diff mbox series

[v2,4/4] virtio-net: Add support for USO features

Message ID 20230731223148.1002258-5-yuri.benditovich@daynix.com
State New
Headers show
Series virtio-net: add USO feature (UDP segmentation offload) | expand

Commit Message

Yuri Benditovich July 31, 2023, 10:31 p.m. UTC
USO features of virtio-net device depend on kernel ability
to support them, for backward compatibility by default the
features are disabled on 8.0 and earlier.

Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
---
 hw/core/machine.c   |  4 ++++
 hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 33 insertions(+), 2 deletions(-)

Comments

Akihiko Odaki Aug. 2, 2023, 5:17 a.m. UTC | #1
On 2023/08/01 7:31, Yuri Benditovich wrote:
> USO features of virtio-net device depend on kernel ability
> to support them, for backward compatibility by default the
> features are disabled on 8.0 and earlier.
> 
> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> ---
>   hw/core/machine.c   |  4 ++++
>   hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++--
>   2 files changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index f0d35c6401..a725e76738 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -38,10 +38,14 @@
>   #include "exec/confidential-guest-support.h"
>   #include "hw/virtio/virtio.h"
>   #include "hw/virtio/virtio-pci.h"
> +#include "hw/virtio/virtio-net.h"
>   
>   GlobalProperty hw_compat_8_0[] = {
>       { "migration", "multifd-flush-after-each-section", "on"},
>       { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
> +    { TYPE_VIRTIO_NET, "host_uso", "off"},
> +    { TYPE_VIRTIO_NET, "guest_uso4", "off"},
> +    { TYPE_VIRTIO_NET, "guest_uso6", "off"},

Nitpick: Add a whitespace before closing brackets '}'.

>   };
>   const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
>   
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index d2311e7d6e..bd0ead94fe 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
>       return n->has_ufo;
>   }
>   
> +static int peer_has_uso(VirtIONet *n)
> +{
> +    if (!peer_has_vnet_hdr(n)) {
> +        return 0;
> +    }
> +
> +    return qemu_has_uso(qemu_get_queue(n->nic)->peer);
> +}
> +
>   static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
>                                          int version_1, int hash_report)
>   {
> @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
>           virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
>           virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
>   
> +        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
> +
>           virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
>       }
>   
> @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
>           virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
>       }
>   
> +    if (!peer_has_uso(n)) {
> +        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
> +    }
> +
>       if (!get_vhost_net(nc->peer)) {
>           return features;
>       }
> @@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
>               !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
>   }
>   
> -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
> +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
>   {
>       static const uint64_t guest_offloads_mask =
>           (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
>           (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
>           (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
>           (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
> -        (1ULL << VIRTIO_NET_F_GUEST_UFO);
> +        (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
> +        (1ULL << VIRTIO_NET_F_GUEST_USO4) |
> +        (1ULL << VIRTIO_NET_F_GUEST_USO6);
>   
>       return guest_offloads_mask & features;
>   }
> @@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = {
>       DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
>       DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
>       DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
> +    DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
> +                      VIRTIO_NET_F_GUEST_USO4, true),
> +    DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
> +                      VIRTIO_NET_F_GUEST_USO6, true),
> +    DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
> +                      VIRTIO_NET_F_HOST_USO, true),
>       DEFINE_PROP_END_OF_LIST(),
>   };
>
Peter Xu July 25, 2024, 10:18 p.m. UTC | #2
On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> USO features of virtio-net device depend on kernel ability
> to support them, for backward compatibility by default the
> features are disabled on 8.0 and earlier.
> 
> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>

Looks like this patch broke migration when the VM starts on a host that has
USO supported, to another host that doesn't..

Yuri, would it be possible we always keep all the USO* features off by
default (so this feature bit never affects migration ABI), but then:

  - only enable them when the user specified ON

  - meanwhile, if detecting host feature doesn't support USO*, it could
    fail qemu from boot, rather than silently turning it from ON->OFF

?

Silently flipping the bit may cause migration issues like this.

Or any suggestion on how to fix migration?

Thanks,

> ---
>  hw/core/machine.c   |  4 ++++
>  hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++--
>  2 files changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index f0d35c6401..a725e76738 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -38,10 +38,14 @@
>  #include "exec/confidential-guest-support.h"
>  #include "hw/virtio/virtio.h"
>  #include "hw/virtio/virtio-pci.h"
> +#include "hw/virtio/virtio-net.h"
>  
>  GlobalProperty hw_compat_8_0[] = {
>      { "migration", "multifd-flush-after-each-section", "on"},
>      { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
> +    { TYPE_VIRTIO_NET, "host_uso", "off"},
> +    { TYPE_VIRTIO_NET, "guest_uso4", "off"},
> +    { TYPE_VIRTIO_NET, "guest_uso6", "off"},
>  };
>  const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
>  
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index d2311e7d6e..bd0ead94fe 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
>      return n->has_ufo;
>  }
>  
> +static int peer_has_uso(VirtIONet *n)
> +{
> +    if (!peer_has_vnet_hdr(n)) {
> +        return 0;
> +    }
> +
> +    return qemu_has_uso(qemu_get_queue(n->nic)->peer);
> +}
> +
>  static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
>                                         int version_1, int hash_report)
>  {
> @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
>          virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
>          virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
>  
> +        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
> +
>          virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
>      }
>  
> @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
>          virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
>      }
>  
> +    if (!peer_has_uso(n)) {
> +        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
> +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
> +    }
> +
>      if (!get_vhost_net(nc->peer)) {
>          return features;
>      }
> @@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
>              !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
>  }
>  
> -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
> +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
>  {
>      static const uint64_t guest_offloads_mask =
>          (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
>          (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
>          (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
>          (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
> -        (1ULL << VIRTIO_NET_F_GUEST_UFO);
> +        (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
> +        (1ULL << VIRTIO_NET_F_GUEST_USO4) |
> +        (1ULL << VIRTIO_NET_F_GUEST_USO6);
>  
>      return guest_offloads_mask & features;
>  }
> @@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = {
>      DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
>      DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
>      DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
> +    DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
> +                      VIRTIO_NET_F_GUEST_USO4, true),
> +    DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
> +                      VIRTIO_NET_F_GUEST_USO6, true),
> +    DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
> +                      VIRTIO_NET_F_HOST_USO, true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> -- 
> 2.34.3
> 
>
Jason Wang July 26, 2024, 2:12 a.m. UTC | #3
On Fri, Jul 26, 2024 at 6:19 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > USO features of virtio-net device depend on kernel ability
> > to support them, for backward compatibility by default the
> > features are disabled on 8.0 and earlier.
> >
> > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>
> Looks like this patch broke migration when the VM starts on a host that has
> USO supported, to another host that doesn't..
>
> Yuri, would it be possible we always keep all the USO* features off by
> default (so this feature bit never affects migration ABI), but then:
>
>   - only enable them when the user specified ON
>
>   - meanwhile, if detecting host feature doesn't support USO*, it could
>     fail qemu from boot, rather than silently turning it from ON->OFF
>
> ?

I agree, I have raised the same issue several times in the past.

>
> Silently flipping the bit may cause migration issues like this.

Looking at virtio_net_get_features(), it silently clears a lot of features...

Thanks
>
> Or any suggestion on how to fix migration?
>
> Thanks,
>
> > ---
> >  hw/core/machine.c   |  4 ++++
> >  hw/net/virtio-net.c | 31 +++++++++++++++++++++++++++++--
> >  2 files changed, 33 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/core/machine.c b/hw/core/machine.c
> > index f0d35c6401..a725e76738 100644
> > --- a/hw/core/machine.c
> > +++ b/hw/core/machine.c
> > @@ -38,10 +38,14 @@
> >  #include "exec/confidential-guest-support.h"
> >  #include "hw/virtio/virtio.h"
> >  #include "hw/virtio/virtio-pci.h"
> > +#include "hw/virtio/virtio-net.h"
> >
> >  GlobalProperty hw_compat_8_0[] = {
> >      { "migration", "multifd-flush-after-each-section", "on"},
> >      { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
> > +    { TYPE_VIRTIO_NET, "host_uso", "off"},
> > +    { TYPE_VIRTIO_NET, "guest_uso4", "off"},
> > +    { TYPE_VIRTIO_NET, "guest_uso6", "off"},
> >  };
> >  const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index d2311e7d6e..bd0ead94fe 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
> >      return n->has_ufo;
> >  }
> >
> > +static int peer_has_uso(VirtIONet *n)
> > +{
> > +    if (!peer_has_vnet_hdr(n)) {
> > +        return 0;
> > +    }
> > +
> > +    return qemu_has_uso(qemu_get_queue(n->nic)->peer);
> > +}
> > +
> >  static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
> >                                         int version_1, int hash_report)
> >  {
> > @@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
> >          virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
> >          virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
> >
> > +        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
> > +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
> > +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
> > +
> >          virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
> >      }
> >
> > @@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
> >          virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
> >      }
> >
> > +    if (!peer_has_uso(n)) {
> > +        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
> > +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
> > +        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
> > +    }
> > +
> >      if (!get_vhost_net(nc->peer)) {
> >          return features;
> >      }
> > @@ -864,14 +883,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
> >              !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
> >  }
> >
> > -static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
> > +static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
> >  {
> >      static const uint64_t guest_offloads_mask =
> >          (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
> >          (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
> >          (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
> >          (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
> > -        (1ULL << VIRTIO_NET_F_GUEST_UFO);
> > +        (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
> > +        (1ULL << VIRTIO_NET_F_GUEST_USO4) |
> > +        (1ULL << VIRTIO_NET_F_GUEST_USO6);
> >
> >      return guest_offloads_mask & features;
> >  }
> > @@ -3924,6 +3945,12 @@ static Property virtio_net_properties[] = {
> >      DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
> >      DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
> >      DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
> > +    DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
> > +                      VIRTIO_NET_F_GUEST_USO4, true),
> > +    DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
> > +                      VIRTIO_NET_F_GUEST_USO6, true),
> > +    DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
> > +                      VIRTIO_NET_F_HOST_USO, true),
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >
> > --
> > 2.34.3
> >
> >
>
> --
> Peter Xu
>
Michael S. Tsirkin July 26, 2024, 6:08 a.m. UTC | #4
On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > USO features of virtio-net device depend on kernel ability
> > to support them, for backward compatibility by default the
> > features are disabled on 8.0 and earlier.
> > 
> > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> 
> Looks like this patch broke migration when the VM starts on a host that has
> USO supported, to another host that doesn't..

This was always the case with all offloads. The answer at the moment is,
don't do this. Long term, we need to start exposing management APIs
to discover this, and management has to disable unsupported features.
Thomas Huth July 26, 2024, 7:03 a.m. UTC | #5
On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>> USO features of virtio-net device depend on kernel ability
>>> to support them, for backward compatibility by default the
>>> features are disabled on 8.0 and earlier.
>>>
>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>
>> Looks like this patch broke migration when the VM starts on a host that has
>> USO supported, to another host that doesn't..
> 
> This was always the case with all offloads. The answer at the moment is,
> don't do this.

May I ask for my understanding:
"don't do this" = don't automatically enable/disable virtio features in QEMU 
depending on host kernel features, or "don't do this" = don't try to migrate 
between machines that have different host kernel features?

> Long term, we need to start exposing management APIs
> to discover this, and management has to disable unsupported features.

Ack, this likely needs some treatments from the libvirt side, too.

  Thomas
Michael S. Tsirkin July 26, 2024, 7:25 a.m. UTC | #6
On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > USO features of virtio-net device depend on kernel ability
> > > > to support them, for backward compatibility by default the
> > > > features are disabled on 8.0 and earlier.
> > > > 
> > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > 
> > > Looks like this patch broke migration when the VM starts on a host that has
> > > USO supported, to another host that doesn't..
> > 
> > This was always the case with all offloads. The answer at the moment is,
> > don't do this.
> 
> May I ask for my understanding:
> "don't do this" = don't automatically enable/disable virtio features in QEMU
> depending on host kernel features, or "don't do this" = don't try to migrate
> between machines that have different host kernel features?

The later.

> > Long term, we need to start exposing management APIs
> > to discover this, and management has to disable unsupported features.
> 
> Ack, this likely needs some treatments from the libvirt side, too.
> 
>  Thomas
Daniel P. Berrangé July 26, 2024, 8:48 a.m. UTC | #7
On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > USO features of virtio-net device depend on kernel ability
> > > > to support them, for backward compatibility by default the
> > > > features are disabled on 8.0 and earlier.
> > > > 
> > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > 
> > > Looks like this patch broke migration when the VM starts on a host that has
> > > USO supported, to another host that doesn't..
> > 
> > This was always the case with all offloads. The answer at the moment is,
> > don't do this.
> 
> May I ask for my understanding:
> "don't do this" = don't automatically enable/disable virtio features in QEMU
> depending on host kernel features, or "don't do this" = don't try to migrate
> between machines that have different host kernel features?
> 
> > Long term, we need to start exposing management APIs
> > to discover this, and management has to disable unsupported features.
> 
> Ack, this likely needs some treatments from the libvirt side, too.

When QEMU automatically toggles machine type featuers based on host
kernel, relying on libvirt to then disable them again is impractical,
as we cannot assume that the libvirt people are using knows about
newly introduced features. Even if libvirt is updated to know about
it, people can easily be using a previous libvirt release.

QEMU itself needs to make the machine types do that they are there
todo, which is to define a stable machine ABI. 

What QEMU is missing here is a "platform ABI" concept, to encode
sets of features which are tied to specific platform generations.
As long as we don't have that we'll keep having these broken
migration problems from machine types dynamically changing instead
of providing a stable guest ABI.

With regards,
Daniel
Peter Xu July 26, 2024, 11:32 a.m. UTC | #8
On Fri, Jul 26, 2024 at 03:25:31AM -0400, Michael S. Tsirkin wrote:
> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> > On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > > USO features of virtio-net device depend on kernel ability
> > > > > to support them, for backward compatibility by default the
> > > > > features are disabled on 8.0 and earlier.
> > > > > 
> > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > > 
> > > > Looks like this patch broke migration when the VM starts on a host that has
> > > > USO supported, to another host that doesn't..
> > > 
> > > This was always the case with all offloads. The answer at the moment is,
> > > don't do this.
> > 
> > May I ask for my understanding:
> > "don't do this" = don't automatically enable/disable virtio features in QEMU
> > depending on host kernel features, or "don't do this" = don't try to migrate
> > between machines that have different host kernel features?
> 
> The later.

The question is how should an user know a migration is not supported?

The user can be using exactly the same QEMU binary on two hosts, while
there can be a tiny slight difference in host kernel version, then
migration can fail between them misterously.

There're too many kernel features that can be on/off when kernels are
different, even if slightly.  Then I don't see how someone can even
identify such issue, unless one uses exactly the same host kernels on both
sides..
Peter Xu July 26, 2024, 2:43 p.m. UTC | #9
On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> > On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > > USO features of virtio-net device depend on kernel ability
> > > > > to support them, for backward compatibility by default the
> > > > > features are disabled on 8.0 and earlier.
> > > > > 
> > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > > 
> > > > Looks like this patch broke migration when the VM starts on a host that has
> > > > USO supported, to another host that doesn't..
> > > 
> > > This was always the case with all offloads. The answer at the moment is,
> > > don't do this.
> > 
> > May I ask for my understanding:
> > "don't do this" = don't automatically enable/disable virtio features in QEMU
> > depending on host kernel features, or "don't do this" = don't try to migrate
> > between machines that have different host kernel features?
> > 
> > > Long term, we need to start exposing management APIs
> > > to discover this, and management has to disable unsupported features.
> > 
> > Ack, this likely needs some treatments from the libvirt side, too.
> 
> When QEMU automatically toggles machine type featuers based on host
> kernel, relying on libvirt to then disable them again is impractical,
> as we cannot assume that the libvirt people are using knows about
> newly introduced features. Even if libvirt is updated to know about
> it, people can easily be using a previous libvirt release.
> 
> QEMU itself needs to make the machine types do that they are there
> todo, which is to define a stable machine ABI. 
> 
> What QEMU is missing here is a "platform ABI" concept, to encode
> sets of features which are tied to specific platform generations.
> As long as we don't have that we'll keep having these broken
> migration problems from machine types dynamically changing instead
> of providing a stable guest ABI.

Any more elaboration on this idea?  Would it be easily feasible in
implementation?

I'd second any sane solution that we can avoid happening similar breakages
in the future.

I also wonder what else might be easily affected like this too when
migration can break with changed kernel or changed HW.  I suppose the CPU
model is well covered by Libvirt so we're fine at least on x86 etc.  While
IIUC KVM always have such thoughts in mind, so that KVM will make sure to
not break an userspace in such way or it'll simply be a KVM bug and fixed.

Thanks,
Peter Xu July 26, 2024, 3:01 p.m. UTC | #10
On Fri, Jul 26, 2024 at 10:12:31AM +0800, Jason Wang wrote:
> On Fri, Jul 26, 2024 at 6:19 AM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > USO features of virtio-net device depend on kernel ability
> > > to support them, for backward compatibility by default the
> > > features are disabled on 8.0 and earlier.
> > >
> > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >
> > Looks like this patch broke migration when the VM starts on a host that has
> > USO supported, to another host that doesn't..
> >
> > Yuri, would it be possible we always keep all the USO* features off by
> > default (so this feature bit never affects migration ABI), but then:
> >
> >   - only enable them when the user specified ON
> >
> >   - meanwhile, if detecting host feature doesn't support USO*, it could
> >     fail qemu from boot, rather than silently turning it from ON->OFF
> >
> > ?
> 
> I agree, I have raised the same issue several times in the past.
> 
> >
> > Silently flipping the bit may cause migration issues like this.
> 
> Looking at virtio_net_get_features(), it silently clears a lot of features...

Yes.. :-( I saw that too when looking at this.

Is it because most of those features are supported on most of the kernels,
so we're good until now by chance?  While it looks like e.g. TUN_F_USO4 was
supported only since ~1.5 years ago so it looks relatively new.

Thanks,
Daniel P. Berrangé July 26, 2024, 3:17 p.m. UTC | #11
On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> > > On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > > > USO features of virtio-net device depend on kernel ability
> > > > > > to support them, for backward compatibility by default the
> > > > > > features are disabled on 8.0 and earlier.
> > > > > > 
> > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > > > 
> > > > > Looks like this patch broke migration when the VM starts on a host that has
> > > > > USO supported, to another host that doesn't..
> > > > 
> > > > This was always the case with all offloads. The answer at the moment is,
> > > > don't do this.
> > > 
> > > May I ask for my understanding:
> > > "don't do this" = don't automatically enable/disable virtio features in QEMU
> > > depending on host kernel features, or "don't do this" = don't try to migrate
> > > between machines that have different host kernel features?
> > > 
> > > > Long term, we need to start exposing management APIs
> > > > to discover this, and management has to disable unsupported features.
> > > 
> > > Ack, this likely needs some treatments from the libvirt side, too.
> > 
> > When QEMU automatically toggles machine type featuers based on host
> > kernel, relying on libvirt to then disable them again is impractical,
> > as we cannot assume that the libvirt people are using knows about
> > newly introduced features. Even if libvirt is updated to know about
> > it, people can easily be using a previous libvirt release.
> > 
> > QEMU itself needs to make the machine types do that they are there
> > todo, which is to define a stable machine ABI. 
> > 
> > What QEMU is missing here is a "platform ABI" concept, to encode
> > sets of features which are tied to specific platform generations.
> > As long as we don't have that we'll keep having these broken
> > migration problems from machine types dynamically changing instead
> > of providing a stable guest ABI.
> 
> Any more elaboration on this idea?  Would it be easily feasible in
> implementation?

In terms of launching QEMU I'd imagine:

  $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...

Any virtual machine HW features which are tied to host kernel features
would have their defaults set based on the requested -platform. The
-machine will be fully invariant wrt the host kernel.

You would have -platform hlep to list available platforms, and
corresonding QMP "query-platforms" command to list what platforms
are supported on a given host OS.

Downstream distros can provide their own platforms definitions
(eg "linux-rhel-9.5") if they have kernels whose feature set
diverges from upstream due to backports.

Mgmt apps won't need to be taught about every single little QEMU
setting whose default is derived from the kernel. Individual
defaults are opaque and controlled by the requested platform.

Live migration has clearly defined semantics, and mgmt app can
use query-platforms to validate two hosts are compatible.

Omitting -platform should pick the very latest platform that is
cmpatible with the current host (not neccessarily the latest
platform built-in to QEMU).


With regards,
Daniel
Thomas Huth July 26, 2024, 5:39 p.m. UTC | #12
On 26/07/2024 09.25, Michael S. Tsirkin wrote:
> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>> USO features of virtio-net device depend on kernel ability
>>>>> to support them, for backward compatibility by default the
>>>>> features are disabled on 8.0 and earlier.
>>>>>
>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>
>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>> USO supported, to another host that doesn't..
>>>
>>> This was always the case with all offloads. The answer at the moment is,
>>> don't do this.
>>
>> May I ask for my understanding:
>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>> depending on host kernel features, or "don't do this" = don't try to migrate
>> between machines that have different host kernel features?
> 
> The later.

 From my experience, it should rather be the former. We've seen similar 
issues with the s390x machine in the past when trying to automatically 
enable features depending on the availability of a kernel features. While it 
looks nicer at a very first glance ("hey, a new feature is available, we 
enable that for you, dear user!"), you end up in migration hell pretty quickly.

Maybe we could elevate the "--nodefaults" command line switch to avoid 
enabling such features automatically?

Anyway, while we're discussing solutions: We are in softfreeze already. 
Should we disable the UFO bits in the new 9.1 machine type for the time 
being to avoid that more people are running into this problem?

  Thomas
Peter Xu July 26, 2024, 8:47 p.m. UTC | #13
On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> > On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> > > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> > > > On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > > > > USO features of virtio-net device depend on kernel ability
> > > > > > > to support them, for backward compatibility by default the
> > > > > > > features are disabled on 8.0 and earlier.
> > > > > > > 
> > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > > > > 
> > > > > > Looks like this patch broke migration when the VM starts on a host that has
> > > > > > USO supported, to another host that doesn't..
> > > > > 
> > > > > This was always the case with all offloads. The answer at the moment is,
> > > > > don't do this.
> > > > 
> > > > May I ask for my understanding:
> > > > "don't do this" = don't automatically enable/disable virtio features in QEMU
> > > > depending on host kernel features, or "don't do this" = don't try to migrate
> > > > between machines that have different host kernel features?
> > > > 
> > > > > Long term, we need to start exposing management APIs
> > > > > to discover this, and management has to disable unsupported features.
> > > > 
> > > > Ack, this likely needs some treatments from the libvirt side, too.
> > > 
> > > When QEMU automatically toggles machine type featuers based on host
> > > kernel, relying on libvirt to then disable them again is impractical,
> > > as we cannot assume that the libvirt people are using knows about
> > > newly introduced features. Even if libvirt is updated to know about
> > > it, people can easily be using a previous libvirt release.
> > > 
> > > QEMU itself needs to make the machine types do that they are there
> > > todo, which is to define a stable machine ABI. 
> > > 
> > > What QEMU is missing here is a "platform ABI" concept, to encode
> > > sets of features which are tied to specific platform generations.
> > > As long as we don't have that we'll keep having these broken
> > > migration problems from machine types dynamically changing instead
> > > of providing a stable guest ABI.
> > 
> > Any more elaboration on this idea?  Would it be easily feasible in
> > implementation?
> 
> In terms of launching QEMU I'd imagine:
> 
>   $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> 
> Any virtual machine HW features which are tied to host kernel features
> would have their defaults set based on the requested -platform. The
> -machine will be fully invariant wrt the host kernel.
> 
> You would have -platform hlep to list available platforms, and
> corresonding QMP "query-platforms" command to list what platforms
> are supported on a given host OS.
> 
> Downstream distros can provide their own platforms definitions
> (eg "linux-rhel-9.5") if they have kernels whose feature set
> diverges from upstream due to backports.
> 
> Mgmt apps won't need to be taught about every single little QEMU
> setting whose default is derived from the kernel. Individual
> defaults are opaque and controlled by the requested platform.
> 
> Live migration has clearly defined semantics, and mgmt app can
> use query-platforms to validate two hosts are compatible.
> 
> Omitting -platform should pick the very latest platform that is
> cmpatible with the current host (not neccessarily the latest
> platform built-in to QEMU).

This seems to add one more layer to maintain, and so far I don't know
whether it's a must.

To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
thought it was mostly the case already, except some extremely rare
outliers.

When we have one host that boots up a VM using:

  $QEMU1 $cmdline

Then another host boots up:

  $QEMU2 $cmdline -incoming XXX

Then migration should succeed if $cmdline is exactly the same, and the VM
can boot up all fine without errors on both sides.

AFAICT this has nothing to do with what kernel is underneath, even not
Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
didn't, I thought the ABI should be guaranteed.

That's why I think this is a migration violation, as 99.99% of other device
properties should be following this rule.  The issue here is, we have the
same virtio-net-pci cmdline on both sides in this case, but the ABI got
break.

That's also why I was suggesting if the property contributes to the guest
ABI, then AFAIU QEMU needs to:

  - Firstly, never quietly flipping any bit that affects the ABI...

  - Have a default value of off, then QEMU will always allow the VM to boot
    by default, while advanced users can opt-in on new features.  We can't
    make this ON by default otherwise some VMs can already fail to boot,

  - If the host doesn't support the feature while the cmdline enabled it,
    it needs to fail QEMU boot rather than flipping, so that it says "hey,
    this host does not support running such VM specified, due to XXX
    feature missing".

That's the only way an user could understand what happened, and IMHO that's
a clean way that we stick with QEMU cmdline on defining the guest ABI,
while in which the machine type is the fundation of such definition, as the
machine type can decides many of the rest compat properties.  And that's
the whole point of the compat properties too (to make sure the guest ABI is
stable).

If kernel breaks it easily, all compat property things that we maintain can
already stop making sense in general, because it didn't define the whole
guest ABI..

So AFAIU that's really what we used for years, I hope I didn't overlook
somehting.  And maybe we don't yet need the "-platform" layer if we can
keep up with this rule?

Thanks,
Peter Xu July 26, 2024, 8:55 p.m. UTC | #14
On Fri, Jul 26, 2024 at 07:39:46PM +0200, Thomas Huth wrote:
> On 26/07/2024 09.25, Michael S. Tsirkin wrote:
> > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> > > On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > > > USO features of virtio-net device depend on kernel ability
> > > > > > to support them, for backward compatibility by default the
> > > > > > features are disabled on 8.0 and earlier.
> > > > > > 
> > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > > > 
> > > > > Looks like this patch broke migration when the VM starts on a host that has
> > > > > USO supported, to another host that doesn't..
> > > > 
> > > > This was always the case with all offloads. The answer at the moment is,
> > > > don't do this.
> > > 
> > > May I ask for my understanding:
> > > "don't do this" = don't automatically enable/disable virtio features in QEMU
> > > depending on host kernel features, or "don't do this" = don't try to migrate
> > > between machines that have different host kernel features?
> > 
> > The later.
> 
> From my experience, it should rather be the former. We've seen similar
> issues with the s390x machine in the past when trying to automatically
> enable features depending on the availability of a kernel features. While it
> looks nicer at a very first glance ("hey, a new feature is available, we
> enable that for you, dear user!"), you end up in migration hell pretty
> quickly.
> 
> Maybe we could elevate the "--nodefaults" command line switch to avoid
> enabling such features automatically?
> 
> Anyway, while we're discussing solutions: We are in softfreeze already.
> Should we disable the UFO bits in the new 9.1 machine type for the time
> being to avoid that more people are running into this problem?

Probably too late for this one; this patch was merged in 8.2.
Unfortunately CIs won't even cover a test across two host kernels, even so
it'll need to be unlucky enough to one has USO one not..

But I do agree with Thomas here.

I think the only feature that can be auto-enabled is the ones that do not
affect guest ABI.  When affected, the only right way to me to enable them
should be exporting -device interface so that Libvirt can opt-in on
enabling them when the host support is detected.  For QEMU users, that
means user needs to explicitly enable them or they're off.

Or, there's also another option that we turn default to ON for such
feature, but when most of the kernels should support it.  With that, we can
set OFF in compat property for old machines, and we should fail the new
machine from boot when running on an old kenrel without the feature.

Thanks,
Akihiko Odaki July 28, 2024, 3:18 p.m. UTC | #15
On 2024/07/27 5:47, Peter Xu wrote:
> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>>>>> USO features of virtio-net device depend on kernel ability
>>>>>>>> to support them, for backward compatibility by default the
>>>>>>>> features are disabled on 8.0 and earlier.
>>>>>>>>
>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>>>>
>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>>>>> USO supported, to another host that doesn't..
>>>>>>
>>>>>> This was always the case with all offloads. The answer at the moment is,
>>>>>> don't do this.
>>>>>
>>>>> May I ask for my understanding:
>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
>>>>> between machines that have different host kernel features?
>>>>>
>>>>>> Long term, we need to start exposing management APIs
>>>>>> to discover this, and management has to disable unsupported features.
>>>>>
>>>>> Ack, this likely needs some treatments from the libvirt side, too.
>>>>
>>>> When QEMU automatically toggles machine type featuers based on host
>>>> kernel, relying on libvirt to then disable them again is impractical,
>>>> as we cannot assume that the libvirt people are using knows about
>>>> newly introduced features. Even if libvirt is updated to know about
>>>> it, people can easily be using a previous libvirt release.
>>>>
>>>> QEMU itself needs to make the machine types do that they are there
>>>> todo, which is to define a stable machine ABI.
>>>>
>>>> What QEMU is missing here is a "platform ABI" concept, to encode
>>>> sets of features which are tied to specific platform generations.
>>>> As long as we don't have that we'll keep having these broken
>>>> migration problems from machine types dynamically changing instead
>>>> of providing a stable guest ABI.
>>>
>>> Any more elaboration on this idea?  Would it be easily feasible in
>>> implementation?
>>
>> In terms of launching QEMU I'd imagine:
>>
>>    $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>
>> Any virtual machine HW features which are tied to host kernel features
>> would have their defaults set based on the requested -platform. The
>> -machine will be fully invariant wrt the host kernel.
>>
>> You would have -platform hlep to list available platforms, and
>> corresonding QMP "query-platforms" command to list what platforms
>> are supported on a given host OS.
>>
>> Downstream distros can provide their own platforms definitions
>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>> diverges from upstream due to backports.
>>
>> Mgmt apps won't need to be taught about every single little QEMU
>> setting whose default is derived from the kernel. Individual
>> defaults are opaque and controlled by the requested platform.
>>
>> Live migration has clearly defined semantics, and mgmt app can
>> use query-platforms to validate two hosts are compatible.
>>
>> Omitting -platform should pick the very latest platform that is
>> cmpatible with the current host (not neccessarily the latest
>> platform built-in to QEMU).
> 
> This seems to add one more layer to maintain, and so far I don't know
> whether it's a must.
> 
> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> thought it was mostly the case already, except some extremely rare
> outliers.
> 
> When we have one host that boots up a VM using:
> 
>    $QEMU1 $cmdline
> 
> Then another host boots up:
> 
>    $QEMU2 $cmdline -incoming XXX
> 
> Then migration should succeed if $cmdline is exactly the same, and the VM
> can boot up all fine without errors on both sides.
> 
> AFAICT this has nothing to do with what kernel is underneath, even not
> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> didn't, I thought the ABI should be guaranteed.
> 
> That's why I think this is a migration violation, as 99.99% of other device
> properties should be following this rule.  The issue here is, we have the
> same virtio-net-pci cmdline on both sides in this case, but the ABI got
> break.
> 
> That's also why I was suggesting if the property contributes to the guest
> ABI, then AFAIU QEMU needs to:
> 
>    - Firstly, never quietly flipping any bit that affects the ABI...
> 
>    - Have a default value of off, then QEMU will always allow the VM to boot
>      by default, while advanced users can opt-in on new features.  We can't
>      make this ON by default otherwise some VMs can already fail to boot,

It may not be necessary the case that old features are supported by 
every systems. In an extreme case, a user may migrate a VM from Linux to 
Windows, which probably doesn't support any offloading at all. A more 
convincing scenario is RSS offloading with eBPF; using eBPF requires a 
privilege so we cannot assume it is always available even on the latest 
version of Linux.

> 
>    - If the host doesn't support the feature while the cmdline enabled it,
>      it needs to fail QEMU boot rather than flipping, so that it says "hey,
>      this host does not support running such VM specified, due to XXX
>      feature missing".

This is handled in:

"virtio-net: Convert feature properties to OnOffAuto"
https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/

> 
> That's the only way an user could understand what happened, and IMHO that's
> a clean way that we stick with QEMU cmdline on defining the guest ABI,
> while in which the machine type is the fundation of such definition, as the
> machine type can decides many of the rest compat properties.  And that's
> the whole point of the compat properties too (to make sure the guest ABI is
> stable).
> 
> If kernel breaks it easily, all compat property things that we maintain can
> already stop making sense in general, because it didn't define the whol
> guest ABI..
> 
> So AFAIU that's really what we used for years, I hope I didn't overlook
> somehting.  And maybe we don't yet need the "-platform" layer if we can
> keep up with this rule?

I think a device which cannot conform to that rule should be 
non-migratable. For example, virtio-gpu-gl does not conform to it, and 
does not support migration either.

Regards,
Akihiko Odaki
Jason Wang July 29, 2024, 3:50 a.m. UTC | #16
On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2024/07/27 5:47, Peter Xu wrote:
> > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> >> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> >>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> >>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> >>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> >>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> >>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> >>>>>>>> USO features of virtio-net device depend on kernel ability
> >>>>>>>> to support them, for backward compatibility by default the
> >>>>>>>> features are disabled on 8.0 and earlier.
> >>>>>>>>
> >>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> >>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >>>>>>>
> >>>>>>> Looks like this patch broke migration when the VM starts on a host that has
> >>>>>>> USO supported, to another host that doesn't..
> >>>>>>
> >>>>>> This was always the case with all offloads. The answer at the moment is,
> >>>>>> don't do this.
> >>>>>
> >>>>> May I ask for my understanding:
> >>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
> >>>>> depending on host kernel features, or "don't do this" = don't try to migrate
> >>>>> between machines that have different host kernel features?
> >>>>>
> >>>>>> Long term, we need to start exposing management APIs
> >>>>>> to discover this, and management has to disable unsupported features.
> >>>>>
> >>>>> Ack, this likely needs some treatments from the libvirt side, too.
> >>>>
> >>>> When QEMU automatically toggles machine type featuers based on host
> >>>> kernel, relying on libvirt to then disable them again is impractical,
> >>>> as we cannot assume that the libvirt people are using knows about
> >>>> newly introduced features. Even if libvirt is updated to know about
> >>>> it, people can easily be using a previous libvirt release.
> >>>>
> >>>> QEMU itself needs to make the machine types do that they are there
> >>>> todo, which is to define a stable machine ABI.
> >>>>
> >>>> What QEMU is missing here is a "platform ABI" concept, to encode
> >>>> sets of features which are tied to specific platform generations.
> >>>> As long as we don't have that we'll keep having these broken
> >>>> migration problems from machine types dynamically changing instead
> >>>> of providing a stable guest ABI.
> >>>
> >>> Any more elaboration on this idea?  Would it be easily feasible in
> >>> implementation?
> >>
> >> In terms of launching QEMU I'd imagine:
> >>
> >>    $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> >>
> >> Any virtual machine HW features which are tied to host kernel features
> >> would have their defaults set based on the requested -platform. The
> >> -machine will be fully invariant wrt the host kernel.
> >>
> >> You would have -platform hlep to list available platforms, and
> >> corresonding QMP "query-platforms" command to list what platforms
> >> are supported on a given host OS.
> >>
> >> Downstream distros can provide their own platforms definitions
> >> (eg "linux-rhel-9.5") if they have kernels whose feature set
> >> diverges from upstream due to backports.
> >>
> >> Mgmt apps won't need to be taught about every single little QEMU
> >> setting whose default is derived from the kernel. Individual
> >> defaults are opaque and controlled by the requested platform.
> >>
> >> Live migration has clearly defined semantics, and mgmt app can
> >> use query-platforms to validate two hosts are compatible.
> >>
> >> Omitting -platform should pick the very latest platform that is
> >> cmpatible with the current host (not neccessarily the latest
> >> platform built-in to QEMU).
> >
> > This seems to add one more layer to maintain, and so far I don't know
> > whether it's a must.
> >
> > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> > thought it was mostly the case already, except some extremely rare
> > outliers.
> >
> > When we have one host that boots up a VM using:
> >
> >    $QEMU1 $cmdline
> >
> > Then another host boots up:
> >
> >    $QEMU2 $cmdline -incoming XXX
> >
> > Then migration should succeed if $cmdline is exactly the same, and the VM
> > can boot up all fine without errors on both sides.
> >
> > AFAICT this has nothing to do with what kernel is underneath, even not
> > Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> > didn't, I thought the ABI should be guaranteed.
> >
> > That's why I think this is a migration violation, as 99.99% of other device
> > properties should be following this rule.  The issue here is, we have the
> > same virtio-net-pci cmdline on both sides in this case, but the ABI got
> > break.
> >
> > That's also why I was suggesting if the property contributes to the guest
> > ABI, then AFAIU QEMU needs to:
> >
> >    - Firstly, never quietly flipping any bit that affects the ABI...
> >
> >    - Have a default value of off, then QEMU will always allow the VM to boot
> >      by default, while advanced users can opt-in on new features.  We can't
> >      make this ON by default otherwise some VMs can already fail to boot,
>
> It may not be necessary the case that old features are supported by
> every systems. In an extreme case, a user may migrate a VM from Linux to
> Windows, which probably doesn't support any offloading at all. A more
> convincing scenario is RSS offloading with eBPF; using eBPF requires a
> privilege so we cannot assume it is always available even on the latest
> version of Linux.

I don't get why eBPF matters here. It is something that is not noticed
by the guest and we have a fallback anyhow.

>
> >
> >    - If the host doesn't support the feature while the cmdline enabled it,
> >      it needs to fail QEMU boot rather than flipping, so that it says "hey,
> >      this host does not support running such VM specified, due to XXX
> >      feature missing".
>
> This is handled in:
>
> "virtio-net: Convert feature properties to OnOffAuto"
> https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/

I may miss something but I think "Auto" doesn't make sense to libvirt.

>
> >
> > That's the only way an user could understand what happened, and IMHO that's
> > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > while in which the machine type is the fundation of such definition, as the
> > machine type can decides many of the rest compat properties.  And that's
> > the whole point of the compat properties too (to make sure the guest ABI is
> > stable).
> >
> > If kernel breaks it easily, all compat property things that we maintain can
> > already stop making sense in general, because it didn't define the whol
> > guest ABI..
> >
> > So AFAIU that's really what we used for years, I hope I didn't overlook
> > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > keep up with this rule?
>
> I think a device which cannot conform to that rule should be
> non-migratable. For example, virtio-gpu-gl does not conform to it, and
> does not support migration either.
>
> Regards,
> Akihiko Odaki
>

Thanks
Jason Wang July 29, 2024, 3:52 a.m. UTC | #17
On Fri, Jul 26, 2024 at 2:08 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > USO features of virtio-net device depend on kernel ability
> > > to support them, for backward compatibility by default the
> > > features are disabled on 8.0 and earlier.
> > >
> > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >
> > Looks like this patch broke migration when the VM starts on a host that has
> > USO supported, to another host that doesn't..
>
> This was always the case with all offloads. The answer at the moment is,
> don't do this.

Sometimes, it's not easy for management to know this.

For example, in the past we suffered from the removal of UFO ....

> Long term, we need to start exposing management APIs
> to discover this, and management has to disable unsupported features.
>
> --
> MST
>

Thanks
Akihiko Odaki July 29, 2024, 4:45 a.m. UTC | #18
On 2024/07/29 12:50, Jason Wang wrote:
> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> On 2024/07/27 5:47, Peter Xu wrote:
>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>>>>>>> USO features of virtio-net device depend on kernel ability
>>>>>>>>>> to support them, for backward compatibility by default the
>>>>>>>>>> features are disabled on 8.0 and earlier.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>>>>>>
>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>>>>>>> USO supported, to another host that doesn't..
>>>>>>>>
>>>>>>>> This was always the case with all offloads. The answer at the moment is,
>>>>>>>> don't do this.
>>>>>>>
>>>>>>> May I ask for my understanding:
>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
>>>>>>> between machines that have different host kernel features?
>>>>>>>
>>>>>>>> Long term, we need to start exposing management APIs
>>>>>>>> to discover this, and management has to disable unsupported features.
>>>>>>>
>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
>>>>>>
>>>>>> When QEMU automatically toggles machine type featuers based on host
>>>>>> kernel, relying on libvirt to then disable them again is impractical,
>>>>>> as we cannot assume that the libvirt people are using knows about
>>>>>> newly introduced features. Even if libvirt is updated to know about
>>>>>> it, people can easily be using a previous libvirt release.
>>>>>>
>>>>>> QEMU itself needs to make the machine types do that they are there
>>>>>> todo, which is to define a stable machine ABI.
>>>>>>
>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
>>>>>> sets of features which are tied to specific platform generations.
>>>>>> As long as we don't have that we'll keep having these broken
>>>>>> migration problems from machine types dynamically changing instead
>>>>>> of providing a stable guest ABI.
>>>>>
>>>>> Any more elaboration on this idea?  Would it be easily feasible in
>>>>> implementation?
>>>>
>>>> In terms of launching QEMU I'd imagine:
>>>>
>>>>     $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>>
>>>> Any virtual machine HW features which are tied to host kernel features
>>>> would have their defaults set based on the requested -platform. The
>>>> -machine will be fully invariant wrt the host kernel.
>>>>
>>>> You would have -platform hlep to list available platforms, and
>>>> corresonding QMP "query-platforms" command to list what platforms
>>>> are supported on a given host OS.
>>>>
>>>> Downstream distros can provide their own platforms definitions
>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>>> diverges from upstream due to backports.
>>>>
>>>> Mgmt apps won't need to be taught about every single little QEMU
>>>> setting whose default is derived from the kernel. Individual
>>>> defaults are opaque and controlled by the requested platform.
>>>>
>>>> Live migration has clearly defined semantics, and mgmt app can
>>>> use query-platforms to validate two hosts are compatible.
>>>>
>>>> Omitting -platform should pick the very latest platform that is
>>>> cmpatible with the current host (not neccessarily the latest
>>>> platform built-in to QEMU).
>>>
>>> This seems to add one more layer to maintain, and so far I don't know
>>> whether it's a must.
>>>
>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>>> thought it was mostly the case already, except some extremely rare
>>> outliers.
>>>
>>> When we have one host that boots up a VM using:
>>>
>>>     $QEMU1 $cmdline
>>>
>>> Then another host boots up:
>>>
>>>     $QEMU2 $cmdline -incoming XXX
>>>
>>> Then migration should succeed if $cmdline is exactly the same, and the VM
>>> can boot up all fine without errors on both sides.
>>>
>>> AFAICT this has nothing to do with what kernel is underneath, even not
>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>>> didn't, I thought the ABI should be guaranteed.
>>>
>>> That's why I think this is a migration violation, as 99.99% of other device
>>> properties should be following this rule.  The issue here is, we have the
>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
>>> break.
>>>
>>> That's also why I was suggesting if the property contributes to the guest
>>> ABI, then AFAIU QEMU needs to:
>>>
>>>     - Firstly, never quietly flipping any bit that affects the ABI...
>>>
>>>     - Have a default value of off, then QEMU will always allow the VM to boot
>>>       by default, while advanced users can opt-in on new features.  We can't
>>>       make this ON by default otherwise some VMs can already fail to boot,
>>
>> It may not be necessary the case that old features are supported by
>> every systems. In an extreme case, a user may migrate a VM from Linux to
>> Windows, which probably doesn't support any offloading at all. A more
>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
>> privilege so we cannot assume it is always available even on the latest
>> version of Linux.
> 
> I don't get why eBPF matters here. It is something that is not noticed
> by the guest and we have a fallback anyhow.
> 
>>
>>>
>>>     - If the host doesn't support the feature while the cmdline enabled it,
>>>       it needs to fail QEMU boot rather than flipping, so that it says "hey,
>>>       this host does not support running such VM specified, due to XXX
>>>       feature missing".
>>
>> This is handled in:
>>
>> "virtio-net: Convert feature properties to OnOffAuto"
>> https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/
> 
> I may miss something but I think "Auto" doesn't make sense to libvirt.

The point is libvirt can explicitly set "on" to avoid the "auto" 
behavior. libvirt does not have to use the "auto" value.

libvirt can still use "auto" if desired. virDomainNetDefParseXMLDriver() 
in libvirt actually parses tristate values (libvirt uses "default" 
instead of "auto" as the mnemonic) for these features though "default" 
is currently disabled by the schema (src/conf/schemas/domaincommon.rng). 
Allowing user to specify "default" is only a matter of editing the 
schema. Of course specifying "default" will make the VM unsafe for 
migration.

Regards,
Akihiko Odaki
Peter Xu July 29, 2024, 2:29 p.m. UTC | #19
On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
> On 2024/07/29 12:50, Jason Wang wrote:
> > On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> > > 
> > > On 2024/07/27 5:47, Peter Xu wrote:
> > > > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> > > > > On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> > > > > > On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> > > > > > > On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> > > > > > > > On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> > > > > > > > > On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> > > > > > > > > > On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> > > > > > > > > > > USO features of virtio-net device depend on kernel ability
> > > > > > > > > > > to support them, for backward compatibility by default the
> > > > > > > > > > > features are disabled on 8.0 and earlier.
> > > > > > > > > > > 
> > > > > > > > > > > Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> > > > > > > > > > > Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> > > > > > > > > > 
> > > > > > > > > > Looks like this patch broke migration when the VM starts on a host that has
> > > > > > > > > > USO supported, to another host that doesn't..
> > > > > > > > > 
> > > > > > > > > This was always the case with all offloads. The answer at the moment is,
> > > > > > > > > don't do this.
> > > > > > > > 
> > > > > > > > May I ask for my understanding:
> > > > > > > > "don't do this" = don't automatically enable/disable virtio features in QEMU
> > > > > > > > depending on host kernel features, or "don't do this" = don't try to migrate
> > > > > > > > between machines that have different host kernel features?
> > > > > > > > 
> > > > > > > > > Long term, we need to start exposing management APIs
> > > > > > > > > to discover this, and management has to disable unsupported features.
> > > > > > > > 
> > > > > > > > Ack, this likely needs some treatments from the libvirt side, too.
> > > > > > > 
> > > > > > > When QEMU automatically toggles machine type featuers based on host
> > > > > > > kernel, relying on libvirt to then disable them again is impractical,
> > > > > > > as we cannot assume that the libvirt people are using knows about
> > > > > > > newly introduced features. Even if libvirt is updated to know about
> > > > > > > it, people can easily be using a previous libvirt release.
> > > > > > > 
> > > > > > > QEMU itself needs to make the machine types do that they are there
> > > > > > > todo, which is to define a stable machine ABI.
> > > > > > > 
> > > > > > > What QEMU is missing here is a "platform ABI" concept, to encode
> > > > > > > sets of features which are tied to specific platform generations.
> > > > > > > As long as we don't have that we'll keep having these broken
> > > > > > > migration problems from machine types dynamically changing instead
> > > > > > > of providing a stable guest ABI.
> > > > > > 
> > > > > > Any more elaboration on this idea?  Would it be easily feasible in
> > > > > > implementation?
> > > > > 
> > > > > In terms of launching QEMU I'd imagine:
> > > > > 
> > > > >     $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> > > > > 
> > > > > Any virtual machine HW features which are tied to host kernel features
> > > > > would have their defaults set based on the requested -platform. The
> > > > > -machine will be fully invariant wrt the host kernel.
> > > > > 
> > > > > You would have -platform hlep to list available platforms, and
> > > > > corresonding QMP "query-platforms" command to list what platforms
> > > > > are supported on a given host OS.
> > > > > 
> > > > > Downstream distros can provide their own platforms definitions
> > > > > (eg "linux-rhel-9.5") if they have kernels whose feature set
> > > > > diverges from upstream due to backports.
> > > > > 
> > > > > Mgmt apps won't need to be taught about every single little QEMU
> > > > > setting whose default is derived from the kernel. Individual
> > > > > defaults are opaque and controlled by the requested platform.
> > > > > 
> > > > > Live migration has clearly defined semantics, and mgmt app can
> > > > > use query-platforms to validate two hosts are compatible.
> > > > > 
> > > > > Omitting -platform should pick the very latest platform that is
> > > > > cmpatible with the current host (not neccessarily the latest
> > > > > platform built-in to QEMU).
> > > > 
> > > > This seems to add one more layer to maintain, and so far I don't know
> > > > whether it's a must.
> > > > 
> > > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> > > > thought it was mostly the case already, except some extremely rare
> > > > outliers.
> > > > 
> > > > When we have one host that boots up a VM using:
> > > > 
> > > >     $QEMU1 $cmdline
> > > > 
> > > > Then another host boots up:
> > > > 
> > > >     $QEMU2 $cmdline -incoming XXX
> > > > 
> > > > Then migration should succeed if $cmdline is exactly the same, and the VM
> > > > can boot up all fine without errors on both sides.
> > > > 
> > > > AFAICT this has nothing to do with what kernel is underneath, even not
> > > > Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> > > > didn't, I thought the ABI should be guaranteed.
> > > > 
> > > > That's why I think this is a migration violation, as 99.99% of other device
> > > > properties should be following this rule.  The issue here is, we have the
> > > > same virtio-net-pci cmdline on both sides in this case, but the ABI got
> > > > break.
> > > > 
> > > > That's also why I was suggesting if the property contributes to the guest
> > > > ABI, then AFAIU QEMU needs to:
> > > > 
> > > >     - Firstly, never quietly flipping any bit that affects the ABI...
> > > > 
> > > >     - Have a default value of off, then QEMU will always allow the VM to boot
> > > >       by default, while advanced users can opt-in on new features.  We can't
> > > >       make this ON by default otherwise some VMs can already fail to boot,
> > > 
> > > It may not be necessary the case that old features are supported by
> > > every systems. In an extreme case, a user may migrate a VM from Linux to
> > > Windows, which probably doesn't support any offloading at all. A more
> > > convincing scenario is RSS offloading with eBPF; using eBPF requires a
> > > privilege so we cannot assume it is always available even on the latest
> > > version of Linux.
> > 
> > I don't get why eBPF matters here. It is something that is not noticed
> > by the guest and we have a fallback anyhow.
> > 
> > > 
> > > > 
> > > >     - If the host doesn't support the feature while the cmdline enabled it,
> > > >       it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > >       this host does not support running such VM specified, due to XXX
> > > >       feature missing".
> > > 
> > > This is handled in:
> > > 
> > > "virtio-net: Convert feature properties to OnOffAuto"
> > > https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/
> > 
> > I may miss something but I think "Auto" doesn't make sense to libvirt.
> 
> The point is libvirt can explicitly set "on" to avoid the "auto" behavior.
> libvirt does not have to use the "auto" value.
> 
> libvirt can still use "auto" if desired. virDomainNetDefParseXMLDriver() in
> libvirt actually parses tristate values (libvirt uses "default" instead of
> "auto" as the mnemonic) for these features though "default" is currently
> disabled by the schema (src/conf/schemas/domaincommon.rng). Allowing user to
> specify "default" is only a matter of editing the schema. Of course
> specifying "default" will make the VM unsafe for migration.

Isn't keeping the default AUTO the same as before when it used to be ON?  I
mean, AUTO in a qemu cmdline doesn't guarantee guest API either.

Indeed it looks like it's a step forward to make ON having the clear
semantics of "fail when unsupported".  It's just that I am not sure how
useful is AUTO here, because anyway we'll need to break ON semantics even
with AUTO, so that an old QEMU script with USO=ON used to boot on old
kernels but not it won't.

What I was trying to say is whether we should make the default parameter to
be migratable.  IOW, it looks to me AUTO should deserve a migration
blocker when chosen.

After all, Libvirt hopefully shouldn't use AUTO at all but only ON/OFF,
while any user when not caring much on these perf details should always use
OFF on any kernel dependent features that may affect the guest ABI.

Thanks,
Daniel P. Berrangé July 29, 2024, 3:58 p.m. UTC | #20
On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote:
> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> > 
> > In terms of launching QEMU I'd imagine:
> > 
> >   $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> > 
> > Any virtual machine HW features which are tied to host kernel features
> > would have their defaults set based on the requested -platform. The
> > -machine will be fully invariant wrt the host kernel.
> > 
> > You would have -platform hlep to list available platforms, and
> > corresonding QMP "query-platforms" command to list what platforms
> > are supported on a given host OS.
> > 
> > Downstream distros can provide their own platforms definitions
> > (eg "linux-rhel-9.5") if they have kernels whose feature set
> > diverges from upstream due to backports.
> > 
> > Mgmt apps won't need to be taught about every single little QEMU
> > setting whose default is derived from the kernel. Individual
> > defaults are opaque and controlled by the requested platform.
> > 
> > Live migration has clearly defined semantics, and mgmt app can
> > use query-platforms to validate two hosts are compatible.
> > 
> > Omitting -platform should pick the very latest platform that is
> > cmpatible with the current host (not neccessarily the latest
> > platform built-in to QEMU).
> 
> This seems to add one more layer to maintain, and so far I don't know
> whether it's a must.
> 
> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> thought it was mostly the case already, except some extremely rare
> outliers.
> 
> When we have one host that boots up a VM using:
> 
>   $QEMU1 $cmdline
> 
> Then another host boots up:
> 
>   $QEMU2 $cmdline -incoming XXX
> 
> Then migration should succeed if $cmdline is exactly the same, and the VM
> can boot up all fine without errors on both sides.
> 
> AFAICT this has nothing to do with what kernel is underneath, even not
> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> didn't, I thought the ABI should be guaranteed.

We've got two mutually conflicting goals with the machine type
definitions.

Primarily we use them to ensure stable ABI, but an important
secondary goal is to enable new tunables to have new defaults
set, without having to update every mgmt app.  The latter
works very well when the defaults have no dependancy on the
platform kernel/OS, but breaks migration when they do have a
platform dependancy.

>   - Firstly, never quietly flipping any bit that affects the ABI...
> 
>   - Have a default value of off, then QEMU will always allow the VM to boot
>     by default, while advanced users can opt-in on new features.  We can't
>     make this ON by default otherwise some VMs can already fail to boot,
> 
>   - If the host doesn't support the feature while the cmdline enabled it,
>     it needs to fail QEMU boot rather than flipping, so that it says "hey,
>     this host does not support running such VM specified, due to XXX
>     feature missing".
> 
> That's the only way an user could understand what happened, and IMHO that's
> a clean way that we stick with QEMU cmdline on defining the guest ABI,
> while in which the machine type is the fundation of such definition, as the
> machine type can decides many of the rest compat properties.  And that's
> the whole point of the compat properties too (to make sure the guest ABI is
> stable).
> 
> If kernel breaks it easily, all compat property things that we maintain can
> already stop making sense in general, because it didn't define the whole
> guest ABI..
> 
> So AFAIU that's really what we used for years, I hope I didn't overlook
> somehting.  And maybe we don't yet need the "-platform" layer if we can
> keep up with this rule?

We've failed at this for years wrt enabling use of new defaults that have
a platform depedancy, so historical practice isn't a good reference.

There are 100's (possibly 1000's) of tunables set implicitly as part of
the machine type, and of those, libvirt likely only exposes a few 10's
of tunables. The vast majority are low level details that no mgmt app
wants to know about, they just want to accept QEMU's new defaults,
while preserving machine ABI. This is a good thing. No one wants the
burden of wiring up every single tunable into libvirt and mgmt apps.

This is what the "-platform" concept would be intended to preserve. It
would allow a way to enable groups of settings that have a platform level
dependancy, without ever having to teach either libvirt or the mgmt apps
about the individual tunables.


With regards,
Daniel
Akihiko Odaki July 29, 2024, 4:43 p.m. UTC | #21
On 2024/07/29 23:29, Peter Xu wrote:
> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
>> On 2024/07/29 12:50, Jason Wang wrote:
>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>
>>>> On 2024/07/27 5:47, Peter Xu wrote:
>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
>>>>>>>>>>>> to support them, for backward compatibility by default the
>>>>>>>>>>>> features are disabled on 8.0 and earlier.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>>>>>>>>
>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>>>>>>>>> USO supported, to another host that doesn't..
>>>>>>>>>>
>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
>>>>>>>>>> don't do this.
>>>>>>>>>
>>>>>>>>> May I ask for my understanding:
>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
>>>>>>>>> between machines that have different host kernel features?
>>>>>>>>>
>>>>>>>>>> Long term, we need to start exposing management APIs
>>>>>>>>>> to discover this, and management has to disable unsupported features.
>>>>>>>>>
>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
>>>>>>>>
>>>>>>>> When QEMU automatically toggles machine type featuers based on host
>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
>>>>>>>> as we cannot assume that the libvirt people are using knows about
>>>>>>>> newly introduced features. Even if libvirt is updated to know about
>>>>>>>> it, people can easily be using a previous libvirt release.
>>>>>>>>
>>>>>>>> QEMU itself needs to make the machine types do that they are there
>>>>>>>> todo, which is to define a stable machine ABI.
>>>>>>>>
>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
>>>>>>>> sets of features which are tied to specific platform generations.
>>>>>>>> As long as we don't have that we'll keep having these broken
>>>>>>>> migration problems from machine types dynamically changing instead
>>>>>>>> of providing a stable guest ABI.
>>>>>>>
>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
>>>>>>> implementation?
>>>>>>
>>>>>> In terms of launching QEMU I'd imagine:
>>>>>>
>>>>>>      $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>>>>
>>>>>> Any virtual machine HW features which are tied to host kernel features
>>>>>> would have their defaults set based on the requested -platform. The
>>>>>> -machine will be fully invariant wrt the host kernel.
>>>>>>
>>>>>> You would have -platform hlep to list available platforms, and
>>>>>> corresonding QMP "query-platforms" command to list what platforms
>>>>>> are supported on a given host OS.
>>>>>>
>>>>>> Downstream distros can provide their own platforms definitions
>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>>>>> diverges from upstream due to backports.
>>>>>>
>>>>>> Mgmt apps won't need to be taught about every single little QEMU
>>>>>> setting whose default is derived from the kernel. Individual
>>>>>> defaults are opaque and controlled by the requested platform.
>>>>>>
>>>>>> Live migration has clearly defined semantics, and mgmt app can
>>>>>> use query-platforms to validate two hosts are compatible.
>>>>>>
>>>>>> Omitting -platform should pick the very latest platform that is
>>>>>> cmpatible with the current host (not neccessarily the latest
>>>>>> platform built-in to QEMU).
>>>>>
>>>>> This seems to add one more layer to maintain, and so far I don't know
>>>>> whether it's a must.
>>>>>
>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>>>>> thought it was mostly the case already, except some extremely rare
>>>>> outliers.
>>>>>
>>>>> When we have one host that boots up a VM using:
>>>>>
>>>>>      $QEMU1 $cmdline
>>>>>
>>>>> Then another host boots up:
>>>>>
>>>>>      $QEMU2 $cmdline -incoming XXX
>>>>>
>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
>>>>> can boot up all fine without errors on both sides.
>>>>>
>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>>>>> didn't, I thought the ABI should be guaranteed.
>>>>>
>>>>> That's why I think this is a migration violation, as 99.99% of other device
>>>>> properties should be following this rule.  The issue here is, we have the
>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
>>>>> break.
>>>>>
>>>>> That's also why I was suggesting if the property contributes to the guest
>>>>> ABI, then AFAIU QEMU needs to:
>>>>>
>>>>>      - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>
>>>>>      - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>        by default, while advanced users can opt-in on new features.  We can't
>>>>>        make this ON by default otherwise some VMs can already fail to boot,
>>>>
>>>> It may not be necessary the case that old features are supported by
>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
>>>> Windows, which probably doesn't support any offloading at all. A more
>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
>>>> privilege so we cannot assume it is always available even on the latest
>>>> version of Linux.
>>>
>>> I don't get why eBPF matters here. It is something that is not noticed
>>> by the guest and we have a fallback anyhow.

It is noticeable for the guest, and the fallback is not effective with 
vhost. Enabling RSS by default will result in a similar problem although 
it is older than USO.

>>>
>>>>
>>>>>
>>>>>      - If the host doesn't support the feature while the cmdline enabled it,
>>>>>        it needs to fail QEMU boot rather than flipping, so that it says "hey,
>>>>>        this host does not support running such VM specified, due to XXX
>>>>>        feature missing".
>>>>
>>>> This is handled in:
>>>>
>>>> "virtio-net: Convert feature properties to OnOffAuto"
>>>> https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/
>>>
>>> I may miss something but I think "Auto" doesn't make sense to libvirt.
>>
>> The point is libvirt can explicitly set "on" to avoid the "auto" behavior.
>> libvirt does not have to use the "auto" value.
>>
>> libvirt can still use "auto" if desired. virDomainNetDefParseXMLDriver() in
>> libvirt actually parses tristate values (libvirt uses "default" instead of
>> "auto" as the mnemonic) for these features though "default" is currently
>> disabled by the schema (src/conf/schemas/domaincommon.rng). Allowing user to
>> specify "default" is only a matter of editing the schema. Of course
>> specifying "default" will make the VM unsafe for migration.
> 
> Isn't keeping the default AUTO the same as before when it used to be ON?  I
> mean, AUTO in a qemu cmdline doesn't guarantee guest API either.

True. It only deals with the situation that "the host doesn't support 
the feature while the cmdline enabled it".

> 
> Indeed it looks like it's a step forward to make ON having the clear
> semantics of "fail when unsupported".  It's just that I am not sure how
> useful is AUTO here, because anyway we'll need to break ON semantics even
> with AUTO, so that an old QEMU script with USO=ON used to boot on old
> kernels but not it won't. >
> What I was trying to say is whether we should make the default parameter to
> be migratable.  IOW, it looks to me AUTO should deserve a migration
> blocker when chosen.
> 
> After all, Libvirt hopefully shouldn't use AUTO at all but only ON/OFF,
> while any user when not caring much on these perf details should always use
> OFF on any kernel dependent features that may affect the guest ABI.

Well, there should be libvirt users who care performance and do not use 
migration so it's better for them if libvirt can use auto. But the use 
of "auto" should be mutually exclusive with migration of course.

Regards,
Akihiko Odaki
Peter Xu July 29, 2024, 5 p.m. UTC | #22
On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote:
> > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> > > 
> > > In terms of launching QEMU I'd imagine:
> > > 
> > >   $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> > > 
> > > Any virtual machine HW features which are tied to host kernel features
> > > would have their defaults set based on the requested -platform. The
> > > -machine will be fully invariant wrt the host kernel.
> > > 
> > > You would have -platform hlep to list available platforms, and
> > > corresonding QMP "query-platforms" command to list what platforms
> > > are supported on a given host OS.
> > > 
> > > Downstream distros can provide their own platforms definitions
> > > (eg "linux-rhel-9.5") if they have kernels whose feature set
> > > diverges from upstream due to backports.
> > > 
> > > Mgmt apps won't need to be taught about every single little QEMU
> > > setting whose default is derived from the kernel. Individual
> > > defaults are opaque and controlled by the requested platform.
> > > 
> > > Live migration has clearly defined semantics, and mgmt app can
> > > use query-platforms to validate two hosts are compatible.
> > > 
> > > Omitting -platform should pick the very latest platform that is
> > > cmpatible with the current host (not neccessarily the latest
> > > platform built-in to QEMU).
> > 
> > This seems to add one more layer to maintain, and so far I don't know
> > whether it's a must.
> > 
> > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> > thought it was mostly the case already, except some extremely rare
> > outliers.
> > 
> > When we have one host that boots up a VM using:
> > 
> >   $QEMU1 $cmdline
> > 
> > Then another host boots up:
> > 
> >   $QEMU2 $cmdline -incoming XXX
> > 
> > Then migration should succeed if $cmdline is exactly the same, and the VM
> > can boot up all fine without errors on both sides.
> > 
> > AFAICT this has nothing to do with what kernel is underneath, even not
> > Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> > didn't, I thought the ABI should be guaranteed.
> 
> We've got two mutually conflicting goals with the machine type
> definitions.
> 
> Primarily we use them to ensure stable ABI, but an important
> secondary goal is to enable new tunables to have new defaults
> set, without having to update every mgmt app.  The latter
> works very well when the defaults have no dependancy on the
> platform kernel/OS, but breaks migration when they do have a
> platform dependancy.
> 
> >   - Firstly, never quietly flipping any bit that affects the ABI...
> > 
> >   - Have a default value of off, then QEMU will always allow the VM to boot
> >     by default, while advanced users can opt-in on new features.  We can't
> >     make this ON by default otherwise some VMs can already fail to boot,
> > 
> >   - If the host doesn't support the feature while the cmdline enabled it,
> >     it needs to fail QEMU boot rather than flipping, so that it says "hey,
> >     this host does not support running such VM specified, due to XXX
> >     feature missing".
> > 
> > That's the only way an user could understand what happened, and IMHO that's
> > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > while in which the machine type is the fundation of such definition, as the
> > machine type can decides many of the rest compat properties.  And that's
> > the whole point of the compat properties too (to make sure the guest ABI is
> > stable).
> > 
> > If kernel breaks it easily, all compat property things that we maintain can
> > already stop making sense in general, because it didn't define the whole
> > guest ABI..
> > 
> > So AFAIU that's really what we used for years, I hope I didn't overlook
> > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > keep up with this rule?
> 
> We've failed at this for years wrt enabling use of new defaults that have
> a platform depedancy, so historical practice isn't a good reference.
> 
> There are 100's (possibly 1000's) of tunables set implicitly as part of
> the machine type, and of those, libvirt likely only exposes a few 10's
> of tunables. The vast majority are low level details that no mgmt app
> wants to know about, they just want to accept QEMU's new defaults,
> while preserving machine ABI. This is a good thing. No one wants the
> burden of wiring up every single tunable into libvirt and mgmt apps.
> 
> This is what the "-platform" concept would be intended to preserve. It
> would allow a way to enable groups of settings that have a platform level
> dependancy, without ever having to teach either libvirt or the mgmt apps
> about the individual tunables.

Do you think we can achieve similar goal by simply turning the feature to
ON only after a few QEMU releases?  I also mentioned that idea below.

https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n

So far it really sounds like the right thing to do to me to fix all similar
issues, even without introducing anything new we need to maintain.

To put that again, what we need to do is this:

  - To start: we should NEVER turn any guest ABI relevant bits
    automatically by QEMU, for sure..

  - When introducing any new device feature that may both (1) affects guest
    ABI, and (2) depends on host kernel features, we set those default
    values to OFF always at start. So this already covers old machine
    types, no compat property needed so far.

  - We always fail hard on QEMU boot whenever we detected such property is
    not supported by the current host when with ON (and since it's OFF by
    default it must be that the user specified that ON).

  - When after a stablized period of time for that new feature to land most
    kernels (we may consider to look at how major Linux distros updates the
    kernel versions) when we're pretty sure the new feature should be
    available on most of the QEMU modern users, we add a patch to make the
    property default ON on the new machine type, add a compat property for
    old machines.

The last bullet also means we'll start to fail new machine type from
booting when running that very new QEMU on a very old kernel, but that's
the trade-off, and when doing it right on "stablizing the feature in the
kernel world", it should really be corner case.  The user should simply
invoke an old machine type on that old kernel, even if the qemu is new.

Thanks,
Akihiko Odaki July 29, 2024, 5:02 p.m. UTC | #23
On 2024/07/30 0:58, Daniel P. Berrangé wrote:
> On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote:
>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>
>>> In terms of launching QEMU I'd imagine:
>>>
>>>    $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>
>>> Any virtual machine HW features which are tied to host kernel features
>>> would have their defaults set based on the requested -platform. The
>>> -machine will be fully invariant wrt the host kernel.
>>>
>>> You would have -platform hlep to list available platforms, and
>>> corresonding QMP "query-platforms" command to list what platforms
>>> are supported on a given host OS.
>>>
>>> Downstream distros can provide their own platforms definitions
>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>> diverges from upstream due to backports.
>>>
>>> Mgmt apps won't need to be taught about every single little QEMU
>>> setting whose default is derived from the kernel. Individual
>>> defaults are opaque and controlled by the requested platform.
>>>
>>> Live migration has clearly defined semantics, and mgmt app can
>>> use query-platforms to validate two hosts are compatible.
>>>
>>> Omitting -platform should pick the very latest platform that is
>>> cmpatible with the current host (not neccessarily the latest
>>> platform built-in to QEMU).
>>
>> This seems to add one more layer to maintain, and so far I don't know
>> whether it's a must.
>>
>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>> thought it was mostly the case already, except some extremely rare
>> outliers.
>>
>> When we have one host that boots up a VM using:
>>
>>    $QEMU1 $cmdline
>>
>> Then another host boots up:
>>
>>    $QEMU2 $cmdline -incoming XXX
>>
>> Then migration should succeed if $cmdline is exactly the same, and the VM
>> can boot up all fine without errors on both sides.
>>
>> AFAICT this has nothing to do with what kernel is underneath, even not
>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>> didn't, I thought the ABI should be guaranteed.
> 
> We've got two mutually conflicting goals with the machine type
> definitions.
> 
> Primarily we use them to ensure stable ABI, but an important
> secondary goal is to enable new tunables to have new defaults
> set, without having to update every mgmt app.  The latter
> works very well when the defaults have no dependancy on the
> platform kernel/OS, but breaks migration when they do have a
> platform dependancy.
> 
>>    - Firstly, never quietly flipping any bit that affects the ABI...
>>
>>    - Have a default value of off, then QEMU will always allow the VM to boot
>>      by default, while advanced users can opt-in on new features.  We can't
>>      make this ON by default otherwise some VMs can already fail to boot,
>>
>>    - If the host doesn't support the feature while the cmdline enabled it,
>>      it needs to fail QEMU boot rather than flipping, so that it says "hey,
>>      this host does not support running such VM specified, due to XXX
>>      feature missing".
>>
>> That's the only way an user could understand what happened, and IMHO that's
>> a clean way that we stick with QEMU cmdline on defining the guest ABI,
>> while in which the machine type is the fundation of such definition, as the
>> machine type can decides many of the rest compat properties.  And that's
>> the whole point of the compat properties too (to make sure the guest ABI is
>> stable).
>>
>> If kernel breaks it easily, all compat property things that we maintain can
>> already stop making sense in general, because it didn't define the whole
>> guest ABI..
>>
>> So AFAIU that's really what we used for years, I hope I didn't overlook
>> somehting.  And maybe we don't yet need the "-platform" layer if we can
>> keep up with this rule?
> 
> We've failed at this for years wrt enabling use of new defaults that have
> a platform depedancy, so historical practice isn't a good reference.
> 
> There are 100's (possibly 1000's) of tunables set implicitly as part of
> the machine type, and of those, libvirt likely only exposes a few 10's
> of tunables. The vast majority are low level details that no mgmt app
> wants to know about, they just want to accept QEMU's new defaults,
> while preserving machine ABI. This is a good thing. No one wants the
> burden of wiring up every single tunable into libvirt and mgmt apps.
> 
> This is what the "-platform" concept would be intended to preserve. It
> would allow a way to enable groups of settings that have a platform level
> dependancy, without ever having to teach either libvirt or the mgmt apps
> about the individual tunables.

The concept of -platform will certainly reduce the number of tunables, 
but I'm a bit worried that such platform definitions can still have too 
much variety.

The variety of kernel is one; a downstream distro can have 
linux-rhel-9.5 or something as you suggested, but it is still a chore. 
Some features like eBPF may need privilege. Others may depend on 
hardware features.

I think it is simpler to analyze the platform dependency and dump it for 
the management layer. For example, libvirt can request QEMU to analyze 
the platform dependency when it creates a new domain. QEMU will then 
figure out that the host kernel is capable of USO and bake it as a 
platform dependency.

Regards,
Akihiko Odaki
Akihiko Odaki July 29, 2024, 5:23 p.m. UTC | #24
On 2024/07/30 2:00, Peter Xu wrote:
> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
>> On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote:
>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>>
>>>> In terms of launching QEMU I'd imagine:
>>>>
>>>>    $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>>
>>>> Any virtual machine HW features which are tied to host kernel features
>>>> would have their defaults set based on the requested -platform. The
>>>> -machine will be fully invariant wrt the host kernel.
>>>>
>>>> You would have -platform hlep to list available platforms, and
>>>> corresonding QMP "query-platforms" command to list what platforms
>>>> are supported on a given host OS.
>>>>
>>>> Downstream distros can provide their own platforms definitions
>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>>> diverges from upstream due to backports.
>>>>
>>>> Mgmt apps won't need to be taught about every single little QEMU
>>>> setting whose default is derived from the kernel. Individual
>>>> defaults are opaque and controlled by the requested platform.
>>>>
>>>> Live migration has clearly defined semantics, and mgmt app can
>>>> use query-platforms to validate two hosts are compatible.
>>>>
>>>> Omitting -platform should pick the very latest platform that is
>>>> cmpatible with the current host (not neccessarily the latest
>>>> platform built-in to QEMU).
>>>
>>> This seems to add one more layer to maintain, and so far I don't know
>>> whether it's a must.
>>>
>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>>> thought it was mostly the case already, except some extremely rare
>>> outliers.
>>>
>>> When we have one host that boots up a VM using:
>>>
>>>    $QEMU1 $cmdline
>>>
>>> Then another host boots up:
>>>
>>>    $QEMU2 $cmdline -incoming XXX
>>>
>>> Then migration should succeed if $cmdline is exactly the same, and the VM
>>> can boot up all fine without errors on both sides.
>>>
>>> AFAICT this has nothing to do with what kernel is underneath, even not
>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>>> didn't, I thought the ABI should be guaranteed.
>>
>> We've got two mutually conflicting goals with the machine type
>> definitions.
>>
>> Primarily we use them to ensure stable ABI, but an important
>> secondary goal is to enable new tunables to have new defaults
>> set, without having to update every mgmt app.  The latter
>> works very well when the defaults have no dependancy on the
>> platform kernel/OS, but breaks migration when they do have a
>> platform dependancy.
>>
>>>    - Firstly, never quietly flipping any bit that affects the ABI...
>>>
>>>    - Have a default value of off, then QEMU will always allow the VM to boot
>>>      by default, while advanced users can opt-in on new features.  We can't
>>>      make this ON by default otherwise some VMs can already fail to boot,
>>>
>>>    - If the host doesn't support the feature while the cmdline enabled it,
>>>      it needs to fail QEMU boot rather than flipping, so that it says "hey,
>>>      this host does not support running such VM specified, due to XXX
>>>      feature missing".
>>>
>>> That's the only way an user could understand what happened, and IMHO that's
>>> a clean way that we stick with QEMU cmdline on defining the guest ABI,
>>> while in which the machine type is the fundation of such definition, as the
>>> machine type can decides many of the rest compat properties.  And that's
>>> the whole point of the compat properties too (to make sure the guest ABI is
>>> stable).
>>>
>>> If kernel breaks it easily, all compat property things that we maintain can
>>> already stop making sense in general, because it didn't define the whole
>>> guest ABI..
>>>
>>> So AFAIU that's really what we used for years, I hope I didn't overlook
>>> somehting.  And maybe we don't yet need the "-platform" layer if we can
>>> keep up with this rule?
>>
>> We've failed at this for years wrt enabling use of new defaults that have
>> a platform depedancy, so historical practice isn't a good reference.
>>
>> There are 100's (possibly 1000's) of tunables set implicitly as part of
>> the machine type, and of those, libvirt likely only exposes a few 10's
>> of tunables. The vast majority are low level details that no mgmt app
>> wants to know about, they just want to accept QEMU's new defaults,
>> while preserving machine ABI. This is a good thing. No one wants the
>> burden of wiring up every single tunable into libvirt and mgmt apps.
>>
>> This is what the "-platform" concept would be intended to preserve. It
>> would allow a way to enable groups of settings that have a platform level
>> dependancy, without ever having to teach either libvirt or the mgmt apps
>> about the individual tunables.
> 
> Do you think we can achieve similar goal by simply turning the feature to
> ON only after a few QEMU releases?  I also mentioned that idea below.
> 
> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> 
> So far it really sounds like the right thing to do to me to fix all similar
> issues, even without introducing anything new we need to maintain.
> 
> To put that again, what we need to do is this:
> 
>    - To start: we should NEVER turn any guest ABI relevant bits
>      automatically by QEMU, for sure..
> 
>    - When introducing any new device feature that may both (1) affects guest
>      ABI, and (2) depends on host kernel features, we set those default
>      values to OFF always at start. So this already covers old machine
>      types, no compat property needed so far.
> 
>    - We always fail hard on QEMU boot whenever we detected such property is
>      not supported by the current host when with ON (and since it's OFF by
>      default it must be that the user specified that ON).
> 
>    - When after a stablized period of time for that new feature to land most
>      kernels (we may consider to look at how major Linux distros updates the
>      kernel versions) when we're pretty sure the new feature should be
>      available on most of the QEMU modern users, we add a patch to make the
>      property default ON on the new machine type, add a compat property for
>      old machines.
> 
> The last bullet also means we'll start to fail new machine type from
> booting when running that very new QEMU on a very old kernel, but that's
> the trade-off, and when doing it right on "stablizing the feature in the
> kernel world", it should really be corner case.  The user should simply
> invoke an old machine type on that old kernel, even if the qemu is new.

docs/about/build-platforms.rst already defines supported platforms. One 
of the supported platforms is Debian 11 (bullseye), and it carries Linux 
5.10, which was released December 2020. If we follow this platform 
support, a new feature added to upstream Linux may take about 4 years 
before it gets enabled by default on QEMU.

As an upstream developer, I feel it is too long, but I'm sure there are 
different opinions from different perspectives.

Regards,
Akihiko Odaki
Daniel P. Berrangé July 29, 2024, 5:26 p.m. UTC | #25
On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > 
> > We've got two mutually conflicting goals with the machine type
> > definitions.
> > 
> > Primarily we use them to ensure stable ABI, but an important
> > secondary goal is to enable new tunables to have new defaults
> > set, without having to update every mgmt app.  The latter
> > works very well when the defaults have no dependancy on the
> > platform kernel/OS, but breaks migration when they do have a
> > platform dependancy.
> > 
> > >   - Firstly, never quietly flipping any bit that affects the ABI...
> > > 
> > >   - Have a default value of off, then QEMU will always allow the VM to boot
> > >     by default, while advanced users can opt-in on new features.  We can't
> > >     make this ON by default otherwise some VMs can already fail to boot,
> > > 
> > >   - If the host doesn't support the feature while the cmdline enabled it,
> > >     it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > >     this host does not support running such VM specified, due to XXX
> > >     feature missing".
> > > 
> > > That's the only way an user could understand what happened, and IMHO that's
> > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > while in which the machine type is the fundation of such definition, as the
> > > machine type can decides many of the rest compat properties.  And that's
> > > the whole point of the compat properties too (to make sure the guest ABI is
> > > stable).
> > > 
> > > If kernel breaks it easily, all compat property things that we maintain can
> > > already stop making sense in general, because it didn't define the whole
> > > guest ABI..
> > > 
> > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > keep up with this rule?
> > 
> > We've failed at this for years wrt enabling use of new defaults that have
> > a platform depedancy, so historical practice isn't a good reference.
> > 
> > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > the machine type, and of those, libvirt likely only exposes a few 10's
> > of tunables. The vast majority are low level details that no mgmt app
> > wants to know about, they just want to accept QEMU's new defaults,
> > while preserving machine ABI. This is a good thing. No one wants the
> > burden of wiring up every single tunable into libvirt and mgmt apps.
> > 
> > This is what the "-platform" concept would be intended to preserve. It
> > would allow a way to enable groups of settings that have a platform level
> > dependancy, without ever having to teach either libvirt or the mgmt apps
> > about the individual tunables.
> 
> Do you think we can achieve similar goal by simply turning the feature to
> ON only after a few QEMU releases?  I also mentioned that idea below.
> 
> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> 
> So far it really sounds like the right thing to do to me to fix all similar
> issues, even without introducing anything new we need to maintain.

Turning a feature with a platform dependency to "on" implies that
the machine type will cease to work out of the box for platforms
which lack the feature. IMHO that's not acceptable behaviour for
any of our supported platforms.

IOW, "after a few QEMU releases" implies a delay of as much as
5 years, while we wait for platforms which don't support the
feature to drop out of our supported targets list.  I don't
think that'll satisfy the desire to get the new feature
available to users as soon as practical for their particular
platform.

> 
> To put that again, what we need to do is this:
> 
>   - To start: we should NEVER turn any guest ABI relevant bits
>     automatically by QEMU, for sure..
> 
>   - When introducing any new device feature that may both (1) affects guest
>     ABI, and (2) depends on host kernel features, we set those default
>     values to OFF always at start. So this already covers old machine
>     types, no compat property needed so far.
> 
>   - We always fail hard on QEMU boot whenever we detected such property is
>     not supported by the current host when with ON (and since it's OFF by
>     default it must be that the user specified that ON).
> 
>   - When after a stablized period of time for that new feature to land most
>     kernels (we may consider to look at how major Linux distros updates the
>     kernel versions) when we're pretty sure the new feature should be
>     available on most of the QEMU modern users, we add a patch to make the
>     property default ON on the new machine type, add a compat property for
>     old machines.

Our supported platform list determines when this will be, and given
our current criteria, this can be as long as 5 years.


With regards,
Daniel
Jason Wang July 30, 2024, 2:04 a.m. UTC | #26
On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2024/07/29 23:29, Peter Xu wrote:
> > On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
> >> On 2024/07/29 12:50, Jason Wang wrote:
> >>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>
> >>>> On 2024/07/27 5:47, Peter Xu wrote:
> >>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> >>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> >>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> >>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> >>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> >>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> >>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> >>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
> >>>>>>>>>>>> to support them, for backward compatibility by default the
> >>>>>>>>>>>> features are disabled on 8.0 and earlier.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> >>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >>>>>>>>>>>
> >>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
> >>>>>>>>>>> USO supported, to another host that doesn't..
> >>>>>>>>>>
> >>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
> >>>>>>>>>> don't do this.
> >>>>>>>>>
> >>>>>>>>> May I ask for my understanding:
> >>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
> >>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
> >>>>>>>>> between machines that have different host kernel features?
> >>>>>>>>>
> >>>>>>>>>> Long term, we need to start exposing management APIs
> >>>>>>>>>> to discover this, and management has to disable unsupported features.
> >>>>>>>>>
> >>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
> >>>>>>>>
> >>>>>>>> When QEMU automatically toggles machine type featuers based on host
> >>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
> >>>>>>>> as we cannot assume that the libvirt people are using knows about
> >>>>>>>> newly introduced features. Even if libvirt is updated to know about
> >>>>>>>> it, people can easily be using a previous libvirt release.
> >>>>>>>>
> >>>>>>>> QEMU itself needs to make the machine types do that they are there
> >>>>>>>> todo, which is to define a stable machine ABI.
> >>>>>>>>
> >>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
> >>>>>>>> sets of features which are tied to specific platform generations.
> >>>>>>>> As long as we don't have that we'll keep having these broken
> >>>>>>>> migration problems from machine types dynamically changing instead
> >>>>>>>> of providing a stable guest ABI.
> >>>>>>>
> >>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
> >>>>>>> implementation?
> >>>>>>
> >>>>>> In terms of launching QEMU I'd imagine:
> >>>>>>
> >>>>>>      $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> >>>>>>
> >>>>>> Any virtual machine HW features which are tied to host kernel features
> >>>>>> would have their defaults set based on the requested -platform. The
> >>>>>> -machine will be fully invariant wrt the host kernel.
> >>>>>>
> >>>>>> You would have -platform hlep to list available platforms, and
> >>>>>> corresonding QMP "query-platforms" command to list what platforms
> >>>>>> are supported on a given host OS.
> >>>>>>
> >>>>>> Downstream distros can provide their own platforms definitions
> >>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
> >>>>>> diverges from upstream due to backports.
> >>>>>>
> >>>>>> Mgmt apps won't need to be taught about every single little QEMU
> >>>>>> setting whose default is derived from the kernel. Individual
> >>>>>> defaults are opaque and controlled by the requested platform.
> >>>>>>
> >>>>>> Live migration has clearly defined semantics, and mgmt app can
> >>>>>> use query-platforms to validate two hosts are compatible.
> >>>>>>
> >>>>>> Omitting -platform should pick the very latest platform that is
> >>>>>> cmpatible with the current host (not neccessarily the latest
> >>>>>> platform built-in to QEMU).
> >>>>>
> >>>>> This seems to add one more layer to maintain, and so far I don't know
> >>>>> whether it's a must.
> >>>>>
> >>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> >>>>> thought it was mostly the case already, except some extremely rare
> >>>>> outliers.
> >>>>>
> >>>>> When we have one host that boots up a VM using:
> >>>>>
> >>>>>      $QEMU1 $cmdline
> >>>>>
> >>>>> Then another host boots up:
> >>>>>
> >>>>>      $QEMU2 $cmdline -incoming XXX
> >>>>>
> >>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
> >>>>> can boot up all fine without errors on both sides.
> >>>>>
> >>>>> AFAICT this has nothing to do with what kernel is underneath, even not
> >>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> >>>>> didn't, I thought the ABI should be guaranteed.
> >>>>>
> >>>>> That's why I think this is a migration violation, as 99.99% of other device
> >>>>> properties should be following this rule.  The issue here is, we have the
> >>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
> >>>>> break.
> >>>>>
> >>>>> That's also why I was suggesting if the property contributes to the guest
> >>>>> ABI, then AFAIU QEMU needs to:
> >>>>>
> >>>>>      - Firstly, never quietly flipping any bit that affects the ABI...
> >>>>>
> >>>>>      - Have a default value of off, then QEMU will always allow the VM to boot
> >>>>>        by default, while advanced users can opt-in on new features.  We can't
> >>>>>        make this ON by default otherwise some VMs can already fail to boot,
> >>>>
> >>>> It may not be necessary the case that old features are supported by
> >>>> every systems. In an extreme case, a user may migrate a VM from Linux to
> >>>> Windows, which probably doesn't support any offloading at all. A more
> >>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
> >>>> privilege so we cannot assume it is always available even on the latest
> >>>> version of Linux.
> >>>
> >>> I don't get why eBPF matters here. It is something that is not noticed
> >>> by the guest and we have a fallback anyhow.
>
> It is noticeable for the guest, and the fallback is not effective with
> vhost.

It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.

Thanks
Akihiko Odaki July 30, 2024, 2:57 a.m. UTC | #27
On 2024/07/30 11:04, Jason Wang wrote:
> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> On 2024/07/29 23:29, Peter Xu wrote:
>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
>>>> On 2024/07/29 12:50, Jason Wang wrote:
>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>
>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
>>>>>>>>>>>>>> to support them, for backward compatibility by default the
>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>>>>>>>>>>> USO supported, to another host that doesn't..
>>>>>>>>>>>>
>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
>>>>>>>>>>>> don't do this.
>>>>>>>>>>>
>>>>>>>>>>> May I ask for my understanding:
>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
>>>>>>>>>>> between machines that have different host kernel features?
>>>>>>>>>>>
>>>>>>>>>>>> Long term, we need to start exposing management APIs
>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
>>>>>>>>>>>
>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
>>>>>>>>>>
>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
>>>>>>>>>> it, people can easily be using a previous libvirt release.
>>>>>>>>>>
>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
>>>>>>>>>> todo, which is to define a stable machine ABI.
>>>>>>>>>>
>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
>>>>>>>>>> sets of features which are tied to specific platform generations.
>>>>>>>>>> As long as we don't have that we'll keep having these broken
>>>>>>>>>> migration problems from machine types dynamically changing instead
>>>>>>>>>> of providing a stable guest ABI.
>>>>>>>>>
>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
>>>>>>>>> implementation?
>>>>>>>>
>>>>>>>> In terms of launching QEMU I'd imagine:
>>>>>>>>
>>>>>>>>       $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>>>>>>
>>>>>>>> Any virtual machine HW features which are tied to host kernel features
>>>>>>>> would have their defaults set based on the requested -platform. The
>>>>>>>> -machine will be fully invariant wrt the host kernel.
>>>>>>>>
>>>>>>>> You would have -platform hlep to list available platforms, and
>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
>>>>>>>> are supported on a given host OS.
>>>>>>>>
>>>>>>>> Downstream distros can provide their own platforms definitions
>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>>>>>>> diverges from upstream due to backports.
>>>>>>>>
>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
>>>>>>>> setting whose default is derived from the kernel. Individual
>>>>>>>> defaults are opaque and controlled by the requested platform.
>>>>>>>>
>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
>>>>>>>> use query-platforms to validate two hosts are compatible.
>>>>>>>>
>>>>>>>> Omitting -platform should pick the very latest platform that is
>>>>>>>> cmpatible with the current host (not neccessarily the latest
>>>>>>>> platform built-in to QEMU).
>>>>>>>
>>>>>>> This seems to add one more layer to maintain, and so far I don't know
>>>>>>> whether it's a must.
>>>>>>>
>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>>>>>>> thought it was mostly the case already, except some extremely rare
>>>>>>> outliers.
>>>>>>>
>>>>>>> When we have one host that boots up a VM using:
>>>>>>>
>>>>>>>       $QEMU1 $cmdline
>>>>>>>
>>>>>>> Then another host boots up:
>>>>>>>
>>>>>>>       $QEMU2 $cmdline -incoming XXX
>>>>>>>
>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
>>>>>>> can boot up all fine without errors on both sides.
>>>>>>>
>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>>>>>>> didn't, I thought the ABI should be guaranteed.
>>>>>>>
>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
>>>>>>> properties should be following this rule.  The issue here is, we have the
>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
>>>>>>> break.
>>>>>>>
>>>>>>> That's also why I was suggesting if the property contributes to the guest
>>>>>>> ABI, then AFAIU QEMU needs to:
>>>>>>>
>>>>>>>       - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>>>
>>>>>>>       - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>>>         by default, while advanced users can opt-in on new features.  We can't
>>>>>>>         make this ON by default otherwise some VMs can already fail to boot,
>>>>>>
>>>>>> It may not be necessary the case that old features are supported by
>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
>>>>>> Windows, which probably doesn't support any offloading at all. A more
>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
>>>>>> privilege so we cannot assume it is always available even on the latest
>>>>>> version of Linux.
>>>>>
>>>>> I don't get why eBPF matters here. It is something that is not noticed
>>>>> by the guest and we have a fallback anyhow.
>>
>> It is noticeable for the guest, and the fallback is not effective with
>> vhost.
> 
> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.

We can certainly fallback to in-QEMU RSS by disabling vhost, but I would 
not say lack of such fallback is a bug. We don't provide in-QEMU 
fallback for other offloads.

Regards,
Akihiko Odaki
Jason Wang July 30, 2024, 3:03 a.m. UTC | #28
On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2024/07/30 11:04, Jason Wang wrote:
> > On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> On 2024/07/29 23:29, Peter Xu wrote:
> >>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
> >>>> On 2024/07/29 12:50, Jason Wang wrote:
> >>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>
> >>>>>> On 2024/07/27 5:47, Peter Xu wrote:
> >>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> >>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> >>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> >>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> >>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> >>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> >>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
> >>>>>>>>>>>>>> to support them, for backward compatibility by default the
> >>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> >>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
> >>>>>>>>>>>>> USO supported, to another host that doesn't..
> >>>>>>>>>>>>
> >>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
> >>>>>>>>>>>> don't do this.
> >>>>>>>>>>>
> >>>>>>>>>>> May I ask for my understanding:
> >>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
> >>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
> >>>>>>>>>>> between machines that have different host kernel features?
> >>>>>>>>>>>
> >>>>>>>>>>>> Long term, we need to start exposing management APIs
> >>>>>>>>>>>> to discover this, and management has to disable unsupported features.
> >>>>>>>>>>>
> >>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
> >>>>>>>>>>
> >>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
> >>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
> >>>>>>>>>> as we cannot assume that the libvirt people are using knows about
> >>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
> >>>>>>>>>> it, people can easily be using a previous libvirt release.
> >>>>>>>>>>
> >>>>>>>>>> QEMU itself needs to make the machine types do that they are there
> >>>>>>>>>> todo, which is to define a stable machine ABI.
> >>>>>>>>>>
> >>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
> >>>>>>>>>> sets of features which are tied to specific platform generations.
> >>>>>>>>>> As long as we don't have that we'll keep having these broken
> >>>>>>>>>> migration problems from machine types dynamically changing instead
> >>>>>>>>>> of providing a stable guest ABI.
> >>>>>>>>>
> >>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
> >>>>>>>>> implementation?
> >>>>>>>>
> >>>>>>>> In terms of launching QEMU I'd imagine:
> >>>>>>>>
> >>>>>>>>       $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> >>>>>>>>
> >>>>>>>> Any virtual machine HW features which are tied to host kernel features
> >>>>>>>> would have their defaults set based on the requested -platform. The
> >>>>>>>> -machine will be fully invariant wrt the host kernel.
> >>>>>>>>
> >>>>>>>> You would have -platform hlep to list available platforms, and
> >>>>>>>> corresonding QMP "query-platforms" command to list what platforms
> >>>>>>>> are supported on a given host OS.
> >>>>>>>>
> >>>>>>>> Downstream distros can provide their own platforms definitions
> >>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
> >>>>>>>> diverges from upstream due to backports.
> >>>>>>>>
> >>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
> >>>>>>>> setting whose default is derived from the kernel. Individual
> >>>>>>>> defaults are opaque and controlled by the requested platform.
> >>>>>>>>
> >>>>>>>> Live migration has clearly defined semantics, and mgmt app can
> >>>>>>>> use query-platforms to validate two hosts are compatible.
> >>>>>>>>
> >>>>>>>> Omitting -platform should pick the very latest platform that is
> >>>>>>>> cmpatible with the current host (not neccessarily the latest
> >>>>>>>> platform built-in to QEMU).
> >>>>>>>
> >>>>>>> This seems to add one more layer to maintain, and so far I don't know
> >>>>>>> whether it's a must.
> >>>>>>>
> >>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> >>>>>>> thought it was mostly the case already, except some extremely rare
> >>>>>>> outliers.
> >>>>>>>
> >>>>>>> When we have one host that boots up a VM using:
> >>>>>>>
> >>>>>>>       $QEMU1 $cmdline
> >>>>>>>
> >>>>>>> Then another host boots up:
> >>>>>>>
> >>>>>>>       $QEMU2 $cmdline -incoming XXX
> >>>>>>>
> >>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
> >>>>>>> can boot up all fine without errors on both sides.
> >>>>>>>
> >>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
> >>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> >>>>>>> didn't, I thought the ABI should be guaranteed.
> >>>>>>>
> >>>>>>> That's why I think this is a migration violation, as 99.99% of other device
> >>>>>>> properties should be following this rule.  The issue here is, we have the
> >>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
> >>>>>>> break.
> >>>>>>>
> >>>>>>> That's also why I was suggesting if the property contributes to the guest
> >>>>>>> ABI, then AFAIU QEMU needs to:
> >>>>>>>
> >>>>>>>       - Firstly, never quietly flipping any bit that affects the ABI...
> >>>>>>>
> >>>>>>>       - Have a default value of off, then QEMU will always allow the VM to boot
> >>>>>>>         by default, while advanced users can opt-in on new features.  We can't
> >>>>>>>         make this ON by default otherwise some VMs can already fail to boot,
> >>>>>>
> >>>>>> It may not be necessary the case that old features are supported by
> >>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
> >>>>>> Windows, which probably doesn't support any offloading at all. A more
> >>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
> >>>>>> privilege so we cannot assume it is always available even on the latest
> >>>>>> version of Linux.
> >>>>>
> >>>>> I don't get why eBPF matters here. It is something that is not noticed
> >>>>> by the guest and we have a fallback anyhow.
> >>
> >> It is noticeable for the guest, and the fallback is not effective with
> >> vhost.
> >
> > It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
>
> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
> not say lack of such fallback is a bug.

Such fallback is by design since the introduction of vhost.

> We don't provide in-QEMU
> fallback for other offloads.

Yes but what I want to say is that eBPF RSS is different from those
segmentation offloads. And technically, Qemu can do fallback for
offloads (as RSC did).

Thanks

>
> Regards,
> Akihiko Odaki
>
Akihiko Odaki July 30, 2024, 3:11 a.m. UTC | #29
On 2024/07/30 12:03, Jason Wang wrote:
> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> On 2024/07/30 11:04, Jason Wang wrote:
>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>
>>>> On 2024/07/29 23:29, Peter Xu wrote:
>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
>>>>>> On 2024/07/29 12:50, Jason Wang wrote:
>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>>>
>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the
>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>>>>>>>>>>>>> USO supported, to another host that doesn't..
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
>>>>>>>>>>>>>> don't do this.
>>>>>>>>>>>>>
>>>>>>>>>>>>> May I ask for my understanding:
>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
>>>>>>>>>>>>> between machines that have different host kernel features?
>>>>>>>>>>>>>
>>>>>>>>>>>>>> Long term, we need to start exposing management APIs
>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
>>>>>>>>>>>>
>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
>>>>>>>>>>>> it, people can easily be using a previous libvirt release.
>>>>>>>>>>>>
>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
>>>>>>>>>>>> todo, which is to define a stable machine ABI.
>>>>>>>>>>>>
>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
>>>>>>>>>>>> sets of features which are tied to specific platform generations.
>>>>>>>>>>>> As long as we don't have that we'll keep having these broken
>>>>>>>>>>>> migration problems from machine types dynamically changing instead
>>>>>>>>>>>> of providing a stable guest ABI.
>>>>>>>>>>>
>>>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
>>>>>>>>>>> implementation?
>>>>>>>>>>
>>>>>>>>>> In terms of launching QEMU I'd imagine:
>>>>>>>>>>
>>>>>>>>>>        $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>>>>>>>>
>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features
>>>>>>>>>> would have their defaults set based on the requested -platform. The
>>>>>>>>>> -machine will be fully invariant wrt the host kernel.
>>>>>>>>>>
>>>>>>>>>> You would have -platform hlep to list available platforms, and
>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
>>>>>>>>>> are supported on a given host OS.
>>>>>>>>>>
>>>>>>>>>> Downstream distros can provide their own platforms definitions
>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>>>>>>>>> diverges from upstream due to backports.
>>>>>>>>>>
>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
>>>>>>>>>> setting whose default is derived from the kernel. Individual
>>>>>>>>>> defaults are opaque and controlled by the requested platform.
>>>>>>>>>>
>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
>>>>>>>>>> use query-platforms to validate two hosts are compatible.
>>>>>>>>>>
>>>>>>>>>> Omitting -platform should pick the very latest platform that is
>>>>>>>>>> cmpatible with the current host (not neccessarily the latest
>>>>>>>>>> platform built-in to QEMU).
>>>>>>>>>
>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know
>>>>>>>>> whether it's a must.
>>>>>>>>>
>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>>>>>>>>> thought it was mostly the case already, except some extremely rare
>>>>>>>>> outliers.
>>>>>>>>>
>>>>>>>>> When we have one host that boots up a VM using:
>>>>>>>>>
>>>>>>>>>        $QEMU1 $cmdline
>>>>>>>>>
>>>>>>>>> Then another host boots up:
>>>>>>>>>
>>>>>>>>>        $QEMU2 $cmdline -incoming XXX
>>>>>>>>>
>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
>>>>>>>>> can boot up all fine without errors on both sides.
>>>>>>>>>
>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
>>>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>>>>>>>>> didn't, I thought the ABI should be guaranteed.
>>>>>>>>>
>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
>>>>>>>>> properties should be following this rule.  The issue here is, we have the
>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
>>>>>>>>> break.
>>>>>>>>>
>>>>>>>>> That's also why I was suggesting if the property contributes to the guest
>>>>>>>>> ABI, then AFAIU QEMU needs to:
>>>>>>>>>
>>>>>>>>>        - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>>>>>
>>>>>>>>>        - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>>>>>          by default, while advanced users can opt-in on new features.  We can't
>>>>>>>>>          make this ON by default otherwise some VMs can already fail to boot,
>>>>>>>>
>>>>>>>> It may not be necessary the case that old features are supported by
>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
>>>>>>>> Windows, which probably doesn't support any offloading at all. A more
>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
>>>>>>>> privilege so we cannot assume it is always available even on the latest
>>>>>>>> version of Linux.
>>>>>>>
>>>>>>> I don't get why eBPF matters here. It is something that is not noticed
>>>>>>> by the guest and we have a fallback anyhow.
>>>>
>>>> It is noticeable for the guest, and the fallback is not effective with
>>>> vhost.
>>>
>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
>>
>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
>> not say lack of such fallback is a bug.
> 
> Such fallback is by design since the introduction of vhost.
> 
>> We don't provide in-QEMU
>> fallback for other offloads.
> 
> Yes but what I want to say is that eBPF RSS is different from those
> segmentation offloads. And technically, Qemu can do fallback for
> offloads (as RSC did).

Well, I couldn't find any code disabling vhost for the in-QEMU RSC 
implementation.

Looking at the code, I also found the case of vhost-vdpa. vhost can be 
simply disabled if it is backed by tuntap, but it is not the case for vDPA.

Regards,
Akihiko Odaki
Jason Wang July 30, 2024, 3:17 a.m. UTC | #30
On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2024/07/30 12:03, Jason Wang wrote:
> > On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> On 2024/07/30 11:04, Jason Wang wrote:
> >>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>
> >>>> On 2024/07/29 23:29, Peter Xu wrote:
> >>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
> >>>>>> On 2024/07/29 12:50, Jason Wang wrote:
> >>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>>>
> >>>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
> >>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> >>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> >>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> >>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> >>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> >>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
> >>>>>>>>>>>>>>>> to support them, for backward compatibility by default the
> >>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> >>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
> >>>>>>>>>>>>>>> USO supported, to another host that doesn't..
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
> >>>>>>>>>>>>>> don't do this.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> May I ask for my understanding:
> >>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
> >>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
> >>>>>>>>>>>>> between machines that have different host kernel features?
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>> Long term, we need to start exposing management APIs
> >>>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
> >>>>>>>>>>>>
> >>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
> >>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
> >>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
> >>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
> >>>>>>>>>>>> it, people can easily be using a previous libvirt release.
> >>>>>>>>>>>>
> >>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
> >>>>>>>>>>>> todo, which is to define a stable machine ABI.
> >>>>>>>>>>>>
> >>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
> >>>>>>>>>>>> sets of features which are tied to specific platform generations.
> >>>>>>>>>>>> As long as we don't have that we'll keep having these broken
> >>>>>>>>>>>> migration problems from machine types dynamically changing instead
> >>>>>>>>>>>> of providing a stable guest ABI.
> >>>>>>>>>>>
> >>>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
> >>>>>>>>>>> implementation?
> >>>>>>>>>>
> >>>>>>>>>> In terms of launching QEMU I'd imagine:
> >>>>>>>>>>
> >>>>>>>>>>        $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> >>>>>>>>>>
> >>>>>>>>>> Any virtual machine HW features which are tied to host kernel features
> >>>>>>>>>> would have their defaults set based on the requested -platform. The
> >>>>>>>>>> -machine will be fully invariant wrt the host kernel.
> >>>>>>>>>>
> >>>>>>>>>> You would have -platform hlep to list available platforms, and
> >>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
> >>>>>>>>>> are supported on a given host OS.
> >>>>>>>>>>
> >>>>>>>>>> Downstream distros can provide their own platforms definitions
> >>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
> >>>>>>>>>> diverges from upstream due to backports.
> >>>>>>>>>>
> >>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
> >>>>>>>>>> setting whose default is derived from the kernel. Individual
> >>>>>>>>>> defaults are opaque and controlled by the requested platform.
> >>>>>>>>>>
> >>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
> >>>>>>>>>> use query-platforms to validate two hosts are compatible.
> >>>>>>>>>>
> >>>>>>>>>> Omitting -platform should pick the very latest platform that is
> >>>>>>>>>> cmpatible with the current host (not neccessarily the latest
> >>>>>>>>>> platform built-in to QEMU).
> >>>>>>>>>
> >>>>>>>>> This seems to add one more layer to maintain, and so far I don't know
> >>>>>>>>> whether it's a must.
> >>>>>>>>>
> >>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> >>>>>>>>> thought it was mostly the case already, except some extremely rare
> >>>>>>>>> outliers.
> >>>>>>>>>
> >>>>>>>>> When we have one host that boots up a VM using:
> >>>>>>>>>
> >>>>>>>>>        $QEMU1 $cmdline
> >>>>>>>>>
> >>>>>>>>> Then another host boots up:
> >>>>>>>>>
> >>>>>>>>>        $QEMU2 $cmdline -incoming XXX
> >>>>>>>>>
> >>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
> >>>>>>>>> can boot up all fine without errors on both sides.
> >>>>>>>>>
> >>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
> >>>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> >>>>>>>>> didn't, I thought the ABI should be guaranteed.
> >>>>>>>>>
> >>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
> >>>>>>>>> properties should be following this rule.  The issue here is, we have the
> >>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
> >>>>>>>>> break.
> >>>>>>>>>
> >>>>>>>>> That's also why I was suggesting if the property contributes to the guest
> >>>>>>>>> ABI, then AFAIU QEMU needs to:
> >>>>>>>>>
> >>>>>>>>>        - Firstly, never quietly flipping any bit that affects the ABI...
> >>>>>>>>>
> >>>>>>>>>        - Have a default value of off, then QEMU will always allow the VM to boot
> >>>>>>>>>          by default, while advanced users can opt-in on new features.  We can't
> >>>>>>>>>          make this ON by default otherwise some VMs can already fail to boot,
> >>>>>>>>
> >>>>>>>> It may not be necessary the case that old features are supported by
> >>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
> >>>>>>>> Windows, which probably doesn't support any offloading at all. A more
> >>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
> >>>>>>>> privilege so we cannot assume it is always available even on the latest
> >>>>>>>> version of Linux.
> >>>>>>>
> >>>>>>> I don't get why eBPF matters here. It is something that is not noticed
> >>>>>>> by the guest and we have a fallback anyhow.
> >>>>
> >>>> It is noticeable for the guest, and the fallback is not effective with
> >>>> vhost.
> >>>
> >>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
> >>
> >> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
> >> not say lack of such fallback is a bug.
> >
> > Such fallback is by design since the introduction of vhost.
> >
> >> We don't provide in-QEMU
> >> fallback for other offloads.
> >
> > Yes but what I want to say is that eBPF RSS is different from those
> > segmentation offloads. And technically, Qemu can do fallback for
> > offloads (as RSC did).
>
> Well, I couldn't find any code disabling vhost for the in-QEMU RSC
> implementation.

It should be a bug (and I remember we disabled vhost when the patches
were merged). Have you tested it in a guest to see if it can see RSC
when vhost is enabled?

I suspect we need to add the RSC bit into current kernel_feature_bits:

/* Features supported by host kernel. */
static const int kernel_feature_bits[] = {
    VIRTIO_F_NOTIFY_ON_EMPTY,
    VIRTIO_RING_F_INDIRECT_DESC,
    VIRTIO_RING_F_EVENT_IDX,
    VIRTIO_NET_F_MRG_RXBUF,
    VIRTIO_F_VERSION_1,
    VIRTIO_NET_F_MTU,
    VIRTIO_F_IOMMU_PLATFORM,
    VIRTIO_F_RING_PACKED,
    VIRTIO_F_RING_RESET,
    VIRTIO_NET_F_HASH_REPORT,
    VHOST_INVALID_FEATURE_BIT
};

As RSC won't be provided by TUN/TAP anyhow.

>
> Looking at the code, I also found the case of vhost-vdpa. vhost can be
> simply disabled if it is backed by tuntap, but it is not the case for vDPA.

True, technically, vDPA can fallback to SVQ, but it's another topic.

Thanks

>
> Regards,
> Akihiko Odaki
>
Akihiko Odaki July 30, 2024, 3:28 a.m. UTC | #31
On 2024/07/30 12:17, Jason Wang wrote:
> On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> On 2024/07/30 12:03, Jason Wang wrote:
>>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>
>>>> On 2024/07/30 11:04, Jason Wang wrote:
>>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>
>>>>>> On 2024/07/29 23:29, Peter Xu wrote:
>>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
>>>>>>>> On 2024/07/29 12:50, Jason Wang wrote:
>>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
>>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
>>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the
>>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>>>>>>>>>>>>>>> USO supported, to another host that doesn't..
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
>>>>>>>>>>>>>>>> don't do this.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> May I ask for my understanding:
>>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
>>>>>>>>>>>>>>> between machines that have different host kernel features?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs
>>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
>>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
>>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
>>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
>>>>>>>>>>>>>> it, people can easily be using a previous libvirt release.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
>>>>>>>>>>>>>> todo, which is to define a stable machine ABI.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
>>>>>>>>>>>>>> sets of features which are tied to specific platform generations.
>>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken
>>>>>>>>>>>>>> migration problems from machine types dynamically changing instead
>>>>>>>>>>>>>> of providing a stable guest ABI.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
>>>>>>>>>>>>> implementation?
>>>>>>>>>>>>
>>>>>>>>>>>> In terms of launching QEMU I'd imagine:
>>>>>>>>>>>>
>>>>>>>>>>>>         $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>>>>>>>>>>
>>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features
>>>>>>>>>>>> would have their defaults set based on the requested -platform. The
>>>>>>>>>>>> -machine will be fully invariant wrt the host kernel.
>>>>>>>>>>>>
>>>>>>>>>>>> You would have -platform hlep to list available platforms, and
>>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
>>>>>>>>>>>> are supported on a given host OS.
>>>>>>>>>>>>
>>>>>>>>>>>> Downstream distros can provide their own platforms definitions
>>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>>>>>>>>>>> diverges from upstream due to backports.
>>>>>>>>>>>>
>>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
>>>>>>>>>>>> setting whose default is derived from the kernel. Individual
>>>>>>>>>>>> defaults are opaque and controlled by the requested platform.
>>>>>>>>>>>>
>>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
>>>>>>>>>>>> use query-platforms to validate two hosts are compatible.
>>>>>>>>>>>>
>>>>>>>>>>>> Omitting -platform should pick the very latest platform that is
>>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest
>>>>>>>>>>>> platform built-in to QEMU).
>>>>>>>>>>>
>>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know
>>>>>>>>>>> whether it's a must.
>>>>>>>>>>>
>>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>>>>>>>>>>> thought it was mostly the case already, except some extremely rare
>>>>>>>>>>> outliers.
>>>>>>>>>>>
>>>>>>>>>>> When we have one host that boots up a VM using:
>>>>>>>>>>>
>>>>>>>>>>>         $QEMU1 $cmdline
>>>>>>>>>>>
>>>>>>>>>>> Then another host boots up:
>>>>>>>>>>>
>>>>>>>>>>>         $QEMU2 $cmdline -incoming XXX
>>>>>>>>>>>
>>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
>>>>>>>>>>> can boot up all fine without errors on both sides.
>>>>>>>>>>>
>>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
>>>>>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>>>>>>>>>>> didn't, I thought the ABI should be guaranteed.
>>>>>>>>>>>
>>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
>>>>>>>>>>> properties should be following this rule.  The issue here is, we have the
>>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
>>>>>>>>>>> break.
>>>>>>>>>>>
>>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest
>>>>>>>>>>> ABI, then AFAIU QEMU needs to:
>>>>>>>>>>>
>>>>>>>>>>>         - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>>>>>>>
>>>>>>>>>>>         - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>>>>>>>           by default, while advanced users can opt-in on new features.  We can't
>>>>>>>>>>>           make this ON by default otherwise some VMs can already fail to boot,
>>>>>>>>>>
>>>>>>>>>> It may not be necessary the case that old features are supported by
>>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
>>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more
>>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
>>>>>>>>>> privilege so we cannot assume it is always available even on the latest
>>>>>>>>>> version of Linux.
>>>>>>>>>
>>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed
>>>>>>>>> by the guest and we have a fallback anyhow.
>>>>>>
>>>>>> It is noticeable for the guest, and the fallback is not effective with
>>>>>> vhost.
>>>>>
>>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
>>>>
>>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
>>>> not say lack of such fallback is a bug.
>>>
>>> Such fallback is by design since the introduction of vhost.
>>>
>>>> We don't provide in-QEMU
>>>> fallback for other offloads.
>>>
>>> Yes but what I want to say is that eBPF RSS is different from those
>>> segmentation offloads. And technically, Qemu can do fallback for
>>> offloads (as RSC did).
>>
>> Well, I couldn't find any code disabling vhost for the in-QEMU RSC
>> implementation.
> 
> It should be a bug (and I remember we disabled vhost when the patches
> were merged). Have you tested it in a guest to see if it can see RSC
> when vhost is enabled?
> 
> I suspect we need to add the RSC bit into current kernel_feature_bits:
> 
> /* Features supported by host kernel. */
> static const int kernel_feature_bits[] = {
>      VIRTIO_F_NOTIFY_ON_EMPTY,
>      VIRTIO_RING_F_INDIRECT_DESC,
>      VIRTIO_RING_F_EVENT_IDX,
>      VIRTIO_NET_F_MRG_RXBUF,
>      VIRTIO_F_VERSION_1,
>      VIRTIO_NET_F_MTU,
>      VIRTIO_F_IOMMU_PLATFORM,
>      VIRTIO_F_RING_PACKED,
>      VIRTIO_F_RING_RESET,
>      VIRTIO_NET_F_HASH_REPORT,
>      VHOST_INVALID_FEATURE_BIT
> };
> 
> As RSC won't be provided by TUN/TAP anyhow.

Adding the RSC bit does not let QEMU disable vhost for RSC, but instead 
it implicitly disables RSC in my understanding. It is still better than 
advertising the availability of that feature while it is missing.

> 
>>
>> Looking at the code, I also found the case of vhost-vdpa. vhost can be
>> simply disabled if it is backed by tuntap, but it is not the case for vDPA.
> 
> True, technically, vDPA can fallback to SVQ, but it's another topic.

My point of this discussion is that we cannot enable features just 
because they are sufficiently old or because the user claims QEMU runs 
on Linux sufficiently new. eBPF requires privilege, and vDPA requires 
hardware feature. A fallback is not a silver bullet either, and there 
are situations that providing a fallback is not a trivial task.

Regards,
Akihiko Odaki
Jason Wang July 30, 2024, 3:45 a.m. UTC | #32
On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2024/07/30 12:17, Jason Wang wrote:
> > On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> On 2024/07/30 12:03, Jason Wang wrote:
> >>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>
> >>>> On 2024/07/30 11:04, Jason Wang wrote:
> >>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>
> >>>>>> On 2024/07/29 23:29, Peter Xu wrote:
> >>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
> >>>>>>>> On 2024/07/29 12:50, Jason Wang wrote:
> >>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
> >>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> >>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> >>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> >>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> >>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
> >>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the
> >>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> >>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
> >>>>>>>>>>>>>>>>> USO supported, to another host that doesn't..
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
> >>>>>>>>>>>>>>>> don't do this.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> May I ask for my understanding:
> >>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
> >>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
> >>>>>>>>>>>>>>> between machines that have different host kernel features?
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs
> >>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
> >>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
> >>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
> >>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
> >>>>>>>>>>>>>> it, people can easily be using a previous libvirt release.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
> >>>>>>>>>>>>>> todo, which is to define a stable machine ABI.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
> >>>>>>>>>>>>>> sets of features which are tied to specific platform generations.
> >>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken
> >>>>>>>>>>>>>> migration problems from machine types dynamically changing instead
> >>>>>>>>>>>>>> of providing a stable guest ABI.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
> >>>>>>>>>>>>> implementation?
> >>>>>>>>>>>>
> >>>>>>>>>>>> In terms of launching QEMU I'd imagine:
> >>>>>>>>>>>>
> >>>>>>>>>>>>         $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> >>>>>>>>>>>>
> >>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features
> >>>>>>>>>>>> would have their defaults set based on the requested -platform. The
> >>>>>>>>>>>> -machine will be fully invariant wrt the host kernel.
> >>>>>>>>>>>>
> >>>>>>>>>>>> You would have -platform hlep to list available platforms, and
> >>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
> >>>>>>>>>>>> are supported on a given host OS.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Downstream distros can provide their own platforms definitions
> >>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
> >>>>>>>>>>>> diverges from upstream due to backports.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
> >>>>>>>>>>>> setting whose default is derived from the kernel. Individual
> >>>>>>>>>>>> defaults are opaque and controlled by the requested platform.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
> >>>>>>>>>>>> use query-platforms to validate two hosts are compatible.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Omitting -platform should pick the very latest platform that is
> >>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest
> >>>>>>>>>>>> platform built-in to QEMU).
> >>>>>>>>>>>
> >>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know
> >>>>>>>>>>> whether it's a must.
> >>>>>>>>>>>
> >>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> >>>>>>>>>>> thought it was mostly the case already, except some extremely rare
> >>>>>>>>>>> outliers.
> >>>>>>>>>>>
> >>>>>>>>>>> When we have one host that boots up a VM using:
> >>>>>>>>>>>
> >>>>>>>>>>>         $QEMU1 $cmdline
> >>>>>>>>>>>
> >>>>>>>>>>> Then another host boots up:
> >>>>>>>>>>>
> >>>>>>>>>>>         $QEMU2 $cmdline -incoming XXX
> >>>>>>>>>>>
> >>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
> >>>>>>>>>>> can boot up all fine without errors on both sides.
> >>>>>>>>>>>
> >>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
> >>>>>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> >>>>>>>>>>> didn't, I thought the ABI should be guaranteed.
> >>>>>>>>>>>
> >>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
> >>>>>>>>>>> properties should be following this rule.  The issue here is, we have the
> >>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
> >>>>>>>>>>> break.
> >>>>>>>>>>>
> >>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest
> >>>>>>>>>>> ABI, then AFAIU QEMU needs to:
> >>>>>>>>>>>
> >>>>>>>>>>>         - Firstly, never quietly flipping any bit that affects the ABI...
> >>>>>>>>>>>
> >>>>>>>>>>>         - Have a default value of off, then QEMU will always allow the VM to boot
> >>>>>>>>>>>           by default, while advanced users can opt-in on new features.  We can't
> >>>>>>>>>>>           make this ON by default otherwise some VMs can already fail to boot,
> >>>>>>>>>>
> >>>>>>>>>> It may not be necessary the case that old features are supported by
> >>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
> >>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more
> >>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
> >>>>>>>>>> privilege so we cannot assume it is always available even on the latest
> >>>>>>>>>> version of Linux.
> >>>>>>>>>
> >>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed
> >>>>>>>>> by the guest and we have a fallback anyhow.
> >>>>>>
> >>>>>> It is noticeable for the guest, and the fallback is not effective with
> >>>>>> vhost.
> >>>>>
> >>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
> >>>>
> >>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
> >>>> not say lack of such fallback is a bug.
> >>>
> >>> Such fallback is by design since the introduction of vhost.
> >>>
> >>>> We don't provide in-QEMU
> >>>> fallback for other offloads.
> >>>
> >>> Yes but what I want to say is that eBPF RSS is different from those
> >>> segmentation offloads. And technically, Qemu can do fallback for
> >>> offloads (as RSC did).
> >>
> >> Well, I couldn't find any code disabling vhost for the in-QEMU RSC
> >> implementation.
> >
> > It should be a bug (and I remember we disabled vhost when the patches
> > were merged). Have you tested it in a guest to see if it can see RSC
> > when vhost is enabled?
> >
> > I suspect we need to add the RSC bit into current kernel_feature_bits:
> >
> > /* Features supported by host kernel. */
> > static const int kernel_feature_bits[] = {
> >      VIRTIO_F_NOTIFY_ON_EMPTY,
> >      VIRTIO_RING_F_INDIRECT_DESC,
> >      VIRTIO_RING_F_EVENT_IDX,
> >      VIRTIO_NET_F_MRG_RXBUF,
> >      VIRTIO_F_VERSION_1,
> >      VIRTIO_NET_F_MTU,
> >      VIRTIO_F_IOMMU_PLATFORM,
> >      VIRTIO_F_RING_PACKED,
> >      VIRTIO_F_RING_RESET,
> >      VIRTIO_NET_F_HASH_REPORT,
> >      VHOST_INVALID_FEATURE_BIT
> > };
> >
> > As RSC won't be provided by TUN/TAP anyhow.
>
> Adding the RSC bit does not let QEMU disable vhost for RSC, but instead
> it implicitly disables RSC in my understanding.

Yes.

> It is still better than
> advertising the availability of that feature while it is missing.

Down the road, we probably need to change the behaviour of disabling vhost-net.

>
> >
> >>
> >> Looking at the code, I also found the case of vhost-vdpa. vhost can be
> >> simply disabled if it is backed by tuntap, but it is not the case for vDPA.
> >
> > True, technically, vDPA can fallback to SVQ, but it's another topic.
>
> My point of this discussion is that we cannot enable features just
> because they are sufficiently old or because the user claims QEMU runs
> on Linux sufficiently new. eBPF requires privilege, and vDPA requires
> hardware feature. A fallback is not a silver bullet either, and there
> are situations that providing a fallback is not a trivial task.

To make sure we are on the same page. I just want to point out that
eBPF RSS is not a good example in this context.

It works only for tuntap, so we should stick to the behaviour of
trying to fallback to userspace if we can as we've already had a
userspace fallback. This is the fundamental difference with other
features (like segmentation offload) or backend (vDPA) that doesn't
have an existing fallback.

Thanks

>
> Regards,
> Akihiko Odaki
>
Akihiko Odaki July 30, 2024, 10:23 a.m. UTC | #33
On 2024/07/30 12:45, Jason Wang wrote:
> On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>
>> On 2024/07/30 12:17, Jason Wang wrote:
>>> On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>
>>>> On 2024/07/30 12:03, Jason Wang wrote:
>>>>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>
>>>>>> On 2024/07/30 11:04, Jason Wang wrote:
>>>>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>>>
>>>>>>>> On 2024/07/29 23:29, Peter Xu wrote:
>>>>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
>>>>>>>>>> On 2024/07/29 12:50, Jason Wang wrote:
>>>>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
>>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
>>>>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
>>>>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
>>>>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
>>>>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
>>>>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the
>>>>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
>>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
>>>>>>>>>>>>>>>>>>> USO supported, to another host that doesn't..
>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
>>>>>>>>>>>>>>>>>> don't do this.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> May I ask for my understanding:
>>>>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
>>>>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
>>>>>>>>>>>>>>>>> between machines that have different host kernel features?
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs
>>>>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
>>>>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
>>>>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
>>>>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
>>>>>>>>>>>>>>>> it, people can easily be using a previous libvirt release.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
>>>>>>>>>>>>>>>> todo, which is to define a stable machine ABI.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
>>>>>>>>>>>>>>>> sets of features which are tied to specific platform generations.
>>>>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken
>>>>>>>>>>>>>>>> migration problems from machine types dynamically changing instead
>>>>>>>>>>>>>>>> of providing a stable guest ABI.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
>>>>>>>>>>>>>>> implementation?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> In terms of launching QEMU I'd imagine:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>          $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features
>>>>>>>>>>>>>> would have their defaults set based on the requested -platform. The
>>>>>>>>>>>>>> -machine will be fully invariant wrt the host kernel.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> You would have -platform hlep to list available platforms, and
>>>>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
>>>>>>>>>>>>>> are supported on a given host OS.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Downstream distros can provide their own platforms definitions
>>>>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
>>>>>>>>>>>>>> diverges from upstream due to backports.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
>>>>>>>>>>>>>> setting whose default is derived from the kernel. Individual
>>>>>>>>>>>>>> defaults are opaque and controlled by the requested platform.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
>>>>>>>>>>>>>> use query-platforms to validate two hosts are compatible.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Omitting -platform should pick the very latest platform that is
>>>>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest
>>>>>>>>>>>>>> platform built-in to QEMU).
>>>>>>>>>>>>>
>>>>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know
>>>>>>>>>>>>> whether it's a must.
>>>>>>>>>>>>>
>>>>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
>>>>>>>>>>>>> thought it was mostly the case already, except some extremely rare
>>>>>>>>>>>>> outliers.
>>>>>>>>>>>>>
>>>>>>>>>>>>> When we have one host that boots up a VM using:
>>>>>>>>>>>>>
>>>>>>>>>>>>>          $QEMU1 $cmdline
>>>>>>>>>>>>>
>>>>>>>>>>>>> Then another host boots up:
>>>>>>>>>>>>>
>>>>>>>>>>>>>          $QEMU2 $cmdline -incoming XXX
>>>>>>>>>>>>>
>>>>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
>>>>>>>>>>>>> can boot up all fine without errors on both sides.
>>>>>>>>>>>>>
>>>>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
>>>>>>>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
>>>>>>>>>>>>> didn't, I thought the ABI should be guaranteed.
>>>>>>>>>>>>>
>>>>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
>>>>>>>>>>>>> properties should be following this rule.  The issue here is, we have the
>>>>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
>>>>>>>>>>>>> break.
>>>>>>>>>>>>>
>>>>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest
>>>>>>>>>>>>> ABI, then AFAIU QEMU needs to:
>>>>>>>>>>>>>
>>>>>>>>>>>>>          - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>>>>>>>>>
>>>>>>>>>>>>>          - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>>>>>>>>>            by default, while advanced users can opt-in on new features.  We can't
>>>>>>>>>>>>>            make this ON by default otherwise some VMs can already fail to boot,
>>>>>>>>>>>>
>>>>>>>>>>>> It may not be necessary the case that old features are supported by
>>>>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
>>>>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more
>>>>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
>>>>>>>>>>>> privilege so we cannot assume it is always available even on the latest
>>>>>>>>>>>> version of Linux.
>>>>>>>>>>>
>>>>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed
>>>>>>>>>>> by the guest and we have a fallback anyhow.
>>>>>>>>
>>>>>>>> It is noticeable for the guest, and the fallback is not effective with
>>>>>>>> vhost.
>>>>>>>
>>>>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
>>>>>>
>>>>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
>>>>>> not say lack of such fallback is a bug.
>>>>>
>>>>> Such fallback is by design since the introduction of vhost.
>>>>>
>>>>>> We don't provide in-QEMU
>>>>>> fallback for other offloads.
>>>>>
>>>>> Yes but what I want to say is that eBPF RSS is different from those
>>>>> segmentation offloads. And technically, Qemu can do fallback for
>>>>> offloads (as RSC did).
>>>>
>>>> Well, I couldn't find any code disabling vhost for the in-QEMU RSC
>>>> implementation.
>>>
>>> It should be a bug (and I remember we disabled vhost when the patches
>>> were merged). Have you tested it in a guest to see if it can see RSC
>>> when vhost is enabled?
>>>
>>> I suspect we need to add the RSC bit into current kernel_feature_bits:
>>>
>>> /* Features supported by host kernel. */
>>> static const int kernel_feature_bits[] = {
>>>       VIRTIO_F_NOTIFY_ON_EMPTY,
>>>       VIRTIO_RING_F_INDIRECT_DESC,
>>>       VIRTIO_RING_F_EVENT_IDX,
>>>       VIRTIO_NET_F_MRG_RXBUF,
>>>       VIRTIO_F_VERSION_1,
>>>       VIRTIO_NET_F_MTU,
>>>       VIRTIO_F_IOMMU_PLATFORM,
>>>       VIRTIO_F_RING_PACKED,
>>>       VIRTIO_F_RING_RESET,
>>>       VIRTIO_NET_F_HASH_REPORT,
>>>       VHOST_INVALID_FEATURE_BIT
>>> };
>>>
>>> As RSC won't be provided by TUN/TAP anyhow.
>>
>> Adding the RSC bit does not let QEMU disable vhost for RSC, but instead
>> it implicitly disables RSC in my understanding.
> 
> Yes.
> 
>> It is still better than
>> advertising the availability of that feature while it is missing.
> 
> Down the road, we probably need to change the behaviour of disabling vhost-net.
> 
>>
>>>
>>>>
>>>> Looking at the code, I also found the case of vhost-vdpa. vhost can be
>>>> simply disabled if it is backed by tuntap, but it is not the case for vDPA.
>>>
>>> True, technically, vDPA can fallback to SVQ, but it's another topic.
>>
>> My point of this discussion is that we cannot enable features just
>> because they are sufficiently old or because the user claims QEMU runs
>> on Linux sufficiently new. eBPF requires privilege, and vDPA requires
>> hardware feature. A fallback is not a silver bullet either, and there
>> are situations that providing a fallback is not a trivial task.
> 
> To make sure we are on the same page. I just want to point out that
> eBPF RSS is not a good example in this context.
> 
> It works only for tuntap, so we should stick to the behaviour of
> trying to fallback to userspace if we can as we've already had a
> userspace fallback. This is the fundamental difference with other
> features (like segmentation offload) or backend (vDPA) that doesn't
> have an existing fallback.

Some (probably not all) offloads are implemented in hw/net/net_tx_pkt.c. 
They are not wired up to behave as a fallback when tuntap's vhost is 
enabled as the in-QEMU RSS is not. In either case, we need to pay some 
effort to wiring things.

I'm not sure it is worthwhile. I think there is a high chance that 
selectively disabling vhost and keeping RSS enabled with fallback will 
result in worse performance than keeping vhost enabled and disabling 
RSS. Such a fallback can still function as an emergency escape hatch, 
but it is also incomplete as we don't have fallbacks for other features. 
I would rather make any features missing in the vhost backend fail to 
keep things consistent.

Regards,
Akihiko Odaki
Yuri Benditovich July 30, 2024, 11:52 a.m. UTC | #34
@Akihiko Odaki The RSC is supported with vhost and without vhost
The 'in-qemu RSC' is related to VIRTIO_NET_F_RSC_EXT feature, it is
intended for one specific WHCK test only and should not be used in any
functional setup.
When it is used the vhost should be off

On Tue, Jul 30, 2024 at 1:23 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2024/07/30 12:45, Jason Wang wrote:
> > On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> On 2024/07/30 12:17, Jason Wang wrote:
> >>> On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>
> >>>> On 2024/07/30 12:03, Jason Wang wrote:
> >>>>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>
> >>>>>> On 2024/07/30 11:04, Jason Wang wrote:
> >>>>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>>>
> >>>>>>>> On 2024/07/29 23:29, Peter Xu wrote:
> >>>>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
> >>>>>>>>>> On 2024/07/29 12:50, Jason Wang wrote:
> >>>>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
> >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> >>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> >>>>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> >>>>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> >>>>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> >>>>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
> >>>>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the
> >>>>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
> >>>>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> >>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >>>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
> >>>>>>>>>>>>>>>>>>> USO supported, to another host that doesn't..
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
> >>>>>>>>>>>>>>>>>> don't do this.
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> May I ask for my understanding:
> >>>>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
> >>>>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
> >>>>>>>>>>>>>>>>> between machines that have different host kernel features?
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs
> >>>>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
> >>>>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
> >>>>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
> >>>>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
> >>>>>>>>>>>>>>>> it, people can easily be using a previous libvirt release.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
> >>>>>>>>>>>>>>>> todo, which is to define a stable machine ABI.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
> >>>>>>>>>>>>>>>> sets of features which are tied to specific platform generations.
> >>>>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken
> >>>>>>>>>>>>>>>> migration problems from machine types dynamically changing instead
> >>>>>>>>>>>>>>>> of providing a stable guest ABI.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
> >>>>>>>>>>>>>>> implementation?
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> In terms of launching QEMU I'd imagine:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>          $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features
> >>>>>>>>>>>>>> would have their defaults set based on the requested -platform. The
> >>>>>>>>>>>>>> -machine will be fully invariant wrt the host kernel.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> You would have -platform hlep to list available platforms, and
> >>>>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
> >>>>>>>>>>>>>> are supported on a given host OS.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Downstream distros can provide their own platforms definitions
> >>>>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
> >>>>>>>>>>>>>> diverges from upstream due to backports.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
> >>>>>>>>>>>>>> setting whose default is derived from the kernel. Individual
> >>>>>>>>>>>>>> defaults are opaque and controlled by the requested platform.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
> >>>>>>>>>>>>>> use query-platforms to validate two hosts are compatible.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Omitting -platform should pick the very latest platform that is
> >>>>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest
> >>>>>>>>>>>>>> platform built-in to QEMU).
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know
> >>>>>>>>>>>>> whether it's a must.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> >>>>>>>>>>>>> thought it was mostly the case already, except some extremely rare
> >>>>>>>>>>>>> outliers.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> When we have one host that boots up a VM using:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          $QEMU1 $cmdline
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Then another host boots up:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          $QEMU2 $cmdline -incoming XXX
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
> >>>>>>>>>>>>> can boot up all fine without errors on both sides.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
> >>>>>>>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> >>>>>>>>>>>>> didn't, I thought the ABI should be guaranteed.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
> >>>>>>>>>>>>> properties should be following this rule.  The issue here is, we have the
> >>>>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
> >>>>>>>>>>>>> break.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest
> >>>>>>>>>>>>> ABI, then AFAIU QEMU needs to:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          - Firstly, never quietly flipping any bit that affects the ABI...
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          - Have a default value of off, then QEMU will always allow the VM to boot
> >>>>>>>>>>>>>            by default, while advanced users can opt-in on new features.  We can't
> >>>>>>>>>>>>>            make this ON by default otherwise some VMs can already fail to boot,
> >>>>>>>>>>>>
> >>>>>>>>>>>> It may not be necessary the case that old features are supported by
> >>>>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
> >>>>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more
> >>>>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
> >>>>>>>>>>>> privilege so we cannot assume it is always available even on the latest
> >>>>>>>>>>>> version of Linux.
> >>>>>>>>>>>
> >>>>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed
> >>>>>>>>>>> by the guest and we have a fallback anyhow.
> >>>>>>>>
> >>>>>>>> It is noticeable for the guest, and the fallback is not effective with
> >>>>>>>> vhost.
> >>>>>>>
> >>>>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
> >>>>>>
> >>>>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
> >>>>>> not say lack of such fallback is a bug.
> >>>>>
> >>>>> Such fallback is by design since the introduction of vhost.
> >>>>>
> >>>>>> We don't provide in-QEMU
> >>>>>> fallback for other offloads.
> >>>>>
> >>>>> Yes but what I want to say is that eBPF RSS is different from those
> >>>>> segmentation offloads. And technically, Qemu can do fallback for
> >>>>> offloads (as RSC did).
> >>>>
> >>>> Well, I couldn't find any code disabling vhost for the in-QEMU RSC
> >>>> implementation.
> >>>
> >>> It should be a bug (and I remember we disabled vhost when the patches
> >>> were merged). Have you tested it in a guest to see if it can see RSC
> >>> when vhost is enabled?
> >>>
> >>> I suspect we need to add the RSC bit into current kernel_feature_bits:
> >>>
> >>> /* Features supported by host kernel. */
> >>> static const int kernel_feature_bits[] = {
> >>>       VIRTIO_F_NOTIFY_ON_EMPTY,
> >>>       VIRTIO_RING_F_INDIRECT_DESC,
> >>>       VIRTIO_RING_F_EVENT_IDX,
> >>>       VIRTIO_NET_F_MRG_RXBUF,
> >>>       VIRTIO_F_VERSION_1,
> >>>       VIRTIO_NET_F_MTU,
> >>>       VIRTIO_F_IOMMU_PLATFORM,
> >>>       VIRTIO_F_RING_PACKED,
> >>>       VIRTIO_F_RING_RESET,
> >>>       VIRTIO_NET_F_HASH_REPORT,
> >>>       VHOST_INVALID_FEATURE_BIT
> >>> };
> >>>
> >>> As RSC won't be provided by TUN/TAP anyhow.
> >>
> >> Adding the RSC bit does not let QEMU disable vhost for RSC, but instead
> >> it implicitly disables RSC in my understanding.
> >
> > Yes.
> >
> >> It is still better than
> >> advertising the availability of that feature while it is missing.
> >
> > Down the road, we probably need to change the behaviour of disabling vhost-net.
> >
> >>
> >>>
> >>>>
> >>>> Looking at the code, I also found the case of vhost-vdpa. vhost can be
> >>>> simply disabled if it is backed by tuntap, but it is not the case for vDPA.
> >>>
> >>> True, technically, vDPA can fallback to SVQ, but it's another topic.
> >>
> >> My point of this discussion is that we cannot enable features just
> >> because they are sufficiently old or because the user claims QEMU runs
> >> on Linux sufficiently new. eBPF requires privilege, and vDPA requires
> >> hardware feature. A fallback is not a silver bullet either, and there
> >> are situations that providing a fallback is not a trivial task.
> >
> > To make sure we are on the same page. I just want to point out that
> > eBPF RSS is not a good example in this context.
> >
> > It works only for tuntap, so we should stick to the behaviour of
> > trying to fallback to userspace if we can as we've already had a
> > userspace fallback. This is the fundamental difference with other
> > features (like segmentation offload) or backend (vDPA) that doesn't
> > have an existing fallback.
>
> Some (probably not all) offloads are implemented in hw/net/net_tx_pkt.c.
> They are not wired up to behave as a fallback when tuntap's vhost is
> enabled as the in-QEMU RSS is not. In either case, we need to pay some
> effort to wiring things.
>
> I'm not sure it is worthwhile. I think there is a high chance that
> selectively disabling vhost and keeping RSS enabled with fallback will
> result in worse performance than keeping vhost enabled and disabling
> RSS. Such a fallback can still function as an emergency escape hatch,
> but it is also incomplete as we don't have fallbacks for other features.
> I would rather make any features missing in the vhost backend fail to
> keep things consistent.
>
> Regards,
> Akihiko Odaki
Peter Xu July 30, 2024, 6:02 p.m. UTC | #35
On Tue, Jul 30, 2024 at 02:23:46AM +0900, Akihiko Odaki wrote:
> On 2024/07/30 2:00, Peter Xu wrote:
> > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > > On Fri, Jul 26, 2024 at 04:47:40PM -0400, Peter Xu wrote:
> > > > On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> > > > > 
> > > > > In terms of launching QEMU I'd imagine:
> > > > > 
> > > > >    $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> > > > > 
> > > > > Any virtual machine HW features which are tied to host kernel features
> > > > > would have their defaults set based on the requested -platform. The
> > > > > -machine will be fully invariant wrt the host kernel.
> > > > > 
> > > > > You would have -platform hlep to list available platforms, and
> > > > > corresonding QMP "query-platforms" command to list what platforms
> > > > > are supported on a given host OS.
> > > > > 
> > > > > Downstream distros can provide their own platforms definitions
> > > > > (eg "linux-rhel-9.5") if they have kernels whose feature set
> > > > > diverges from upstream due to backports.
> > > > > 
> > > > > Mgmt apps won't need to be taught about every single little QEMU
> > > > > setting whose default is derived from the kernel. Individual
> > > > > defaults are opaque and controlled by the requested platform.
> > > > > 
> > > > > Live migration has clearly defined semantics, and mgmt app can
> > > > > use query-platforms to validate two hosts are compatible.
> > > > > 
> > > > > Omitting -platform should pick the very latest platform that is
> > > > > cmpatible with the current host (not neccessarily the latest
> > > > > platform built-in to QEMU).
> > > > 
> > > > This seems to add one more layer to maintain, and so far I don't know
> > > > whether it's a must.
> > > > 
> > > > To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> > > > thought it was mostly the case already, except some extremely rare
> > > > outliers.
> > > > 
> > > > When we have one host that boots up a VM using:
> > > > 
> > > >    $QEMU1 $cmdline
> > > > 
> > > > Then another host boots up:
> > > > 
> > > >    $QEMU2 $cmdline -incoming XXX
> > > > 
> > > > Then migration should succeed if $cmdline is exactly the same, and the VM
> > > > can boot up all fine without errors on both sides.
> > > > 
> > > > AFAICT this has nothing to do with what kernel is underneath, even not
> > > > Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> > > > didn't, I thought the ABI should be guaranteed.
> > > 
> > > We've got two mutually conflicting goals with the machine type
> > > definitions.
> > > 
> > > Primarily we use them to ensure stable ABI, but an important
> > > secondary goal is to enable new tunables to have new defaults
> > > set, without having to update every mgmt app.  The latter
> > > works very well when the defaults have no dependancy on the
> > > platform kernel/OS, but breaks migration when they do have a
> > > platform dependancy.
> > > 
> > > >    - Firstly, never quietly flipping any bit that affects the ABI...
> > > > 
> > > >    - Have a default value of off, then QEMU will always allow the VM to boot
> > > >      by default, while advanced users can opt-in on new features.  We can't
> > > >      make this ON by default otherwise some VMs can already fail to boot,
> > > > 
> > > >    - If the host doesn't support the feature while the cmdline enabled it,
> > > >      it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > >      this host does not support running such VM specified, due to XXX
> > > >      feature missing".
> > > > 
> > > > That's the only way an user could understand what happened, and IMHO that's
> > > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > > while in which the machine type is the fundation of such definition, as the
> > > > machine type can decides many of the rest compat properties.  And that's
> > > > the whole point of the compat properties too (to make sure the guest ABI is
> > > > stable).
> > > > 
> > > > If kernel breaks it easily, all compat property things that we maintain can
> > > > already stop making sense in general, because it didn't define the whole
> > > > guest ABI..
> > > > 
> > > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > > keep up with this rule?
> > > 
> > > We've failed at this for years wrt enabling use of new defaults that have
> > > a platform depedancy, so historical practice isn't a good reference.
> > > 
> > > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > > the machine type, and of those, libvirt likely only exposes a few 10's
> > > of tunables. The vast majority are low level details that no mgmt app
> > > wants to know about, they just want to accept QEMU's new defaults,
> > > while preserving machine ABI. This is a good thing. No one wants the
> > > burden of wiring up every single tunable into libvirt and mgmt apps.
> > > 
> > > This is what the "-platform" concept would be intended to preserve. It
> > > would allow a way to enable groups of settings that have a platform level
> > > dependancy, without ever having to teach either libvirt or the mgmt apps
> > > about the individual tunables.
> > 
> > Do you think we can achieve similar goal by simply turning the feature to
> > ON only after a few QEMU releases?  I also mentioned that idea below.
> > 
> > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> > 
> > So far it really sounds like the right thing to do to me to fix all similar
> > issues, even without introducing anything new we need to maintain.
> > 
> > To put that again, what we need to do is this:
> > 
> >    - To start: we should NEVER turn any guest ABI relevant bits
> >      automatically by QEMU, for sure..
> > 
> >    - When introducing any new device feature that may both (1) affects guest
> >      ABI, and (2) depends on host kernel features, we set those default
> >      values to OFF always at start. So this already covers old machine
> >      types, no compat property needed so far.
> > 
> >    - We always fail hard on QEMU boot whenever we detected such property is
> >      not supported by the current host when with ON (and since it's OFF by
> >      default it must be that the user specified that ON).
> > 
> >    - When after a stablized period of time for that new feature to land most
> >      kernels (we may consider to look at how major Linux distros updates the
> >      kernel versions) when we're pretty sure the new feature should be
> >      available on most of the QEMU modern users, we add a patch to make the
> >      property default ON on the new machine type, add a compat property for
> >      old machines.
> > 
> > The last bullet also means we'll start to fail new machine type from
> > booting when running that very new QEMU on a very old kernel, but that's
> > the trade-off, and when doing it right on "stablizing the feature in the
> > kernel world", it should really be corner case.  The user should simply
> > invoke an old machine type on that old kernel, even if the qemu is new.
> 
> docs/about/build-platforms.rst already defines supported platforms. One of
> the supported platforms is Debian 11 (bullseye), and it carries Linux 5.10,
> which was released December 2020. If we follow this platform support, a new
> feature added to upstream Linux may take about 4 years before it gets
> enabled by default on QEMU.
> 
> As an upstream developer, I feel it is too long, but I'm sure there are
> different opinions from different perspectives.

Above rule won't stop the supported platforms to still run the QEMU
binaries, am I right?  Especially if it's a serious user the VMs should
always be invoked with an old machine type, and that shouldn't be impacted,
as the old machines should simply don't support such new kernel feature.

The payoff here is only about when the user tries to start the VM using the
default / latest machine type.  Then with above rule it should fail clearly
on what is required to turn OFF so as to boot that VM.

Then the user has two options: turn that feature OFF manually, or switch to
an old machine type.

This is all still based on the fact that we do plan to keep that OFF for a
while.  So if we think "a few years" is too long, one option is we set it
to ON after e.g. 1-2 years so it's in the middle ground where some such
new users will fail booting the VM on old hosts, but it'll start to benefit
whoever runs the same on a new host.

So far I think it's not a major deal, especially considering that this so
far looks like the easiest and (still looks to me..) workable solution to
make migration always work, IMHO more important to serious VM users.

I'm definitely open to other options or suggestions if there is.  I just
don't see anything yet that is easily applicable..

Thanks,
Peter Xu July 30, 2024, 6:13 p.m. UTC | #36
On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
> On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
> > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > > 
> > > We've got two mutually conflicting goals with the machine type
> > > definitions.
> > > 
> > > Primarily we use them to ensure stable ABI, but an important
> > > secondary goal is to enable new tunables to have new defaults
> > > set, without having to update every mgmt app.  The latter
> > > works very well when the defaults have no dependancy on the
> > > platform kernel/OS, but breaks migration when they do have a
> > > platform dependancy.
> > > 
> > > >   - Firstly, never quietly flipping any bit that affects the ABI...
> > > > 
> > > >   - Have a default value of off, then QEMU will always allow the VM to boot
> > > >     by default, while advanced users can opt-in on new features.  We can't
> > > >     make this ON by default otherwise some VMs can already fail to boot,
> > > > 
> > > >   - If the host doesn't support the feature while the cmdline enabled it,
> > > >     it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > >     this host does not support running such VM specified, due to XXX
> > > >     feature missing".
> > > > 
> > > > That's the only way an user could understand what happened, and IMHO that's
> > > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > > while in which the machine type is the fundation of such definition, as the
> > > > machine type can decides many of the rest compat properties.  And that's
> > > > the whole point of the compat properties too (to make sure the guest ABI is
> > > > stable).
> > > > 
> > > > If kernel breaks it easily, all compat property things that we maintain can
> > > > already stop making sense in general, because it didn't define the whole
> > > > guest ABI..
> > > > 
> > > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > > keep up with this rule?
> > > 
> > > We've failed at this for years wrt enabling use of new defaults that have
> > > a platform depedancy, so historical practice isn't a good reference.
> > > 
> > > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > > the machine type, and of those, libvirt likely only exposes a few 10's
> > > of tunables. The vast majority are low level details that no mgmt app
> > > wants to know about, they just want to accept QEMU's new defaults,
> > > while preserving machine ABI. This is a good thing. No one wants the
> > > burden of wiring up every single tunable into libvirt and mgmt apps.
> > > 
> > > This is what the "-platform" concept would be intended to preserve. It
> > > would allow a way to enable groups of settings that have a platform level
> > > dependancy, without ever having to teach either libvirt or the mgmt apps
> > > about the individual tunables.
> > 
> > Do you think we can achieve similar goal by simply turning the feature to
> > ON only after a few QEMU releases?  I also mentioned that idea below.
> > 
> > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> > 
> > So far it really sounds like the right thing to do to me to fix all similar
> > issues, even without introducing anything new we need to maintain.
> 
> Turning a feature with a platform dependency to "on" implies that
> the machine type will cease to work out of the box for platforms
> which lack the feature. IMHO that's not acceptable behaviour for
> any of our supported platforms.

Right, that's why I was thinking whether we should just always be on the
safe side, even if I just replied in the other email to Akihiko, that we do
have the option to make this more aggresive by turning those to ON after
even 1-2 years or even less.. and we have control of how aggressive this
can be.

> 
> IOW, "after a few QEMU releases" implies a delay of as much as
> 5 years, while we wait for platforms which don't support the
> feature to drop out of our supported targets list.  I don't
> think that'll satisfy the desire to get the new feature
> available to users as soon as practical for their particular
> platform.

The feature is always available since the 1st day, right?  We just need the
user to opt-in, by specifying ON in the cmdline.

That'll be my take on this that QEMU's default VM setup should be always
bootable, migratable, and so on.  Then user opt-in on stuff like this one,
where there's implication on the ABIs.  The "user" can also include
Libvirt.  I mean when something is really important, Libvirt should, IMHO,
opt-in by treating that similarly like many cpu properties, and by probing
the host first.

IIUC there aren't a lot of things like that (part of guest ABI & host
kernel / HW dependent), am I right?  Otherwise I would expect more failures
like this one, but it isn't as much as that yet.  IIUC it means the efforts
to make Libvirt get involved should be hopefully under control too.  The
worst case is Libvirt doesn't auto-on it, but again the user should always
have the option to turn it on when it's necessary.

Thanks,

> 
> > 
> > To put that again, what we need to do is this:
> > 
> >   - To start: we should NEVER turn any guest ABI relevant bits
> >     automatically by QEMU, for sure..
> > 
> >   - When introducing any new device feature that may both (1) affects guest
> >     ABI, and (2) depends on host kernel features, we set those default
> >     values to OFF always at start. So this already covers old machine
> >     types, no compat property needed so far.
> > 
> >   - We always fail hard on QEMU boot whenever we detected such property is
> >     not supported by the current host when with ON (and since it's OFF by
> >     default it must be that the user specified that ON).
> > 
> >   - When after a stablized period of time for that new feature to land most
> >     kernels (we may consider to look at how major Linux distros updates the
> >     kernel versions) when we're pretty sure the new feature should be
> >     available on most of the QEMU modern users, we add a patch to make the
> >     property default ON on the new machine type, add a compat property for
> >     old machines.
> 
> Our supported platform list determines when this will be, and given
> our current criteria, this can be as long as 5 years.
> 
> 
> With regards,
> Daniel
> -- 
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>
Daniel P. Berrangé July 30, 2024, 6:46 p.m. UTC | #37
On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
> On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
> > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
> > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > > > 
> > > > We've got two mutually conflicting goals with the machine type
> > > > definitions.
> > > > 
> > > > Primarily we use them to ensure stable ABI, but an important
> > > > secondary goal is to enable new tunables to have new defaults
> > > > set, without having to update every mgmt app.  The latter
> > > > works very well when the defaults have no dependancy on the
> > > > platform kernel/OS, but breaks migration when they do have a
> > > > platform dependancy.
> > > > 
> > > > >   - Firstly, never quietly flipping any bit that affects the ABI...
> > > > > 
> > > > >   - Have a default value of off, then QEMU will always allow the VM to boot
> > > > >     by default, while advanced users can opt-in on new features.  We can't
> > > > >     make this ON by default otherwise some VMs can already fail to boot,
> > > > > 
> > > > >   - If the host doesn't support the feature while the cmdline enabled it,
> > > > >     it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > > >     this host does not support running such VM specified, due to XXX
> > > > >     feature missing".
> > > > > 
> > > > > That's the only way an user could understand what happened, and IMHO that's
> > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > > > while in which the machine type is the fundation of such definition, as the
> > > > > machine type can decides many of the rest compat properties.  And that's
> > > > > the whole point of the compat properties too (to make sure the guest ABI is
> > > > > stable).
> > > > > 
> > > > > If kernel breaks it easily, all compat property things that we maintain can
> > > > > already stop making sense in general, because it didn't define the whole
> > > > > guest ABI..
> > > > > 
> > > > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > > > keep up with this rule?
> > > > 
> > > > We've failed at this for years wrt enabling use of new defaults that have
> > > > a platform depedancy, so historical practice isn't a good reference.
> > > > 
> > > > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > > > the machine type, and of those, libvirt likely only exposes a few 10's
> > > > of tunables. The vast majority are low level details that no mgmt app
> > > > wants to know about, they just want to accept QEMU's new defaults,
> > > > while preserving machine ABI. This is a good thing. No one wants the
> > > > burden of wiring up every single tunable into libvirt and mgmt apps.
> > > > 
> > > > This is what the "-platform" concept would be intended to preserve. It
> > > > would allow a way to enable groups of settings that have a platform level
> > > > dependancy, without ever having to teach either libvirt or the mgmt apps
> > > > about the individual tunables.
> > > 
> > > Do you think we can achieve similar goal by simply turning the feature to
> > > ON only after a few QEMU releases?  I also mentioned that idea below.
> > > 
> > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> > > 
> > > So far it really sounds like the right thing to do to me to fix all similar
> > > issues, even without introducing anything new we need to maintain.
> > 
> > Turning a feature with a platform dependency to "on" implies that
> > the machine type will cease to work out of the box for platforms
> > which lack the feature. IMHO that's not acceptable behaviour for
> > any of our supported platforms.
> 
> Right, that's why I was thinking whether we should just always be on the
> safe side, even if I just replied in the other email to Akihiko, that we do
> have the option to make this more aggresive by turning those to ON after
> even 1-2 years or even less.. and we have control of how aggressive this
> can be.
> 
> > 
> > IOW, "after a few QEMU releases" implies a delay of as much as
> > 5 years, while we wait for platforms which don't support the
> > feature to drop out of our supported targets list.  I don't
> > think that'll satisfy the desire to get the new feature
> > available to users as soon as practical for their particular
> > platform.
> 
> The feature is always available since the 1st day, right?  We just need the
> user to opt-in, by specifying ON in the cmdline.
> 
> That'll be my take on this that QEMU's default VM setup should be always
> bootable, migratable, and so on.  Then user opt-in on stuff like this one,
> where there's implication on the ABIs.  The "user" can also include
> Libvirt.  I mean when something is really important, Libvirt should, IMHO,
> opt-in by treating that similarly like many cpu properties, and by probing
> the host first.
> 
> IIUC there aren't a lot of things like that (part of guest ABI & host
> kernel / HW dependent), am I right?  Otherwise I would expect more failures
> like this one, but it isn't as much as that yet.  IIUC it means the efforts
> to make Libvirt get involved should be hopefully under control too.  The
> worst case is Libvirt doesn't auto-on it, but again the user should always
> have the option to turn it on when it's necessary.

If it is left to libvirt, then it would very likely end up being a user
opt-in, not auto-enabled.

With regards,
Daniel
Peter Xu July 30, 2024, 7:11 p.m. UTC | #38
On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote:
> On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
> > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
> > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
> > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > > > > 
> > > > > We've got two mutually conflicting goals with the machine type
> > > > > definitions.
> > > > > 
> > > > > Primarily we use them to ensure stable ABI, but an important
> > > > > secondary goal is to enable new tunables to have new defaults
> > > > > set, without having to update every mgmt app.  The latter
> > > > > works very well when the defaults have no dependancy on the
> > > > > platform kernel/OS, but breaks migration when they do have a
> > > > > platform dependancy.
> > > > > 
> > > > > >   - Firstly, never quietly flipping any bit that affects the ABI...
> > > > > > 
> > > > > >   - Have a default value of off, then QEMU will always allow the VM to boot
> > > > > >     by default, while advanced users can opt-in on new features.  We can't
> > > > > >     make this ON by default otherwise some VMs can already fail to boot,
> > > > > > 
> > > > > >   - If the host doesn't support the feature while the cmdline enabled it,
> > > > > >     it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > > > >     this host does not support running such VM specified, due to XXX
> > > > > >     feature missing".
> > > > > > 
> > > > > > That's the only way an user could understand what happened, and IMHO that's
> > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > > > > while in which the machine type is the fundation of such definition, as the
> > > > > > machine type can decides many of the rest compat properties.  And that's
> > > > > > the whole point of the compat properties too (to make sure the guest ABI is
> > > > > > stable).
> > > > > > 
> > > > > > If kernel breaks it easily, all compat property things that we maintain can
> > > > > > already stop making sense in general, because it didn't define the whole
> > > > > > guest ABI..
> > > > > > 
> > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > > > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > > > > keep up with this rule?
> > > > > 
> > > > > We've failed at this for years wrt enabling use of new defaults that have
> > > > > a platform depedancy, so historical practice isn't a good reference.
> > > > > 
> > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > > > > the machine type, and of those, libvirt likely only exposes a few 10's
> > > > > of tunables. The vast majority are low level details that no mgmt app
> > > > > wants to know about, they just want to accept QEMU's new defaults,
> > > > > while preserving machine ABI. This is a good thing. No one wants the
> > > > > burden of wiring up every single tunable into libvirt and mgmt apps.
> > > > > 
> > > > > This is what the "-platform" concept would be intended to preserve. It
> > > > > would allow a way to enable groups of settings that have a platform level
> > > > > dependancy, without ever having to teach either libvirt or the mgmt apps
> > > > > about the individual tunables.
> > > > 
> > > > Do you think we can achieve similar goal by simply turning the feature to
> > > > ON only after a few QEMU releases?  I also mentioned that idea below.
> > > > 
> > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> > > > 
> > > > So far it really sounds like the right thing to do to me to fix all similar
> > > > issues, even without introducing anything new we need to maintain.
> > > 
> > > Turning a feature with a platform dependency to "on" implies that
> > > the machine type will cease to work out of the box for platforms
> > > which lack the feature. IMHO that's not acceptable behaviour for
> > > any of our supported platforms.
> > 
> > Right, that's why I was thinking whether we should just always be on the
> > safe side, even if I just replied in the other email to Akihiko, that we do
> > have the option to make this more aggresive by turning those to ON after
> > even 1-2 years or even less.. and we have control of how aggressive this
> > can be.
> > 
> > > 
> > > IOW, "after a few QEMU releases" implies a delay of as much as
> > > 5 years, while we wait for platforms which don't support the
> > > feature to drop out of our supported targets list.  I don't
> > > think that'll satisfy the desire to get the new feature
> > > available to users as soon as practical for their particular
> > > platform.
> > 
> > The feature is always available since the 1st day, right?  We just need the
> > user to opt-in, by specifying ON in the cmdline.
> > 
> > That'll be my take on this that QEMU's default VM setup should be always
> > bootable, migratable, and so on.  Then user opt-in on stuff like this one,
> > where there's implication on the ABIs.  The "user" can also include
> > Libvirt.  I mean when something is really important, Libvirt should, IMHO,
> > opt-in by treating that similarly like many cpu properties, and by probing
> > the host first.
> > 
> > IIUC there aren't a lot of things like that (part of guest ABI & host
> > kernel / HW dependent), am I right?  Otherwise I would expect more failures
> > like this one, but it isn't as much as that yet.  IIUC it means the efforts
> > to make Libvirt get involved should be hopefully under control too.  The
> > worst case is Libvirt doesn't auto-on it, but again the user should always
> > have the option to turn it on when it's necessary.
> 
> If it is left to libvirt, then it would very likely end up being a user
> opt-in, not auto-enabled.

Not sure whether there's other opinions, but that's definitely fine by me.

I think it even makes more sense, as even if Libvirt probed the host and
auto-on the feature, it also means Libvirt made a decision for the user,
saying "having a better performance" is more important than "being able to
migrate this VM everywhere".

I don't see a way that can make such fair decision besides requesting the
user to opt-in always for those, then the user is fully aware what is
enabled, with the hope that when a migration fails later with "target host
doesn't support feature XXX" the user is crystal clear on what happened.

Thanks,
Michael S. Tsirkin July 30, 2024, 7:22 p.m. UTC | #39
On Tue, Jul 30, 2024 at 03:11:03PM -0400, Peter Xu wrote:
> On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote:
> > On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
> > > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
> > > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
> > > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > > > > > 
> > > > > > We've got two mutually conflicting goals with the machine type
> > > > > > definitions.
> > > > > > 
> > > > > > Primarily we use them to ensure stable ABI, but an important
> > > > > > secondary goal is to enable new tunables to have new defaults
> > > > > > set, without having to update every mgmt app.  The latter
> > > > > > works very well when the defaults have no dependancy on the
> > > > > > platform kernel/OS, but breaks migration when they do have a
> > > > > > platform dependancy.
> > > > > > 
> > > > > > >   - Firstly, never quietly flipping any bit that affects the ABI...
> > > > > > > 
> > > > > > >   - Have a default value of off, then QEMU will always allow the VM to boot
> > > > > > >     by default, while advanced users can opt-in on new features.  We can't
> > > > > > >     make this ON by default otherwise some VMs can already fail to boot,
> > > > > > > 
> > > > > > >   - If the host doesn't support the feature while the cmdline enabled it,
> > > > > > >     it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > > > > >     this host does not support running such VM specified, due to XXX
> > > > > > >     feature missing".
> > > > > > > 
> > > > > > > That's the only way an user could understand what happened, and IMHO that's
> > > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > > > > > while in which the machine type is the fundation of such definition, as the
> > > > > > > machine type can decides many of the rest compat properties.  And that's
> > > > > > > the whole point of the compat properties too (to make sure the guest ABI is
> > > > > > > stable).
> > > > > > > 
> > > > > > > If kernel breaks it easily, all compat property things that we maintain can
> > > > > > > already stop making sense in general, because it didn't define the whole
> > > > > > > guest ABI..
> > > > > > > 
> > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > > > > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > > > > > keep up with this rule?
> > > > > > 
> > > > > > We've failed at this for years wrt enabling use of new defaults that have
> > > > > > a platform depedancy, so historical practice isn't a good reference.
> > > > > > 
> > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > > > > > the machine type, and of those, libvirt likely only exposes a few 10's
> > > > > > of tunables. The vast majority are low level details that no mgmt app
> > > > > > wants to know about, they just want to accept QEMU's new defaults,
> > > > > > while preserving machine ABI. This is a good thing. No one wants the
> > > > > > burden of wiring up every single tunable into libvirt and mgmt apps.
> > > > > > 
> > > > > > This is what the "-platform" concept would be intended to preserve. It
> > > > > > would allow a way to enable groups of settings that have a platform level
> > > > > > dependancy, without ever having to teach either libvirt or the mgmt apps
> > > > > > about the individual tunables.
> > > > > 
> > > > > Do you think we can achieve similar goal by simply turning the feature to
> > > > > ON only after a few QEMU releases?  I also mentioned that idea below.
> > > > > 
> > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> > > > > 
> > > > > So far it really sounds like the right thing to do to me to fix all similar
> > > > > issues, even without introducing anything new we need to maintain.
> > > > 
> > > > Turning a feature with a platform dependency to "on" implies that
> > > > the machine type will cease to work out of the box for platforms
> > > > which lack the feature. IMHO that's not acceptable behaviour for
> > > > any of our supported platforms.
> > > 
> > > Right, that's why I was thinking whether we should just always be on the
> > > safe side, even if I just replied in the other email to Akihiko, that we do
> > > have the option to make this more aggresive by turning those to ON after
> > > even 1-2 years or even less.. and we have control of how aggressive this
> > > can be.
> > > 
> > > > 
> > > > IOW, "after a few QEMU releases" implies a delay of as much as
> > > > 5 years, while we wait for platforms which don't support the
> > > > feature to drop out of our supported targets list.  I don't
> > > > think that'll satisfy the desire to get the new feature
> > > > available to users as soon as practical for their particular
> > > > platform.
> > > 
> > > The feature is always available since the 1st day, right?  We just need the
> > > user to opt-in, by specifying ON in the cmdline.
> > > 
> > > That'll be my take on this that QEMU's default VM setup should be always
> > > bootable, migratable, and so on.  Then user opt-in on stuff like this one,
> > > where there's implication on the ABIs.  The "user" can also include
> > > Libvirt.  I mean when something is really important, Libvirt should, IMHO,
> > > opt-in by treating that similarly like many cpu properties, and by probing
> > > the host first.
> > > 
> > > IIUC there aren't a lot of things like that (part of guest ABI & host
> > > kernel / HW dependent), am I right?  Otherwise I would expect more failures
> > > like this one, but it isn't as much as that yet.  IIUC it means the efforts
> > > to make Libvirt get involved should be hopefully under control too.  The
> > > worst case is Libvirt doesn't auto-on it, but again the user should always
> > > have the option to turn it on when it's necessary.
> > 
> > If it is left to libvirt, then it would very likely end up being a user
> > opt-in, not auto-enabled.
> 
> Not sure whether there's other opinions, but that's definitely fine by me.
> 
> I think it even makes more sense, as even if Libvirt probed the host and
> auto-on the feature, it also means Libvirt made a decision for the user,
> saying "having a better performance" is more important than "being able to
> migrate this VM everywhere".
> 
> I don't see a way that can make such fair decision besides requesting the
> user to opt-in always for those, then the user is fully aware what is
> enabled, with the hope that when a migration fails later with "target host
> doesn't support feature XXX" the user is crystal clear on what happened.
> 
> Thanks,
> 
> -- 
> Peter Xu


This is not what we did historically. Why should we start now?
Peter Xu July 30, 2024, 8:03 p.m. UTC | #40
On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> This is not what we did historically. Why should we start now?

It's a matter of whether we still want migration to randomly fail, like
what this patch does.

Or any better suggestions?  I'm definitely open to that.

Thanks,
Michael S. Tsirkin July 30, 2024, 9:32 p.m. UTC | #41
On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
> On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> > This is not what we did historically. Why should we start now?
> 
> It's a matter of whether we still want migration to randomly fail, like
> what this patch does.
> 
> Or any better suggestions?  I'm definitely open to that.
> 
> Thanks,
> 
> -- 
> Peter Xu

Randomly is an overstatement. You need to switch between kernels
where this feature differs. We did it with a ton of features
in the past, donnu why we single out USO now.

Basically downstreams just don't separately add kernel features vs
qemu features. There's little reason for them to do so.
Peter Xu July 30, 2024, 10:01 p.m. UTC | #42
On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote:
> On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
> > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> > > This is not what we did historically. Why should we start now?
> > 
> > It's a matter of whether we still want migration to randomly fail, like
> > what this patch does.
> > 
> > Or any better suggestions?  I'm definitely open to that.
> > 
> > Thanks,
> > 
> > -- 
> > Peter Xu
> 
> Randomly is an overstatement. You need to switch between kernels
> where this feature differs. We did it with a ton of features
> in the past, donnu why we single out USO now.

Right, my previous comment should apply to all such features, so it's not
sololy about USO*.

For old features that Jason mentioned that can also be auto-OFF, my wild
guess was that most of them should be supported in most of the kernels that
people are using, so they're fine.  Otherwise I don't see what stops it
from happening in other features too.  And that's also why I am thinking
maybe we don't need to fix old features, but for this USO* one - I'm not
sure yet; it could hit already.

For the future, I definitely want to avoid such issue; that's also one
major reason / goal I wanted to discuss this thoroughly this time..

> 
> Basically downstreams just don't separately add kernel features vs
> qemu features. There's little reason for them to do so.

But we hit this bug in downstream tests..  IIUC it means this is not the
case?

To be explicit, for RHEL9 some version we added USO* features for QEMU, but
not yet for the kernel TAP drivers.  AFAIU that's the context where we
trapped this failure where we have some system supporting the QEMU feature
but not supporting the kernel ones.  While some newer systems will support
both.  Then we hit this when migrating back to the RHEL9 system.

Thanks,
Jason Wang July 31, 2024, 2:01 a.m. UTC | #43
On Wed, Jul 31, 2024 at 5:33 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
> > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> > > This is not what we did historically. Why should we start now?
> >
> > It's a matter of whether we still want migration to randomly fail, like
> > what this patch does.
> >
> > Or any better suggestions?  I'm definitely open to that.
> >
> > Thanks,
> >
> > --
> > Peter Xu
>
> Randomly is an overstatement. You need to switch between kernels
> where this feature differs. We did it with a ton of features
> in the past, donnu why we single out USO now.

I guess the reason is that for offload features other than USO they
are landed in early kernels so we don't have a chance to test/meet
this case. But this is not the case for USO.

Thanks

>
> Basically downstreams just don't separately add kernel features vs
> qemu features. There's little reason for them to do so.
>
>
>
> --
> MST
>
Jason Wang July 31, 2024, 2:05 a.m. UTC | #44
On Tue, Jul 30, 2024 at 6:23 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
>
> On 2024/07/30 12:45, Jason Wang wrote:
> > On Tue, Jul 30, 2024 at 11:29 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>
> >> On 2024/07/30 12:17, Jason Wang wrote:
> >>> On Tue, Jul 30, 2024 at 11:12 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>
> >>>> On 2024/07/30 12:03, Jason Wang wrote:
> >>>>> On Tue, Jul 30, 2024 at 10:57 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>
> >>>>>> On 2024/07/30 11:04, Jason Wang wrote:
> >>>>>>> On Tue, Jul 30, 2024 at 12:43 AM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>>>
> >>>>>>>> On 2024/07/29 23:29, Peter Xu wrote:
> >>>>>>>>> On Mon, Jul 29, 2024 at 01:45:12PM +0900, Akihiko Odaki wrote:
> >>>>>>>>>> On 2024/07/29 12:50, Jason Wang wrote:
> >>>>>>>>>>> On Sun, Jul 28, 2024 at 11:19 PM Akihiko Odaki <akihiko.odaki@daynix.com> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 2024/07/27 5:47, Peter Xu wrote:
> >>>>>>>>>>>>> On Fri, Jul 26, 2024 at 04:17:12PM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 10:43:42AM -0400, Peter Xu wrote:
> >>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:48:02AM +0100, Daniel P. Berrangé wrote:
> >>>>>>>>>>>>>>>> On Fri, Jul 26, 2024 at 09:03:24AM +0200, Thomas Huth wrote:
> >>>>>>>>>>>>>>>>> On 26/07/2024 08.08, Michael S. Tsirkin wrote:
> >>>>>>>>>>>>>>>>>> On Thu, Jul 25, 2024 at 06:18:20PM -0400, Peter Xu wrote:
> >>>>>>>>>>>>>>>>>>> On Tue, Aug 01, 2023 at 01:31:48AM +0300, Yuri Benditovich wrote:
> >>>>>>>>>>>>>>>>>>>> USO features of virtio-net device depend on kernel ability
> >>>>>>>>>>>>>>>>>>>> to support them, for backward compatibility by default the
> >>>>>>>>>>>>>>>>>>>> features are disabled on 8.0 and earlier.
> >>>>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>>>> Signed-off-by: Yuri Benditovich <yuri.benditovich@daynix.com>
> >>>>>>>>>>>>>>>>>>>> Signed-off-by: Andrew Melnychecnko <andrew@daynix.com>
> >>>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>>> Looks like this patch broke migration when the VM starts on a host that has
> >>>>>>>>>>>>>>>>>>> USO supported, to another host that doesn't..
> >>>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> This was always the case with all offloads. The answer at the moment is,
> >>>>>>>>>>>>>>>>>> don't do this.
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> May I ask for my understanding:
> >>>>>>>>>>>>>>>>> "don't do this" = don't automatically enable/disable virtio features in QEMU
> >>>>>>>>>>>>>>>>> depending on host kernel features, or "don't do this" = don't try to migrate
> >>>>>>>>>>>>>>>>> between machines that have different host kernel features?
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>>> Long term, we need to start exposing management APIs
> >>>>>>>>>>>>>>>>>> to discover this, and management has to disable unsupported features.
> >>>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>>> Ack, this likely needs some treatments from the libvirt side, too.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> When QEMU automatically toggles machine type featuers based on host
> >>>>>>>>>>>>>>>> kernel, relying on libvirt to then disable them again is impractical,
> >>>>>>>>>>>>>>>> as we cannot assume that the libvirt people are using knows about
> >>>>>>>>>>>>>>>> newly introduced features. Even if libvirt is updated to know about
> >>>>>>>>>>>>>>>> it, people can easily be using a previous libvirt release.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> QEMU itself needs to make the machine types do that they are there
> >>>>>>>>>>>>>>>> todo, which is to define a stable machine ABI.
> >>>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>>> What QEMU is missing here is a "platform ABI" concept, to encode
> >>>>>>>>>>>>>>>> sets of features which are tied to specific platform generations.
> >>>>>>>>>>>>>>>> As long as we don't have that we'll keep having these broken
> >>>>>>>>>>>>>>>> migration problems from machine types dynamically changing instead
> >>>>>>>>>>>>>>>> of providing a stable guest ABI.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Any more elaboration on this idea?  Would it be easily feasible in
> >>>>>>>>>>>>>>> implementation?
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> In terms of launching QEMU I'd imagine:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>          $QEMU -machine pc-q35-9.1 -platform linux-6.9 ...args...
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Any virtual machine HW features which are tied to host kernel features
> >>>>>>>>>>>>>> would have their defaults set based on the requested -platform. The
> >>>>>>>>>>>>>> -machine will be fully invariant wrt the host kernel.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> You would have -platform hlep to list available platforms, and
> >>>>>>>>>>>>>> corresonding QMP "query-platforms" command to list what platforms
> >>>>>>>>>>>>>> are supported on a given host OS.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Downstream distros can provide their own platforms definitions
> >>>>>>>>>>>>>> (eg "linux-rhel-9.5") if they have kernels whose feature set
> >>>>>>>>>>>>>> diverges from upstream due to backports.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Mgmt apps won't need to be taught about every single little QEMU
> >>>>>>>>>>>>>> setting whose default is derived from the kernel. Individual
> >>>>>>>>>>>>>> defaults are opaque and controlled by the requested platform.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Live migration has clearly defined semantics, and mgmt app can
> >>>>>>>>>>>>>> use query-platforms to validate two hosts are compatible.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Omitting -platform should pick the very latest platform that is
> >>>>>>>>>>>>>> cmpatible with the current host (not neccessarily the latest
> >>>>>>>>>>>>>> platform built-in to QEMU).
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> This seems to add one more layer to maintain, and so far I don't know
> >>>>>>>>>>>>> whether it's a must.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> To put it simple, can we simply rely on qemu cmdline as "the guest ABI"?  I
> >>>>>>>>>>>>> thought it was mostly the case already, except some extremely rare
> >>>>>>>>>>>>> outliers.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> When we have one host that boots up a VM using:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          $QEMU1 $cmdline
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Then another host boots up:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          $QEMU2 $cmdline -incoming XXX
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Then migration should succeed if $cmdline is exactly the same, and the VM
> >>>>>>>>>>>>> can boot up all fine without errors on both sides.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> AFAICT this has nothing to do with what kernel is underneath, even not
> >>>>>>>>>>>>> Linux?  I think either QEMU1 / QEMU2 has the option to fail.  But if it
> >>>>>>>>>>>>> didn't, I thought the ABI should be guaranteed.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> That's why I think this is a migration violation, as 99.99% of other device
> >>>>>>>>>>>>> properties should be following this rule.  The issue here is, we have the
> >>>>>>>>>>>>> same virtio-net-pci cmdline on both sides in this case, but the ABI got
> >>>>>>>>>>>>> break.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> That's also why I was suggesting if the property contributes to the guest
> >>>>>>>>>>>>> ABI, then AFAIU QEMU needs to:
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          - Firstly, never quietly flipping any bit that affects the ABI...
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>          - Have a default value of off, then QEMU will always allow the VM to boot
> >>>>>>>>>>>>>            by default, while advanced users can opt-in on new features.  We can't
> >>>>>>>>>>>>>            make this ON by default otherwise some VMs can already fail to boot,
> >>>>>>>>>>>>
> >>>>>>>>>>>> It may not be necessary the case that old features are supported by
> >>>>>>>>>>>> every systems. In an extreme case, a user may migrate a VM from Linux to
> >>>>>>>>>>>> Windows, which probably doesn't support any offloading at all. A more
> >>>>>>>>>>>> convincing scenario is RSS offloading with eBPF; using eBPF requires a
> >>>>>>>>>>>> privilege so we cannot assume it is always available even on the latest
> >>>>>>>>>>>> version of Linux.
> >>>>>>>>>>>
> >>>>>>>>>>> I don't get why eBPF matters here. It is something that is not noticed
> >>>>>>>>>>> by the guest and we have a fallback anyhow.
> >>>>>>>>
> >>>>>>>> It is noticeable for the guest, and the fallback is not effective with
> >>>>>>>> vhost.
> >>>>>>>
> >>>>>>> It's a bug then. Qemu can fallback to tuntap if it sees issues in vhost.
> >>>>>>
> >>>>>> We can certainly fallback to in-QEMU RSS by disabling vhost, but I would
> >>>>>> not say lack of such fallback is a bug.
> >>>>>
> >>>>> Such fallback is by design since the introduction of vhost.
> >>>>>
> >>>>>> We don't provide in-QEMU
> >>>>>> fallback for other offloads.
> >>>>>
> >>>>> Yes but what I want to say is that eBPF RSS is different from those
> >>>>> segmentation offloads. And technically, Qemu can do fallback for
> >>>>> offloads (as RSC did).
> >>>>
> >>>> Well, I couldn't find any code disabling vhost for the in-QEMU RSC
> >>>> implementation.
> >>>
> >>> It should be a bug (and I remember we disabled vhost when the patches
> >>> were merged). Have you tested it in a guest to see if it can see RSC
> >>> when vhost is enabled?
> >>>
> >>> I suspect we need to add the RSC bit into current kernel_feature_bits:
> >>>
> >>> /* Features supported by host kernel. */
> >>> static const int kernel_feature_bits[] = {
> >>>       VIRTIO_F_NOTIFY_ON_EMPTY,
> >>>       VIRTIO_RING_F_INDIRECT_DESC,
> >>>       VIRTIO_RING_F_EVENT_IDX,
> >>>       VIRTIO_NET_F_MRG_RXBUF,
> >>>       VIRTIO_F_VERSION_1,
> >>>       VIRTIO_NET_F_MTU,
> >>>       VIRTIO_F_IOMMU_PLATFORM,
> >>>       VIRTIO_F_RING_PACKED,
> >>>       VIRTIO_F_RING_RESET,
> >>>       VIRTIO_NET_F_HASH_REPORT,
> >>>       VHOST_INVALID_FEATURE_BIT
> >>> };
> >>>
> >>> As RSC won't be provided by TUN/TAP anyhow.
> >>
> >> Adding the RSC bit does not let QEMU disable vhost for RSC, but instead
> >> it implicitly disables RSC in my understanding.
> >
> > Yes.
> >
> >> It is still better than
> >> advertising the availability of that feature while it is missing.
> >
> > Down the road, we probably need to change the behaviour of disabling vhost-net.
> >
> >>
> >>>
> >>>>
> >>>> Looking at the code, I also found the case of vhost-vdpa. vhost can be
> >>>> simply disabled if it is backed by tuntap, but it is not the case for vDPA.
> >>>
> >>> True, technically, vDPA can fallback to SVQ, but it's another topic.
> >>
> >> My point of this discussion is that we cannot enable features just
> >> because they are sufficiently old or because the user claims QEMU runs
> >> on Linux sufficiently new. eBPF requires privilege, and vDPA requires
> >> hardware feature. A fallback is not a silver bullet either, and there
> >> are situations that providing a fallback is not a trivial task.
> >
> > To make sure we are on the same page. I just want to point out that
> > eBPF RSS is not a good example in this context.
> >
> > It works only for tuntap, so we should stick to the behaviour of
> > trying to fallback to userspace if we can as we've already had a
> > userspace fallback. This is the fundamental difference with other
> > features (like segmentation offload) or backend (vDPA) that doesn't
> > have an existing fallback.
>
> Some (probably not all) offloads are implemented in hw/net/net_tx_pkt.c.
> They are not wired up to behave as a fallback when tuntap's vhost is
> enabled as the in-QEMU RSS is not. In either case, we need to pay some
> effort to wiring things.
>
> I'm not sure it is worthwhile. I think there is a high chance that
> selectively disabling vhost and keeping RSS enabled with fallback will
> result in worse performance than keeping vhost enabled and disabling
> RSS. Such a fallback can still function as an emergency escape hatch,
> but it is also incomplete as we don't have fallbacks for other features.

The reason is that we depend on ioctl to configure and negotiate with
tuntap correctly.

> I would rather make any features missing in the vhost backend fail to
> keep things consistent.

You might be right but it's too late to do that.

Thanks

>
> Regards,
> Akihiko Odaki
>
Daniel P. Berrangé July 31, 2024, 7:04 a.m. UTC | #45
On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote:
> On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
> > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> > > This is not what we did historically. Why should we start now?
> > 
> > It's a matter of whether we still want migration to randomly fail, like
> > what this patch does.
> > 
> > Or any better suggestions?  I'm definitely open to that.
> > 
> > Thanks,
> > 
> > -- 
> > Peter Xu
> 
> Randomly is an overstatement. You need to switch between kernels
> where this feature differs. We did it with a ton of features
> in the past, donnu why we single out USO now.

This has been a problem with a ton of features in the past. We've
ignored the problem, but that doesn't make it the right solution

With regards,
Daniel
Michael S. Tsirkin July 31, 2024, 7:41 a.m. UTC | #46
On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote:
> On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote:
> > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
> > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> > > > This is not what we did historically. Why should we start now?
> > > 
> > > It's a matter of whether we still want migration to randomly fail, like
> > > what this patch does.
> > > 
> > > Or any better suggestions?  I'm definitely open to that.
> > > 
> > > Thanks,
> > > 
> > > -- 
> > > Peter Xu
> > 
> > Randomly is an overstatement. You need to switch between kernels
> > where this feature differs. We did it with a ton of features
> > in the past, donnu why we single out USO now.
> 
> This has been a problem with a ton of features in the past. We've
> ignored the problem, but that doesn't make it the right solution
> 
> With regards,
> Daniel

Pushing it to domain xml does not really help,
migration will still fail unexpectedly (after wasting
a ton of resources copying memory, and getting
a downtime bump, I might add).

The right solution is to have a tool that can query
backends, and that given the results from all of the cluster,
generate a set of parameters that will ensure migration works.
Kind of like qemu-img, but for migration.
Peter Xu July 31, 2024, 12:57 p.m. UTC | #47
On Wed, Jul 31, 2024 at 03:41:00AM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote:
> > On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote:
> > > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
> > > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> > > > > This is not what we did historically. Why should we start now?
> > > > 
> > > > It's a matter of whether we still want migration to randomly fail, like
> > > > what this patch does.
> > > > 
> > > > Or any better suggestions?  I'm definitely open to that.
> > > > 
> > > > Thanks,
> > > > 
> > > > -- 
> > > > Peter Xu
> > > 
> > > Randomly is an overstatement. You need to switch between kernels
> > > where this feature differs. We did it with a ton of features
> > > in the past, donnu why we single out USO now.
> > 
> > This has been a problem with a ton of features in the past. We've
> > ignored the problem, but that doesn't make it the right solution
> > 
> > With regards,
> > Daniel
> 
> Pushing it to domain xml does not really help,
> migration will still fail unexpectedly (after wasting
> a ton of resources copying memory, and getting
> a downtime bump, I might add).

Could you elaborate why it would fail if with what I proposed?

Note that if this is a generic comment about "any migration can fail if we
found a device mismatch", we have plan to fix that to some degree. It's
just that we don't have enough people working on these topics yet. See:

https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake

It includes:

 "Check device tree on both sides, etc., to make sure the migration is
  applicable. E.g., we should fail early and clearly on any device
  mismatch."

However I don't think it'll cover all checks, e.g. I _think_ even if we
verify VMSDs then post_load() hooks can still fail, and there can be some
corner cases to think.  And of course, this may not even apply to virtio
since virtio manages migration itself, without providing a top-level vmsd.

> 
> The right solution is to have a tool that can query
> backends, and that given the results from all of the cluster,
> generate a set of parameters that will ensure migration works.
> Kind of like qemu-img, but for migration.

This is adding extra work, IMHO.

If we stick with "qemu cmdline as guest ABI" concept, I think we're all
fine, as that work is done by QEMU booting up first on both sides,
including dest.  Basically Libvirt already plays this role of the new tool
without any new code to be added at all: what captured on the boot failure
log will be the output of that tool if we write it.

Thanks,
Jason Wang Aug. 1, 2024, 2:28 a.m. UTC | #48
On Wed, Jul 31, 2024 at 8:58 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Jul 31, 2024 at 03:41:00AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote:
> > > On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote:
> > > > On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
> > > > > On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
> > > > > > This is not what we did historically. Why should we start now?
> > > > >
> > > > > It's a matter of whether we still want migration to randomly fail, like
> > > > > what this patch does.
> > > > >
> > > > > Or any better suggestions?  I'm definitely open to that.
> > > > >
> > > > > Thanks,
> > > > >
> > > > > --
> > > > > Peter Xu
> > > >
> > > > Randomly is an overstatement. You need to switch between kernels
> > > > where this feature differs. We did it with a ton of features
> > > > in the past, donnu why we single out USO now.
> > >
> > > This has been a problem with a ton of features in the past. We've
> > > ignored the problem, but that doesn't make it the right solution
> > >
> > > With regards,
> > > Daniel
> >
> > Pushing it to domain xml does not really help,
> > migration will still fail unexpectedly (after wasting
> > a ton of resources copying memory, and getting
> > a downtime bump, I might add).
>
> Could you elaborate why it would fail if with what I proposed?
>
> Note that if this is a generic comment about "any migration can fail if we
> found a device mismatch", we have plan to fix that to some degree. It's
> just that we don't have enough people working on these topics yet. See:
>
> https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake
>
> It includes:
>
>  "Check device tree on both sides, etc., to make sure the migration is
>   applicable. E.g., we should fail early and clearly on any device
>   mismatch."
>
> However I don't think it'll cover all checks, e.g. I _think_ even if we
> verify VMSDs then post_load() hooks can still fail, and there can be some
> corner cases to think.  And of course, this may not even apply to virtio
> since virtio manages migration itself, without providing a top-level vmsd.
>
> >
> > The right solution is to have a tool that can query
> > backends, and that given the results from all of the cluster,
> > generate a set of parameters that will ensure migration works.

This seems to be very hard for vhost-users.

> > Kind of like qemu-img, but for migration.
>
> This is adding extra work, IMHO.
>
> If we stick with "qemu cmdline as guest ABI" concept, I think we're all
> fine, as that work is done by QEMU booting up first on both sides,
> including dest.

Probably, letting Qemu to probe is much easier than rewriting the
probe in the upper layer.

>  Basically Libvirt already plays this role of the new tool
> without any new code to be added at all: what captured on the boot failure
> log will be the output of that tool if we write it.
>
> Thanks,

Thanks

>
> --
> Peter Xu
>
Akihiko Odaki Aug. 1, 2024, 5:05 a.m. UTC | #49
On 2024/07/31 4:11, Peter Xu wrote:
> On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote:
>> On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
>>> On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
>>>> On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
>>>>> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
>>>>>>
>>>>>> We've got two mutually conflicting goals with the machine type
>>>>>> definitions.
>>>>>>
>>>>>> Primarily we use them to ensure stable ABI, but an important
>>>>>> secondary goal is to enable new tunables to have new defaults
>>>>>> set, without having to update every mgmt app.  The latter
>>>>>> works very well when the defaults have no dependancy on the
>>>>>> platform kernel/OS, but breaks migration when they do have a
>>>>>> platform dependancy.
>>>>>>
>>>>>>>    - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>>>
>>>>>>>    - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>>>      by default, while advanced users can opt-in on new features.  We can't
>>>>>>>      make this ON by default otherwise some VMs can already fail to boot,
>>>>>>>
>>>>>>>    - If the host doesn't support the feature while the cmdline enabled it,
>>>>>>>      it needs to fail QEMU boot rather than flipping, so that it says "hey,
>>>>>>>      this host does not support running such VM specified, due to XXX
>>>>>>>      feature missing".
>>>>>>>
>>>>>>> That's the only way an user could understand what happened, and IMHO that's
>>>>>>> a clean way that we stick with QEMU cmdline on defining the guest ABI,
>>>>>>> while in which the machine type is the fundation of such definition, as the
>>>>>>> machine type can decides many of the rest compat properties.  And that's
>>>>>>> the whole point of the compat properties too (to make sure the guest ABI is
>>>>>>> stable).
>>>>>>>
>>>>>>> If kernel breaks it easily, all compat property things that we maintain can
>>>>>>> already stop making sense in general, because it didn't define the whole
>>>>>>> guest ABI..
>>>>>>>
>>>>>>> So AFAIU that's really what we used for years, I hope I didn't overlook
>>>>>>> somehting.  And maybe we don't yet need the "-platform" layer if we can
>>>>>>> keep up with this rule?
>>>>>>
>>>>>> We've failed at this for years wrt enabling use of new defaults that have
>>>>>> a platform depedancy, so historical practice isn't a good reference.
>>>>>>
>>>>>> There are 100's (possibly 1000's) of tunables set implicitly as part of
>>>>>> the machine type, and of those, libvirt likely only exposes a few 10's
>>>>>> of tunables. The vast majority are low level details that no mgmt app
>>>>>> wants to know about, they just want to accept QEMU's new defaults,
>>>>>> while preserving machine ABI. This is a good thing. No one wants the
>>>>>> burden of wiring up every single tunable into libvirt and mgmt apps.
>>>>>>
>>>>>> This is what the "-platform" concept would be intended to preserve. It
>>>>>> would allow a way to enable groups of settings that have a platform level
>>>>>> dependancy, without ever having to teach either libvirt or the mgmt apps
>>>>>> about the individual tunables.
>>>>>
>>>>> Do you think we can achieve similar goal by simply turning the feature to
>>>>> ON only after a few QEMU releases?  I also mentioned that idea below.
>>>>>
>>>>> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
>>>>>
>>>>> So far it really sounds like the right thing to do to me to fix all similar
>>>>> issues, even without introducing anything new we need to maintain.
>>>>
>>>> Turning a feature with a platform dependency to "on" implies that
>>>> the machine type will cease to work out of the box for platforms
>>>> which lack the feature. IMHO that's not acceptable behaviour for
>>>> any of our supported platforms.
>>>
>>> Right, that's why I was thinking whether we should just always be on the
>>> safe side, even if I just replied in the other email to Akihiko, that we do
>>> have the option to make this more aggresive by turning those to ON after
>>> even 1-2 years or even less.. and we have control of how aggressive this
>>> can be.
>>>
>>>>
>>>> IOW, "after a few QEMU releases" implies a delay of as much as
>>>> 5 years, while we wait for platforms which don't support the
>>>> feature to drop out of our supported targets list.  I don't
>>>> think that'll satisfy the desire to get the new feature
>>>> available to users as soon as practical for their particular
>>>> platform.
>>>
>>> The feature is always available since the 1st day, right?  We just need the
>>> user to opt-in, by specifying ON in the cmdline.
>>>
>>> That'll be my take on this that QEMU's default VM setup should be always
>>> bootable, migratable, and so on.  Then user opt-in on stuff like this one,
>>> where there's implication on the ABIs.  The "user" can also include
>>> Libvirt.  I mean when something is really important, Libvirt should, IMHO,
>>> opt-in by treating that similarly like many cpu properties, and by probing
>>> the host first.
>>>
>>> IIUC there aren't a lot of things like that (part of guest ABI & host
>>> kernel / HW dependent), am I right?  Otherwise I would expect more failures
>>> like this one, but it isn't as much as that yet.  IIUC it means the efforts
>>> to make Libvirt get involved should be hopefully under control too.  The
>>> worst case is Libvirt doesn't auto-on it, but again the user should always
>>> have the option to turn it on when it's necessary.
>>
>> If it is left to libvirt, then it would very likely end up being a user
>> opt-in, not auto-enabled.
> 
> Not sure whether there's other opinions, but that's definitely fine by me.
> 
> I think it even makes more sense, as even if Libvirt probed the host and
> auto-on the feature, it also means Libvirt made a decision for the user,
> saying "having a better performance" is more important than "being able to
> migrate this VM everywhere".
> 
> I don't see a way that can make such fair decision besides requesting the
> user to opt-in always for those, then the user is fully aware what is
> enabled, with the hope that when a migration fails later with "target host
> doesn't support feature XXX" the user is crystal clear on what happened.

I think it is better to distinguish saying "having a better performance 
is more important than being able to migrate this VM everywhere" from 
explicitly selecting all available offload features; the latter is lot 
of chores. More importantly, users may not just know these features may 
prevent migration; they may just look like performance features nice to 
have at first glance.

I don' think what a user would want are not individual performance 
knobs, but a user is more likely to need to express the platforms they 
would want to migrate VMs on. There are several possible scenarios in 
particular:
1) Migration everywhere
2) Migration on specific machines
3) Migration on some known platforms
4) No migration (migration on nowhere)

If a user chooses 1-3), QEMU may reject platform-dependent features even 
if the user requests one; in this way, we don't need the users to make 
things crystal clear, but we can expect QEMU does so.

If a user chooses 2-4), QEMU may enable all offloading features 
available on the specified platforms. Again, the user will no longer 
have to know each individual performance features. QEMU may also reject 
migration to platforms not specified to prevent misconfiguration.

The -platform proposal earlier corresponds to 3). However it has a 
downside that QEMU needs to know about platforms, which may not be 
trivial. In that case, we can support 1), 2), and 4).

Regards,
Akihiko Odaki
Akihiko Odaki Aug. 1, 2024, 5:28 a.m. UTC | #50
On 2024/08/01 11:28, Jason Wang wrote:
> On Wed, Jul 31, 2024 at 8:58 PM Peter Xu <peterx@redhat.com> wrote:
>>
>> On Wed, Jul 31, 2024 at 03:41:00AM -0400, Michael S. Tsirkin wrote:
>>> On Wed, Jul 31, 2024 at 08:04:24AM +0100, Daniel P. Berrangé wrote:
>>>> On Tue, Jul 30, 2024 at 05:32:48PM -0400, Michael S. Tsirkin wrote:
>>>>> On Tue, Jul 30, 2024 at 04:03:53PM -0400, Peter Xu wrote:
>>>>>> On Tue, Jul 30, 2024 at 03:22:50PM -0400, Michael S. Tsirkin wrote:
>>>>>>> This is not what we did historically. Why should we start now?
>>>>>>
>>>>>> It's a matter of whether we still want migration to randomly fail, like
>>>>>> what this patch does.
>>>>>>
>>>>>> Or any better suggestions?  I'm definitely open to that.
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> --
>>>>>> Peter Xu
>>>>>
>>>>> Randomly is an overstatement. You need to switch between kernels
>>>>> where this feature differs. We did it with a ton of features
>>>>> in the past, donnu why we single out USO now.
>>>>
>>>> This has been a problem with a ton of features in the past. We've
>>>> ignored the problem, but that doesn't make it the right solution
>>>>
>>>> With regards,
>>>> Daniel
>>>
>>> Pushing it to domain xml does not really help,
>>> migration will still fail unexpectedly (after wasting
>>> a ton of resources copying memory, and getting
>>> a downtime bump, I might add).
>>
>> Could you elaborate why it would fail if with what I proposed?
>>
>> Note that if this is a generic comment about "any migration can fail if we
>> found a device mismatch", we have plan to fix that to some degree. It's
>> just that we don't have enough people working on these topics yet. See:
>>
>> https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake
>>
>> It includes:
>>
>>   "Check device tree on both sides, etc., to make sure the migration is
>>    applicable. E.g., we should fail early and clearly on any device
>>    mismatch."
>>
>> However I don't think it'll cover all checks, e.g. I _think_ even if we
>> verify VMSDs then post_load() hooks can still fail, and there can be some
>> corner cases to think.  And of course, this may not even apply to virtio
>> since virtio manages migration itself, without providing a top-level vmsd.
>>
>>>
>>> The right solution is to have a tool that can query
>>> backends, and that given the results from all of the cluster,
>>> generate a set of parameters that will ensure migration works.
> 
> This seems to be very hard for vhost-users.

Can you elaborate more? I was thinking something like follows:
1. Prepare a QEMU command line.
2. Run the command line appended with -dump-platform on all hosts, which 
dumps platform features automatically enabled. For virtio devices, we 
can dump "host_features" variable.
3. Run the command line appended with -merge-platform with all dumps. 
For most virtio devices, this would be AND operations on "host_features" 
variable.
4. Run the command line appended with -use-platform with the merged 
dump. This will run VMs with features available on all hosts.

I may have missed something but this seems good enough for me. Of course 
this requires changes throughout the stack (QEMU common and 
device-specific code, libvirt, and even higher layers like OpenStack).

Regards,
Akihiko Odaki
Michael S. Tsirkin Aug. 1, 2024, 5:34 a.m. UTC | #51
On Wed, Jul 31, 2024 at 08:57:52AM -0400, Peter Xu wrote:
> > 
> > The right solution is to have a tool that can query
> > backends, and that given the results from all of the cluster,
> > generate a set of parameters that will ensure migration works.
> > Kind of like qemu-img, but for migration.
> 
> This is adding extra work, IMHO.

Agreed that it's a lot of work.

> If we stick with "qemu cmdline as guest ABI" concept, I think we're all
> fine, as that work is done by QEMU booting up first on both sides,
> including dest.  Basically Libvirt already plays this role of the new tool
> without any new code to be added at all: what captured on the boot failure
> log will be the output of that tool if we write it.

However, this means we can never add new features without
also teaching libvirt to enable them.


How about we add some kind of command on source qemu
to return description of all working features?
Then when qemu is started on destination, this data can be passed in,
and validated.

Hmm?
Michael S. Tsirkin Aug. 1, 2024, 5:38 a.m. UTC | #52
On Tue, Jul 30, 2024 at 02:02:27AM +0900, Akihiko Odaki wrote:
> I think it is simpler to analyze the platform dependency and dump it for the
> management layer. For example, libvirt can request QEMU to analyze the
> platform dependency when it creates a new domain. QEMU will then figure out
> that the host kernel is capable of USO and bake it as a platform dependency.
> 
> Regards,
> Akihiko Odaki

I think for starters, we can just have dump-features as a QEMU command.
Pass it on command line on destination.
Achieves the same thing as making userspace pass each flag
manually, but without the pain of teaching management to enable
each new feature.
Michael S. Tsirkin Aug. 1, 2024, 5:41 a.m. UTC | #53
On Fri, Jul 26, 2024 at 07:39:46PM +0200, Thomas Huth wrote:
> Anyway, while we're discussing solutions: We are in softfreeze already.
> Should we disable the UFO bits in the new 9.1 machine type for the time
> being to avoid that more people are running into this problem?

At the moment I'm looking at solutions for 9.2 I don't see how
we can do much for 9.1. I mean we can move it back to behave like
8.1 (IIRC), but that is not much.
Michael S. Tsirkin Aug. 1, 2024, 5:51 a.m. UTC | #54
On Wed, Jul 31, 2024 at 08:57:52AM -0400, Peter Xu wrote:
> Could you elaborate why it would fail if with what I proposed?

First I think I was wrong I misunderstood what you said.
To summarise, you said:

- any new feature depending on another package is off by default
- starting qemu on destination with feature enabled will fail
  thus migration is not started


My comment is that this "started" is from qemu point of view,
from user's POV starting qemu on destination is just the 1st
step of migration.


However I agree, this is better since we do not waste bandwidth,
and I was wrong to say we do.

My other comment is that adding features becomes even more work
than it is now.

So I suggest a single command that dumps some description of host
features, to be passed to qemu on destination. qemu then fails to
start on destination if some of these do not work.
The advantage is that this also helps things like -cpu host,
and a bunch of other things like vdpa where we like to pass through
config from kernel.

The disadvantage is that it does not exactly *fix* migration,
it just does not let you start it.
Peter Xu Aug. 1, 2024, 3:13 p.m. UTC | #55
On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote:
> On 2024/07/31 4:11, Peter Xu wrote:
> > On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote:
> > > On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
> > > > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
> > > > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
> > > > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > > > > > > 
> > > > > > > We've got two mutually conflicting goals with the machine type
> > > > > > > definitions.
> > > > > > > 
> > > > > > > Primarily we use them to ensure stable ABI, but an important
> > > > > > > secondary goal is to enable new tunables to have new defaults
> > > > > > > set, without having to update every mgmt app.  The latter
> > > > > > > works very well when the defaults have no dependancy on the
> > > > > > > platform kernel/OS, but breaks migration when they do have a
> > > > > > > platform dependancy.
> > > > > > > 
> > > > > > > >    - Firstly, never quietly flipping any bit that affects the ABI...
> > > > > > > > 
> > > > > > > >    - Have a default value of off, then QEMU will always allow the VM to boot
> > > > > > > >      by default, while advanced users can opt-in on new features.  We can't
> > > > > > > >      make this ON by default otherwise some VMs can already fail to boot,
> > > > > > > > 
> > > > > > > >    - If the host doesn't support the feature while the cmdline enabled it,
> > > > > > > >      it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > > > > > >      this host does not support running such VM specified, due to XXX
> > > > > > > >      feature missing".
> > > > > > > > 
> > > > > > > > That's the only way an user could understand what happened, and IMHO that's
> > > > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > > > > > > while in which the machine type is the fundation of such definition, as the
> > > > > > > > machine type can decides many of the rest compat properties.  And that's
> > > > > > > > the whole point of the compat properties too (to make sure the guest ABI is
> > > > > > > > stable).
> > > > > > > > 
> > > > > > > > If kernel breaks it easily, all compat property things that we maintain can
> > > > > > > > already stop making sense in general, because it didn't define the whole
> > > > > > > > guest ABI..
> > > > > > > > 
> > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > > > > > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > > > > > > keep up with this rule?
> > > > > > > 
> > > > > > > We've failed at this for years wrt enabling use of new defaults that have
> > > > > > > a platform depedancy, so historical practice isn't a good reference.
> > > > > > > 
> > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > > > > > > the machine type, and of those, libvirt likely only exposes a few 10's
> > > > > > > of tunables. The vast majority are low level details that no mgmt app
> > > > > > > wants to know about, they just want to accept QEMU's new defaults,
> > > > > > > while preserving machine ABI. This is a good thing. No one wants the
> > > > > > > burden of wiring up every single tunable into libvirt and mgmt apps.
> > > > > > > 
> > > > > > > This is what the "-platform" concept would be intended to preserve. It
> > > > > > > would allow a way to enable groups of settings that have a platform level
> > > > > > > dependancy, without ever having to teach either libvirt or the mgmt apps
> > > > > > > about the individual tunables.
> > > > > > 
> > > > > > Do you think we can achieve similar goal by simply turning the feature to
> > > > > > ON only after a few QEMU releases?  I also mentioned that idea below.
> > > > > > 
> > > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> > > > > > 
> > > > > > So far it really sounds like the right thing to do to me to fix all similar
> > > > > > issues, even without introducing anything new we need to maintain.
> > > > > 
> > > > > Turning a feature with a platform dependency to "on" implies that
> > > > > the machine type will cease to work out of the box for platforms
> > > > > which lack the feature. IMHO that's not acceptable behaviour for
> > > > > any of our supported platforms.
> > > > 
> > > > Right, that's why I was thinking whether we should just always be on the
> > > > safe side, even if I just replied in the other email to Akihiko, that we do
> > > > have the option to make this more aggresive by turning those to ON after
> > > > even 1-2 years or even less.. and we have control of how aggressive this
> > > > can be.
> > > > 
> > > > > 
> > > > > IOW, "after a few QEMU releases" implies a delay of as much as
> > > > > 5 years, while we wait for platforms which don't support the
> > > > > feature to drop out of our supported targets list.  I don't
> > > > > think that'll satisfy the desire to get the new feature
> > > > > available to users as soon as practical for their particular
> > > > > platform.
> > > > 
> > > > The feature is always available since the 1st day, right?  We just need the
> > > > user to opt-in, by specifying ON in the cmdline.
> > > > 
> > > > That'll be my take on this that QEMU's default VM setup should be always
> > > > bootable, migratable, and so on.  Then user opt-in on stuff like this one,
> > > > where there's implication on the ABIs.  The "user" can also include
> > > > Libvirt.  I mean when something is really important, Libvirt should, IMHO,
> > > > opt-in by treating that similarly like many cpu properties, and by probing
> > > > the host first.
> > > > 
> > > > IIUC there aren't a lot of things like that (part of guest ABI & host
> > > > kernel / HW dependent), am I right?  Otherwise I would expect more failures
> > > > like this one, but it isn't as much as that yet.  IIUC it means the efforts
> > > > to make Libvirt get involved should be hopefully under control too.  The
> > > > worst case is Libvirt doesn't auto-on it, but again the user should always
> > > > have the option to turn it on when it's necessary.
> > > 
> > > If it is left to libvirt, then it would very likely end up being a user
> > > opt-in, not auto-enabled.
> > 
> > Not sure whether there's other opinions, but that's definitely fine by me.
> > 
> > I think it even makes more sense, as even if Libvirt probed the host and
> > auto-on the feature, it also means Libvirt made a decision for the user,
> > saying "having a better performance" is more important than "being able to
> > migrate this VM everywhere".
> > 
> > I don't see a way that can make such fair decision besides requesting the
> > user to opt-in always for those, then the user is fully aware what is
> > enabled, with the hope that when a migration fails later with "target host
> > doesn't support feature XXX" the user is crystal clear on what happened.
> 
> I think it is better to distinguish saying "having a better performance is
> more important than being able to migrate this VM everywhere" from
> explicitly selecting all available offload features; the latter is lot of
> chores. More importantly, users may not just know these features may prevent
> migration; they may just look like performance features nice to have at
> first glance.
> 
> I don' think what a user would want are not individual performance knobs,
> but a user is more likely to need to express the platforms they would want
> to migrate VMs on. There are several possible scenarios in particular:
> 1) Migration everywhere
> 2) Migration on specific machines
> 3) Migration on some known platforms
> 4) No migration (migration on nowhere)
> 
> If a user chooses 1-3), QEMU may reject platform-dependent features even if
> the user requests one; in this way, we don't need the users to make things
> crystal clear, but we can expect QEMU does so.
> 
> If a user chooses 2-4), QEMU may enable all offloading features available on
> the specified platforms. Again, the user will no longer have to know each
> individual performance features. QEMU may also reject migration to platforms
> not specified to prevent misconfiguration.
> 
> The -platform proposal earlier corresponds to 3). However it has a downside
> that QEMU needs to know about platforms, which may not be trivial. In that
> case, we can support 1), 2), and 4).

I'm not sure if I read it right.  Perhaps you meant something more generic
than -platform but similar?

For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
"perf" or "compat", while by default to "compat"?

If so, I think I get the idea, but it'll be challenging in at least these
aspects:

  - We already have (at least.. that I'm aware of) three layers of
    specifying a property for a device, they are:

    (1) default value
    (2) compat property (from machine type definitions)
    (3) qemu cmdline (specify one property explicitly)

    So far, there's an order we apply these (1-3), while (3) has the top
    priority to overwrite (1-2), and (2) to overwrite (1).

    The new "-profile", if I read it right, introduce (4), and it's already
    unclear to me how that interacts with (3) when -profile says "turn
    FEAT1 on" while cmdline says otherwise.

    It can make things very compilcated, IMHO.

  - This still will break the "QEMU cmdline defines the guest ABI", e.g.,
    consider this USO* thing that we boot an old machine type on a new
    system that has QEMU+Linux USO* all enabled.  We specify "-profile
    perf" there.  Then when we try to migrate to another older QEMU it'll
    still fail the migration instead of any way telling us "migration is
    not compatible".  So even if it helps the user turning on knobs, it
    doesn't sound like to fix the problem we're working on?

For whatever profile setup, it sounds like more applicable to a Libvirt
option that user can choose.  That may avoid above two concerns I have,
especially the latter. But I really don't know much on Libvirt, and this
can be some extra effort too on top of either QEMU / Libvirt, and we may
need to justify worthwhile.

Do we really concern about users not enabling features that much?  I
thought users always can manually change the XML and add whatever they
need, and device properties do not like too special here to me.  I mean, we
have bunch of "features" exported as new "-devices" and users must opt-in
for them by changing the XML.  We never worried on user not using them.  I
doubt whether we worried too much on user not opt-in, especially for
performance features, because they're, IMHO, targeting advanced users.

Thanks,
Michael S. Tsirkin Aug. 1, 2024, 3:15 p.m. UTC | #56
On Thu, Aug 01, 2024 at 11:13:37AM -0400, Peter Xu wrote:
> Do we really concern about users not enabling features that much?  I
> thought users always can manually change the XML and add whatever they
> need, and device properties do not like too special here to me.  I mean, we
> have bunch of "features" exported as new "-devices" and users must opt-in
> for them by changing the XML.  We never worried on user not using them.  I
> doubt whether we worried too much on user not opt-in, especially for
> performance features, because they're, IMHO, targeting advanced users.

What I do not like, is pushing the knowledge of what good defaults
are to libvirt.
Daniel P. Berrangé Aug. 1, 2024, 3:25 p.m. UTC | #57
On Thu, Aug 01, 2024 at 11:15:47AM -0400, Michael S. Tsirkin wrote:
> On Thu, Aug 01, 2024 at 11:13:37AM -0400, Peter Xu wrote:
> > Do we really concern about users not enabling features that much?  I
> > thought users always can manually change the XML and add whatever they
> > need, and device properties do not like too special here to me.  I mean, we
> > have bunch of "features" exported as new "-devices" and users must opt-in
> > for them by changing the XML.  We never worried on user not using them.  I
> > doubt whether we worried too much on user not opt-in, especially for
> > performance features, because they're, IMHO, targeting advanced users.
> 
> What I do not like, is pushing the knowledge of what good defaults
> are to libvirt.

With the -platform concept, libvirt wouldn't need to know anything about
the settings being used, nor the defaults.

Consider how it works for machine types. Libvirt queries the machine
types, and gets a list back, and QEMU expresses a default. eg saying
that 'pc-i440fx-9.1.0' is aliased to 'pc'. So libvirt can expand
'pc' to a particular version that QEMU has chosen as the default.

Conceptually I could see something similar working for the -platform
concept. Libvirt would ask QEMU for all the "platform" variants that
are available on the current running kernel. QEMU can reply with the
list, and indicate which of those is the "newest" in some manner.

Absent any preference from the mgmt app, libvirt would use whichever
one QEMU indicates was the newest. This optimizes for best featureset
on the current kernel, as the cost of possibly reduced migration
compatibility.

When a mgmt app is caring about migration, they would explicitly tell
libvirt which platform version to use, just as they would explicitly
ask for a specific machine type version, rather than accepting the 'pc'
default.

With regards,
Daniel
Peter Xu Aug. 1, 2024, 3:36 p.m. UTC | #58
On Thu, Aug 01, 2024 at 01:51:00AM -0400, Michael S. Tsirkin wrote:
> So I suggest a single command that dumps some description of host
> features, to be passed to qemu on destination. qemu then fails to
> start on destination if some of these do not work.
> The advantage is that this also helps things like -cpu host,
> and a bunch of other things like vdpa where we like to pass through
> config from kernel.

Something like that could work indeed.  I'm thinking whether it shouldn't
require a new QMP command; that sounds more work, and we also needs Libvirt
cooperations so QEMU migration will still fail.  I wonder whether we can
integrate it into migration handshake that I referred previously in our
TODO item here:

https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake

The "device handshake" part (in the previous plan) was that we at least can
verify VMSD fields matching on both sides - VMSDs are defined in both QEMU
binaries, so migration can do that already without device opt-in.

What we can do on top of that (or even, before that) is, maybe, allow
device to opt-in in such handshake besides an "VMSD check", so that there
can be something hooked to the VMSDs or similar structures, so the src
QEMU's device A can talk to dest QEMU's device A making sure everything is
good for migration.

Virtio can handshake on host feature lists and we can fail the whole
handshake there.  Same to -cpu, or vDPA, as long as opt-in hook is provided
on both sides.

The good side of it is it sounds natural to integrate this with a handshake
(when we can have it).  Meanwhile, we restrict everything within the device
scope, so neither QEMU nor migration needs to know what happened exactly.

Would that sound workable and better?

Besides, I also wonder what's our next step for this issue.  Should we fix
this on the safe side, and only set ON by default when we have the
handshake ready (in whatever form, either above, or a new QMP command)?

It's just that the handshake in general may still need some thoughts, so
I'm not sure how fast that can ready, considering our very limited
bandwidth so far.  Maybe that can be done separately, but I remember Dan
used to suggest we do handshake right in one shot, and I tend to agree
that'll be nicer.

Thanks,
Michael S. Tsirkin Aug. 1, 2024, 3:39 p.m. UTC | #59
On Thu, Aug 01, 2024 at 11:36:19AM -0400, Peter Xu wrote:
> On Thu, Aug 01, 2024 at 01:51:00AM -0400, Michael S. Tsirkin wrote:
> > So I suggest a single command that dumps some description of host
> > features, to be passed to qemu on destination. qemu then fails to
> > start on destination if some of these do not work.
> > The advantage is that this also helps things like -cpu host,
> > and a bunch of other things like vdpa where we like to pass through
> > config from kernel.
> 
> Something like that could work indeed.  I'm thinking whether it shouldn't
> require a new QMP command; that sounds more work, and we also needs Libvirt
> cooperations so QEMU migration will still fail.  I wonder whether we can
> integrate it into migration handshake that I referred previously in our
> TODO item here:
> 
> https://wiki.qemu.org/ToDo/LiveMigration#Migration_handshake

This is different. You can start migration on destination
without touching source at all. This allows e.g. finding
a destination that can support your source.
Daniel P. Berrangé Aug. 1, 2024, 3:45 p.m. UTC | #60
On Thu, Aug 01, 2024 at 01:51:00AM -0400, Michael S. Tsirkin wrote:
> On Wed, Jul 31, 2024 at 08:57:52AM -0400, Peter Xu wrote:
> > Could you elaborate why it would fail if with what I proposed?
> 
> First I think I was wrong I misunderstood what you said.
> To summarise, you said:
> 
> - any new feature depending on another package is off by default
> - starting qemu on destination with feature enabled will fail
>   thus migration is not started
> 
> 
> My comment is that this "started" is from qemu point of view,
> from user's POV starting qemu on destination is just the 1st
> step of migration.
> 
> 
> However I agree, this is better since we do not waste bandwidth,
> and I was wrong to say we do.
> 
> My other comment is that adding features becomes even more work
> than it is now.
> 
> So I suggest a single command that dumps some description of host
> features, to be passed to qemu on destination. qemu then fails to
> start on destination if some of these do not work.
> The advantage is that this also helps things like -cpu host,
> and a bunch of other things like vdpa where we like to pass through
> config from kernel.
> 
> The disadvantage is that it does not exactly *fix* migration,
> it just does not let you start it.

This feels like only half a solution, and not the most helpful half.
It prevents you accidentally migrating to a host that lacks some
features, but doesn't help with starting a VM that has migrate
compatible features in the first place.

From a user POV, the latter is what's most important. Checking for
incompatible features is just a safety net that you should never
need to hit, if QEMU was configured suitably to start with.

So to ensure a QEMU is started with migration compatible features
will still require teaching libvirt about every single feature
that has a host kernel dependancy, so libvirt (or the app using
libvirt) knows to turn this off. This is alot more work for both
libvirt & the mgmt app, than having QEMU provide the generic
"platforms" concept which is extensible without needing further
work outside QEMU.

With regards,
Daniel
Michael S. Tsirkin Aug. 1, 2024, 3:50 p.m. UTC | #61
On Thu, Aug 01, 2024 at 04:45:17PM +0100, Daniel P. Berrangé wrote:
> So to ensure a QEMU is started with migration compatible features
> will still require teaching libvirt about every single feature
> that has a host kernel dependancy, so libvirt (or the app using
> libvirt) knows to turn this off. This is alot more work for both
> libvirt & the mgmt app, than having QEMU provide the generic
> "platforms" concept which is extensible without needing further
> work outside QEMU.

I am just not sure it can all amount to selecting from a list.
For example, some resource can be limited on one host or another.
Thus we get a number. Or there could be a set of N flags, with 2^N
combinations.
Daniel P. Berrangé Aug. 1, 2024, 3:58 p.m. UTC | #62
On Thu, Aug 01, 2024 at 11:50:40AM -0400, Michael S. Tsirkin wrote:
> On Thu, Aug 01, 2024 at 04:45:17PM +0100, Daniel P. Berrangé wrote:
> > So to ensure a QEMU is started with migration compatible features
> > will still require teaching libvirt about every single feature
> > that has a host kernel dependancy, so libvirt (or the app using
> > libvirt) knows to turn this off. This is alot more work for both
> > libvirt & the mgmt app, than having QEMU provide the generic
> > "platforms" concept which is extensible without needing further
> > work outside QEMU.
> 
> I am just not sure it can all amount to selecting from a list.
> For example, some resource can be limited on one host or another.
> Thus we get a number. Or there could be a set of N flags, with 2^N
> combinations.

We don't have to support all possible combinations IMHO. If a user
really does require precise control over every combination of some
settings, then exposing those tunables in libvirt is inevitable.

The platform concept only has to be able to express a "good enough"
subset of combinations, such that it is unlikely users will need to
have fine tuning for most of the tunables. We might end up exposing
a handful of tunables in libvirt anyway, but as long as we get the
common case satisifed, we'll eliminate most of the ongoing burden.

With regards,
Daniel
Akihiko Odaki Aug. 2, 2024, 4:30 a.m. UTC | #63
On 2024/08/02 0:13, Peter Xu wrote:
> On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote:
>> On 2024/07/31 4:11, Peter Xu wrote:
>>> On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote:
>>>> On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
>>>>> On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
>>>>>> On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
>>>>>>> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
>>>>>>>>
>>>>>>>> We've got two mutually conflicting goals with the machine type
>>>>>>>> definitions.
>>>>>>>>
>>>>>>>> Primarily we use them to ensure stable ABI, but an important
>>>>>>>> secondary goal is to enable new tunables to have new defaults
>>>>>>>> set, without having to update every mgmt app.  The latter
>>>>>>>> works very well when the defaults have no dependancy on the
>>>>>>>> platform kernel/OS, but breaks migration when they do have a
>>>>>>>> platform dependancy.
>>>>>>>>
>>>>>>>>>     - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>>>>>
>>>>>>>>>     - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>>>>>       by default, while advanced users can opt-in on new features.  We can't
>>>>>>>>>       make this ON by default otherwise some VMs can already fail to boot,
>>>>>>>>>
>>>>>>>>>     - If the host doesn't support the feature while the cmdline enabled it,
>>>>>>>>>       it needs to fail QEMU boot rather than flipping, so that it says "hey,
>>>>>>>>>       this host does not support running such VM specified, due to XXX
>>>>>>>>>       feature missing".
>>>>>>>>>
>>>>>>>>> That's the only way an user could understand what happened, and IMHO that's
>>>>>>>>> a clean way that we stick with QEMU cmdline on defining the guest ABI,
>>>>>>>>> while in which the machine type is the fundation of such definition, as the
>>>>>>>>> machine type can decides many of the rest compat properties.  And that's
>>>>>>>>> the whole point of the compat properties too (to make sure the guest ABI is
>>>>>>>>> stable).
>>>>>>>>>
>>>>>>>>> If kernel breaks it easily, all compat property things that we maintain can
>>>>>>>>> already stop making sense in general, because it didn't define the whole
>>>>>>>>> guest ABI..
>>>>>>>>>
>>>>>>>>> So AFAIU that's really what we used for years, I hope I didn't overlook
>>>>>>>>> somehting.  And maybe we don't yet need the "-platform" layer if we can
>>>>>>>>> keep up with this rule?
>>>>>>>>
>>>>>>>> We've failed at this for years wrt enabling use of new defaults that have
>>>>>>>> a platform depedancy, so historical practice isn't a good reference.
>>>>>>>>
>>>>>>>> There are 100's (possibly 1000's) of tunables set implicitly as part of
>>>>>>>> the machine type, and of those, libvirt likely only exposes a few 10's
>>>>>>>> of tunables. The vast majority are low level details that no mgmt app
>>>>>>>> wants to know about, they just want to accept QEMU's new defaults,
>>>>>>>> while preserving machine ABI. This is a good thing. No one wants the
>>>>>>>> burden of wiring up every single tunable into libvirt and mgmt apps.
>>>>>>>>
>>>>>>>> This is what the "-platform" concept would be intended to preserve. It
>>>>>>>> would allow a way to enable groups of settings that have a platform level
>>>>>>>> dependancy, without ever having to teach either libvirt or the mgmt apps
>>>>>>>> about the individual tunables.
>>>>>>>
>>>>>>> Do you think we can achieve similar goal by simply turning the feature to
>>>>>>> ON only after a few QEMU releases?  I also mentioned that idea below.
>>>>>>>
>>>>>>> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
>>>>>>>
>>>>>>> So far it really sounds like the right thing to do to me to fix all similar
>>>>>>> issues, even without introducing anything new we need to maintain.
>>>>>>
>>>>>> Turning a feature with a platform dependency to "on" implies that
>>>>>> the machine type will cease to work out of the box for platforms
>>>>>> which lack the feature. IMHO that's not acceptable behaviour for
>>>>>> any of our supported platforms.
>>>>>
>>>>> Right, that's why I was thinking whether we should just always be on the
>>>>> safe side, even if I just replied in the other email to Akihiko, that we do
>>>>> have the option to make this more aggresive by turning those to ON after
>>>>> even 1-2 years or even less.. and we have control of how aggressive this
>>>>> can be.
>>>>>
>>>>>>
>>>>>> IOW, "after a few QEMU releases" implies a delay of as much as
>>>>>> 5 years, while we wait for platforms which don't support the
>>>>>> feature to drop out of our supported targets list.  I don't
>>>>>> think that'll satisfy the desire to get the new feature
>>>>>> available to users as soon as practical for their particular
>>>>>> platform.
>>>>>
>>>>> The feature is always available since the 1st day, right?  We just need the
>>>>> user to opt-in, by specifying ON in the cmdline.
>>>>>
>>>>> That'll be my take on this that QEMU's default VM setup should be always
>>>>> bootable, migratable, and so on.  Then user opt-in on stuff like this one,
>>>>> where there's implication on the ABIs.  The "user" can also include
>>>>> Libvirt.  I mean when something is really important, Libvirt should, IMHO,
>>>>> opt-in by treating that similarly like many cpu properties, and by probing
>>>>> the host first.
>>>>>
>>>>> IIUC there aren't a lot of things like that (part of guest ABI & host
>>>>> kernel / HW dependent), am I right?  Otherwise I would expect more failures
>>>>> like this one, but it isn't as much as that yet.  IIUC it means the efforts
>>>>> to make Libvirt get involved should be hopefully under control too.  The
>>>>> worst case is Libvirt doesn't auto-on it, but again the user should always
>>>>> have the option to turn it on when it's necessary.
>>>>
>>>> If it is left to libvirt, then it would very likely end up being a user
>>>> opt-in, not auto-enabled.
>>>
>>> Not sure whether there's other opinions, but that's definitely fine by me.
>>>
>>> I think it even makes more sense, as even if Libvirt probed the host and
>>> auto-on the feature, it also means Libvirt made a decision for the user,
>>> saying "having a better performance" is more important than "being able to
>>> migrate this VM everywhere".
>>>
>>> I don't see a way that can make such fair decision besides requesting the
>>> user to opt-in always for those, then the user is fully aware what is
>>> enabled, with the hope that when a migration fails later with "target host
>>> doesn't support feature XXX" the user is crystal clear on what happened.
>>
>> I think it is better to distinguish saying "having a better performance is
>> more important than being able to migrate this VM everywhere" from
>> explicitly selecting all available offload features; the latter is lot of
>> chores. More importantly, users may not just know these features may prevent
>> migration; they may just look like performance features nice to have at
>> first glance.
>>
>> I don' think what a user would want are not individual performance knobs,
>> but a user is more likely to need to express the platforms they would want
>> to migrate VMs on. There are several possible scenarios in particular:
>> 1) Migration everywhere
>> 2) Migration on specific machines
>> 3) Migration on some known platforms
>> 4) No migration (migration on nowhere)
>>
>> If a user chooses 1-3), QEMU may reject platform-dependent features even if
>> the user requests one; in this way, we don't need the users to make things
>> crystal clear, but we can expect QEMU does so.
>>
>> If a user chooses 2-4), QEMU may enable all offloading features available on
>> the specified platforms. Again, the user will no longer have to know each
>> individual performance features. QEMU may also reject migration to platforms
>> not specified to prevent misconfiguration.
>>
>> The -platform proposal earlier corresponds to 3). However it has a downside
>> that QEMU needs to know about platforms, which may not be trivial. In that
>> case, we can support 1), 2), and 4).
> 
> I'm not sure if I read it right.  Perhaps you meant something more generic
> than -platform but similar?
> 
> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> "perf" or "compat", while by default to "compat"?

"perf" would cover 4) and "compat" will cover 1). However neither of 
them will cover 2) because an enum is not enough to know about all 
hosts. I presented a design that will cover 2) in:
https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com

I also want to point out that "perf" should be rather named like 
"nomigrate". In general, a program should expose a functional 
requirement on the interface. It can then do its best to achieve high 
performance under that requirement.

> 
> If so, I think I get the idea, but it'll be challenging in at least these
> aspects:
> 
>    - We already have (at least.. that I'm aware of) three layers of
>      specifying a property for a device, they are:
> 
>      (1) default value
>      (2) compat property (from machine type definitions)
>      (3) qemu cmdline (specify one property explicitly)
> 
>      So far, there's an order we apply these (1-3), while (3) has the top
>      priority to overwrite (1-2), and (2) to overwrite (1).
> 
>      The new "-profile", if I read it right, introduce (4), and it's already
>      unclear to me how that interacts with (3) when -profile says "turn
>      FEAT1 on" while cmdline says otherwise.
> 
>      It can make things very compilcated, IMHO.
> 
>    - This still will break the "QEMU cmdline defines the guest ABI", e.g.,
>      consider this USO* thing that we boot an old machine type on a new
>      system that has QEMU+Linux USO* all enabled.  We specify "-profile
>      perf" there.  Then when we try to migrate to another older QEMU it'll
>      still fail the migration instead of any way telling us "migration is
>      not compatible".  So even if it helps the user turning on knobs, it
>      doesn't sound like to fix the problem we're working on?

When it is named nomigrate, it is obvious that migration does not work.

> 
> For whatever profile setup, it sounds like more applicable to a Libvirt
> option that user can choose.  That may avoid above two concerns I have,
> especially the latter. But I really don't know much on Libvirt, and this
> can be some extra effort too on top of either QEMU / Libvirt, and we may
> need to justify worthwhile.
> 
> Do we really concern about users not enabling features that much?  I
> thought users always can manually change the XML and add whatever they
> need, and device properties do not like too special here to me.  I mean, we
> have bunch of "features" exported as new "-devices" and users must opt-in
> for them by changing the XML.  We never worried on user not using them.  I
> doubt whether we worried too much on user not opt-in, especially for
> performance features, because they're, IMHO, targeting advanced users.

It is not about whether the user is knowledgeable or not, but it is 
about what the user wants. Migration is mandatory for a user who runs 
multi-tenant platforms, but it doesn't really matter for desktop users. 
Which are more knowledgeable? Personally, I want to have higher 
expectation for users running multi-tenant platforms, but it all depends.

You asked for the next step in another email. My suggestion is to 
satisfy 1) first because it is the easiest and safest. In particular, I 
suggest disabling all platform-dependent features by default to satisfy 
1). Combined with an existing option, -only-migratable, users will get 
the maximum assurance of migratability.

4) is the second easiest to implement, but the design of 4) will depend 
on whether we will satisfy 2) or 3). In the email I cited earlier, I 
suggested an option -use-platform to specify the expectation on the 
platform. If it is ever to be implemented, that option can take a 
special value, "host" to tell QEMU that it can use any features it finds 
on the current host.

Regards,
Akihiko Odaki
Michael S. Tsirkin Aug. 2, 2024, 1:21 p.m. UTC | #64
On Fri, Aug 02, 2024 at 01:30:51PM +0900, Akihiko Odaki wrote:
> 4) is the second easiest to implement, but the design of 4) will depend on
> whether we will satisfy 2) or 3). In the email I cited earlier, I suggested
> an option -use-platform to specify the expectation on the platform. If it is
> ever to be implemented, that option can take a special value, "host" to tell
> QEMU that it can use any features it finds on the current host.

In practice, lots of people would benefit from ability to migrate
using host features (checking that hosts are compatibile,
as they often are).
If we are going to go to great lengths adding new interfaces,
I think that would be a really useful thing to address.
Peter Xu Aug. 2, 2024, 3:05 p.m. UTC | #65
On Fri, Aug 02, 2024 at 01:30:51PM +0900, Akihiko Odaki wrote:
> On 2024/08/02 0:13, Peter Xu wrote:
> > On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote:
> > > On 2024/07/31 4:11, Peter Xu wrote:
> > > > On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote:
> > > > > On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
> > > > > > On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
> > > > > > > On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
> > > > > > > > On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
> > > > > > > > > 
> > > > > > > > > We've got two mutually conflicting goals with the machine type
> > > > > > > > > definitions.
> > > > > > > > > 
> > > > > > > > > Primarily we use them to ensure stable ABI, but an important
> > > > > > > > > secondary goal is to enable new tunables to have new defaults
> > > > > > > > > set, without having to update every mgmt app.  The latter
> > > > > > > > > works very well when the defaults have no dependancy on the
> > > > > > > > > platform kernel/OS, but breaks migration when they do have a
> > > > > > > > > platform dependancy.
> > > > > > > > > 
> > > > > > > > > >     - Firstly, never quietly flipping any bit that affects the ABI...
> > > > > > > > > > 
> > > > > > > > > >     - Have a default value of off, then QEMU will always allow the VM to boot
> > > > > > > > > >       by default, while advanced users can opt-in on new features.  We can't
> > > > > > > > > >       make this ON by default otherwise some VMs can already fail to boot,
> > > > > > > > > > 
> > > > > > > > > >     - If the host doesn't support the feature while the cmdline enabled it,
> > > > > > > > > >       it needs to fail QEMU boot rather than flipping, so that it says "hey,
> > > > > > > > > >       this host does not support running such VM specified, due to XXX
> > > > > > > > > >       feature missing".
> > > > > > > > > > 
> > > > > > > > > > That's the only way an user could understand what happened, and IMHO that's
> > > > > > > > > > a clean way that we stick with QEMU cmdline on defining the guest ABI,
> > > > > > > > > > while in which the machine type is the fundation of such definition, as the
> > > > > > > > > > machine type can decides many of the rest compat properties.  And that's
> > > > > > > > > > the whole point of the compat properties too (to make sure the guest ABI is
> > > > > > > > > > stable).
> > > > > > > > > > 
> > > > > > > > > > If kernel breaks it easily, all compat property things that we maintain can
> > > > > > > > > > already stop making sense in general, because it didn't define the whole
> > > > > > > > > > guest ABI..
> > > > > > > > > > 
> > > > > > > > > > So AFAIU that's really what we used for years, I hope I didn't overlook
> > > > > > > > > > somehting.  And maybe we don't yet need the "-platform" layer if we can
> > > > > > > > > > keep up with this rule?
> > > > > > > > > 
> > > > > > > > > We've failed at this for years wrt enabling use of new defaults that have
> > > > > > > > > a platform depedancy, so historical practice isn't a good reference.
> > > > > > > > > 
> > > > > > > > > There are 100's (possibly 1000's) of tunables set implicitly as part of
> > > > > > > > > the machine type, and of those, libvirt likely only exposes a few 10's
> > > > > > > > > of tunables. The vast majority are low level details that no mgmt app
> > > > > > > > > wants to know about, they just want to accept QEMU's new defaults,
> > > > > > > > > while preserving machine ABI. This is a good thing. No one wants the
> > > > > > > > > burden of wiring up every single tunable into libvirt and mgmt apps.
> > > > > > > > > 
> > > > > > > > > This is what the "-platform" concept would be intended to preserve. It
> > > > > > > > > would allow a way to enable groups of settings that have a platform level
> > > > > > > > > dependancy, without ever having to teach either libvirt or the mgmt apps
> > > > > > > > > about the individual tunables.
> > > > > > > > 
> > > > > > > > Do you think we can achieve similar goal by simply turning the feature to
> > > > > > > > ON only after a few QEMU releases?  I also mentioned that idea below.
> > > > > > > > 
> > > > > > > > https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
> > > > > > > > 
> > > > > > > > So far it really sounds like the right thing to do to me to fix all similar
> > > > > > > > issues, even without introducing anything new we need to maintain.
> > > > > > > 
> > > > > > > Turning a feature with a platform dependency to "on" implies that
> > > > > > > the machine type will cease to work out of the box for platforms
> > > > > > > which lack the feature. IMHO that's not acceptable behaviour for
> > > > > > > any of our supported platforms.
> > > > > > 
> > > > > > Right, that's why I was thinking whether we should just always be on the
> > > > > > safe side, even if I just replied in the other email to Akihiko, that we do
> > > > > > have the option to make this more aggresive by turning those to ON after
> > > > > > even 1-2 years or even less.. and we have control of how aggressive this
> > > > > > can be.
> > > > > > 
> > > > > > > 
> > > > > > > IOW, "after a few QEMU releases" implies a delay of as much as
> > > > > > > 5 years, while we wait for platforms which don't support the
> > > > > > > feature to drop out of our supported targets list.  I don't
> > > > > > > think that'll satisfy the desire to get the new feature
> > > > > > > available to users as soon as practical for their particular
> > > > > > > platform.
> > > > > > 
> > > > > > The feature is always available since the 1st day, right?  We just need the
> > > > > > user to opt-in, by specifying ON in the cmdline.
> > > > > > 
> > > > > > That'll be my take on this that QEMU's default VM setup should be always
> > > > > > bootable, migratable, and so on.  Then user opt-in on stuff like this one,
> > > > > > where there's implication on the ABIs.  The "user" can also include
> > > > > > Libvirt.  I mean when something is really important, Libvirt should, IMHO,
> > > > > > opt-in by treating that similarly like many cpu properties, and by probing
> > > > > > the host first.
> > > > > > 
> > > > > > IIUC there aren't a lot of things like that (part of guest ABI & host
> > > > > > kernel / HW dependent), am I right?  Otherwise I would expect more failures
> > > > > > like this one, but it isn't as much as that yet.  IIUC it means the efforts
> > > > > > to make Libvirt get involved should be hopefully under control too.  The
> > > > > > worst case is Libvirt doesn't auto-on it, but again the user should always
> > > > > > have the option to turn it on when it's necessary.
> > > > > 
> > > > > If it is left to libvirt, then it would very likely end up being a user
> > > > > opt-in, not auto-enabled.
> > > > 
> > > > Not sure whether there's other opinions, but that's definitely fine by me.
> > > > 
> > > > I think it even makes more sense, as even if Libvirt probed the host and
> > > > auto-on the feature, it also means Libvirt made a decision for the user,
> > > > saying "having a better performance" is more important than "being able to
> > > > migrate this VM everywhere".
> > > > 
> > > > I don't see a way that can make such fair decision besides requesting the
> > > > user to opt-in always for those, then the user is fully aware what is
> > > > enabled, with the hope that when a migration fails later with "target host
> > > > doesn't support feature XXX" the user is crystal clear on what happened.
> > > 
> > > I think it is better to distinguish saying "having a better performance is
> > > more important than being able to migrate this VM everywhere" from
> > > explicitly selecting all available offload features; the latter is lot of
> > > chores. More importantly, users may not just know these features may prevent
> > > migration; they may just look like performance features nice to have at
> > > first glance.
> > > 
> > > I don' think what a user would want are not individual performance knobs,
> > > but a user is more likely to need to express the platforms they would want
> > > to migrate VMs on. There are several possible scenarios in particular:
> > > 1) Migration everywhere
> > > 2) Migration on specific machines
> > > 3) Migration on some known platforms
> > > 4) No migration (migration on nowhere)
> > > 
> > > If a user chooses 1-3), QEMU may reject platform-dependent features even if
> > > the user requests one; in this way, we don't need the users to make things
> > > crystal clear, but we can expect QEMU does so.
> > > 
> > > If a user chooses 2-4), QEMU may enable all offloading features available on
> > > the specified platforms. Again, the user will no longer have to know each
> > > individual performance features. QEMU may also reject migration to platforms
> > > not specified to prevent misconfiguration.
> > > 
> > > The -platform proposal earlier corresponds to 3). However it has a downside
> > > that QEMU needs to know about platforms, which may not be trivial. In that
> > > case, we can support 1), 2), and 4).
> > 
> > I'm not sure if I read it right.  Perhaps you meant something more generic
> > than -platform but similar?
> > 
> > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> > "perf" or "compat", while by default to "compat"?
> 
> "perf" would cover 4) and "compat" will cover 1). However neither of them
> will cover 2) because an enum is not enough to know about all hosts. I
> presented a design that will cover 2) in:
> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com

"-merge-platform" shouldn't be a QEMU parameter, but should be something
separate.

Yes, as you mentioned there it could be a lot of work, we may need to think
it through and collect enough input before working on something like that.

> 
> I also want to point out that "perf" should be rather named like
> "nomigrate". In general, a program should expose a functional requirement on
> the interface. It can then do its best to achieve high performance under
> that requirement.

"nomigrate" may be inaccurate or even wrong in this case, because as long
as the features are supported on both hosts it's migratable.

> 
> > 
> > If so, I think I get the idea, but it'll be challenging in at least these
> > aspects:
> > 
> >    - We already have (at least.. that I'm aware of) three layers of
> >      specifying a property for a device, they are:
> > 
> >      (1) default value
> >      (2) compat property (from machine type definitions)
> >      (3) qemu cmdline (specify one property explicitly)
> > 
> >      So far, there's an order we apply these (1-3), while (3) has the top
> >      priority to overwrite (1-2), and (2) to overwrite (1).
> > 
> >      The new "-profile", if I read it right, introduce (4), and it's already
> >      unclear to me how that interacts with (3) when -profile says "turn
> >      FEAT1 on" while cmdline says otherwise.
> > 
> >      It can make things very compilcated, IMHO.
> > 
> >    - This still will break the "QEMU cmdline defines the guest ABI", e.g.,
> >      consider this USO* thing that we boot an old machine type on a new
> >      system that has QEMU+Linux USO* all enabled.  We specify "-profile
> >      perf" there.  Then when we try to migrate to another older QEMU it'll
> >      still fail the migration instead of any way telling us "migration is
> >      not compatible".  So even if it helps the user turning on knobs, it
> >      doesn't sound like to fix the problem we're working on?
> 
> When it is named nomigrate, it is obvious that migration does not work.

I am not sure whether you meant to e.g. add a migration blocker in this
case even if migration can be supported between some hosts.  But if so it
may not be wise either to block users trying to migrate where it is still
applicable.  So maybe I misunderstood.

> 
> > 
> > For whatever profile setup, it sounds like more applicable to a Libvirt
> > option that user can choose.  That may avoid above two concerns I have,
> > especially the latter. But I really don't know much on Libvirt, and this
> > can be some extra effort too on top of either QEMU / Libvirt, and we may
> > need to justify worthwhile.
> > 
> > Do we really concern about users not enabling features that much?  I
> > thought users always can manually change the XML and add whatever they
> > need, and device properties do not like too special here to me.  I mean, we
> > have bunch of "features" exported as new "-devices" and users must opt-in
> > for them by changing the XML.  We never worried on user not using them.  I
> > doubt whether we worried too much on user not opt-in, especially for
> > performance features, because they're, IMHO, targeting advanced users.
> 
> It is not about whether the user is knowledgeable or not, but it is about
> what the user wants. Migration is mandatory for a user who runs multi-tenant
> platforms, but it doesn't really matter for desktop users. Which are more
> knowledgeable? Personally, I want to have higher expectation for users
> running multi-tenant platforms, but it all depends.
> 
> You asked for the next step in another email. My suggestion is to satisfy 1)
> first because it is the easiest and safest. In particular, I suggest
> disabling all platform-dependent features by default to satisfy 1). Combined
> with an existing option, -only-migratable, users will get the maximum
> assurance of migratability.
> 
> 4) is the second easiest to implement, but the design of 4) will depend on
> whether we will satisfy 2) or 3). In the email I cited earlier, I suggested
> an option -use-platform to specify the expectation on the platform. If it is
> ever to be implemented, that option can take a special value, "host" to tell
> QEMU that it can use any features it finds on the current host.

So I don't know what's the best plan yet for the longer term, but I'm
completely with you on starting with 1).

Thanks,
Akihiko Odaki Aug. 2, 2024, 3:54 p.m. UTC | #66
On 2024/08/03 0:05, Peter Xu wrote:
> On Fri, Aug 02, 2024 at 01:30:51PM +0900, Akihiko Odaki wrote:
>> On 2024/08/02 0:13, Peter Xu wrote:
>>> On Thu, Aug 01, 2024 at 02:05:54PM +0900, Akihiko Odaki wrote:
>>>> On 2024/07/31 4:11, Peter Xu wrote:
>>>>> On Tue, Jul 30, 2024 at 07:46:12PM +0100, Daniel P. Berrangé wrote:
>>>>>> On Tue, Jul 30, 2024 at 02:13:51PM -0400, Peter Xu wrote:
>>>>>>> On Mon, Jul 29, 2024 at 06:26:41PM +0100, Daniel P. Berrangé wrote:
>>>>>>>> On Mon, Jul 29, 2024 at 01:00:30PM -0400, Peter Xu wrote:
>>>>>>>>> On Mon, Jul 29, 2024 at 04:58:03PM +0100, Daniel P. Berrangé wrote:
>>>>>>>>>>
>>>>>>>>>> We've got two mutually conflicting goals with the machine type
>>>>>>>>>> definitions.
>>>>>>>>>>
>>>>>>>>>> Primarily we use them to ensure stable ABI, but an important
>>>>>>>>>> secondary goal is to enable new tunables to have new defaults
>>>>>>>>>> set, without having to update every mgmt app.  The latter
>>>>>>>>>> works very well when the defaults have no dependancy on the
>>>>>>>>>> platform kernel/OS, but breaks migration when they do have a
>>>>>>>>>> platform dependancy.
>>>>>>>>>>
>>>>>>>>>>>      - Firstly, never quietly flipping any bit that affects the ABI...
>>>>>>>>>>>
>>>>>>>>>>>      - Have a default value of off, then QEMU will always allow the VM to boot
>>>>>>>>>>>        by default, while advanced users can opt-in on new features.  We can't
>>>>>>>>>>>        make this ON by default otherwise some VMs can already fail to boot,
>>>>>>>>>>>
>>>>>>>>>>>      - If the host doesn't support the feature while the cmdline enabled it,
>>>>>>>>>>>        it needs to fail QEMU boot rather than flipping, so that it says "hey,
>>>>>>>>>>>        this host does not support running such VM specified, due to XXX
>>>>>>>>>>>        feature missing".
>>>>>>>>>>>
>>>>>>>>>>> That's the only way an user could understand what happened, and IMHO that's
>>>>>>>>>>> a clean way that we stick with QEMU cmdline on defining the guest ABI,
>>>>>>>>>>> while in which the machine type is the fundation of such definition, as the
>>>>>>>>>>> machine type can decides many of the rest compat properties.  And that's
>>>>>>>>>>> the whole point of the compat properties too (to make sure the guest ABI is
>>>>>>>>>>> stable).
>>>>>>>>>>>
>>>>>>>>>>> If kernel breaks it easily, all compat property things that we maintain can
>>>>>>>>>>> already stop making sense in general, because it didn't define the whole
>>>>>>>>>>> guest ABI..
>>>>>>>>>>>
>>>>>>>>>>> So AFAIU that's really what we used for years, I hope I didn't overlook
>>>>>>>>>>> somehting.  And maybe we don't yet need the "-platform" layer if we can
>>>>>>>>>>> keep up with this rule?
>>>>>>>>>>
>>>>>>>>>> We've failed at this for years wrt enabling use of new defaults that have
>>>>>>>>>> a platform depedancy, so historical practice isn't a good reference.
>>>>>>>>>>
>>>>>>>>>> There are 100's (possibly 1000's) of tunables set implicitly as part of
>>>>>>>>>> the machine type, and of those, libvirt likely only exposes a few 10's
>>>>>>>>>> of tunables. The vast majority are low level details that no mgmt app
>>>>>>>>>> wants to know about, they just want to accept QEMU's new defaults,
>>>>>>>>>> while preserving machine ABI. This is a good thing. No one wants the
>>>>>>>>>> burden of wiring up every single tunable into libvirt and mgmt apps.
>>>>>>>>>>
>>>>>>>>>> This is what the "-platform" concept would be intended to preserve. It
>>>>>>>>>> would allow a way to enable groups of settings that have a platform level
>>>>>>>>>> dependancy, without ever having to teach either libvirt or the mgmt apps
>>>>>>>>>> about the individual tunables.
>>>>>>>>>
>>>>>>>>> Do you think we can achieve similar goal by simply turning the feature to
>>>>>>>>> ON only after a few QEMU releases?  I also mentioned that idea below.
>>>>>>>>>
>>>>>>>>> https://lore.kernel.org/r/ZqQNKZ9_OPhDq2AK@x1n
>>>>>>>>>
>>>>>>>>> So far it really sounds like the right thing to do to me to fix all similar
>>>>>>>>> issues, even without introducing anything new we need to maintain.
>>>>>>>>
>>>>>>>> Turning a feature with a platform dependency to "on" implies that
>>>>>>>> the machine type will cease to work out of the box for platforms
>>>>>>>> which lack the feature. IMHO that's not acceptable behaviour for
>>>>>>>> any of our supported platforms.
>>>>>>>
>>>>>>> Right, that's why I was thinking whether we should just always be on the
>>>>>>> safe side, even if I just replied in the other email to Akihiko, that we do
>>>>>>> have the option to make this more aggresive by turning those to ON after
>>>>>>> even 1-2 years or even less.. and we have control of how aggressive this
>>>>>>> can be.
>>>>>>>
>>>>>>>>
>>>>>>>> IOW, "after a few QEMU releases" implies a delay of as much as
>>>>>>>> 5 years, while we wait for platforms which don't support the
>>>>>>>> feature to drop out of our supported targets list.  I don't
>>>>>>>> think that'll satisfy the desire to get the new feature
>>>>>>>> available to users as soon as practical for their particular
>>>>>>>> platform.
>>>>>>>
>>>>>>> The feature is always available since the 1st day, right?  We just need the
>>>>>>> user to opt-in, by specifying ON in the cmdline.
>>>>>>>
>>>>>>> That'll be my take on this that QEMU's default VM setup should be always
>>>>>>> bootable, migratable, and so on.  Then user opt-in on stuff like this one,
>>>>>>> where there's implication on the ABIs.  The "user" can also include
>>>>>>> Libvirt.  I mean when something is really important, Libvirt should, IMHO,
>>>>>>> opt-in by treating that similarly like many cpu properties, and by probing
>>>>>>> the host first.
>>>>>>>
>>>>>>> IIUC there aren't a lot of things like that (part of guest ABI & host
>>>>>>> kernel / HW dependent), am I right?  Otherwise I would expect more failures
>>>>>>> like this one, but it isn't as much as that yet.  IIUC it means the efforts
>>>>>>> to make Libvirt get involved should be hopefully under control too.  The
>>>>>>> worst case is Libvirt doesn't auto-on it, but again the user should always
>>>>>>> have the option to turn it on when it's necessary.
>>>>>>
>>>>>> If it is left to libvirt, then it would very likely end up being a user
>>>>>> opt-in, not auto-enabled.
>>>>>
>>>>> Not sure whether there's other opinions, but that's definitely fine by me.
>>>>>
>>>>> I think it even makes more sense, as even if Libvirt probed the host and
>>>>> auto-on the feature, it also means Libvirt made a decision for the user,
>>>>> saying "having a better performance" is more important than "being able to
>>>>> migrate this VM everywhere".
>>>>>
>>>>> I don't see a way that can make such fair decision besides requesting the
>>>>> user to opt-in always for those, then the user is fully aware what is
>>>>> enabled, with the hope that when a migration fails later with "target host
>>>>> doesn't support feature XXX" the user is crystal clear on what happened.
>>>>
>>>> I think it is better to distinguish saying "having a better performance is
>>>> more important than being able to migrate this VM everywhere" from
>>>> explicitly selecting all available offload features; the latter is lot of
>>>> chores. More importantly, users may not just know these features may prevent
>>>> migration; they may just look like performance features nice to have at
>>>> first glance.
>>>>
>>>> I don' think what a user would want are not individual performance knobs,
>>>> but a user is more likely to need to express the platforms they would want
>>>> to migrate VMs on. There are several possible scenarios in particular:
>>>> 1) Migration everywhere
>>>> 2) Migration on specific machines
>>>> 3) Migration on some known platforms
>>>> 4) No migration (migration on nowhere)
>>>>
>>>> If a user chooses 1-3), QEMU may reject platform-dependent features even if
>>>> the user requests one; in this way, we don't need the users to make things
>>>> crystal clear, but we can expect QEMU does so.
>>>>
>>>> If a user chooses 2-4), QEMU may enable all offloading features available on
>>>> the specified platforms. Again, the user will no longer have to know each
>>>> individual performance features. QEMU may also reject migration to platforms
>>>> not specified to prevent misconfiguration.
>>>>
>>>> The -platform proposal earlier corresponds to 3). However it has a downside
>>>> that QEMU needs to know about platforms, which may not be trivial. In that
>>>> case, we can support 1), 2), and 4).
>>>
>>> I'm not sure if I read it right.  Perhaps you meant something more generic
>>> than -platform but similar?
>>>
>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
>>> "perf" or "compat", while by default to "compat"?
>>
>> "perf" would cover 4) and "compat" will cover 1). However neither of them
>> will cover 2) because an enum is not enough to know about all hosts. I
>> presented a design that will cover 2) in:
>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
> 
> "-merge-platform" shouldn't be a QEMU parameter, but should be something
> separate.

Do you mean merging platform dumps should be done with another command? 
I think we will want to know the QOM tree is in use when implementing 
-merge-platform. For example, you cannot define a "platform" when e.g., 
you don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) 
is connected to virtio-net devices. Of course we can include those 
information in dumps, but we don't do so for VMState.

> 
> Yes, as you mentioned there it could be a lot of work, we may need to think
> it through and collect enough input before working on something like that.
> 
>>
>> I also want to point out that "perf" should be rather named like
>> "nomigrate". In general, a program should expose a functional requirement on
>> the interface. It can then do its best to achieve high performance under
>> that requirement.
> 
> "nomigrate" may be inaccurate or even wrong in this case, because as long
> as the features are supported on both hosts it's migratable.

Perhaps it may be named no-cross-migrate or something. There are lots of 
details we need to figure out.

> 
>>
>>>
>>> If so, I think I get the idea, but it'll be challenging in at least these
>>> aspects:
>>>
>>>     - We already have (at least.. that I'm aware of) three layers of
>>>       specifying a property for a device, they are:
>>>
>>>       (1) default value
>>>       (2) compat property (from machine type definitions)
>>>       (3) qemu cmdline (specify one property explicitly)
>>>
>>>       So far, there's an order we apply these (1-3), while (3) has the top
>>>       priority to overwrite (1-2), and (2) to overwrite (1).
>>>
>>>       The new "-profile", if I read it right, introduce (4), and it's already
>>>       unclear to me how that interacts with (3) when -profile says "turn
>>>       FEAT1 on" while cmdline says otherwise.
>>>
>>>       It can make things very compilcated, IMHO.
>>>
>>>     - This still will break the "QEMU cmdline defines the guest ABI", e.g.,
>>>       consider this USO* thing that we boot an old machine type on a new
>>>       system that has QEMU+Linux USO* all enabled.  We specify "-profile
>>>       perf" there.  Then when we try to migrate to another older QEMU it'll
>>>       still fail the migration instead of any way telling us "migration is
>>>       not compatible".  So even if it helps the user turning on knobs, it
>>>       doesn't sound like to fix the problem we're working on?
>>
>> When it is named nomigrate, it is obvious that migration does not work.
> 
> I am not sure whether you meant to e.g. add a migration blocker in this
> case even if migration can be supported between some hosts.  But if so it
> may not be wise either to block users trying to migrate where it is still
> applicable.  So maybe I misunderstood.

There is certainly downside and upside to add a migration blocker and I 
don't have a strong opinion here.

Regards,
Akihiko Odaki
Peter Xu Aug. 2, 2024, 4:26 p.m. UTC | #67
On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
> > > > I'm not sure if I read it right.  Perhaps you meant something more generic
> > > > than -platform but similar?
> > > > 
> > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> > > > "perf" or "compat", while by default to "compat"?
> > > 
> > > "perf" would cover 4) and "compat" will cover 1). However neither of them
> > > will cover 2) because an enum is not enough to know about all hosts. I
> > > presented a design that will cover 2) in:
> > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
> > 
> > "-merge-platform" shouldn't be a QEMU parameter, but should be something
> > separate.
> 
> Do you mean merging platform dumps should be done with another command? I
> think we will want to know the QOM tree is in use when implementing
> -merge-platform. For example, you cannot define a "platform" when e.g., you
> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
> connected to virtio-net devices. Of course we can include those information
> in dumps, but we don't do so for VMState.

What I was thinking is the generated platform dump shouldn't care about
what is used as backend: it should try to probe whatever is specified in
the qemu cmdline, and it's the user's job to make sure the exact same qemu
cmdline is used in other hosts to dump this information.

IOW, the dump will only contain the information that was based on the qemu
cmdline.  E.g., if it doesn't include virtio device at all, and if we only
support such dump for virtio, it should dump nothing.

Then the -merge-platform will expect all dumps to look the same too,
merging them with AND on each field.

Said that, I actually am still not clear on how / whether it should work at
last.  At least my previous concern (1) didn't has a good answer yet, on
what we do when profile collisions with qemu cmdlines.  So far I actually
still think it more straightforward that in migration we handshake on these
capabilities if possible.

And that's why I was thinking (where I totally agree with you on this) that
whether we should settle a short term plan first to be on the safe side
that we start with migration always being compatible, then we figure the
other approach.  That seems easier to me, and it's also a matter of whether
we want to do something for 9.1, or leaving that for 9.2 for USO*.

Thanks,
Michael S. Tsirkin Aug. 2, 2024, 4:40 p.m. UTC | #68
On Fri, Aug 02, 2024 at 12:26:22PM -0400, Peter Xu wrote:
> And that's why I was thinking (where I totally agree with you on this) that
> whether we should settle a short term plan first to be on the safe side
> that we start with migration always being compatible, then we figure the
> other approach.


We have two big issues around migration compatibility we never solved:

- some guest visible behaviour depends on a package outside of qemu:
  as that package can change, so can qemu behaviour

- sometimes we change guest visible behaviour and only
  discover this after the release: fixing that breaks
  migration to one version, not fixing breaks migration to another


These, to me, look similar enough that I feel we should look
at them together from QAPI POV.

Both issues sometimes can have work-arounds, enabling these
would be nice.
Also, both issues have a clean solution, which can come in
two flavors:
1. basic: detecting incompatibility
and not starting qemu on destination (or failing migration,
possibly early, which I consider a less clean solution).
2. advanced: ability to go from a set of configurations to
a flag making them compatible.
Peter Xu Aug. 2, 2024, 8:56 p.m. UTC | #69
On Fri, Aug 02, 2024 at 12:40:33PM -0400, Michael S. Tsirkin wrote:
> On Fri, Aug 02, 2024 at 12:26:22PM -0400, Peter Xu wrote:
> > And that's why I was thinking (where I totally agree with you on this) that
> > whether we should settle a short term plan first to be on the safe side
> > that we start with migration always being compatible, then we figure the
> > other approach.
> 
> We have two big issues around migration compatibility we never solved:
> 
> - some guest visible behaviour depends on a package outside of qemu:
>   as that package can change, so can qemu behaviour

Any example, or bug link for this one?

> 
> - sometimes we change guest visible behaviour and only
>   discover this after the release: fixing that breaks
>   migration to one version, not fixing breaks migration to another

In this case it is a bug, IMHO, and not always fixable.  It's like QEMU can
crash and coredump, not fixable unless the user upgrades.

Here "upgrades" for migration purpose means, the user should avoid
migration with a broken QEMU version, and one needs to cold reboot into a
new fixed binary rather than a live migration.

The good thing is as long as the user doesn't trigger migration logically
the bug can be avoided.

The bad thing is since it's a migration bug it cannot be fixed by live
migrating to a new QEMU..

AFAICT we did that before, for downstream we fix X.Y.0 with X.Y.1, then
declare X.Y.0 broken, something like that.  It's the same for downstream
where we put into similar documentations.

> 
> 
> These, to me, look similar enough that I feel we should look
> at them together from QAPI POV.

Or maybe I misunderstood here, in that case some elaboration of the QAPI
that mentioned here could help on clarifying things.

So far I don't see any QAPI command can fix a migration bug, for example,
which falls into category 2 above.

> 
> Both issues sometimes can have work-arounds, enabling these
> would be nice.
> Also, both issues have a clean solution, which can come in
> two flavors:
> 1. basic: detecting incompatibility
> and not starting qemu on destination (or failing migration,
> possibly early, which I consider a less clean solution).
> 2. advanced: ability to go from a set of configurations to
> a flag making them compatible.
> 
> 
> -- 
> MST
> 

Thanks,
Akihiko Odaki Aug. 4, 2024, 6:49 a.m. UTC | #70
On 2024/08/03 1:26, Peter Xu wrote:
> On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
>>>>> I'm not sure if I read it right.  Perhaps you meant something more generic
>>>>> than -platform but similar?
>>>>>
>>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
>>>>> "perf" or "compat", while by default to "compat"?
>>>>
>>>> "perf" would cover 4) and "compat" will cover 1). However neither of them
>>>> will cover 2) because an enum is not enough to know about all hosts. I
>>>> presented a design that will cover 2) in:
>>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
>>>
>>> "-merge-platform" shouldn't be a QEMU parameter, but should be something
>>> separate.
>>
>> Do you mean merging platform dumps should be done with another command? I
>> think we will want to know the QOM tree is in use when implementing
>> -merge-platform. For example, you cannot define a "platform" when e.g., you
>> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
>> connected to virtio-net devices. Of course we can include those information
>> in dumps, but we don't do so for VMState.
> 
> What I was thinking is the generated platform dump shouldn't care about
> what is used as backend: it should try to probe whatever is specified in
> the qemu cmdline, and it's the user's job to make sure the exact same qemu
> cmdline is used in other hosts to dump this information.
> 
> IOW, the dump will only contain the information that was based on the qemu
> cmdline.  E.g., if it doesn't include virtio device at all, and if we only
> support such dump for virtio, it should dump nothing.
> 
> Then the -merge-platform will expect all dumps to look the same too,
> merging them with AND on each field.

I think we will still need the QOM tree in that case. I think the 
platform information will look somewhat similar to VMState, which 
requires the QOM tree to interpret.

> 
> Said that, I actually am still not clear on how / whether it should work at
> last.  At least my previous concern (1) didn't has a good answer yet, on
> what we do when profile collisions with qemu cmdlines.  So far I actually
> still think it more straightforward that in migration we handshake on these
> capabilities if possible.
> 
> And that's why I was thinking (where I totally agree with you on this) that
> whether we should settle a short term plan first to be on the safe side
> that we start with migration always being compatible, then we figure the
> other approach.  That seems easier to me, and it's also a matter of whether
> we want to do something for 9.1, or leaving that for 9.2 for USO*.

I suggest disabling all offload features of virtio-net with 9.2.

I want to keep things consistent so I want to disable all at once. This 
change will be very uncomfortable for us, who are implementing offload 
features, but I hope it will motivate us to implement a proper solution.

That said, it will be surely a breaking change so we should wait for 9.1 
before making such a change.

By the way, I am wondering perhaps the "no-cross-migrate" scenario can 
be implemented relatively easy in a way similar to compatibility 
properties. The idea is to add the "no-cross-migrate" property to 
machines. If the property is set to "on", all offload features of 
virtio-net will be set to "auto". virtio-net will then probe the offload 
features and enable available offloading features.

Regards,
Akihiko Odaki
Peter Xu Aug. 4, 2024, 1:08 p.m. UTC | #71
On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
> On 2024/08/03 1:26, Peter Xu wrote:
> > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
> > > > > > I'm not sure if I read it right.  Perhaps you meant something more generic
> > > > > > than -platform but similar?
> > > > > > 
> > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> > > > > > "perf" or "compat", while by default to "compat"?
> > > > > 
> > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them
> > > > > will cover 2) because an enum is not enough to know about all hosts. I
> > > > > presented a design that will cover 2) in:
> > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
> > > > 
> > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something
> > > > separate.
> > > 
> > > Do you mean merging platform dumps should be done with another command? I
> > > think we will want to know the QOM tree is in use when implementing
> > > -merge-platform. For example, you cannot define a "platform" when e.g., you
> > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
> > > connected to virtio-net devices. Of course we can include those information
> > > in dumps, but we don't do so for VMState.
> > 
> > What I was thinking is the generated platform dump shouldn't care about
> > what is used as backend: it should try to probe whatever is specified in
> > the qemu cmdline, and it's the user's job to make sure the exact same qemu
> > cmdline is used in other hosts to dump this information.
> > 
> > IOW, the dump will only contain the information that was based on the qemu
> > cmdline.  E.g., if it doesn't include virtio device at all, and if we only
> > support such dump for virtio, it should dump nothing.
> > 
> > Then the -merge-platform will expect all dumps to look the same too,
> > merging them with AND on each field.
> 
> I think we will still need the QOM tree in that case. I think the platform
> information will look somewhat similar to VMState, which requires the QOM
> tree to interpret.

Ah yes, I assume you meant when multiple devices can report different thing
even if with the same frontend / device type.  QOM should work, or anything
that can identify a device, e.g. with id / instance_id attached along with
the device class.

One thing that I still don't know how it works is how it interacts with new
hosts being added.

This idea is based on the fact that the cluster is known before starting
any VM.  However in reality I think it can happen when VMs started with a
small cluster but then cluster extended, when the -merge-platform has been
done on the smaller set.

> 
> > 
> > Said that, I actually am still not clear on how / whether it should work at
> > last.  At least my previous concern (1) didn't has a good answer yet, on
> > what we do when profile collisions with qemu cmdlines.  So far I actually
> > still think it more straightforward that in migration we handshake on these
> > capabilities if possible.
> > 
> > And that's why I was thinking (where I totally agree with you on this) that
> > whether we should settle a short term plan first to be on the safe side
> > that we start with migration always being compatible, then we figure the
> > other approach.  That seems easier to me, and it's also a matter of whether
> > we want to do something for 9.1, or leaving that for 9.2 for USO*.
> 
> I suggest disabling all offload features of virtio-net with 9.2.
> 
> I want to keep things consistent so I want to disable all at once. This
> change will be very uncomfortable for us, who are implementing offload
> features, but I hope it will motivate us to implement a proper solution.
> 
> That said, it will be surely a breaking change so we should wait for 9.1
> before making such a change.

Personally I don't worry too much on other offload bits besides USO* so far
if we have them ON for longer time.  My wish was that they're old good
kernel features mostly supported everywhere who runs QEMU, then we're good.

And I definitely worry about future offload features, or any feature that
may probe host like this and auto-OFF: I hope we can do them on the safe
side starting from day1.

So I don't know whether we should do that to USO* only or all.  But I agree
with you that'll definitely be cleaner.

On the details of how to turn them off properly..  Taking an example if we
want to turn off all the offload features by default (or simply we replace
that with USO-only)..

Upstream machine type is flexible to all kinds of kernels, so we may not
want to regress anyone using an existing machine type even on perf,
especially if we want to turn off all.

In that case we may need one more knob (I'm assuming this is virtio-net
specific issue, but maybe not; using it as an example) to make sure the old
machine types perfs as well, with:

  - x-virtio-net-offload-enforce

    When set, the offload features with value ON are enforced, so when
    the host doesn't support a offload feature it will fail to boot,
    showing the error that specific offload feature is not supported by the
    virtio backend.

    When clear, the offload features with value ON are not enforced, so
    these features can be automatically turned OFF when it's detected the
    backend doesn't support them.  This may bring best perf but has the
    risk of breaking migration.

With that,

  - On old machine types (compat properties):

    - set "x-virtio-net-offload-enforce" OFF
    - set all offload features ON

  - On new machine types (the default values):

    - set "x-virtio-net-offload-enforce" ON
    - set all offload features OFF

And yes, we can do that until 9.2, but with above even 9.1 should be safe
to do.  9.2 might be still easier just to think everything through again,
after all at least USO was introduced in 8.2 so not a regress in 9.1.

> 
> By the way, I am wondering perhaps the "no-cross-migrate" scenario can be
> implemented relatively easy in a way similar to compatibility properties.
> The idea is to add the "no-cross-migrate" property to machines. If the
> property is set to "on", all offload features of virtio-net will be set to
> "auto". virtio-net will then probe the offload features and enable available
> offloading features.

If it'll become a device property, there's still the trick / concern where
no-cross-migrate could conflict with the other offload feature that was
selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF).

Thanks,
Michael S. Tsirkin Aug. 4, 2024, 1:41 p.m. UTC | #72
On Sun, Aug 04, 2024 at 09:08:05AM -0400, Peter Xu wrote:
> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
> > On 2024/08/03 1:26, Peter Xu wrote:
> > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
> > > > > > > I'm not sure if I read it right.  Perhaps you meant something more generic
> > > > > > > than -platform but similar?
> > > > > > > 
> > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> > > > > > > "perf" or "compat", while by default to "compat"?
> > > > > > 
> > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them
> > > > > > will cover 2) because an enum is not enough to know about all hosts. I
> > > > > > presented a design that will cover 2) in:
> > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
> > > > > 
> > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something
> > > > > separate.
> > > > 
> > > > Do you mean merging platform dumps should be done with another command? I
> > > > think we will want to know the QOM tree is in use when implementing
> > > > -merge-platform. For example, you cannot define a "platform" when e.g., you
> > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
> > > > connected to virtio-net devices. Of course we can include those information
> > > > in dumps, but we don't do so for VMState.
> > > 
> > > What I was thinking is the generated platform dump shouldn't care about
> > > what is used as backend: it should try to probe whatever is specified in
> > > the qemu cmdline, and it's the user's job to make sure the exact same qemu
> > > cmdline is used in other hosts to dump this information.
> > > 
> > > IOW, the dump will only contain the information that was based on the qemu
> > > cmdline.  E.g., if it doesn't include virtio device at all, and if we only
> > > support such dump for virtio, it should dump nothing.
> > > 
> > > Then the -merge-platform will expect all dumps to look the same too,
> > > merging them with AND on each field.
> > 
> > I think we will still need the QOM tree in that case. I think the platform
> > information will look somewhat similar to VMState, which requires the QOM
> > tree to interpret.
> 
> Ah yes, I assume you meant when multiple devices can report different thing
> even if with the same frontend / device type.  QOM should work, or anything
> that can identify a device, e.g. with id / instance_id attached along with
> the device class.
> 
> One thing that I still don't know how it works is how it interacts with new
> hosts being added.
> 
> This idea is based on the fact that the cluster is known before starting
> any VM.  However in reality I think it can happen when VMs started with a
> small cluster but then cluster extended, when the -merge-platform has been
> done on the smaller set.

This is why I think we (also?) need a way to dump config and then
give it to qemu on destination. To have a simple way to management to
know whether migration has a chance of working.


> > 
> > > 
> > > Said that, I actually am still not clear on how / whether it should work at
> > > last.  At least my previous concern (1) didn't has a good answer yet, on
> > > what we do when profile collisions with qemu cmdlines.  So far I actually
> > > still think it more straightforward that in migration we handshake on these
> > > capabilities if possible.
> > > 
> > > And that's why I was thinking (where I totally agree with you on this) that
> > > whether we should settle a short term plan first to be on the safe side
> > > that we start with migration always being compatible, then we figure the
> > > other approach.  That seems easier to me, and it's also a matter of whether
> > > we want to do something for 9.1, or leaving that for 9.2 for USO*.
> > 
> > I suggest disabling all offload features of virtio-net with 9.2.
> > 
> > I want to keep things consistent so I want to disable all at once. This
> > change will be very uncomfortable for us, who are implementing offload
> > features, but I hope it will motivate us to implement a proper solution.
> > 
> > That said, it will be surely a breaking change so we should wait for 9.1
> > before making such a change.
> 
> Personally I don't worry too much on other offload bits besides USO* so far
> if we have them ON for longer time.  My wish was that they're old good
> kernel features mostly supported everywhere who runs QEMU, then we're good.
> 
> And I definitely worry about future offload features, or any feature that
> may probe host like this and auto-OFF: I hope we can do them on the safe
> side starting from day1.
> 
> So I don't know whether we should do that to USO* only or all.  But I agree
> with you that'll definitely be cleaner.
> 
> On the details of how to turn them off properly..  Taking an example if we
> want to turn off all the offload features by default (or simply we replace
> that with USO-only)..
> 
> Upstream machine type is flexible to all kinds of kernels, so we may not
> want to regress anyone using an existing machine type even on perf,
> especially if we want to turn off all.
> 
> In that case we may need one more knob (I'm assuming this is virtio-net
> specific issue, but maybe not; using it as an example) to make sure the old
> machine types perfs as well, with:
> 
>   - x-virtio-net-offload-enforce
> 
>     When set, the offload features with value ON are enforced, so when
>     the host doesn't support a offload feature it will fail to boot,
>     showing the error that specific offload feature is not supported by the
>     virtio backend.
> 
>     When clear, the offload features with value ON are not enforced, so
>     these features can be automatically turned OFF when it's detected the
>     backend doesn't support them.  This may bring best perf but has the
>     risk of breaking migration.
> 
> With that,
> 
>   - On old machine types (compat properties):
> 
>     - set "x-virtio-net-offload-enforce" OFF
>     - set all offload features ON
> 
>   - On new machine types (the default values):
> 
>     - set "x-virtio-net-offload-enforce" ON
>     - set all offload features OFF
> 
> And yes, we can do that until 9.2, but with above even 9.1 should be safe
> to do.  9.2 might be still easier just to think everything through again,
> after all at least USO was introduced in 8.2 so not a regress in 9.1.
> 
> > 
> > By the way, I am wondering perhaps the "no-cross-migrate" scenario can be
> > implemented relatively easy in a way similar to compatibility properties.
> > The idea is to add the "no-cross-migrate" property to machines. If the
> > property is set to "on", all offload features of virtio-net will be set to
> > "auto". virtio-net will then probe the offload features and enable available
> > offloading features.
> 
> If it'll become a device property, there's still the trick / concern where
> no-cross-migrate could conflict with the other offload feature that was
> selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF).
> 
> Thanks,
> 
> -- 
> Peter Xu
Akihiko Odaki Aug. 5, 2024, 7:27 a.m. UTC | #73
On 2024/08/04 22:08, Peter Xu wrote:
> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
>> On 2024/08/03 1:26, Peter Xu wrote:
>>> On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
>>>>>>> I'm not sure if I read it right.  Perhaps you meant something more generic
>>>>>>> than -platform but similar?
>>>>>>>
>>>>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
>>>>>>> "perf" or "compat", while by default to "compat"?
>>>>>>
>>>>>> "perf" would cover 4) and "compat" will cover 1). However neither of them
>>>>>> will cover 2) because an enum is not enough to know about all hosts. I
>>>>>> presented a design that will cover 2) in:
>>>>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
>>>>>
>>>>> "-merge-platform" shouldn't be a QEMU parameter, but should be something
>>>>> separate.
>>>>
>>>> Do you mean merging platform dumps should be done with another command? I
>>>> think we will want to know the QOM tree is in use when implementing
>>>> -merge-platform. For example, you cannot define a "platform" when e.g., you
>>>> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
>>>> connected to virtio-net devices. Of course we can include those information
>>>> in dumps, but we don't do so for VMState.
>>>
>>> What I was thinking is the generated platform dump shouldn't care about
>>> what is used as backend: it should try to probe whatever is specified in
>>> the qemu cmdline, and it's the user's job to make sure the exact same qemu
>>> cmdline is used in other hosts to dump this information.
>>>
>>> IOW, the dump will only contain the information that was based on the qemu
>>> cmdline.  E.g., if it doesn't include virtio device at all, and if we only
>>> support such dump for virtio, it should dump nothing.
>>>
>>> Then the -merge-platform will expect all dumps to look the same too,
>>> merging them with AND on each field.
>>
>> I think we will still need the QOM tree in that case. I think the platform
>> information will look somewhat similar to VMState, which requires the QOM
>> tree to interpret.
> 
> Ah yes, I assume you meant when multiple devices can report different thing
> even if with the same frontend / device type.  QOM should work, or anything
> that can identify a device, e.g. with id / instance_id attached along with
> the device class.
> 
> One thing that I still don't know how it works is how it interacts with new
> hosts being added.
> 
> This idea is based on the fact that the cluster is known before starting
> any VM.  However in reality I think it can happen when VMs started with a
> small cluster but then cluster extended, when the -merge-platform has been
> done on the smaller set.
> 
>>
>>>
>>> Said that, I actually am still not clear on how / whether it should work at
>>> last.  At least my previous concern (1) didn't has a good answer yet, on
>>> what we do when profile collisions with qemu cmdlines.  So far I actually
>>> still think it more straightforward that in migration we handshake on these
>>> capabilities if possible.
>>>
>>> And that's why I was thinking (where I totally agree with you on this) that
>>> whether we should settle a short term plan first to be on the safe side
>>> that we start with migration always being compatible, then we figure the
>>> other approach.  That seems easier to me, and it's also a matter of whether
>>> we want to do something for 9.1, or leaving that for 9.2 for USO*.
>>
>> I suggest disabling all offload features of virtio-net with 9.2.
>>
>> I want to keep things consistent so I want to disable all at once. This
>> change will be very uncomfortable for us, who are implementing offload
>> features, but I hope it will motivate us to implement a proper solution.
>>
>> That said, it will be surely a breaking change so we should wait for 9.1
>> before making such a change.
> 
> Personally I don't worry too much on other offload bits besides USO* so far
> if we have them ON for longer time.  My wish was that they're old good
> kernel features mostly supported everywhere who runs QEMU, then we're good.

Unfortunately, we cannot expect everyone runs Linux, and the offload 
features are provided by Linux. However, QEMU can run on other 
platforms, and offload features may be provided by vhost-user or vhost-vdpa.

> 
> And I definitely worry about future offload features, or any feature that
> may probe host like this and auto-OFF: I hope we can do them on the safe
> side starting from day1.
> 
> So I don't know whether we should do that to USO* only or all.  But I agree
> with you that'll definitely be cleaner.
> 
> On the details of how to turn them off properly..  Taking an example if we
> want to turn off all the offload features by default (or simply we replace
> that with USO-only)..
> 
> Upstream machine type is flexible to all kinds of kernels, so we may not
> want to regress anyone using an existing machine type even on perf,
> especially if we want to turn off all.
> 
> In that case we may need one more knob (I'm assuming this is virtio-net
> specific issue, but maybe not; using it as an example) to make sure the old
> machine types perfs as well, with:
> 
>    - x-virtio-net-offload-enforce
> 
>      When set, the offload features with value ON are enforced, so when
>      the host doesn't support a offload feature it will fail to boot,
>      showing the error that specific offload feature is not supported by the
>      virtio backend.
> 
>      When clear, the offload features with value ON are not enforced, so
>      these features can be automatically turned OFF when it's detected the
>      backend doesn't support them.  This may bring best perf but has the
>      risk of breaking migration.

"[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" 
adds "x-force-features-auto" compatibility property to virtio-net for 
this purpose:
https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com

> 
> With that,
> 
>    - On old machine types (compat properties):
> 
>      - set "x-virtio-net-offload-enforce" OFF
>      - set all offload features ON
> 
>    - On new machine types (the default values):
> 
>      - set "x-virtio-net-offload-enforce" ON
>      - set all offload features OFF
> 
> And yes, we can do that until 9.2, but with above even 9.1 should be safe
> to do.  9.2 might be still easier just to think everything through again,
> after all at least USO was introduced in 8.2 so not a regress in 9.1.
> 
>>
>> By the way, I am wondering perhaps the "no-cross-migrate" scenario can be
>> implemented relatively easy in a way similar to compatibility properties.
>> The idea is to add the "no-cross-migrate" property to machines. If the
>> property is set to "on", all offload features of virtio-net will be set to
>> "auto". virtio-net will then probe the offload features and enable available
>> offloading features.
> 
> If it'll become a device property, there's still the trick / concern where
> no-cross-migrate could conflict with the other offload feature that was
> selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF).
With no-cross-migrate=ON + uso=OFF, no-cross-migrate will set uso=auto, 
but the user overrides with uso=off. As the consequence, USO will be 
disabled but all other available offload features will be enabled.

Regards,
Akihiko Odaki
Michael S. Tsirkin Aug. 5, 2024, 7:30 a.m. UTC | #74
On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
> I suggest disabling all offload features of virtio-net with 9.2.

Yea ... no.

> I want to keep things consistent so I want to disable all at once. This
> change will be very uncomfortable for us, who are implementing offload
> features, but I hope it will motivate us to implement a proper solution.

It's uncomfortable for users.
Akihiko Odaki Aug. 5, 2024, 7:53 a.m. UTC | #75
On 2024/08/05 16:30, Michael S. Tsirkin wrote:
> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
>> I suggest disabling all offload features of virtio-net with 9.2.
> 
> Yea ... no.
> 
>> I want to keep things consistent so I want to disable all at once. This
>> change will be very uncomfortable for us, who are implementing offload
>> features, but I hope it will motivate us to implement a proper solution.
> 
> It's uncomfortable for users.

An obvious alternative is to set cross-migrate=off by default (I dropped 
the no- prefix because no-cross-migrate=off is confusing). I don't have 
a particular idea whether cross-migrate should be on or off by default.

This is a trade-off of safety and performance. In general, I believe 
safety should come first before performance.

On the other hand, disabling offload features is a breaking change. QEMU 
also has -only-migratable option; it is more consistent to make the 
additional assurance for migration opt-in instead of opt-out. Finally, I 
see migration across hosts as an advanced feature, and perhaps it can be 
justified to make it more like an optional feature.

Regards,
Akihiko Odaki
Michael S. Tsirkin Aug. 5, 2024, 8:23 a.m. UTC | #76
On Mon, Aug 05, 2024 at 04:53:52PM +0900, Akihiko Odaki wrote:
> On 2024/08/05 16:30, Michael S. Tsirkin wrote:
> > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
> > > I suggest disabling all offload features of virtio-net with 9.2.
> > 
> > Yea ... no.
> > 
> > > I want to keep things consistent so I want to disable all at once. This
> > > change will be very uncomfortable for us, who are implementing offload
> > > features, but I hope it will motivate us to implement a proper solution.
> > 
> > It's uncomfortable for users.
> 
> An obvious alternative is to set cross-migrate=off by default (I dropped the
> no- prefix because no-cross-migrate=off is confusing). I don't have a
> particular idea whether cross-migrate should be on or off by default.
> 
> This is a trade-off of safety and performance. In general, I believe safety
> should come first before performance.

There's no actual safety issue here. You can't migrate certain configurations.
So I don't really understand what "cross-migrate" is supposed to do:
fail migration in 100% of cases?

I can see value in getting configuration from source and not starting
qemu on destination if it can not be migrated. This is rather
straight-forward, and seems directly useful for management.
I also see value in getting configuration from destination and starting
on source only if it can be migrated. As a variant of last one, I maybe
see value in getting that info from multiple destinations. Using this
last kind of thing would be trickier because it's not at the libvirt level,
so we would need very good documentation.

> On the other hand, disabling offload features is a breaking change. QEMU
> also has -only-migratable option; it is more consistent to make the
> additional assurance for migration opt-in instead of opt-out. Finally, I see
> migration across hosts as an advanced feature, and perhaps it can be
> justified to make it more like an optional feature.
> 
> Regards,
> Akihiko Odaki

It's already an optional feature.
Akihiko Odaki Aug. 5, 2024, 9:37 a.m. UTC | #77
On 2024/08/05 17:23, Michael S. Tsirkin wrote:
> On Mon, Aug 05, 2024 at 04:53:52PM +0900, Akihiko Odaki wrote:
>> On 2024/08/05 16:30, Michael S. Tsirkin wrote:
>>> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
>>>> I suggest disabling all offload features of virtio-net with 9.2.
>>>
>>> Yea ... no.
>>>
>>>> I want to keep things consistent so I want to disable all at once. This
>>>> change will be very uncomfortable for us, who are implementing offload
>>>> features, but I hope it will motivate us to implement a proper solution.
>>>
>>> It's uncomfortable for users.
>>
>> An obvious alternative is to set cross-migrate=off by default (I dropped the
>> no- prefix because no-cross-migrate=off is confusing). I don't have a
>> particular idea whether cross-migrate should be on or off by default.
>>
>> This is a trade-off of safety and performance. In general, I believe safety
>> should come first before performance.
> 
> There's no actual safety issue here. You can't migrate certain configurations.
> So I don't really understand what "cross-migrate" is supposed to do:
> fail migration in 100% of cases?

"cross-migrate" means migration among hosts with different platforms 
(e.g., different kernels, vDPA devices).

If cross-migrate=off, QEMU can still migrate on the same host 
(checkpoint and restart). QEMU can also migrate across hosts if the user 
ensures they are on the same platform.
Michael S. Tsirkin Aug. 5, 2024, 10:08 a.m. UTC | #78
On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
> and restart). QEMU can also migrate across hosts if the user ensures they
> are on the same platform.

What is so special about checkpoint/restart? I guess we hope that
downgrades are uncommon, but they are possible...
Akihiko Odaki Aug. 6, 2024, 7:35 a.m. UTC | #79
On 2024/08/05 19:08, Michael S. Tsirkin wrote:
> On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
>> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
>> and restart). QEMU can also migrate across hosts if the user ensures they
>> are on the same platform.
> 
> What is so special about checkpoint/restart? I guess we hope that
> downgrades are uncommon, but they are possible...

Downgrades will not work with cross-migrate=off. Users who want 
downgrades should use cross-migrate=on.
Michael S. Tsirkin Aug. 6, 2024, 1:29 p.m. UTC | #80
On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote:
> On 2024/08/05 19:08, Michael S. Tsirkin wrote:
> > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
> > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
> > > and restart). QEMU can also migrate across hosts if the user ensures they
> > > are on the same platform.
> > 
> > What is so special about checkpoint/restart? I guess we hope that
> > downgrades are uncommon, but they are possible...
> 
> Downgrades will not work with cross-migrate=off. Users who want downgrades
> should use cross-migrate=on.

We also don't know that upgrades do not disable a feature:
can happen if e.g. there's a serious bug in the feature.
Basically, this makes the feature too fragile, in my opinion.
Peter Xu Aug. 6, 2024, 8:41 p.m. UTC | #81
On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote:
> On 2024/08/04 22:08, Peter Xu wrote:
> > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
> > > On 2024/08/03 1:26, Peter Xu wrote:
> > > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
> > > > > > > > I'm not sure if I read it right.  Perhaps you meant something more generic
> > > > > > > > than -platform but similar?
> > > > > > > > 
> > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> > > > > > > > "perf" or "compat", while by default to "compat"?
> > > > > > > 
> > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them
> > > > > > > will cover 2) because an enum is not enough to know about all hosts. I
> > > > > > > presented a design that will cover 2) in:
> > > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
> > > > > > 
> > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something
> > > > > > separate.
> > > > > 
> > > > > Do you mean merging platform dumps should be done with another command? I
> > > > > think we will want to know the QOM tree is in use when implementing
> > > > > -merge-platform. For example, you cannot define a "platform" when e.g., you
> > > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
> > > > > connected to virtio-net devices. Of course we can include those information
> > > > > in dumps, but we don't do so for VMState.
> > > > 
> > > > What I was thinking is the generated platform dump shouldn't care about
> > > > what is used as backend: it should try to probe whatever is specified in
> > > > the qemu cmdline, and it's the user's job to make sure the exact same qemu
> > > > cmdline is used in other hosts to dump this information.
> > > > 
> > > > IOW, the dump will only contain the information that was based on the qemu
> > > > cmdline.  E.g., if it doesn't include virtio device at all, and if we only
> > > > support such dump for virtio, it should dump nothing.
> > > > 
> > > > Then the -merge-platform will expect all dumps to look the same too,
> > > > merging them with AND on each field.
> > > 
> > > I think we will still need the QOM tree in that case. I think the platform
> > > information will look somewhat similar to VMState, which requires the QOM
> > > tree to interpret.
> > 
> > Ah yes, I assume you meant when multiple devices can report different thing
> > even if with the same frontend / device type.  QOM should work, or anything
> > that can identify a device, e.g. with id / instance_id attached along with
> > the device class.
> > 
> > One thing that I still don't know how it works is how it interacts with new
> > hosts being added.
> > 
> > This idea is based on the fact that the cluster is known before starting
> > any VM.  However in reality I think it can happen when VMs started with a
> > small cluster but then cluster extended, when the -merge-platform has been
> > done on the smaller set.
> > 
> > > 
> > > > 
> > > > Said that, I actually am still not clear on how / whether it should work at
> > > > last.  At least my previous concern (1) didn't has a good answer yet, on
> > > > what we do when profile collisions with qemu cmdlines.  So far I actually
> > > > still think it more straightforward that in migration we handshake on these
> > > > capabilities if possible.
> > > > 
> > > > And that's why I was thinking (where I totally agree with you on this) that
> > > > whether we should settle a short term plan first to be on the safe side
> > > > that we start with migration always being compatible, then we figure the
> > > > other approach.  That seems easier to me, and it's also a matter of whether
> > > > we want to do something for 9.1, or leaving that for 9.2 for USO*.
> > > 
> > > I suggest disabling all offload features of virtio-net with 9.2.
> > > 
> > > I want to keep things consistent so I want to disable all at once. This
> > > change will be very uncomfortable for us, who are implementing offload
> > > features, but I hope it will motivate us to implement a proper solution.
> > > 
> > > That said, it will be surely a breaking change so we should wait for 9.1
> > > before making such a change.
> > 
> > Personally I don't worry too much on other offload bits besides USO* so far
> > if we have them ON for longer time.  My wish was that they're old good
> > kernel features mostly supported everywhere who runs QEMU, then we're good.
> 
> Unfortunately, we cannot expect everyone runs Linux, and the offload
> features are provided by Linux. However, QEMU can run on other platforms,
> and offload features may be provided by vhost-user or vhost-vdpa.

I see.  I am not familiar with the status quo there, so I'll leave that to
you and other experts that know better on this..

Personally I do care more on Linux, as that's what we ship within RH..

> 
> > 
> > And I definitely worry about future offload features, or any feature that
> > may probe host like this and auto-OFF: I hope we can do them on the safe
> > side starting from day1.
> > 
> > So I don't know whether we should do that to USO* only or all.  But I agree
> > with you that'll definitely be cleaner.
> > 
> > On the details of how to turn them off properly..  Taking an example if we
> > want to turn off all the offload features by default (or simply we replace
> > that with USO-only)..
> > 
> > Upstream machine type is flexible to all kinds of kernels, so we may not
> > want to regress anyone using an existing machine type even on perf,
> > especially if we want to turn off all.
> > 
> > In that case we may need one more knob (I'm assuming this is virtio-net
> > specific issue, but maybe not; using it as an example) to make sure the old
> > machine types perfs as well, with:
> > 
> >    - x-virtio-net-offload-enforce
> > 
> >      When set, the offload features with value ON are enforced, so when
> >      the host doesn't support a offload feature it will fail to boot,
> >      showing the error that specific offload feature is not supported by the
> >      virtio backend.
> > 
> >      When clear, the offload features with value ON are not enforced, so
> >      these features can be automatically turned OFF when it's detected the
> >      backend doesn't support them.  This may bring best perf but has the
> >      risk of breaking migration.
> 
> "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds
> "x-force-features-auto" compatibility property to virtio-net for this
> purpose:
> https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com

Ah ok.  But note that there's still a slight difference: we need to avoid
AUTO being an option, at all, IMHO.

It's about making qemu cmdline the ABI: when with AUTO it's still possible
the user uses AUTO on both sides, then ABI may not be guaranteed.

AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2)
the AUTO bit will always generate the same thing on both hosts.  However
USO* isn't such case.. so the AUTO option is IMHO not wanted.

What I mentioned above "x-virtio-net-offload-enforce" shouldn't add
anything new to "uso"; it still can only be ON/OFF.  However it should
affect "flip that to OFF automatically" or "fail the boot" behavior on
missing features.

> 
> > 
> > With that,
> > 
> >    - On old machine types (compat properties):
> > 
> >      - set "x-virtio-net-offload-enforce" OFF
> >      - set all offload features ON
> > 
> >    - On new machine types (the default values):
> > 
> >      - set "x-virtio-net-offload-enforce" ON
> >      - set all offload features OFF
> > 
> > And yes, we can do that until 9.2, but with above even 9.1 should be safe
> > to do.  9.2 might be still easier just to think everything through again,
> > after all at least USO was introduced in 8.2 so not a regress in 9.1.
> > 
> > > 
> > > By the way, I am wondering perhaps the "no-cross-migrate" scenario can be
> > > implemented relatively easy in a way similar to compatibility properties.
> > > The idea is to add the "no-cross-migrate" property to machines. If the
> > > property is set to "on", all offload features of virtio-net will be set to
> > > "auto". virtio-net will then probe the offload features and enable available
> > > offloading features.
> > 
> > If it'll become a device property, there's still the trick / concern where
> > no-cross-migrate could conflict with the other offload feature that was
> > selected explicilty by an user (e.g. no-cross-migrate=ON + uso=OFF).
> With no-cross-migrate=ON + uso=OFF, no-cross-migrate will set uso=auto, but
> the user overrides with uso=off. As the consequence, USO will be disabled
> but all other available offload features will be enabled.

Basically you're saying that no-cross-migrate has lower priority than
specific feature bits.  That's OK to me.

Thanks,
Akihiko Odaki Aug. 8, 2024, 10:52 a.m. UTC | #82
On 2024/08/06 22:29, Michael S. Tsirkin wrote:
> On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote:
>> On 2024/08/05 19:08, Michael S. Tsirkin wrote:
>>> On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
>>>> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
>>>> and restart). QEMU can also migrate across hosts if the user ensures they
>>>> are on the same platform.
>>>
>>> What is so special about checkpoint/restart? I guess we hope that
>>> downgrades are uncommon, but they are possible...
>>
>> Downgrades will not work with cross-migrate=off. Users who want downgrades
>> should use cross-migrate=on.
> 
> We also don't know that upgrades do not disable a feature:
> can happen if e.g. there's a serious bug in the feature.
> Basically, this makes the feature too fragile, in my opinion.

We can do nothing in such a case. Whether it is on a single host or 
multiple hosts, we cannot support migration if features once enabled 
disappear.

Regards,
Akihiko Odaki
Michael S. Tsirkin Aug. 8, 2024, 10:54 a.m. UTC | #83
On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote:
> On 2024/08/06 22:29, Michael S. Tsirkin wrote:
> > On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote:
> > > On 2024/08/05 19:08, Michael S. Tsirkin wrote:
> > > > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
> > > > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
> > > > > and restart). QEMU can also migrate across hosts if the user ensures they
> > > > > are on the same platform.
> > > > 
> > > > What is so special about checkpoint/restart? I guess we hope that
> > > > downgrades are uncommon, but they are possible...
> > > 
> > > Downgrades will not work with cross-migrate=off. Users who want downgrades
> > > should use cross-migrate=on.
> > 
> > We also don't know that upgrades do not disable a feature:
> > can happen if e.g. there's a serious bug in the feature.
> > Basically, this makes the feature too fragile, in my opinion.
> 
> We can do nothing in such a case. Whether it is on a single host or multiple
> hosts, we cannot support migration if features once enabled disappear.
> 
> Regards,
> Akihiko Odaki

It does not follow that we have to do something, and this is something,
therefore that we have to do this.

This is just a reason not to handle checkpoint/restart any different
than any other migration.
Akihiko Odaki Aug. 8, 2024, 11:03 a.m. UTC | #84
On 2024/08/08 19:54, Michael S. Tsirkin wrote:
> On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote:
>> On 2024/08/06 22:29, Michael S. Tsirkin wrote:
>>> On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote:
>>>> On 2024/08/05 19:08, Michael S. Tsirkin wrote:
>>>>> On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
>>>>>> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
>>>>>> and restart). QEMU can also migrate across hosts if the user ensures they
>>>>>> are on the same platform.
>>>>>
>>>>> What is so special about checkpoint/restart? I guess we hope that
>>>>> downgrades are uncommon, but they are possible...
>>>>
>>>> Downgrades will not work with cross-migrate=off. Users who want downgrades
>>>> should use cross-migrate=on.
>>>
>>> We also don't know that upgrades do not disable a feature:
>>> can happen if e.g. there's a serious bug in the feature.
>>> Basically, this makes the feature too fragile, in my opinion.
>>
>> We can do nothing in such a case. Whether it is on a single host or multiple
>> hosts, we cannot support migration if features once enabled disappear.
>>
>> Regards,
>> Akihiko Odaki
> 
> It does not follow that we have to do something, and this is something,
> therefore that we have to do this.
> 
> This is just a reason not to handle checkpoint/restart any different
> than any other migration.

Whethere it is checkpoint/restart or any other migration, I expect 
platform features won't disappear from the host(s); we can't readily 
support migration in such a situation.

When platform features won't disappear, for checkpoint/restart, we can 
enable all available features without disrupting migration; 
cross-migrate=off will instruct that.

However, if we are migrating a VM across hosts and the user doesn't 
ensure they are on the same platform, we cannot enable platform features 
even if we are sure that platform features already present on a host 
won't disappear because some hosts may not have features in the first 
place. We can set cross-migrate=on in such a case to disable optional 
platform features.

Regards,
Akihiko Odaki
Michael S. Tsirkin Aug. 8, 2024, 11:12 a.m. UTC | #85
On Thu, Aug 08, 2024 at 08:03:25PM +0900, Akihiko Odaki wrote:
> On 2024/08/08 19:54, Michael S. Tsirkin wrote:
> > On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote:
> > > On 2024/08/06 22:29, Michael S. Tsirkin wrote:
> > > > On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote:
> > > > > On 2024/08/05 19:08, Michael S. Tsirkin wrote:
> > > > > > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
> > > > > > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
> > > > > > > and restart). QEMU can also migrate across hosts if the user ensures they
> > > > > > > are on the same platform.
> > > > > > 
> > > > > > What is so special about checkpoint/restart? I guess we hope that
> > > > > > downgrades are uncommon, but they are possible...
> > > > > 
> > > > > Downgrades will not work with cross-migrate=off. Users who want downgrades
> > > > > should use cross-migrate=on.
> > > > 
> > > > We also don't know that upgrades do not disable a feature:
> > > > can happen if e.g. there's a serious bug in the feature.
> > > > Basically, this makes the feature too fragile, in my opinion.
> > > 
> > > We can do nothing in such a case. Whether it is on a single host or multiple
> > > hosts, we cannot support migration if features once enabled disappear.
> > > 
> > > Regards,
> > > Akihiko Odaki
> > 
> > It does not follow that we have to do something, and this is something,
> > therefore that we have to do this.
> > 
> > This is just a reason not to handle checkpoint/restart any different
> > than any other migration.
> 
> Whethere it is checkpoint/restart or any other migration, I expect platform
> features won't disappear from the host(s); we can't readily support
> migration in such a situation.


We can if we mask the features from the guest before starting VM.

Or if we didn't, we can fail gracefully.

> When platform features won't disappear, for checkpoint/restart, we can
> enable all available features without disrupting migration;
> cross-migrate=off will instruct that.
> 
> However, if we are migrating a VM across hosts and the user doesn't ensure
> they are on the same platform, we cannot enable platform features even if we
> are sure that platform features already present on a host won't disappear
> because some hosts may not have features in the first place. We can set
> cross-migrate=on in such a case to disable optional platform features.
> 
> Regards,
> Akihiko Odaki


This is too big of a hammer. People already use what you call "cross
migrate" and have for years. We are not going to stop developing
features just because someone suddenly became aware of some such bit.
If you care, you will have to work to solve the problem properly -
nacking half baked hacks is the only tool maintainers have to make
people work on hard problems.
Akihiko Odaki Aug. 8, 2024, 11:32 a.m. UTC | #86
On 2024/08/08 20:12, Michael S. Tsirkin wrote:
> On Thu, Aug 08, 2024 at 08:03:25PM +0900, Akihiko Odaki wrote:
>> On 2024/08/08 19:54, Michael S. Tsirkin wrote:
>>> On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote:
>>>> On 2024/08/06 22:29, Michael S. Tsirkin wrote:
>>>>> On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote:
>>>>>> On 2024/08/05 19:08, Michael S. Tsirkin wrote:
>>>>>>> On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
>>>>>>>> If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
>>>>>>>> and restart). QEMU can also migrate across hosts if the user ensures they
>>>>>>>> are on the same platform.
>>>>>>>
>>>>>>> What is so special about checkpoint/restart? I guess we hope that
>>>>>>> downgrades are uncommon, but they are possible...
>>>>>>
>>>>>> Downgrades will not work with cross-migrate=off. Users who want downgrades
>>>>>> should use cross-migrate=on.
>>>>>
>>>>> We also don't know that upgrades do not disable a feature:
>>>>> can happen if e.g. there's a serious bug in the feature.
>>>>> Basically, this makes the feature too fragile, in my opinion.
>>>>
>>>> We can do nothing in such a case. Whether it is on a single host or multiple
>>>> hosts, we cannot support migration if features once enabled disappear.
>>>>
>>>> Regards,
>>>> Akihiko Odaki
>>>
>>> It does not follow that we have to do something, and this is something,
>>> therefore that we have to do this.
>>>
>>> This is just a reason not to handle checkpoint/restart any different
>>> than any other migration.
>>
>> Whethere it is checkpoint/restart or any other migration, I expect platform
>> features won't disappear from the host(s); we can't readily support
>> migration in such a situation.
> 
> 
> We can if we mask the features from the guest before starting VM.
> 
> Or if we didn't, we can fail gracefully.
> 
>> When platform features won't disappear, for checkpoint/restart, we can
>> enable all available features without disrupting migration;
>> cross-migrate=off will instruct that.
>>
>> However, if we are migrating a VM across hosts and the user doesn't ensure
>> they are on the same platform, we cannot enable platform features even if we
>> are sure that platform features already present on a host won't disappear
>> because some hosts may not have features in the first place. We can set
>> cross-migrate=on in such a case to disable optional platform features.
>>
>> Regards,
>> Akihiko Odaki
> 
> 
> This is too big of a hammer. People already use what you call "cross
> migrate" and have for years. We are not going to stop developing
> features just because someone suddenly became aware of some such bit.
> If you care, you will have to work to solve the problem properly -
> nacking half baked hacks is the only tool maintainers have to make
> people work on hard problems.

I think you meant cross-migrate=off, which is the current behavior.

I am not suggesting forcing cross-migrate=on or even making it default. 
I have shown four possible scenarios earlier[a]:

1) Migration everywhere
2) Migration on specific machines
3) Migration on some known platforms
4) No migration (migration on nowhere)

Taking the discussion with Peter, I amend 4) as follows:
4*) Migration on one platform (checkpoint/restore)

cross-migrate=on is a complete solution for 1).
2) is dealt with another proposal of mine.[b]
3) can be solved with the -platform proposal by Daniel.[c]
4*) is what QEMU currently implements.

[a] 
https://lore.kernel.org/all/39a8bb8b-4191-4f41-aaf7-06df24bf3280@daynix.com/
[b] 
https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/
[c] https://lore.kernel.org/all/ZqO7cR-UiGpX2rk0@redhat.com/

Regards,
Akihiko Odaki
Akihiko Odaki Aug. 8, 2024, 11:43 a.m. UTC | #87
On 2024/08/07 5:41, Peter Xu wrote:
> On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote:
>> On 2024/08/04 22:08, Peter Xu wrote:
>>> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
>>>> On 2024/08/03 1:26, Peter Xu wrote:
>>>>> On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
>>>>>>>>> I'm not sure if I read it right.  Perhaps you meant something more generic
>>>>>>>>> than -platform but similar?
>>>>>>>>>
>>>>>>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
>>>>>>>>> "perf" or "compat", while by default to "compat"?
>>>>>>>>
>>>>>>>> "perf" would cover 4) and "compat" will cover 1). However neither of them
>>>>>>>> will cover 2) because an enum is not enough to know about all hosts. I
>>>>>>>> presented a design that will cover 2) in:
>>>>>>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
>>>>>>>
>>>>>>> "-merge-platform" shouldn't be a QEMU parameter, but should be something
>>>>>>> separate.
>>>>>>
>>>>>> Do you mean merging platform dumps should be done with another command? I
>>>>>> think we will want to know the QOM tree is in use when implementing
>>>>>> -merge-platform. For example, you cannot define a "platform" when e.g., you
>>>>>> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
>>>>>> connected to virtio-net devices. Of course we can include those information
>>>>>> in dumps, but we don't do so for VMState.
>>>>>
>>>>> What I was thinking is the generated platform dump shouldn't care about
>>>>> what is used as backend: it should try to probe whatever is specified in
>>>>> the qemu cmdline, and it's the user's job to make sure the exact same qemu
>>>>> cmdline is used in other hosts to dump this information.
>>>>>
>>>>> IOW, the dump will only contain the information that was based on the qemu
>>>>> cmdline.  E.g., if it doesn't include virtio device at all, and if we only
>>>>> support such dump for virtio, it should dump nothing.
>>>>>
>>>>> Then the -merge-platform will expect all dumps to look the same too,
>>>>> merging them with AND on each field.
>>>>
>>>> I think we will still need the QOM tree in that case. I think the platform
>>>> information will look somewhat similar to VMState, which requires the QOM
>>>> tree to interpret.
>>>
>>> Ah yes, I assume you meant when multiple devices can report different thing
>>> even if with the same frontend / device type.  QOM should work, or anything
>>> that can identify a device, e.g. with id / instance_id attached along with
>>> the device class.
>>>
>>> One thing that I still don't know how it works is how it interacts with new
>>> hosts being added.
>>>
>>> This idea is based on the fact that the cluster is known before starting
>>> any VM.  However in reality I think it can happen when VMs started with a
>>> small cluster but then cluster extended, when the -merge-platform has been
>>> done on the smaller set.
>>>
>>>>
>>>>>
>>>>> Said that, I actually am still not clear on how / whether it should work at
>>>>> last.  At least my previous concern (1) didn't has a good answer yet, on
>>>>> what we do when profile collisions with qemu cmdlines.  So far I actually
>>>>> still think it more straightforward that in migration we handshake on these
>>>>> capabilities if possible.
>>>>>
>>>>> And that's why I was thinking (where I totally agree with you on this) that
>>>>> whether we should settle a short term plan first to be on the safe side
>>>>> that we start with migration always being compatible, then we figure the
>>>>> other approach.  That seems easier to me, and it's also a matter of whether
>>>>> we want to do something for 9.1, or leaving that for 9.2 for USO*.
>>>>
>>>> I suggest disabling all offload features of virtio-net with 9.2.
>>>>
>>>> I want to keep things consistent so I want to disable all at once. This
>>>> change will be very uncomfortable for us, who are implementing offload
>>>> features, but I hope it will motivate us to implement a proper solution.
>>>>
>>>> That said, it will be surely a breaking change so we should wait for 9.1
>>>> before making such a change.
>>>
>>> Personally I don't worry too much on other offload bits besides USO* so far
>>> if we have them ON for longer time.  My wish was that they're old good
>>> kernel features mostly supported everywhere who runs QEMU, then we're good.
>>
>> Unfortunately, we cannot expect everyone runs Linux, and the offload
>> features are provided by Linux. However, QEMU can run on other platforms,
>> and offload features may be provided by vhost-user or vhost-vdpa.
> 
> I see.  I am not familiar with the status quo there, so I'll leave that to
> you and other experts that know better on this..
> 
> Personally I do care more on Linux, as that's what we ship within RH..
> 
>>
>>>
>>> And I definitely worry about future offload features, or any feature that
>>> may probe host like this and auto-OFF: I hope we can do them on the safe
>>> side starting from day1.
>>>
>>> So I don't know whether we should do that to USO* only or all.  But I agree
>>> with you that'll definitely be cleaner.
>>>
>>> On the details of how to turn them off properly..  Taking an example if we
>>> want to turn off all the offload features by default (or simply we replace
>>> that with USO-only)..
>>>
>>> Upstream machine type is flexible to all kinds of kernels, so we may not
>>> want to regress anyone using an existing machine type even on perf,
>>> especially if we want to turn off all.
>>>
>>> In that case we may need one more knob (I'm assuming this is virtio-net
>>> specific issue, but maybe not; using it as an example) to make sure the old
>>> machine types perfs as well, with:
>>>
>>>     - x-virtio-net-offload-enforce
>>>
>>>       When set, the offload features with value ON are enforced, so when
>>>       the host doesn't support a offload feature it will fail to boot,
>>>       showing the error that specific offload feature is not supported by the
>>>       virtio backend.
>>>
>>>       When clear, the offload features with value ON are not enforced, so
>>>       these features can be automatically turned OFF when it's detected the
>>>       backend doesn't support them.  This may bring best perf but has the
>>>       risk of breaking migration.
>>
>> "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds
>> "x-force-features-auto" compatibility property to virtio-net for this
>> purpose:
>> https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com
> 
> Ah ok.  But note that there's still a slight difference: we need to avoid
> AUTO being an option, at all, IMHO.
> 
> It's about making qemu cmdline the ABI: when with AUTO it's still possible
> the user uses AUTO on both sides, then ABI may not be guaranteed.
> 
> AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2)
> the AUTO bit will always generate the same thing on both hosts.  However
> USO* isn't such case.. so the AUTO option is IMHO not wanted.
> 
> What I mentioned above "x-virtio-net-offload-enforce" shouldn't add
> anything new to "uso"; it still can only be ON/OFF.  However it should
> affect "flip that to OFF automatically" or "fail the boot" behavior on
> missing features.

My rationale for the OnOffAuto change is that "flipping ON to OFF 
automatically" is more confusing than making users specify AUTO to allow 
QEMU making the feature OFF. "ON" will always make the boot fail.

The ABI guarantee will be gone anyway if 
x-virtio-net-offload-enforce=off. AUTO is no different in that sense.

Regards,
Akihiko Odaki
Peter Xu Aug. 8, 2024, 1:55 p.m. UTC | #88
On Thu, Aug 08, 2024 at 08:43:22PM +0900, Akihiko Odaki wrote:
> On 2024/08/07 5:41, Peter Xu wrote:
> > On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote:
> > > On 2024/08/04 22:08, Peter Xu wrote:
> > > > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
> > > > > On 2024/08/03 1:26, Peter Xu wrote:
> > > > > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
> > > > > > > > > > I'm not sure if I read it right.  Perhaps you meant something more generic
> > > > > > > > > > than -platform but similar?
> > > > > > > > > > 
> > > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> > > > > > > > > > "perf" or "compat", while by default to "compat"?
> > > > > > > > > 
> > > > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them
> > > > > > > > > will cover 2) because an enum is not enough to know about all hosts. I
> > > > > > > > > presented a design that will cover 2) in:
> > > > > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
> > > > > > > > 
> > > > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something
> > > > > > > > separate.
> > > > > > > 
> > > > > > > Do you mean merging platform dumps should be done with another command? I
> > > > > > > think we will want to know the QOM tree is in use when implementing
> > > > > > > -merge-platform. For example, you cannot define a "platform" when e.g., you
> > > > > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
> > > > > > > connected to virtio-net devices. Of course we can include those information
> > > > > > > in dumps, but we don't do so for VMState.
> > > > > > 
> > > > > > What I was thinking is the generated platform dump shouldn't care about
> > > > > > what is used as backend: it should try to probe whatever is specified in
> > > > > > the qemu cmdline, and it's the user's job to make sure the exact same qemu
> > > > > > cmdline is used in other hosts to dump this information.
> > > > > > 
> > > > > > IOW, the dump will only contain the information that was based on the qemu
> > > > > > cmdline.  E.g., if it doesn't include virtio device at all, and if we only
> > > > > > support such dump for virtio, it should dump nothing.
> > > > > > 
> > > > > > Then the -merge-platform will expect all dumps to look the same too,
> > > > > > merging them with AND on each field.
> > > > > 
> > > > > I think we will still need the QOM tree in that case. I think the platform
> > > > > information will look somewhat similar to VMState, which requires the QOM
> > > > > tree to interpret.
> > > > 
> > > > Ah yes, I assume you meant when multiple devices can report different thing
> > > > even if with the same frontend / device type.  QOM should work, or anything
> > > > that can identify a device, e.g. with id / instance_id attached along with
> > > > the device class.
> > > > 
> > > > One thing that I still don't know how it works is how it interacts with new
> > > > hosts being added.
> > > > 
> > > > This idea is based on the fact that the cluster is known before starting
> > > > any VM.  However in reality I think it can happen when VMs started with a
> > > > small cluster but then cluster extended, when the -merge-platform has been
> > > > done on the smaller set.
> > > > 
> > > > > 
> > > > > > 
> > > > > > Said that, I actually am still not clear on how / whether it should work at
> > > > > > last.  At least my previous concern (1) didn't has a good answer yet, on
> > > > > > what we do when profile collisions with qemu cmdlines.  So far I actually
> > > > > > still think it more straightforward that in migration we handshake on these
> > > > > > capabilities if possible.
> > > > > > 
> > > > > > And that's why I was thinking (where I totally agree with you on this) that
> > > > > > whether we should settle a short term plan first to be on the safe side
> > > > > > that we start with migration always being compatible, then we figure the
> > > > > > other approach.  That seems easier to me, and it's also a matter of whether
> > > > > > we want to do something for 9.1, or leaving that for 9.2 for USO*.
> > > > > 
> > > > > I suggest disabling all offload features of virtio-net with 9.2.
> > > > > 
> > > > > I want to keep things consistent so I want to disable all at once. This
> > > > > change will be very uncomfortable for us, who are implementing offload
> > > > > features, but I hope it will motivate us to implement a proper solution.
> > > > > 
> > > > > That said, it will be surely a breaking change so we should wait for 9.1
> > > > > before making such a change.
> > > > 
> > > > Personally I don't worry too much on other offload bits besides USO* so far
> > > > if we have them ON for longer time.  My wish was that they're old good
> > > > kernel features mostly supported everywhere who runs QEMU, then we're good.
> > > 
> > > Unfortunately, we cannot expect everyone runs Linux, and the offload
> > > features are provided by Linux. However, QEMU can run on other platforms,
> > > and offload features may be provided by vhost-user or vhost-vdpa.
> > 
> > I see.  I am not familiar with the status quo there, so I'll leave that to
> > you and other experts that know better on this..
> > 
> > Personally I do care more on Linux, as that's what we ship within RH..
> > 
> > > 
> > > > 
> > > > And I definitely worry about future offload features, or any feature that
> > > > may probe host like this and auto-OFF: I hope we can do them on the safe
> > > > side starting from day1.
> > > > 
> > > > So I don't know whether we should do that to USO* only or all.  But I agree
> > > > with you that'll definitely be cleaner.
> > > > 
> > > > On the details of how to turn them off properly..  Taking an example if we
> > > > want to turn off all the offload features by default (or simply we replace
> > > > that with USO-only)..
> > > > 
> > > > Upstream machine type is flexible to all kinds of kernels, so we may not
> > > > want to regress anyone using an existing machine type even on perf,
> > > > especially if we want to turn off all.
> > > > 
> > > > In that case we may need one more knob (I'm assuming this is virtio-net
> > > > specific issue, but maybe not; using it as an example) to make sure the old
> > > > machine types perfs as well, with:
> > > > 
> > > >     - x-virtio-net-offload-enforce
> > > > 
> > > >       When set, the offload features with value ON are enforced, so when
> > > >       the host doesn't support a offload feature it will fail to boot,
> > > >       showing the error that specific offload feature is not supported by the
> > > >       virtio backend.
> > > > 
> > > >       When clear, the offload features with value ON are not enforced, so
> > > >       these features can be automatically turned OFF when it's detected the
> > > >       backend doesn't support them.  This may bring best perf but has the
> > > >       risk of breaking migration.
> > > 
> > > "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds
> > > "x-force-features-auto" compatibility property to virtio-net for this
> > > purpose:
> > > https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com
> > 
> > Ah ok.  But note that there's still a slight difference: we need to avoid
> > AUTO being an option, at all, IMHO.
> > 
> > It's about making qemu cmdline the ABI: when with AUTO it's still possible
> > the user uses AUTO on both sides, then ABI may not be guaranteed.
> > 
> > AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2)
> > the AUTO bit will always generate the same thing on both hosts.  However
> > USO* isn't such case.. so the AUTO option is IMHO not wanted.
> > 
> > What I mentioned above "x-virtio-net-offload-enforce" shouldn't add
> > anything new to "uso"; it still can only be ON/OFF.  However it should
> > affect "flip that to OFF automatically" or "fail the boot" behavior on
> > missing features.
> 
> My rationale for the OnOffAuto change is that "flipping ON to OFF
> automatically" is more confusing than making users specify AUTO to allow
> QEMU making the feature OFF. "ON" will always make the boot fail.
> 
> The ABI guarantee will be gone anyway if x-virtio-net-offload-enforce=off.
> AUTO is no different in that sense.

Hmm yes; I wished we can have device properties that user can never
specify, but only set from internals.  It's just that applying a compat
property so far require a generic device property.  Or say, it'll be nice
that compat property can tweak a class variable too then no property to
introduce.

We could even add a migration blocker for x-virtio-net-offload-enforce=ON,
but again it could be too aggresive.  I think it might be better we bet
nobody will even know there's the parameter, so it won't be used in manual
setup.  OTOH, "guest_uso4" can be too easiy to find there's the AUTO
option: I normally use ",guest_uso4=?" to just dump the possible values.

Thanks,
Peter Xu Aug. 8, 2024, 2:15 p.m. UTC | #89
On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
> This is too big of a hammer. People already use what you call "cross
> migrate" and have for years. We are not going to stop developing
> features just because someone suddenly became aware of some such bit.
> If you care, you will have to work to solve the problem properly -
> nacking half baked hacks is the only tool maintainers have to make
> people work on hard problems.

IMHO this is totally different thing.  It's not about proposing a new
feature yet so far, it's about how we should fix a breakage first.

And that's why I think we should fix it even in the simple way first, then
we consider anything more benefitial from perf side without breaking
anything, which should be on top of that.

Thanks,
Peter Xu Aug. 8, 2024, 2:21 p.m. UTC | #90
On Thu, Aug 08, 2024 at 08:32:58PM +0900, Akihiko Odaki wrote:
> On 2024/08/08 20:12, Michael S. Tsirkin wrote:
> > On Thu, Aug 08, 2024 at 08:03:25PM +0900, Akihiko Odaki wrote:
> > > On 2024/08/08 19:54, Michael S. Tsirkin wrote:
> > > > On Thu, Aug 08, 2024 at 07:52:37PM +0900, Akihiko Odaki wrote:
> > > > > On 2024/08/06 22:29, Michael S. Tsirkin wrote:
> > > > > > On Tue, Aug 06, 2024 at 04:35:44PM +0900, Akihiko Odaki wrote:
> > > > > > > On 2024/08/05 19:08, Michael S. Tsirkin wrote:
> > > > > > > > On Mon, Aug 05, 2024 at 06:37:58PM +0900, Akihiko Odaki wrote:
> > > > > > > > > If cross-migrate=off, QEMU can still migrate on the same host (checkpoint
> > > > > > > > > and restart). QEMU can also migrate across hosts if the user ensures they
> > > > > > > > > are on the same platform.
> > > > > > > > 
> > > > > > > > What is so special about checkpoint/restart? I guess we hope that
> > > > > > > > downgrades are uncommon, but they are possible...
> > > > > > > 
> > > > > > > Downgrades will not work with cross-migrate=off. Users who want downgrades
> > > > > > > should use cross-migrate=on.
> > > > > > 
> > > > > > We also don't know that upgrades do not disable a feature:
> > > > > > can happen if e.g. there's a serious bug in the feature.
> > > > > > Basically, this makes the feature too fragile, in my opinion.
> > > > > 
> > > > > We can do nothing in such a case. Whether it is on a single host or multiple
> > > > > hosts, we cannot support migration if features once enabled disappear.
> > > > > 
> > > > > Regards,
> > > > > Akihiko Odaki
> > > > 
> > > > It does not follow that we have to do something, and this is something,
> > > > therefore that we have to do this.
> > > > 
> > > > This is just a reason not to handle checkpoint/restart any different
> > > > than any other migration.
> > > 
> > > Whethere it is checkpoint/restart or any other migration, I expect platform
> > > features won't disappear from the host(s); we can't readily support
> > > migration in such a situation.
> > 
> > 
> > We can if we mask the features from the guest before starting VM.
> > 
> > Or if we didn't, we can fail gracefully.
> > 
> > > When platform features won't disappear, for checkpoint/restart, we can
> > > enable all available features without disrupting migration;
> > > cross-migrate=off will instruct that.
> > > 
> > > However, if we are migrating a VM across hosts and the user doesn't ensure
> > > they are on the same platform, we cannot enable platform features even if we
> > > are sure that platform features already present on a host won't disappear
> > > because some hosts may not have features in the first place. We can set
> > > cross-migrate=on in such a case to disable optional platform features.
> > > 
> > > Regards,
> > > Akihiko Odaki
> > 
> > 
> > This is too big of a hammer. People already use what you call "cross
> > migrate" and have for years. We are not going to stop developing
> > features just because someone suddenly became aware of some such bit.
> > If you care, you will have to work to solve the problem properly -
> > nacking half baked hacks is the only tool maintainers have to make
> > people work on hard problems.
> 
> I think you meant cross-migrate=off, which is the current behavior.
> 
> I am not suggesting forcing cross-migrate=on or even making it default. I
> have shown four possible scenarios earlier[a]:
> 
> 1) Migration everywhere
> 2) Migration on specific machines
> 3) Migration on some known platforms
> 4) No migration (migration on nowhere)
> 
> Taking the discussion with Peter, I amend 4) as follows:
> 4*) Migration on one platform (checkpoint/restore)

Maybe we can avoid calling out "checkpoint/restore", but something like
"migration on identical hosts" or something.

AFAIU that's what we do with many arm64 systems on the vcpu models with KVM
(IIRC it's still about using "virt" machines), where we simply mostly
require it's the identical bare metal host or weird things can happen when
migration happens.

> 
> cross-migrate=on is a complete solution for 1).
> 2) is dealt with another proposal of mine.[b]
> 3) can be solved with the -platform proposal by Daniel.[c]
> 4*) is what QEMU currently implements.
> 
> [a]
> https://lore.kernel.org/all/39a8bb8b-4191-4f41-aaf7-06df24bf3280@daynix.com/
> [b]
> https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/
> [c] https://lore.kernel.org/all/ZqO7cR-UiGpX2rk0@redhat.com/
> 
> Regards,
> Akihiko Odaki
> 

Thanks,
Michael S. Tsirkin Aug. 8, 2024, 2:45 p.m. UTC | #91
On Thu, Aug 08, 2024 at 09:55:49AM -0400, Peter Xu wrote:
> On Thu, Aug 08, 2024 at 08:43:22PM +0900, Akihiko Odaki wrote:
> > On 2024/08/07 5:41, Peter Xu wrote:
> > > On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote:
> > > > On 2024/08/04 22:08, Peter Xu wrote:
> > > > > On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
> > > > > > On 2024/08/03 1:26, Peter Xu wrote:
> > > > > > > On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
> > > > > > > > > > > I'm not sure if I read it right.  Perhaps you meant something more generic
> > > > > > > > > > > than -platform but similar?
> > > > > > > > > > > 
> > > > > > > > > > > For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
> > > > > > > > > > > "perf" or "compat", while by default to "compat"?
> > > > > > > > > > 
> > > > > > > > > > "perf" would cover 4) and "compat" will cover 1). However neither of them
> > > > > > > > > > will cover 2) because an enum is not enough to know about all hosts. I
> > > > > > > > > > presented a design that will cover 2) in:
> > > > > > > > > > https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
> > > > > > > > > 
> > > > > > > > > "-merge-platform" shouldn't be a QEMU parameter, but should be something
> > > > > > > > > separate.
> > > > > > > > 
> > > > > > > > Do you mean merging platform dumps should be done with another command? I
> > > > > > > > think we will want to know the QOM tree is in use when implementing
> > > > > > > > -merge-platform. For example, you cannot define a "platform" when e.g., you
> > > > > > > > don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
> > > > > > > > connected to virtio-net devices. Of course we can include those information
> > > > > > > > in dumps, but we don't do so for VMState.
> > > > > > > 
> > > > > > > What I was thinking is the generated platform dump shouldn't care about
> > > > > > > what is used as backend: it should try to probe whatever is specified in
> > > > > > > the qemu cmdline, and it's the user's job to make sure the exact same qemu
> > > > > > > cmdline is used in other hosts to dump this information.
> > > > > > > 
> > > > > > > IOW, the dump will only contain the information that was based on the qemu
> > > > > > > cmdline.  E.g., if it doesn't include virtio device at all, and if we only
> > > > > > > support such dump for virtio, it should dump nothing.
> > > > > > > 
> > > > > > > Then the -merge-platform will expect all dumps to look the same too,
> > > > > > > merging them with AND on each field.
> > > > > > 
> > > > > > I think we will still need the QOM tree in that case. I think the platform
> > > > > > information will look somewhat similar to VMState, which requires the QOM
> > > > > > tree to interpret.
> > > > > 
> > > > > Ah yes, I assume you meant when multiple devices can report different thing
> > > > > even if with the same frontend / device type.  QOM should work, or anything
> > > > > that can identify a device, e.g. with id / instance_id attached along with
> > > > > the device class.
> > > > > 
> > > > > One thing that I still don't know how it works is how it interacts with new
> > > > > hosts being added.
> > > > > 
> > > > > This idea is based on the fact that the cluster is known before starting
> > > > > any VM.  However in reality I think it can happen when VMs started with a
> > > > > small cluster but then cluster extended, when the -merge-platform has been
> > > > > done on the smaller set.
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > Said that, I actually am still not clear on how / whether it should work at
> > > > > > > last.  At least my previous concern (1) didn't has a good answer yet, on
> > > > > > > what we do when profile collisions with qemu cmdlines.  So far I actually
> > > > > > > still think it more straightforward that in migration we handshake on these
> > > > > > > capabilities if possible.
> > > > > > > 
> > > > > > > And that's why I was thinking (where I totally agree with you on this) that
> > > > > > > whether we should settle a short term plan first to be on the safe side
> > > > > > > that we start with migration always being compatible, then we figure the
> > > > > > > other approach.  That seems easier to me, and it's also a matter of whether
> > > > > > > we want to do something for 9.1, or leaving that for 9.2 for USO*.
> > > > > > 
> > > > > > I suggest disabling all offload features of virtio-net with 9.2.
> > > > > > 
> > > > > > I want to keep things consistent so I want to disable all at once. This
> > > > > > change will be very uncomfortable for us, who are implementing offload
> > > > > > features, but I hope it will motivate us to implement a proper solution.
> > > > > > 
> > > > > > That said, it will be surely a breaking change so we should wait for 9.1
> > > > > > before making such a change.
> > > > > 
> > > > > Personally I don't worry too much on other offload bits besides USO* so far
> > > > > if we have them ON for longer time.  My wish was that they're old good
> > > > > kernel features mostly supported everywhere who runs QEMU, then we're good.
> > > > 
> > > > Unfortunately, we cannot expect everyone runs Linux, and the offload
> > > > features are provided by Linux. However, QEMU can run on other platforms,
> > > > and offload features may be provided by vhost-user or vhost-vdpa.
> > > 
> > > I see.  I am not familiar with the status quo there, so I'll leave that to
> > > you and other experts that know better on this..
> > > 
> > > Personally I do care more on Linux, as that's what we ship within RH..
> > > 
> > > > 
> > > > > 
> > > > > And I definitely worry about future offload features, or any feature that
> > > > > may probe host like this and auto-OFF: I hope we can do them on the safe
> > > > > side starting from day1.
> > > > > 
> > > > > So I don't know whether we should do that to USO* only or all.  But I agree
> > > > > with you that'll definitely be cleaner.
> > > > > 
> > > > > On the details of how to turn them off properly..  Taking an example if we
> > > > > want to turn off all the offload features by default (or simply we replace
> > > > > that with USO-only)..
> > > > > 
> > > > > Upstream machine type is flexible to all kinds of kernels, so we may not
> > > > > want to regress anyone using an existing machine type even on perf,
> > > > > especially if we want to turn off all.
> > > > > 
> > > > > In that case we may need one more knob (I'm assuming this is virtio-net
> > > > > specific issue, but maybe not; using it as an example) to make sure the old
> > > > > machine types perfs as well, with:
> > > > > 
> > > > >     - x-virtio-net-offload-enforce
> > > > > 
> > > > >       When set, the offload features with value ON are enforced, so when
> > > > >       the host doesn't support a offload feature it will fail to boot,
> > > > >       showing the error that specific offload feature is not supported by the
> > > > >       virtio backend.
> > > > > 
> > > > >       When clear, the offload features with value ON are not enforced, so
> > > > >       these features can be automatically turned OFF when it's detected the
> > > > >       backend doesn't support them.  This may bring best perf but has the
> > > > >       risk of breaking migration.
> > > > 
> > > > "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds
> > > > "x-force-features-auto" compatibility property to virtio-net for this
> > > > purpose:
> > > > https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com
> > > 
> > > Ah ok.  But note that there's still a slight difference: we need to avoid
> > > AUTO being an option, at all, IMHO.
> > > 
> > > It's about making qemu cmdline the ABI: when with AUTO it's still possible
> > > the user uses AUTO on both sides, then ABI may not be guaranteed.
> > > 
> > > AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2)
> > > the AUTO bit will always generate the same thing on both hosts.  However
> > > USO* isn't such case.. so the AUTO option is IMHO not wanted.
> > > 
> > > What I mentioned above "x-virtio-net-offload-enforce" shouldn't add
> > > anything new to "uso"; it still can only be ON/OFF.  However it should
> > > affect "flip that to OFF automatically" or "fail the boot" behavior on
> > > missing features.
> > 
> > My rationale for the OnOffAuto change is that "flipping ON to OFF
> > automatically" is more confusing than making users specify AUTO to allow
> > QEMU making the feature OFF. "ON" will always make the boot fail.
> > 
> > The ABI guarantee will be gone anyway if x-virtio-net-offload-enforce=off.
> > AUTO is no different in that sense.
> 
> Hmm yes; I wished we can have device properties that user can never
> specify, but only set from internals.


I think prefixing with "x-" is good enough.

>  It's just that applying a compat
> property so far require a generic device property.  Or say, it'll be nice
> that compat property can tweak a class variable too then no property to
> introduce.
> 
> We could even add a migration blocker for x-virtio-net-offload-enforce=ON,
> but again it could be too aggresive.  I think it might be better we bet
> nobody will even know there's the parameter, so it won't be used in manual
> setup.  OTOH, "guest_uso4" can be too easiy to find there's the AUTO
> option: I normally use ",guest_uso4=?" to just dump the possible values.
> 
> Thanks,
> 
> -- 
> Peter Xu
Michael S. Tsirkin Aug. 8, 2024, 2:47 p.m. UTC | #92
On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
> On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
> > This is too big of a hammer. People already use what you call "cross
> > migrate" and have for years. We are not going to stop developing
> > features just because someone suddenly became aware of some such bit.
> > If you care, you will have to work to solve the problem properly -
> > nacking half baked hacks is the only tool maintainers have to make
> > people work on hard problems.
> 
> IMHO this is totally different thing.  It's not about proposing a new
> feature yet so far, it's about how we should fix a breakage first.
> 
> And that's why I think we should fix it even in the simple way first, then
> we consider anything more benefitial from perf side without breaking
> anything, which should be on top of that.
> 
> Thanks,

As I said, once the quick hack is merged people stop caring.
Mixing different kernel versions in migration is esoteric enough for
this not to matter to most people. There's no rush I think, address
it properly.
Peter Xu Aug. 8, 2024, 3:25 p.m. UTC | #93
On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote:
> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
> > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
> > > This is too big of a hammer. People already use what you call "cross
> > > migrate" and have for years. We are not going to stop developing
> > > features just because someone suddenly became aware of some such bit.
> > > If you care, you will have to work to solve the problem properly -
> > > nacking half baked hacks is the only tool maintainers have to make
> > > people work on hard problems.
> > 
> > IMHO this is totally different thing.  It's not about proposing a new
> > feature yet so far, it's about how we should fix a breakage first.
> > 
> > And that's why I think we should fix it even in the simple way first, then
> > we consider anything more benefitial from perf side without breaking
> > anything, which should be on top of that.
> > 
> > Thanks,
> 
> As I said, once the quick hack is merged people stop caring.

IMHO it's not a hack. It's a proper fix to me to disable it by default for
now.

OTOH, having it ON always even knowing it can break migration is a hack to
me, when we don't have anything else to guard the migration.

> Mixing different kernel versions in migration is esoteric enough for
> this not to matter to most people. There's no rush I think, address
> it properly.

Exactly mixing kernel versions will be tricky to users to identify, but
that's, AFAICT, exactly happening everywhere.  We can't urge user to always
use the exact same kernels when we're talking about a VM cluster.  That's
why I think allowing migration to work across those kernels matter.

I will agree there's no rush iff RHEL9 kernel won't backport TAP at all,
otherwise this will trigger between y-stream after people upgrades partial
of the clusters.

Thanks,
Akihiko Odaki Aug. 9, 2024, 10:28 a.m. UTC | #94
On 2024/08/08 22:55, Peter Xu wrote:
> On Thu, Aug 08, 2024 at 08:43:22PM +0900, Akihiko Odaki wrote:
>> On 2024/08/07 5:41, Peter Xu wrote:
>>> On Mon, Aug 05, 2024 at 04:27:43PM +0900, Akihiko Odaki wrote:
>>>> On 2024/08/04 22:08, Peter Xu wrote:
>>>>> On Sun, Aug 04, 2024 at 03:49:45PM +0900, Akihiko Odaki wrote:
>>>>>> On 2024/08/03 1:26, Peter Xu wrote:
>>>>>>> On Sat, Aug 03, 2024 at 12:54:51AM +0900, Akihiko Odaki wrote:
>>>>>>>>>>> I'm not sure if I read it right.  Perhaps you meant something more generic
>>>>>>>>>>> than -platform but similar?
>>>>>>>>>>>
>>>>>>>>>>> For example, "-profile [PROFILE]" qemu cmdline, where PROFILE can be either
>>>>>>>>>>> "perf" or "compat", while by default to "compat"?
>>>>>>>>>>
>>>>>>>>>> "perf" would cover 4) and "compat" will cover 1). However neither of them
>>>>>>>>>> will cover 2) because an enum is not enough to know about all hosts. I
>>>>>>>>>> presented a design that will cover 2) in:
>>>>>>>>>> https://lore.kernel.org/r/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com
>>>>>>>>>
>>>>>>>>> "-merge-platform" shouldn't be a QEMU parameter, but should be something
>>>>>>>>> separate.
>>>>>>>>
>>>>>>>> Do you mean merging platform dumps should be done with another command? I
>>>>>>>> think we will want to know the QOM tree is in use when implementing
>>>>>>>> -merge-platform. For example, you cannot define a "platform" when e.g., you
>>>>>>>> don't know what netdev backend (e.g., user, vhost-net, vhost-vdpa) is
>>>>>>>> connected to virtio-net devices. Of course we can include those information
>>>>>>>> in dumps, but we don't do so for VMState.
>>>>>>>
>>>>>>> What I was thinking is the generated platform dump shouldn't care about
>>>>>>> what is used as backend: it should try to probe whatever is specified in
>>>>>>> the qemu cmdline, and it's the user's job to make sure the exact same qemu
>>>>>>> cmdline is used in other hosts to dump this information.
>>>>>>>
>>>>>>> IOW, the dump will only contain the information that was based on the qemu
>>>>>>> cmdline.  E.g., if it doesn't include virtio device at all, and if we only
>>>>>>> support such dump for virtio, it should dump nothing.
>>>>>>>
>>>>>>> Then the -merge-platform will expect all dumps to look the same too,
>>>>>>> merging them with AND on each field.
>>>>>>
>>>>>> I think we will still need the QOM tree in that case. I think the platform
>>>>>> information will look somewhat similar to VMState, which requires the QOM
>>>>>> tree to interpret.
>>>>>
>>>>> Ah yes, I assume you meant when multiple devices can report different thing
>>>>> even if with the same frontend / device type.  QOM should work, or anything
>>>>> that can identify a device, e.g. with id / instance_id attached along with
>>>>> the device class.
>>>>>
>>>>> One thing that I still don't know how it works is how it interacts with new
>>>>> hosts being added.
>>>>>
>>>>> This idea is based on the fact that the cluster is known before starting
>>>>> any VM.  However in reality I think it can happen when VMs started with a
>>>>> small cluster but then cluster extended, when the -merge-platform has been
>>>>> done on the smaller set.
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Said that, I actually am still not clear on how / whether it should work at
>>>>>>> last.  At least my previous concern (1) didn't has a good answer yet, on
>>>>>>> what we do when profile collisions with qemu cmdlines.  So far I actually
>>>>>>> still think it more straightforward that in migration we handshake on these
>>>>>>> capabilities if possible.
>>>>>>>
>>>>>>> And that's why I was thinking (where I totally agree with you on this) that
>>>>>>> whether we should settle a short term plan first to be on the safe side
>>>>>>> that we start with migration always being compatible, then we figure the
>>>>>>> other approach.  That seems easier to me, and it's also a matter of whether
>>>>>>> we want to do something for 9.1, or leaving that for 9.2 for USO*.
>>>>>>
>>>>>> I suggest disabling all offload features of virtio-net with 9.2.
>>>>>>
>>>>>> I want to keep things consistent so I want to disable all at once. This
>>>>>> change will be very uncomfortable for us, who are implementing offload
>>>>>> features, but I hope it will motivate us to implement a proper solution.
>>>>>>
>>>>>> That said, it will be surely a breaking change so we should wait for 9.1
>>>>>> before making such a change.
>>>>>
>>>>> Personally I don't worry too much on other offload bits besides USO* so far
>>>>> if we have them ON for longer time.  My wish was that they're old good
>>>>> kernel features mostly supported everywhere who runs QEMU, then we're good.
>>>>
>>>> Unfortunately, we cannot expect everyone runs Linux, and the offload
>>>> features are provided by Linux. However, QEMU can run on other platforms,
>>>> and offload features may be provided by vhost-user or vhost-vdpa.
>>>
>>> I see.  I am not familiar with the status quo there, so I'll leave that to
>>> you and other experts that know better on this..
>>>
>>> Personally I do care more on Linux, as that's what we ship within RH..
>>>
>>>>
>>>>>
>>>>> And I definitely worry about future offload features, or any feature that
>>>>> may probe host like this and auto-OFF: I hope we can do them on the safe
>>>>> side starting from day1.
>>>>>
>>>>> So I don't know whether we should do that to USO* only or all.  But I agree
>>>>> with you that'll definitely be cleaner.
>>>>>
>>>>> On the details of how to turn them off properly..  Taking an example if we
>>>>> want to turn off all the offload features by default (or simply we replace
>>>>> that with USO-only)..
>>>>>
>>>>> Upstream machine type is flexible to all kinds of kernels, so we may not
>>>>> want to regress anyone using an existing machine type even on perf,
>>>>> especially if we want to turn off all.
>>>>>
>>>>> In that case we may need one more knob (I'm assuming this is virtio-net
>>>>> specific issue, but maybe not; using it as an example) to make sure the old
>>>>> machine types perfs as well, with:
>>>>>
>>>>>      - x-virtio-net-offload-enforce
>>>>>
>>>>>        When set, the offload features with value ON are enforced, so when
>>>>>        the host doesn't support a offload feature it will fail to boot,
>>>>>        showing the error that specific offload feature is not supported by the
>>>>>        virtio backend.
>>>>>
>>>>>        When clear, the offload features with value ON are not enforced, so
>>>>>        these features can be automatically turned OFF when it's detected the
>>>>>        backend doesn't support them.  This may bring best perf but has the
>>>>>        risk of breaking migration.
>>>>
>>>> "[PATCH v3 0/5] virtio-net: Convert feature properties to OnOffAuto" adds
>>>> "x-force-features-auto" compatibility property to virtio-net for this
>>>> purpose:
>>>> https://lore.kernel.org/r/20240714-auto-v3-0-e27401aabab3@daynix.com
>>>
>>> Ah ok.  But note that there's still a slight difference: we need to avoid
>>> AUTO being an option, at all, IMHO.
>>>
>>> It's about making qemu cmdline the ABI: when with AUTO it's still possible
>>> the user uses AUTO on both sides, then ABI may not be guaranteed.
>>>
>>> AUTO would be fine if: (1) the property doesn't affect guest ABI, or (2)
>>> the AUTO bit will always generate the same thing on both hosts.  However
>>> USO* isn't such case.. so the AUTO option is IMHO not wanted.
>>>
>>> What I mentioned above "x-virtio-net-offload-enforce" shouldn't add
>>> anything new to "uso"; it still can only be ON/OFF.  However it should
>>> affect "flip that to OFF automatically" or "fail the boot" behavior on
>>> missing features.
>>
>> My rationale for the OnOffAuto change is that "flipping ON to OFF
>> automatically" is more confusing than making users specify AUTO to allow
>> QEMU making the feature OFF. "ON" will always make the boot fail.
>>
>> The ABI guarantee will be gone anyway if x-virtio-net-offload-enforce=off.
>> AUTO is no different in that sense.
> 
> Hmm yes; I wished we can have device properties that user can never
> specify, but only set from internals.  It's just that applying a compat
> property so far require a generic device property.  Or say, it'll be nice
> that compat property can tweak a class variable too then no property to
> introduce.
> 
> We could even add a migration blocker for x-virtio-net-offload-enforce=ON,
> but again it could be too aggresive.  I think it might be better we bet
> nobody will even know there's the parameter, so it won't be used in manual
> setup.  OTOH, "guest_uso4" can be too easiy to find there's the AUTO
> option: I normally use ",guest_uso4=?" to just dump the possible values.

We can detect and reject AUTO when cross-migrate=on if desired, but I'm 
not sure it's worthwhile.

Regards,
Akihiko Odaki
Fabiano Rosas Aug. 9, 2024, 12:50 p.m. UTC | #95
Peter Xu <peterx@redhat.com> writes:

> On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote:
>> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
>> > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
>> > > This is too big of a hammer. People already use what you call "cross
>> > > migrate" and have for years. We are not going to stop developing
>> > > features just because someone suddenly became aware of some such bit.
>> > > If you care, you will have to work to solve the problem properly -
>> > > nacking half baked hacks is the only tool maintainers have to make
>> > > people work on hard problems.
>> > 
>> > IMHO this is totally different thing.  It's not about proposing a new
>> > feature yet so far, it's about how we should fix a breakage first.
>> > 
>> > And that's why I think we should fix it even in the simple way first, then
>> > we consider anything more benefitial from perf side without breaking
>> > anything, which should be on top of that.
>> > 
>> > Thanks,
>> 
>> As I said, once the quick hack is merged people stop caring.
>
> IMHO it's not a hack. It's a proper fix to me to disable it by default for
> now.
>
> OTOH, having it ON always even knowing it can break migration is a hack to
> me, when we don't have anything else to guard the migration.
>
>> Mixing different kernel versions in migration is esoteric enough for
>> this not to matter to most people. There's no rush I think, address
>> it properly.
>
> Exactly mixing kernel versions will be tricky to users to identify, but
> that's, AFAICT, exactly happening everywhere.  We can't urge user to always
> use the exact same kernels when we're talking about a VM cluster.  That's
> why I think allowing migration to work across those kernels matter.

I also worry a bit about the scenario where the cluster changes slightly
and now all VMs are already restricted by some option that requires the
exact same kernel. Specifically, kernel changes in a cloud environment
also happen due to factors completely unrelated to migration. I'm not
sure the people managing the infra (who care about migration) will be
gating kernel changes just because QEMU has been configured in a
specific manner.
Michael S. Tsirkin Aug. 11, 2024, 7:35 a.m. UTC | #96
On Thu, Aug 08, 2024 at 11:25:29AM -0400, Peter Xu wrote:
> On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote:
> > On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
> > > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
> > > > This is too big of a hammer. People already use what you call "cross
> > > > migrate" and have for years. We are not going to stop developing
> > > > features just because someone suddenly became aware of some such bit.
> > > > If you care, you will have to work to solve the problem properly -
> > > > nacking half baked hacks is the only tool maintainers have to make
> > > > people work on hard problems.
> > > 
> > > IMHO this is totally different thing.  It's not about proposing a new
> > > feature yet so far, it's about how we should fix a breakage first.
> > > 
> > > And that's why I think we should fix it even in the simple way first, then
> > > we consider anything more benefitial from perf side without breaking
> > > anything, which should be on top of that.
> > > 
> > > Thanks,
> > 
> > As I said, once the quick hack is merged people stop caring.
> 
> IMHO it's not a hack. It's a proper fix to me to disable it by default for
> now.
> 
> OTOH, having it ON always even knowing it can break migration is a hack to
> me, when we don't have anything else to guard the migration.

It's a hack in the sense that it's specific to this option.
But hack or not, it's the only way I have to make people work on
a full solution.

> > Mixing different kernel versions in migration is esoteric enough for
> > this not to matter to most people. There's no rush I think, address
> > it properly.
> 
> Exactly mixing kernel versions will be tricky to users to identify, but
> that's, AFAICT, exactly happening everywhere.  We can't urge user to always
> use the exact same kernels when we're talking about a VM cluster.  That's
> why I think allowing migration to work across those kernels matter.
> 
> I will agree there's no rush iff RHEL9 kernel won't backport TAP at all,
> otherwise this will trigger between y-stream after people upgrades partial
> of the clusters.
> 
> Thanks,
> 
> -- 
> Peter Xu
Akihiko Odaki Aug. 18, 2024, 5:04 a.m. UTC | #97
On 2024/08/09 21:50, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
>> On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote:
>>> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
>>>> On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
>>>>> This is too big of a hammer. People already use what you call "cross
>>>>> migrate" and have for years. We are not going to stop developing
>>>>> features just because someone suddenly became aware of some such bit.
>>>>> If you care, you will have to work to solve the problem properly -
>>>>> nacking half baked hacks is the only tool maintainers have to make
>>>>> people work on hard problems.
>>>>
>>>> IMHO this is totally different thing.  It's not about proposing a new
>>>> feature yet so far, it's about how we should fix a breakage first.
>>>>
>>>> And that's why I think we should fix it even in the simple way first, then
>>>> we consider anything more benefitial from perf side without breaking
>>>> anything, which should be on top of that.
>>>>
>>>> Thanks,
>>>
>>> As I said, once the quick hack is merged people stop caring.
>>
>> IMHO it's not a hack. It's a proper fix to me to disable it by default for
>> now.
>>
>> OTOH, having it ON always even knowing it can break migration is a hack to
>> me, when we don't have anything else to guard the migration.
>>
>>> Mixing different kernel versions in migration is esoteric enough for
>>> this not to matter to most people. There's no rush I think, address
>>> it properly.
>>
>> Exactly mixing kernel versions will be tricky to users to identify, but
>> that's, AFAICT, exactly happening everywhere.  We can't urge user to always
>> use the exact same kernels when we're talking about a VM cluster.  That's
>> why I think allowing migration to work across those kernels matter.
> 
> I also worry a bit about the scenario where the cluster changes slightly
> and now all VMs are already restricted by some option that requires the
> exact same kernel. Specifically, kernel changes in a cloud environment
> also happen due to factors completely unrelated to migration. I'm not
> sure the people managing the infra (who care about migration) will be
> gating kernel changes just because QEMU has been configured in a
> specific manner.

I have wrote a bit about the expectation on the platform earlier[1], but 
let me summarize it here.

1. I expect the user will not downgrade the platform of hosts after 
setting up a VM. This is essential to enable any platform feature.

2. The user is allowed to upgrade the platform of hosts gradually. This 
results in a situation with mixed platforms. The oldest platform is 
still not older than the platform the VM is set up for. This enables the 
gradual deployment strategy.

3. the user is allowed to downgrade the platform of hosts to the version 
used when setting up the VM. This enables rollbacks in case of regression.

With these expectations, we can ensure migratability by a) enabling 
platform features available on all hosts when setting up the VM and b) 
saving the enabled features. This is covered with my 
-dump-platform/-merge-platform/-use-platform proposal[2].

Regards,
Akihiko Odaki

[1] 
https://lore.kernel.org/r/2b62780c-a6cb-4262-beb5-81d54c14f545@daynix.com
[2] 
https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/
Akihiko Odaki Aug. 18, 2024, 5:09 a.m. UTC | #98
On 2024/08/09 0:25, Peter Xu wrote:
> On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote:
>> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
>>> On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
>>>> This is too big of a hammer. People already use what you call "cross
>>>> migrate" and have for years. We are not going to stop developing
>>>> features just because someone suddenly became aware of some such bit.
>>>> If you care, you will have to work to solve the problem properly -
>>>> nacking half baked hacks is the only tool maintainers have to make
>>>> people work on hard problems.
>>>
>>> IMHO this is totally different thing.  It's not about proposing a new
>>> feature yet so far, it's about how we should fix a breakage first.
>>>
>>> And that's why I think we should fix it even in the simple way first, then
>>> we consider anything more benefitial from perf side without breaking
>>> anything, which should be on top of that.
>>>
>>> Thanks,
>>
>> As I said, once the quick hack is merged people stop caring.
> 
> IMHO it's not a hack. It's a proper fix to me to disable it by default for
> now.
> 
> OTOH, having it ON always even knowing it can break migration is a hack to
> me, when we don't have anything else to guard the migration.

I think neither of them is a hack; they just deal with different 
scenarios summarized in [1]. We need apply a solution appropriate for 
each scenario, or we will end up with a broken system.

Regards,
Akihiko Odaki

[1] 
https://lore.kernel.org/r/770300ac-7ed3-4aba-addb-b3f987cc6376@daynix.com/
Michael S. Tsirkin Aug. 18, 2024, 7:03 a.m. UTC | #99
On Sun, Aug 18, 2024 at 02:04:29PM +0900, Akihiko Odaki wrote:
> On 2024/08/09 21:50, Fabiano Rosas wrote:
> > Peter Xu <peterx@redhat.com> writes:
> > 
> > > On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote:
> > > > On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
> > > > > On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
> > > > > > This is too big of a hammer. People already use what you call "cross
> > > > > > migrate" and have for years. We are not going to stop developing
> > > > > > features just because someone suddenly became aware of some such bit.
> > > > > > If you care, you will have to work to solve the problem properly -
> > > > > > nacking half baked hacks is the only tool maintainers have to make
> > > > > > people work on hard problems.
> > > > > 
> > > > > IMHO this is totally different thing.  It's not about proposing a new
> > > > > feature yet so far, it's about how we should fix a breakage first.
> > > > > 
> > > > > And that's why I think we should fix it even in the simple way first, then
> > > > > we consider anything more benefitial from perf side without breaking
> > > > > anything, which should be on top of that.
> > > > > 
> > > > > Thanks,
> > > > 
> > > > As I said, once the quick hack is merged people stop caring.
> > > 
> > > IMHO it's not a hack. It's a proper fix to me to disable it by default for
> > > now.
> > > 
> > > OTOH, having it ON always even knowing it can break migration is a hack to
> > > me, when we don't have anything else to guard the migration.
> > > 
> > > > Mixing different kernel versions in migration is esoteric enough for
> > > > this not to matter to most people. There's no rush I think, address
> > > > it properly.
> > > 
> > > Exactly mixing kernel versions will be tricky to users to identify, but
> > > that's, AFAICT, exactly happening everywhere.  We can't urge user to always
> > > use the exact same kernels when we're talking about a VM cluster.  That's
> > > why I think allowing migration to work across those kernels matter.
> > 
> > I also worry a bit about the scenario where the cluster changes slightly
> > and now all VMs are already restricted by some option that requires the
> > exact same kernel. Specifically, kernel changes in a cloud environment
> > also happen due to factors completely unrelated to migration. I'm not
> > sure the people managing the infra (who care about migration) will be
> > gating kernel changes just because QEMU has been configured in a
> > specific manner.
> 
> I have wrote a bit about the expectation on the platform earlier[1], but let
> me summarize it here.
> 
> 1. I expect the user will not downgrade the platform of hosts after setting
> up a VM. This is essential to enable any platform feature.
> 
> 2. The user is allowed to upgrade the platform of hosts gradually. This
> results in a situation with mixed platforms. The oldest platform is still
> not older than the platform the VM is set up for. This enables the gradual
> deployment strategy.
> 
> 3. the user is allowed to downgrade the platform of hosts to the version
> used when setting up the VM. This enables rollbacks in case of regression.
> 
> With these expectations, we can ensure migratability by a) enabling platform
> features available on all hosts when setting up the VM and b) saving the
> enabled features. This is covered with my
> -dump-platform/-merge-platform/-use-platform proposal[2].

I really like [2]. Do you plan to work on it? Does anyone else?

> Regards,
> Akihiko Odaki
> 
> [1]
> https://lore.kernel.org/r/2b62780c-a6cb-4262-beb5-81d54c14f545@daynix.com
> [2]
> https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/
Akihiko Odaki Aug. 19, 2024, 4:27 a.m. UTC | #100
On 2024/08/18 16:03, Michael S. Tsirkin wrote:
> On Sun, Aug 18, 2024 at 02:04:29PM +0900, Akihiko Odaki wrote:
>> On 2024/08/09 21:50, Fabiano Rosas wrote:
>>> Peter Xu <peterx@redhat.com> writes:
>>>
>>>> On Thu, Aug 08, 2024 at 10:47:28AM -0400, Michael S. Tsirkin wrote:
>>>>> On Thu, Aug 08, 2024 at 10:15:36AM -0400, Peter Xu wrote:
>>>>>> On Thu, Aug 08, 2024 at 07:12:14AM -0400, Michael S. Tsirkin wrote:
>>>>>>> This is too big of a hammer. People already use what you call "cross
>>>>>>> migrate" and have for years. We are not going to stop developing
>>>>>>> features just because someone suddenly became aware of some such bit.
>>>>>>> If you care, you will have to work to solve the problem properly -
>>>>>>> nacking half baked hacks is the only tool maintainers have to make
>>>>>>> people work on hard problems.
>>>>>>
>>>>>> IMHO this is totally different thing.  It's not about proposing a new
>>>>>> feature yet so far, it's about how we should fix a breakage first.
>>>>>>
>>>>>> And that's why I think we should fix it even in the simple way first, then
>>>>>> we consider anything more benefitial from perf side without breaking
>>>>>> anything, which should be on top of that.
>>>>>>
>>>>>> Thanks,
>>>>>
>>>>> As I said, once the quick hack is merged people stop caring.
>>>>
>>>> IMHO it's not a hack. It's a proper fix to me to disable it by default for
>>>> now.
>>>>
>>>> OTOH, having it ON always even knowing it can break migration is a hack to
>>>> me, when we don't have anything else to guard the migration.
>>>>
>>>>> Mixing different kernel versions in migration is esoteric enough for
>>>>> this not to matter to most people. There's no rush I think, address
>>>>> it properly.
>>>>
>>>> Exactly mixing kernel versions will be tricky to users to identify, but
>>>> that's, AFAICT, exactly happening everywhere.  We can't urge user to always
>>>> use the exact same kernels when we're talking about a VM cluster.  That's
>>>> why I think allowing migration to work across those kernels matter.
>>>
>>> I also worry a bit about the scenario where the cluster changes slightly
>>> and now all VMs are already restricted by some option that requires the
>>> exact same kernel. Specifically, kernel changes in a cloud environment
>>> also happen due to factors completely unrelated to migration. I'm not
>>> sure the people managing the infra (who care about migration) will be
>>> gating kernel changes just because QEMU has been configured in a
>>> specific manner.
>>
>> I have wrote a bit about the expectation on the platform earlier[1], but let
>> me summarize it here.
>>
>> 1. I expect the user will not downgrade the platform of hosts after setting
>> up a VM. This is essential to enable any platform feature.
>>
>> 2. The user is allowed to upgrade the platform of hosts gradually. This
>> results in a situation with mixed platforms. The oldest platform is still
>> not older than the platform the VM is set up for. This enables the gradual
>> deployment strategy.
>>
>> 3. the user is allowed to downgrade the platform of hosts to the version
>> used when setting up the VM. This enables rollbacks in case of regression.
>>
>> With these expectations, we can ensure migratability by a) enabling platform
>> features available on all hosts when setting up the VM and b) saving the
>> enabled features. This is covered with my
>> -dump-platform/-merge-platform/-use-platform proposal[2].
> 
> I really like [2]. Do you plan to work on it? Does anyone else?

No, but I want to move "[PATCH v3 0/5] virtio-net: Convert feature 
properties to OnOffAuto" forward:
https://patchew.org/QEMU/20240714-auto-v3-0-e27401aabab3@daynix.com/

This will clarify the existence of the "auto" semantics, which is to 
enable a platform feature based on availability. [2] will be regarded as 
a feature to improve the handling of the "auto" semantics once this 
change lands.

Regards,
Akihiko Odaki

> 
>> Regards,
>> Akihiko Odaki
>>
>> [1]
>> https://lore.kernel.org/r/2b62780c-a6cb-4262-beb5-81d54c14f545@daynix.com
>> [2]
>> https://lore.kernel.org/all/2da4ebcd-2058-49c3-a4ec-8e60536e5cbb@daynix.com/
>
diff mbox series

Patch

diff --git a/hw/core/machine.c b/hw/core/machine.c
index f0d35c6401..a725e76738 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -38,10 +38,14 @@ 
 #include "exec/confidential-guest-support.h"
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-net.h"
 
 GlobalProperty hw_compat_8_0[] = {
     { "migration", "multifd-flush-after-each-section", "on"},
     { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
+    { TYPE_VIRTIO_NET, "host_uso", "off"},
+    { TYPE_VIRTIO_NET, "guest_uso4", "off"},
+    { TYPE_VIRTIO_NET, "guest_uso6", "off"},
 };
 const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index d2311e7d6e..bd0ead94fe 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -659,6 +659,15 @@  static int peer_has_ufo(VirtIONet *n)
     return n->has_ufo;
 }
 
+static int peer_has_uso(VirtIONet *n)
+{
+    if (!peer_has_vnet_hdr(n)) {
+        return 0;
+    }
+
+    return qemu_has_uso(qemu_get_queue(n->nic)->peer);
+}
+
 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
                                        int version_1, int hash_report)
 {
@@ -796,6 +805,10 @@  static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
 
+        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+
         virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
     }
 
@@ -804,6 +817,12 @@  static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
     }
 
+    if (!peer_has_uso(n)) {
+        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+    }
+
     if (!get_vhost_net(nc->peer)) {
         return features;
     }
@@ -864,14 +883,16 @@  static void virtio_net_apply_guest_offloads(VirtIONet *n)
             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
 }
 
-static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
+static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
 {
     static const uint64_t guest_offloads_mask =
         (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
         (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
         (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
         (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
-        (1ULL << VIRTIO_NET_F_GUEST_UFO);
+        (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
+        (1ULL << VIRTIO_NET_F_GUEST_USO4) |
+        (1ULL << VIRTIO_NET_F_GUEST_USO6);
 
     return guest_offloads_mask & features;
 }
@@ -3924,6 +3945,12 @@  static Property virtio_net_properties[] = {
     DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
     DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
     DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
+    DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
+                      VIRTIO_NET_F_GUEST_USO4, true),
+    DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
+                      VIRTIO_NET_F_GUEST_USO6, true),
+    DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
+                      VIRTIO_NET_F_HOST_USO, true),
     DEFINE_PROP_END_OF_LIST(),
 };