diff mbox

[PATCHv4,3/4] cpuid: disable pv eoi for 1.1 and older compat types

Message ID a395c68c2e1e3002a23f1fce165ec723b1c1561a.1346175787.git.mst@redhat.com
State New
Headers show

Commit Message

Michael S. Tsirkin Aug. 28, 2012, 5:43 p.m. UTC
In preparation for adding PV EOI support, disable PV EOI by default for
1.1 and older machine types, to avoid CPUID changing during migration.

PV EOI can still be enabled/disabled by specifying it explicitly.
    Enable for 1.1
    -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
    Disable for 1.2
    -M pc-1.2 -cpu kvm64,-kvm_pv_eoi

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/Makefile.objs  |  2 +-
 hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
 hw/cpu_flags.h    |  9 +++++++++
 hw/pc_piix.c      |  2 ++
 target-i386/cpu.c |  8 ++++++++
 5 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 hw/cpu_flags.c
 create mode 100644 hw/cpu_flags.h

Comments

Eduardo Habkost Aug. 28, 2012, 7:13 p.m. UTC | #1
On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> In preparation for adding PV EOI support, disable PV EOI by default for
> 1.1 and older machine types, to avoid CPUID changing during migration.
> 
> PV EOI can still be enabled/disabled by specifying it explicitly.
>     Enable for 1.1
>     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
>     Disable for 1.2
>     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> 

What about users that are already running "qemu-1.1 -M pc-1.1" on a host
kernel that supports PV EOI already? They would get PV EOI disabled when
migrating to a destination running "qemu-1.2 -M pc-1.1".

(On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
supporting PV EOI already have migration broken, so there's not much we
can do for them)

While we don't make the KVM feature-bit handling sane (with defaults
that are not blindly derived from the host kernel capabilities), maybe
the safest bet is to expect users to not migrate between hosts running
kernels with different KVM capabilities? (I am not sure which option is
better)


> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  hw/Makefile.objs  |  2 +-
>  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
>  hw/cpu_flags.h    |  9 +++++++++
>  hw/pc_piix.c      |  2 ++
>  target-i386/cpu.c |  8 ++++++++
>  5 files changed, 52 insertions(+), 1 deletion(-)
>  create mode 100644 hw/cpu_flags.c
>  create mode 100644 hw/cpu_flags.h
> 
> diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> index 850b87b..3f2532a 100644
> --- a/hw/Makefile.objs
> +++ b/hw/Makefile.objs
> @@ -1,5 +1,5 @@
>  hw-obj-y = usb/ ide/
> -hw-obj-y += loader.o
> +hw-obj-y += loader.o cpu_flags.o
>  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
>  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
>  hw-obj-y += fw_cfg.o
> diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> new file mode 100644
> index 0000000..7a633c0
> --- /dev/null
> +++ b/hw/cpu_flags.c
> @@ -0,0 +1,32 @@
> +/*
> + * CPU compatibility flags.
> + *
> + * Copyright (c) 2012 Red Hat Inc.
> + * Author: Michael S. Tsirkin.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +#include "hw/cpu_flags.h"
> +
> +static bool kvm_pv_eoi_disabled_state;
> +
> +void disable_kvm_pv_eoi(void)
> +{
> +       kvm_pv_eoi_disabled_state = true;
> +}
> +
> +bool kvm_pv_eoi_disabled(void)
> +{
> +       return kvm_pv_eoi_disabled_state;
> +}
> diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> new file mode 100644
> index 0000000..05777b6
> --- /dev/null
> +++ b/hw/cpu_flags.h
> @@ -0,0 +1,9 @@
> +#ifndef HW_CPU_FLAGS_H
> +#define HW_CPU_FLAGS_H
> +
> +#include <stdbool.h>
> +
> +void disable_kvm_pv_eoi(void);
> +bool kvm_pv_eoi_disabled(void);
> +
> +#endif
> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> index 008d42f..bdbceda 100644
> --- a/hw/pc_piix.c
> +++ b/hw/pc_piix.c
> @@ -46,6 +46,7 @@
>  #ifdef CONFIG_XEN
>  #  include <xen/hvm/hvm_info_table.h>
>  #endif
> +#include "cpu_flags.h"
>  
>  #define MAX_IDE_BUS 2
>  
> @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
>  
>  static void pc_machine_v1_1_compat(void)
>  {
> +    disable_kvm_pv_eoi();
>  }
>  
>  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index 120a2e3..0d02fd1 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -23,6 +23,7 @@
>  
>  #include "cpu.h"
>  #include "kvm.h"
> +#include "asm/kvm_para.h"
>  
>  #include "qemu-option.h"
>  #include "qemu-config.h"
> @@ -33,6 +34,7 @@
>  #include "hyperv.h"
>  
>  #include "hw/hw.h"
> +#include "hw/cpu_flags.h"
>  
>  /* feature flags taken from "Intel Processor Identification and the CPUID
>   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
>  
>      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
>  
> +    /* Disable PV EOI for old machine types.
> +     * Feature flags can still override. */
> +    if (kvm_pv_eoi_disabled()) {
> +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> +    }
> +
>      add_flagname_to_bitmaps("hypervisor", &plus_features,
>          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
>          &plus_kvm_features, &plus_svm_features);
> -- 
> MST
> 
>
Michael S. Tsirkin Aug. 28, 2012, 9:35 p.m. UTC | #2
On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > In preparation for adding PV EOI support, disable PV EOI by default for
> > 1.1 and older machine types, to avoid CPUID changing during migration.
> > 
> > PV EOI can still be enabled/disabled by specifying it explicitly.
> >     Enable for 1.1
> >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> >     Disable for 1.2
> >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > 
> 
> What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> kernel that supports PV EOI already? They would get PV EOI disabled when
> migrating to a destination running "qemu-1.2 -M pc-1.1".
> 
> (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> supporting PV EOI already have migration broken, so there's not much we
> can do for them)

Exactly.

Talked to Gleb, long term I think we should rework code to make
it forward-compatible wrt adding new MSRs:
- source gets list of MSRs to be migrated from KVM and simply sends them all
- send all MSRS in key/value format
- destination gets list of MSRs to be migrated from KVM and
  only restores the supported ones
Too late for 1.2?

> While we don't make the KVM feature-bit handling sane (with defaults
> that are not blindly derived from the host kernel capabilities), maybe
> the safest bet is to expect users to not migrate between hosts running
> kernels with different KVM capabilities? (I am not sure which option is
> better)

Sorry not sure what you talk about here. What has KVM feature-bit
handling to do with this patchset?

> 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  hw/Makefile.objs  |  2 +-
> >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> >  hw/cpu_flags.h    |  9 +++++++++
> >  hw/pc_piix.c      |  2 ++
> >  target-i386/cpu.c |  8 ++++++++
> >  5 files changed, 52 insertions(+), 1 deletion(-)
> >  create mode 100644 hw/cpu_flags.c
> >  create mode 100644 hw/cpu_flags.h
> > 
> > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > index 850b87b..3f2532a 100644
> > --- a/hw/Makefile.objs
> > +++ b/hw/Makefile.objs
> > @@ -1,5 +1,5 @@
> >  hw-obj-y = usb/ ide/
> > -hw-obj-y += loader.o
> > +hw-obj-y += loader.o cpu_flags.o
> >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> >  hw-obj-y += fw_cfg.o
> > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > new file mode 100644
> > index 0000000..7a633c0
> > --- /dev/null
> > +++ b/hw/cpu_flags.c
> > @@ -0,0 +1,32 @@
> > +/*
> > + * CPU compatibility flags.
> > + *
> > + * Copyright (c) 2012 Red Hat Inc.
> > + * Author: Michael S. Tsirkin.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +#include "hw/cpu_flags.h"
> > +
> > +static bool kvm_pv_eoi_disabled_state;
> > +
> > +void disable_kvm_pv_eoi(void)
> > +{
> > +       kvm_pv_eoi_disabled_state = true;
> > +}
> > +
> > +bool kvm_pv_eoi_disabled(void)
> > +{
> > +       return kvm_pv_eoi_disabled_state;
> > +}
> > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > new file mode 100644
> > index 0000000..05777b6
> > --- /dev/null
> > +++ b/hw/cpu_flags.h
> > @@ -0,0 +1,9 @@
> > +#ifndef HW_CPU_FLAGS_H
> > +#define HW_CPU_FLAGS_H
> > +
> > +#include <stdbool.h>
> > +
> > +void disable_kvm_pv_eoi(void);
> > +bool kvm_pv_eoi_disabled(void);
> > +
> > +#endif
> > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > index 008d42f..bdbceda 100644
> > --- a/hw/pc_piix.c
> > +++ b/hw/pc_piix.c
> > @@ -46,6 +46,7 @@
> >  #ifdef CONFIG_XEN
> >  #  include <xen/hvm/hvm_info_table.h>
> >  #endif
> > +#include "cpu_flags.h"
> >  
> >  #define MAX_IDE_BUS 2
> >  
> > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> >  
> >  static void pc_machine_v1_1_compat(void)
> >  {
> > +    disable_kvm_pv_eoi();
> >  }
> >  
> >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > index 120a2e3..0d02fd1 100644
> > --- a/target-i386/cpu.c
> > +++ b/target-i386/cpu.c
> > @@ -23,6 +23,7 @@
> >  
> >  #include "cpu.h"
> >  #include "kvm.h"
> > +#include "asm/kvm_para.h"
> >  
> >  #include "qemu-option.h"
> >  #include "qemu-config.h"
> > @@ -33,6 +34,7 @@
> >  #include "hyperv.h"
> >  
> >  #include "hw/hw.h"
> > +#include "hw/cpu_flags.h"
> >  
> >  /* feature flags taken from "Intel Processor Identification and the CPUID
> >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> >  
> >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> >  
> > +    /* Disable PV EOI for old machine types.
> > +     * Feature flags can still override. */
> > +    if (kvm_pv_eoi_disabled()) {
> > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > +    }
> > +
> >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> >          &plus_kvm_features, &plus_svm_features);
> > -- 
> > MST
> > 
> > 
> 
> -- 
> Eduardo
Eduardo Habkost Aug. 28, 2012, 10:02 p.m. UTC | #3
On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > 
> > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > >     Enable for 1.1
> > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > >     Disable for 1.2
> > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > 
> > 
> > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > kernel that supports PV EOI already? They would get PV EOI disabled when
> > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > 
> > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > supporting PV EOI already have migration broken, so there's not much we
> > can do for them)
> 
> Exactly.
> 
> Talked to Gleb, long term I think we should rework code to make
> it forward-compatible wrt adding new MSRs:
> - source gets list of MSRs to be migrated from KVM and simply sends them all
> - send all MSRS in key/value format
> - destination gets list of MSRs to be migrated from KVM and
>   only restores the supported ones

As far as I understand the migration code requirements/expectations, if
the origin is sending some data, it is because it is part of the
guest-visible machine state that must be kept while migrating. Because
of that, the destination is not allowed to drop anything it doesn't know
about. At the same time, if it's not part of guest-visible machine
state, it doesn't have to be sent by the migration origin.

On the other hand, a mode of operation that doesn't require updating
QEMU every time there's a new bit of guest-visible state to be migrated
would be nice (just like the "-cpu host" mode, that doesn't require
updating QEMU for every new CPU feature, is nice for some use cases). I
just don't know how to make work with the current migration protocol.



> Too late for 1.2?

Absolutely (in my opinion).

> 
> > While we don't make the KVM feature-bit handling sane (with defaults
> > that are not blindly derived from the host kernel capabilities), maybe
> > the safest bet is to expect users to not migrate between hosts running
> > kernels with different KVM capabilities? (I am not sure which option is
> > better)
> 
> Sorry not sure what you talk about here. What has KVM feature-bit
> handling to do with this patchset?

Everything? The whole point of this patch is to filter out the PV_EOI
KVM feature bit.


This part of the current code, specifically, is wrong:

> > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */

The QEMU-side list of KVM features should be whitelist-based, not
blacklist-based (unless the user doesn't need migration, in that case he
can use "-cpu host" and get every feature blindly enabled), because QEMU
can't know if a new feature involves guest-visible state that has to be
migrated.


> 
> > 
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > >  hw/Makefile.objs  |  2 +-
> > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > >  hw/cpu_flags.h    |  9 +++++++++
> > >  hw/pc_piix.c      |  2 ++
> > >  target-i386/cpu.c |  8 ++++++++
> > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > >  create mode 100644 hw/cpu_flags.c
> > >  create mode 100644 hw/cpu_flags.h
> > > 
> > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > index 850b87b..3f2532a 100644
> > > --- a/hw/Makefile.objs
> > > +++ b/hw/Makefile.objs
> > > @@ -1,5 +1,5 @@
> > >  hw-obj-y = usb/ ide/
> > > -hw-obj-y += loader.o
> > > +hw-obj-y += loader.o cpu_flags.o
> > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > >  hw-obj-y += fw_cfg.o
> > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > new file mode 100644
> > > index 0000000..7a633c0
> > > --- /dev/null
> > > +++ b/hw/cpu_flags.c
> > > @@ -0,0 +1,32 @@
> > > +/*
> > > + * CPU compatibility flags.
> > > + *
> > > + * Copyright (c) 2012 Red Hat Inc.
> > > + * Author: Michael S. Tsirkin.
> > > + *
> > > + * This program is free software; you can redistribute it and/or modify
> > > + * it under the terms of the GNU General Public License as published by
> > > + * the Free Software Foundation; either version 2 of the License, or
> > > + * (at your option) any later version.
> > > + *
> > > + * This program is distributed in the hope that it will be useful,
> > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > + * GNU General Public License for more details.
> > > + *
> > > + * You should have received a copy of the GNU General Public License along
> > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > + */
> > > +#include "hw/cpu_flags.h"
> > > +
> > > +static bool kvm_pv_eoi_disabled_state;
> > > +
> > > +void disable_kvm_pv_eoi(void)
> > > +{
> > > +       kvm_pv_eoi_disabled_state = true;
> > > +}
> > > +
> > > +bool kvm_pv_eoi_disabled(void)
> > > +{
> > > +       return kvm_pv_eoi_disabled_state;
> > > +}
> > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > new file mode 100644
> > > index 0000000..05777b6
> > > --- /dev/null
> > > +++ b/hw/cpu_flags.h
> > > @@ -0,0 +1,9 @@
> > > +#ifndef HW_CPU_FLAGS_H
> > > +#define HW_CPU_FLAGS_H
> > > +
> > > +#include <stdbool.h>
> > > +
> > > +void disable_kvm_pv_eoi(void);
> > > +bool kvm_pv_eoi_disabled(void);
> > > +
> > > +#endif
> > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > index 008d42f..bdbceda 100644
> > > --- a/hw/pc_piix.c
> > > +++ b/hw/pc_piix.c
> > > @@ -46,6 +46,7 @@
> > >  #ifdef CONFIG_XEN
> > >  #  include <xen/hvm/hvm_info_table.h>
> > >  #endif
> > > +#include "cpu_flags.h"
> > >  
> > >  #define MAX_IDE_BUS 2
> > >  
> > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > >  
> > >  static void pc_machine_v1_1_compat(void)
> > >  {
> > > +    disable_kvm_pv_eoi();
> > >  }
> > >  
> > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > index 120a2e3..0d02fd1 100644
> > > --- a/target-i386/cpu.c
> > > +++ b/target-i386/cpu.c
> > > @@ -23,6 +23,7 @@
> > >  
> > >  #include "cpu.h"
> > >  #include "kvm.h"
> > > +#include "asm/kvm_para.h"
> > >  
> > >  #include "qemu-option.h"
> > >  #include "qemu-config.h"
> > > @@ -33,6 +34,7 @@
> > >  #include "hyperv.h"
> > >  
> > >  #include "hw/hw.h"
> > > +#include "hw/cpu_flags.h"
> > >  
> > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > >  
> > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > >  
> > > +    /* Disable PV EOI for old machine types.
> > > +     * Feature flags can still override. */
> > > +    if (kvm_pv_eoi_disabled()) {
> > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > +    }
> > > +
> > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > >          &plus_kvm_features, &plus_svm_features);
> > > -- 
> > > MST
> > > 
> > > 
> > 
> > -- 
> > Eduardo
Michael S. Tsirkin Aug. 28, 2012, 10:21 p.m. UTC | #4
On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > 
> > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > >     Enable for 1.1
> > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > >     Disable for 1.2
> > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > 
> > > 
> > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > 
> > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > supporting PV EOI already have migration broken, so there's not much we
> > > can do for them)
> > 
> > Exactly.
> > 
> > Talked to Gleb, long term I think we should rework code to make
> > it forward-compatible wrt adding new MSRs:
> > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > - send all MSRS in key/value format
> > - destination gets list of MSRs to be migrated from KVM and
> >   only restores the supported ones
> 
> As far as I understand the migration code requirements/expectations, if
> the origin is sending some data, it is because it is part of the
> guest-visible machine state that must be kept while migrating. Because
> of that, the destination is not allowed to drop anything it doesn't know
> about.


We have a ton of code that reads in values then just
ignores them, for compat with old qemu.
This will be exactly such a case:
we don't drop anything - protocol does not
support this. We read and simply do not tell kvm
about it. We also have tons of code that sends
useless values again for compatibility.

> At the same time, if it's not part of guest-visible machine
> state, it doesn't have to be sent by the migration origin.
> 

False, we often send internal device state which is not
directly guest visible.

> On the other hand, a mode of operation that doesn't require updating
> QEMU every time there's a new bit of guest-visible state to be migrated
> would be nice (just like the "-cpu host" mode, that doesn't require
> updating QEMU for every new CPU feature, is nice for some use cases). I
> just don't know how to make work with the current migration protocol.
> 

I don't understand. What is the problem with the proposal?
What will not work with our protocol?
Can you give an example please?


> > Too late for 1.2?
> 
> Absolutely (in my opinion).
> 
> > 
> > > While we don't make the KVM feature-bit handling sane (with defaults
> > > that are not blindly derived from the host kernel capabilities), maybe
> > > the safest bet is to expect users to not migrate between hosts running
> > > kernels with different KVM capabilities? (I am not sure which option is
> > > better)
> > 
> > Sorry not sure what you talk about here. What has KVM feature-bit
> > handling to do with this patchset?
> 
> Everything? The whole point of this patch is to filter out the PV_EOI
> KVM feature bit.
> 
> 
> This part of the current code, specifically, is wrong:
> 
> > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> 
> The QEMU-side list of KVM features should be whitelist-based, not
> blacklist-based (unless the user doesn't need migration, in that case he
> can use "-cpu host" and get every feature blindly enabled), because QEMU
> can't know if a new feature involves guest-visible state that has to be
> migrated.
> 
> 
> > 
> > > 
> > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > ---
> > > >  hw/Makefile.objs  |  2 +-
> > > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > > >  hw/cpu_flags.h    |  9 +++++++++
> > > >  hw/pc_piix.c      |  2 ++
> > > >  target-i386/cpu.c |  8 ++++++++
> > > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > > >  create mode 100644 hw/cpu_flags.c
> > > >  create mode 100644 hw/cpu_flags.h
> > > > 
> > > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > > index 850b87b..3f2532a 100644
> > > > --- a/hw/Makefile.objs
> > > > +++ b/hw/Makefile.objs
> > > > @@ -1,5 +1,5 @@
> > > >  hw-obj-y = usb/ ide/
> > > > -hw-obj-y += loader.o
> > > > +hw-obj-y += loader.o cpu_flags.o
> > > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > > >  hw-obj-y += fw_cfg.o
> > > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > > new file mode 100644
> > > > index 0000000..7a633c0
> > > > --- /dev/null
> > > > +++ b/hw/cpu_flags.c
> > > > @@ -0,0 +1,32 @@
> > > > +/*
> > > > + * CPU compatibility flags.
> > > > + *
> > > > + * Copyright (c) 2012 Red Hat Inc.
> > > > + * Author: Michael S. Tsirkin.
> > > > + *
> > > > + * This program is free software; you can redistribute it and/or modify
> > > > + * it under the terms of the GNU General Public License as published by
> > > > + * the Free Software Foundation; either version 2 of the License, or
> > > > + * (at your option) any later version.
> > > > + *
> > > > + * This program is distributed in the hope that it will be useful,
> > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > > + * GNU General Public License for more details.
> > > > + *
> > > > + * You should have received a copy of the GNU General Public License along
> > > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > > + */
> > > > +#include "hw/cpu_flags.h"
> > > > +
> > > > +static bool kvm_pv_eoi_disabled_state;
> > > > +
> > > > +void disable_kvm_pv_eoi(void)
> > > > +{
> > > > +       kvm_pv_eoi_disabled_state = true;
> > > > +}
> > > > +
> > > > +bool kvm_pv_eoi_disabled(void)
> > > > +{
> > > > +       return kvm_pv_eoi_disabled_state;
> > > > +}
> > > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > > new file mode 100644
> > > > index 0000000..05777b6
> > > > --- /dev/null
> > > > +++ b/hw/cpu_flags.h
> > > > @@ -0,0 +1,9 @@
> > > > +#ifndef HW_CPU_FLAGS_H
> > > > +#define HW_CPU_FLAGS_H
> > > > +
> > > > +#include <stdbool.h>
> > > > +
> > > > +void disable_kvm_pv_eoi(void);
> > > > +bool kvm_pv_eoi_disabled(void);
> > > > +
> > > > +#endif
> > > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > > index 008d42f..bdbceda 100644
> > > > --- a/hw/pc_piix.c
> > > > +++ b/hw/pc_piix.c
> > > > @@ -46,6 +46,7 @@
> > > >  #ifdef CONFIG_XEN
> > > >  #  include <xen/hvm/hvm_info_table.h>
> > > >  #endif
> > > > +#include "cpu_flags.h"
> > > >  
> > > >  #define MAX_IDE_BUS 2
> > > >  
> > > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > > >  
> > > >  static void pc_machine_v1_1_compat(void)
> > > >  {
> > > > +    disable_kvm_pv_eoi();
> > > >  }
> > > >  
> > > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > > index 120a2e3..0d02fd1 100644
> > > > --- a/target-i386/cpu.c
> > > > +++ b/target-i386/cpu.c
> > > > @@ -23,6 +23,7 @@
> > > >  
> > > >  #include "cpu.h"
> > > >  #include "kvm.h"
> > > > +#include "asm/kvm_para.h"
> > > >  
> > > >  #include "qemu-option.h"
> > > >  #include "qemu-config.h"
> > > > @@ -33,6 +34,7 @@
> > > >  #include "hyperv.h"
> > > >  
> > > >  #include "hw/hw.h"
> > > > +#include "hw/cpu_flags.h"
> > > >  
> > > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > > >  
> > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > >  
> > > > +    /* Disable PV EOI for old machine types.
> > > > +     * Feature flags can still override. */
> > > > +    if (kvm_pv_eoi_disabled()) {
> > > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > > +    }
> > > > +
> > > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > > >          &plus_kvm_features, &plus_svm_features);
> > > > -- 
> > > > MST
> > > > 
> > > > 
> > > 
> > > -- 
> > > Eduardo
> 
> -- 
> Eduardo
Michael S. Tsirkin Aug. 28, 2012, 10:25 p.m. UTC | #5
On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > 
> > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > >     Enable for 1.1
> > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > >     Disable for 1.2
> > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > 
> > > > 
> > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > 
> > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > supporting PV EOI already have migration broken, so there's not much we
> > > > can do for them)
> > > 
> > > Exactly.
> > > 
> > > Talked to Gleb, long term I think we should rework code to make
> > > it forward-compatible wrt adding new MSRs:
> > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > - send all MSRS in key/value format
> > > - destination gets list of MSRs to be migrated from KVM and
> > >   only restores the supported ones
> > 
> > As far as I understand the migration code requirements/expectations, if
> > the origin is sending some data, it is because it is part of the
> > guest-visible machine state that must be kept while migrating. Because
> > of that, the destination is not allowed to drop anything it doesn't know
> > about.
> 
> 
> We have a ton of code that reads in values then just
> ignores them, for compat with old qemu.
> This will be exactly such a case:
> we don't drop anything - protocol does not
> support this. We read and simply do not tell kvm
> about it. We also have tons of code that sends
> useless values again for compatibility.
> 
> > At the same time, if it's not part of guest-visible machine
> > state, it doesn't have to be sent by the migration origin.
> > 
> 
> False, we often send internal device state which is not
> directly guest visible.

Besides, all MSRs in KVM's MSR list are actually guest visible, even if
guests normally do not invoke these MSRs - they can.

> > On the other hand, a mode of operation that doesn't require updating
> > QEMU every time there's a new bit of guest-visible state to be migrated
> > would be nice (just like the "-cpu host" mode, that doesn't require
> > updating QEMU for every new CPU feature, is nice for some use cases). I
> > just don't know how to make work with the current migration protocol.
> > 
> 
> I don't understand. What is the problem with the proposal?
> What will not work with our protocol?
> Can you give an example please?
> 
> 
> > > Too late for 1.2?
> > 
> > Absolutely (in my opinion).
> > 
> > > 
> > > > While we don't make the KVM feature-bit handling sane (with defaults
> > > > that are not blindly derived from the host kernel capabilities), maybe
> > > > the safest bet is to expect users to not migrate between hosts running
> > > > kernels with different KVM capabilities? (I am not sure which option is
> > > > better)
> > > 
> > > Sorry not sure what you talk about here. What has KVM feature-bit
> > > handling to do with this patchset?
> > 
> > Everything? The whole point of this patch is to filter out the PV_EOI
> > KVM feature bit.
> > 
> > 
> > This part of the current code, specifically, is wrong:
> > 
> > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > 
> > The QEMU-side list of KVM features should be whitelist-based, not
> > blacklist-based (unless the user doesn't need migration, in that case he
> > can use "-cpu host" and get every feature blindly enabled), because QEMU
> > can't know if a new feature involves guest-visible state that has to be
> > migrated.
> > 
> > 
> > > 
> > > > 
> > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > ---
> > > > >  hw/Makefile.objs  |  2 +-
> > > > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > > > >  hw/cpu_flags.h    |  9 +++++++++
> > > > >  hw/pc_piix.c      |  2 ++
> > > > >  target-i386/cpu.c |  8 ++++++++
> > > > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > > > >  create mode 100644 hw/cpu_flags.c
> > > > >  create mode 100644 hw/cpu_flags.h
> > > > > 
> > > > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > > > index 850b87b..3f2532a 100644
> > > > > --- a/hw/Makefile.objs
> > > > > +++ b/hw/Makefile.objs
> > > > > @@ -1,5 +1,5 @@
> > > > >  hw-obj-y = usb/ ide/
> > > > > -hw-obj-y += loader.o
> > > > > +hw-obj-y += loader.o cpu_flags.o
> > > > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > > > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > > > >  hw-obj-y += fw_cfg.o
> > > > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > > > new file mode 100644
> > > > > index 0000000..7a633c0
> > > > > --- /dev/null
> > > > > +++ b/hw/cpu_flags.c
> > > > > @@ -0,0 +1,32 @@
> > > > > +/*
> > > > > + * CPU compatibility flags.
> > > > > + *
> > > > > + * Copyright (c) 2012 Red Hat Inc.
> > > > > + * Author: Michael S. Tsirkin.
> > > > > + *
> > > > > + * This program is free software; you can redistribute it and/or modify
> > > > > + * it under the terms of the GNU General Public License as published by
> > > > > + * the Free Software Foundation; either version 2 of the License, or
> > > > > + * (at your option) any later version.
> > > > > + *
> > > > > + * This program is distributed in the hope that it will be useful,
> > > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > > > + * GNU General Public License for more details.
> > > > > + *
> > > > > + * You should have received a copy of the GNU General Public License along
> > > > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > > > + */
> > > > > +#include "hw/cpu_flags.h"
> > > > > +
> > > > > +static bool kvm_pv_eoi_disabled_state;
> > > > > +
> > > > > +void disable_kvm_pv_eoi(void)
> > > > > +{
> > > > > +       kvm_pv_eoi_disabled_state = true;
> > > > > +}
> > > > > +
> > > > > +bool kvm_pv_eoi_disabled(void)
> > > > > +{
> > > > > +       return kvm_pv_eoi_disabled_state;
> > > > > +}
> > > > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > > > new file mode 100644
> > > > > index 0000000..05777b6
> > > > > --- /dev/null
> > > > > +++ b/hw/cpu_flags.h
> > > > > @@ -0,0 +1,9 @@
> > > > > +#ifndef HW_CPU_FLAGS_H
> > > > > +#define HW_CPU_FLAGS_H
> > > > > +
> > > > > +#include <stdbool.h>
> > > > > +
> > > > > +void disable_kvm_pv_eoi(void);
> > > > > +bool kvm_pv_eoi_disabled(void);
> > > > > +
> > > > > +#endif
> > > > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > > > index 008d42f..bdbceda 100644
> > > > > --- a/hw/pc_piix.c
> > > > > +++ b/hw/pc_piix.c
> > > > > @@ -46,6 +46,7 @@
> > > > >  #ifdef CONFIG_XEN
> > > > >  #  include <xen/hvm/hvm_info_table.h>
> > > > >  #endif
> > > > > +#include "cpu_flags.h"
> > > > >  
> > > > >  #define MAX_IDE_BUS 2
> > > > >  
> > > > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > > > >  
> > > > >  static void pc_machine_v1_1_compat(void)
> > > > >  {
> > > > > +    disable_kvm_pv_eoi();
> > > > >  }
> > > > >  
> > > > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > > > index 120a2e3..0d02fd1 100644
> > > > > --- a/target-i386/cpu.c
> > > > > +++ b/target-i386/cpu.c
> > > > > @@ -23,6 +23,7 @@
> > > > >  
> > > > >  #include "cpu.h"
> > > > >  #include "kvm.h"
> > > > > +#include "asm/kvm_para.h"
> > > > >  
> > > > >  #include "qemu-option.h"
> > > > >  #include "qemu-config.h"
> > > > > @@ -33,6 +34,7 @@
> > > > >  #include "hyperv.h"
> > > > >  
> > > > >  #include "hw/hw.h"
> > > > > +#include "hw/cpu_flags.h"
> > > > >  
> > > > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > > > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > > > >  
> > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > >  
> > > > > +    /* Disable PV EOI for old machine types.
> > > > > +     * Feature flags can still override. */
> > > > > +    if (kvm_pv_eoi_disabled()) {
> > > > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > > > +    }
> > > > > +
> > > > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > > > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > > > >          &plus_kvm_features, &plus_svm_features);
> > > > > -- 
> > > > > MST
> > > > > 
> > > > > 
> > > > 
> > > > -- 
> > > > Eduardo
> > 
> > -- 
> > Eduardo
Eduardo Habkost Aug. 28, 2012, 11:50 p.m. UTC | #6
On Wed, Aug 29, 2012 at 01:25:35AM +0300, Michael S. Tsirkin wrote:
> On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > > 
> > > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > > >     Enable for 1.1
> > > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > > >     Disable for 1.2
> > > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > > 
> > > > > 
> > > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > > 
> > > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > > supporting PV EOI already have migration broken, so there's not much we
> > > > > can do for them)
> > > > 
> > > > Exactly.
> > > > 
> > > > Talked to Gleb, long term I think we should rework code to make
> > > > it forward-compatible wrt adding new MSRs:
> > > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > > - send all MSRS in key/value format
> > > > - destination gets list of MSRs to be migrated from KVM and
> > > >   only restores the supported ones
> > > 
> > > As far as I understand the migration code requirements/expectations, if
> > > the origin is sending some data, it is because it is part of the
> > > guest-visible machine state that must be kept while migrating. Because
> > > of that, the destination is not allowed to drop anything it doesn't know
> > > about.
> > 
> > 
> > We have a ton of code that reads in values then just
> > ignores them, for compat with old qemu.

The difference is that you ignore only the values you _know_ to be safe
to be ignored. You can't ignore a value simply because you don't know
what it means, and if it's important or not.

> > This will be exactly such a case:
> > we don't drop anything - protocol does not
> > support this. We read and simply do not tell kvm
> > about it.

This fits the definition of "dropping", to me.

> We also have tons of code that sends
> > useless values again for compatibility.

But these values should be ignored only if the receiver knows exactly
what it means, and knows exactly why/when it can be ignored.

> > 
> > > At the same time, if it's not part of guest-visible machine
> > > state, it doesn't have to be sent by the migration origin.

It may be not directly visible, but if it's part of the machine state,
it affects the guest in some way. If it didn't affect the guest in any
way, we wouldn't even have to send it while migrating, in the first
place.

> > > 
> > 
> > False, we often send internal device state which is not
> > directly guest visible.
> 
> Besides, all MSRs in KVM's MSR list are actually guest visible, even if
> guests normally do not invoke these MSRs - they can.

The difference is that the machine doesn't guarantee a MSR to be
migrated unless the corresponding feature bit reports the MSR as
supported.

e.g. using the PV EOI MSR without having the PV EOI CPUID bit set is
undefined behavior. The guest shouldn't do it.

On the other hand, if the PV EOI CPUID bit is set, the MSR should be
always kept, no matter what.

This means that we need to migrate the MSR only if the corresponding
CPUID bit is enabled.


> 
> > > On the other hand, a mode of operation that doesn't require updating
> > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > just don't know how to make work with the current migration protocol.
> > > 
> > 
> > I don't understand. What is the problem with the proposal?
> > What will not work with our protocol?
> > Can you give an example please?

Yes:

Suppose kernel 3.7 introduces KVM_FOO_MSR and CPUID_KVM_FOO.

Also, suppose QEMU 1.2 doesn't know anything about KVM_FOO, because it
was release before this feature was introduced.


Then you run "qemu-1.2 -M pc-1.2" on a 3.7 host kernel. qemu-1.2 can do
two things here:

1) Not enable CPUID_KVM_FOO

In this case, the guest should not use KVM_FOO_MSR and the MSR does not
need to be migrated (the guest may try to use it, but the behavior when
trying to use it is undefined). Sending the MSR value when migrating
would be useless.


2) Enable CPUID_KVM_FOO.

In this case, the guest may try to use the feature and write something
into KVM_FOO_MSR. Sending the MSR value when migrating is absolutely
necessary

---

Now assume you run "qemu-1.2 -M pc-1.2" in the destination host, running
the 3.6 kernel (without KVM_FOO).

Then qemu-1.2 receives the KVM_FOO_MSR data in the migration stream. In
this case, qemu-1.2 simply can't decide if it's safe to drop the data
(and not tell KVM about it), or not.

If we simply send every MSR reported by the kernel, both the origin and
destination qemu-1.2 processes can't ever know if the MSR value is
important or not, because it doesn't know if it's part of the machine
state that has to be kept consistent.

We could introduce a mode of operation where _every_ MSR reported by KVM
is important and sent by the origin (and also where every MSR must be
handled by the destination, otherwise migration would fail). But this
new mode would break migration compatibility between two hosts running
the same machine-type, only because they are running different kernel
versions. But it may be useful for some use cases, so maybe it's
appropriate for a future "pc-next" machine-type (and for "-cpu host"),
but not for "pc-<version>".




> > 
> > 
> > > > Too late for 1.2?
> > > 
> > > Absolutely (in my opinion).
> > > 
> > > > 
> > > > > While we don't make the KVM feature-bit handling sane (with defaults
> > > > > that are not blindly derived from the host kernel capabilities), maybe
> > > > > the safest bet is to expect users to not migrate between hosts running
> > > > > kernels with different KVM capabilities? (I am not sure which option is
> > > > > better)
> > > > 
> > > > Sorry not sure what you talk about here. What has KVM feature-bit
> > > > handling to do with this patchset?
> > > 
> > > Everything? The whole point of this patch is to filter out the PV_EOI
> > > KVM feature bit.
> > > 
> > > 
> > > This part of the current code, specifically, is wrong:
> > > 
> > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > 
> > > The QEMU-side list of KVM features should be whitelist-based, not
> > > blacklist-based (unless the user doesn't need migration, in that case he
> > > can use "-cpu host" and get every feature blindly enabled), because QEMU
> > > can't know if a new feature involves guest-visible state that has to be
> > > migrated.
> > > 
> > > 
> > > > 
> > > > > 
> > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > > ---
> > > > > >  hw/Makefile.objs  |  2 +-
> > > > > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > > > > >  hw/cpu_flags.h    |  9 +++++++++
> > > > > >  hw/pc_piix.c      |  2 ++
> > > > > >  target-i386/cpu.c |  8 ++++++++
> > > > > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > > > > >  create mode 100644 hw/cpu_flags.c
> > > > > >  create mode 100644 hw/cpu_flags.h
> > > > > > 
> > > > > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > > > > index 850b87b..3f2532a 100644
> > > > > > --- a/hw/Makefile.objs
> > > > > > +++ b/hw/Makefile.objs
> > > > > > @@ -1,5 +1,5 @@
> > > > > >  hw-obj-y = usb/ ide/
> > > > > > -hw-obj-y += loader.o
> > > > > > +hw-obj-y += loader.o cpu_flags.o
> > > > > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > > > > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > > > > >  hw-obj-y += fw_cfg.o
> > > > > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > > > > new file mode 100644
> > > > > > index 0000000..7a633c0
> > > > > > --- /dev/null
> > > > > > +++ b/hw/cpu_flags.c
> > > > > > @@ -0,0 +1,32 @@
> > > > > > +/*
> > > > > > + * CPU compatibility flags.
> > > > > > + *
> > > > > > + * Copyright (c) 2012 Red Hat Inc.
> > > > > > + * Author: Michael S. Tsirkin.
> > > > > > + *
> > > > > > + * This program is free software; you can redistribute it and/or modify
> > > > > > + * it under the terms of the GNU General Public License as published by
> > > > > > + * the Free Software Foundation; either version 2 of the License, or
> > > > > > + * (at your option) any later version.
> > > > > > + *
> > > > > > + * This program is distributed in the hope that it will be useful,
> > > > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > > > > + * GNU General Public License for more details.
> > > > > > + *
> > > > > > + * You should have received a copy of the GNU General Public License along
> > > > > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > > > > + */
> > > > > > +#include "hw/cpu_flags.h"
> > > > > > +
> > > > > > +static bool kvm_pv_eoi_disabled_state;
> > > > > > +
> > > > > > +void disable_kvm_pv_eoi(void)
> > > > > > +{
> > > > > > +       kvm_pv_eoi_disabled_state = true;
> > > > > > +}
> > > > > > +
> > > > > > +bool kvm_pv_eoi_disabled(void)
> > > > > > +{
> > > > > > +       return kvm_pv_eoi_disabled_state;
> > > > > > +}
> > > > > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > > > > new file mode 100644
> > > > > > index 0000000..05777b6
> > > > > > --- /dev/null
> > > > > > +++ b/hw/cpu_flags.h
> > > > > > @@ -0,0 +1,9 @@
> > > > > > +#ifndef HW_CPU_FLAGS_H
> > > > > > +#define HW_CPU_FLAGS_H
> > > > > > +
> > > > > > +#include <stdbool.h>
> > > > > > +
> > > > > > +void disable_kvm_pv_eoi(void);
> > > > > > +bool kvm_pv_eoi_disabled(void);
> > > > > > +
> > > > > > +#endif
> > > > > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > > > > index 008d42f..bdbceda 100644
> > > > > > --- a/hw/pc_piix.c
> > > > > > +++ b/hw/pc_piix.c
> > > > > > @@ -46,6 +46,7 @@
> > > > > >  #ifdef CONFIG_XEN
> > > > > >  #  include <xen/hvm/hvm_info_table.h>
> > > > > >  #endif
> > > > > > +#include "cpu_flags.h"
> > > > > >  
> > > > > >  #define MAX_IDE_BUS 2
> > > > > >  
> > > > > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > > > > >  
> > > > > >  static void pc_machine_v1_1_compat(void)
> > > > > >  {
> > > > > > +    disable_kvm_pv_eoi();
> > > > > >  }
> > > > > >  
> > > > > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > > > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > > > > index 120a2e3..0d02fd1 100644
> > > > > > --- a/target-i386/cpu.c
> > > > > > +++ b/target-i386/cpu.c
> > > > > > @@ -23,6 +23,7 @@
> > > > > >  
> > > > > >  #include "cpu.h"
> > > > > >  #include "kvm.h"
> > > > > > +#include "asm/kvm_para.h"
> > > > > >  
> > > > > >  #include "qemu-option.h"
> > > > > >  #include "qemu-config.h"
> > > > > > @@ -33,6 +34,7 @@
> > > > > >  #include "hyperv.h"
> > > > > >  
> > > > > >  #include "hw/hw.h"
> > > > > > +#include "hw/cpu_flags.h"
> > > > > >  
> > > > > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > > > > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > > > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > > > > >  
> > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > >  
> > > > > > +    /* Disable PV EOI for old machine types.
> > > > > > +     * Feature flags can still override. */
> > > > > > +    if (kvm_pv_eoi_disabled()) {
> > > > > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > > > > +    }
> > > > > > +
> > > > > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > > > > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > > > > >          &plus_kvm_features, &plus_svm_features);
> > > > > > -- 
> > > > > > MST
> > > > > > 
> > > > > > 
> > > > > 
> > > > > -- 
> > > > > Eduardo
> > > 
> > > -- 
> > > Eduardo
Marcelo Tosatti Aug. 29, 2012, 9:59 a.m. UTC | #7
On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > 
> > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > >     Enable for 1.1
> > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > >     Disable for 1.2
> > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > 
> > > > 
> > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > 
> > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > supporting PV EOI already have migration broken, so there's not much we
> > > > can do for them)
> > > 
> > > Exactly.
> > > 
> > > Talked to Gleb, long term I think we should rework code to make
> > > it forward-compatible wrt adding new MSRs:
> > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > - send all MSRS in key/value format
> > > - destination gets list of MSRs to be migrated from KVM and
> > >   only restores the supported ones
> > 
> > As far as I understand the migration code requirements/expectations, if
> > the origin is sending some data, it is because it is part of the
> > guest-visible machine state that must be kept while migrating. Because
> > of that, the destination is not allowed to drop anything it doesn't know
> > about.
> 
> 
> We have a ton of code that reads in values then just
> ignores them, for compat with old qemu.
> This will be exactly such a case:
> we don't drop anything - protocol does not
> support this. We read and simply do not tell kvm
> about it. We also have tons of code that sends
> useless values again for compatibility.
> 
> > At the same time, if it's not part of guest-visible machine
> > state, it doesn't have to be sent by the migration origin.
> > 
> 
> False, we often send internal device state which is not
> directly guest visible.
> 
> > On the other hand, a mode of operation that doesn't require updating
> > QEMU every time there's a new bit of guest-visible state to be migrated
> > would be nice (just like the "-cpu host" mode, that doesn't require
> > updating QEMU for every new CPU feature, is nice for some use cases). I
> > just don't know how to make work with the current migration protocol.
> > 
> 
> I don't understand. What is the problem with the proposal?
> What will not work with our protocol?
> Can you give an example please?
> 
> 
> > > Too late for 1.2?
> > 
> > Absolutely (in my opinion).

My concern here is that you are introducing infrastructure by using
_one example_, claiming it to solve a general problem, without the
appropriate amount of energy spent in this solution.

Using old machines is supposed to be supported on older guests.

Michael, not migrating the PVEOI MSR actually crashes the guest?
Marcelo Tosatti Aug. 29, 2012, 10:03 a.m. UTC | #8
On Wed, Aug 29, 2012 at 06:59:03AM -0300, Marcelo Tosatti wrote:
> > I don't understand. What is the problem with the proposal?
> > What will not work with our protocol?
> > Can you give an example please?
> > 
> > 
> > > > Too late for 1.2?
> > > 
> > > Absolutely (in my opinion).
> 
> My concern here is that you are introducing infrastructure by using
> _one example_, claiming it to solve a general problem, without the
> appropriate amount of energy spent in this solution.
> 
> Using old machines is supposed to be supported on older guests.
> 
> Michael, not migrating the PVEOI MSR actually crashes the guest?

Which is irrelevant since it won't be able to migrate to an older 
qemu.
Michael S. Tsirkin Aug. 29, 2012, 10:06 a.m. UTC | #9
On Tue, Aug 28, 2012 at 08:50:09PM -0300, Eduardo Habkost wrote:
> On Wed, Aug 29, 2012 at 01:25:35AM +0300, Michael S. Tsirkin wrote:
> > On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > > > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > > > 
> > > > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > > > >     Enable for 1.1
> > > > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > > > >     Disable for 1.2
> > > > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > > > 
> > > > > > 
> > > > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > > > 
> > > > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > > > supporting PV EOI already have migration broken, so there's not much we
> > > > > > can do for them)
> > > > > 
> > > > > Exactly.
> > > > > 
> > > > > Talked to Gleb, long term I think we should rework code to make
> > > > > it forward-compatible wrt adding new MSRs:
> > > > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > > > - send all MSRS in key/value format
> > > > > - destination gets list of MSRs to be migrated from KVM and
> > > > >   only restores the supported ones
> > > > 
> > > > As far as I understand the migration code requirements/expectations, if
> > > > the origin is sending some data, it is because it is part of the
> > > > guest-visible machine state that must be kept while migrating. Because
> > > > of that, the destination is not allowed to drop anything it doesn't know
> > > > about.
> > > 
> > > 
> > > We have a ton of code that reads in values then just
> > > ignores them, for compat with old qemu.
> 
> The difference is that you ignore only the values you _know_ to be safe
> to be ignored. You can't ignore a value simply because you don't know
> what it means, and if it's important or not.
> > > This will be exactly such a case:
> > > we don't drop anything - protocol does not
> > > support this. We read and simply do not tell kvm
> > > about it.
> 
> This fits the definition of "dropping", to me.
> 
> > We also have tons of code that sends
> > > useless values again for compatibility.
> 
> But these values should be ignored only if the receiver knows exactly
> what it means, and knows exactly why/when it can be ignored.

Sorry I just do not understand these meta arguments.  Do you mean
the example below? If yes let us focus on it please.

> > > 
> > > > At the same time, if it's not part of guest-visible machine
> > > > state, it doesn't have to be sent by the migration origin.
> 
> It may be not directly visible, but if it's part of the machine state,
> it affects the guest in some way. If it didn't affect the guest in any
> way, we wouldn't even have to send it while migrating, in the first
> place.
> 
> > > > 
> > > 
> > > False, we often send internal device state which is not
> > > directly guest visible.
> > 
> > Besides, all MSRs in KVM's MSR list are actually guest visible, even if
> > guests normally do not invoke these MSRs - they can.
> 
> The difference is that the machine doesn't guarantee a MSR to be
> migrated unless the corresponding feature bit reports the MSR as
> supported.
> 
> e.g. using the PV EOI MSR without having the PV EOI CPUID bit set is
> undefined behavior. The guest shouldn't do it.

So the problematic scenario involves a guest that sets PV EOI when CPUID
bit is off on source. Why do we even care what happens on migration?

> On the other hand, if the PV EOI CPUID bit is set, the MSR should be
> always kept, no matter what.
> 
> This means that we need to migrate the MSR only if the corresponding
> CPUID bit is enabled.
> 

I do not follow. Why does it mean that?
It seems completely safe to migrate this MSR no matter what.

> > 
> > > > On the other hand, a mode of operation that doesn't require updating
> > > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > > just don't know how to make work with the current migration protocol.
> > > > 
> > > 
> > > I don't understand. What is the problem with the proposal?
> > > What will not work with our protocol?
> > > Can you give an example please?
> 
> Yes:
> 
> Suppose kernel 3.7 introduces KVM_FOO_MSR and CPUID_KVM_FOO.
> 
> Also, suppose QEMU 1.2 doesn't know anything about KVM_FOO, because it
> was release before this feature was introduced.
> 
> 
> Then you run "qemu-1.2 -M pc-1.2" on a 3.7 host kernel. qemu-1.2 can do
> two things here:
> 
> 1) Not enable CPUID_KVM_FOO
> 
> In this case, the guest should not use KVM_FOO_MSR and the MSR does not
> need to be migrated (the guest may try to use it, but the behavior when
> trying to use it is undefined). Sending the MSR value when migrating
> would be useless.
> 
> 
> 2) Enable CPUID_KVM_FOO.
> 
> In this case, the guest may try to use the feature and write something
> into KVM_FOO_MSR. Sending the MSR value when migrating is absolutely
> necessary
> 
> ---
> 
> Now assume you run "qemu-1.2 -M pc-1.2" in the destination host, running
> the 3.6 kernel (without KVM_FOO).
> 
> Then qemu-1.2 receives the KVM_FOO_MSR data in the migration stream. In
> this case, qemu-1.2 simply can't decide if it's safe to drop the data
> (and not tell KVM about it), or not.
> If we simply send every MSR reported by the kernel, both the origin and
> destination qemu-1.2 processes can't ever know if the MSR value is
> important or not, because it doesn't know if it's part of the machine
> state that has to be kept consistent.
> We could introduce a mode of operation where _every_ MSR reported by KVM
> is important and sent by the origin (and also where every MSR must be
> handled by the destination, otherwise migration would fail). But this
> new mode would break migration compatibility between two hosts running
> the same machine-type, only because they are running different kernel
> versions. But it may be useful for some use cases, so maybe it's
> appropriate for a future "pc-next" machine-type (and for "-cpu host"),
> but not for "pc-<version>".
> 

In this example, we should migrate CPUID (don't we?).
Destination should validate that CPUID supplied by source
matches what it can support (doesn't it?).

If we do and it does not, it's an unrelated bug:
CPUID changing under guest's feet.

> > > 
> > > 
> > > > > Too late for 1.2?
> > > > 
> > > > Absolutely (in my opinion).
> > > > 
> > > > > 
> > > > > > While we don't make the KVM feature-bit handling sane (with defaults
> > > > > > that are not blindly derived from the host kernel capabilities), maybe
> > > > > > the safest bet is to expect users to not migrate between hosts running
> > > > > > kernels with different KVM capabilities? (I am not sure which option is
> > > > > > better)
> > > > > 
> > > > > Sorry not sure what you talk about here. What has KVM feature-bit
> > > > > handling to do with this patchset?
> > > > 
> > > > Everything? The whole point of this patch is to filter out the PV_EOI
> > > > KVM feature bit.
> > > > 
> > > > 
> > > > This part of the current code, specifically, is wrong:
> > > > 
> > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > 
> > > > The QEMU-side list of KVM features should be whitelist-based, not
> > > > blacklist-based (unless the user doesn't need migration, in that case he
> > > > can use "-cpu host" and get every feature blindly enabled), because QEMU
> > > > can't know if a new feature involves guest-visible state that has to be
> > > > migrated.
> > > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > > > ---
> > > > > > >  hw/Makefile.objs  |  2 +-
> > > > > > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > > > > > >  hw/cpu_flags.h    |  9 +++++++++
> > > > > > >  hw/pc_piix.c      |  2 ++
> > > > > > >  target-i386/cpu.c |  8 ++++++++
> > > > > > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > > > > > >  create mode 100644 hw/cpu_flags.c
> > > > > > >  create mode 100644 hw/cpu_flags.h
> > > > > > > 
> > > > > > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > > > > > index 850b87b..3f2532a 100644
> > > > > > > --- a/hw/Makefile.objs
> > > > > > > +++ b/hw/Makefile.objs
> > > > > > > @@ -1,5 +1,5 @@
> > > > > > >  hw-obj-y = usb/ ide/
> > > > > > > -hw-obj-y += loader.o
> > > > > > > +hw-obj-y += loader.o cpu_flags.o
> > > > > > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > > > > > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > > > > > >  hw-obj-y += fw_cfg.o
> > > > > > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > > > > > new file mode 100644
> > > > > > > index 0000000..7a633c0
> > > > > > > --- /dev/null
> > > > > > > +++ b/hw/cpu_flags.c
> > > > > > > @@ -0,0 +1,32 @@
> > > > > > > +/*
> > > > > > > + * CPU compatibility flags.
> > > > > > > + *
> > > > > > > + * Copyright (c) 2012 Red Hat Inc.
> > > > > > > + * Author: Michael S. Tsirkin.
> > > > > > > + *
> > > > > > > + * This program is free software; you can redistribute it and/or modify
> > > > > > > + * it under the terms of the GNU General Public License as published by
> > > > > > > + * the Free Software Foundation; either version 2 of the License, or
> > > > > > > + * (at your option) any later version.
> > > > > > > + *
> > > > > > > + * This program is distributed in the hope that it will be useful,
> > > > > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > > > > > + * GNU General Public License for more details.
> > > > > > > + *
> > > > > > > + * You should have received a copy of the GNU General Public License along
> > > > > > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > > > > > + */
> > > > > > > +#include "hw/cpu_flags.h"
> > > > > > > +
> > > > > > > +static bool kvm_pv_eoi_disabled_state;
> > > > > > > +
> > > > > > > +void disable_kvm_pv_eoi(void)
> > > > > > > +{
> > > > > > > +       kvm_pv_eoi_disabled_state = true;
> > > > > > > +}
> > > > > > > +
> > > > > > > +bool kvm_pv_eoi_disabled(void)
> > > > > > > +{
> > > > > > > +       return kvm_pv_eoi_disabled_state;
> > > > > > > +}
> > > > > > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > > > > > new file mode 100644
> > > > > > > index 0000000..05777b6
> > > > > > > --- /dev/null
> > > > > > > +++ b/hw/cpu_flags.h
> > > > > > > @@ -0,0 +1,9 @@
> > > > > > > +#ifndef HW_CPU_FLAGS_H
> > > > > > > +#define HW_CPU_FLAGS_H
> > > > > > > +
> > > > > > > +#include <stdbool.h>
> > > > > > > +
> > > > > > > +void disable_kvm_pv_eoi(void);
> > > > > > > +bool kvm_pv_eoi_disabled(void);
> > > > > > > +
> > > > > > > +#endif
> > > > > > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > > > > > index 008d42f..bdbceda 100644
> > > > > > > --- a/hw/pc_piix.c
> > > > > > > +++ b/hw/pc_piix.c
> > > > > > > @@ -46,6 +46,7 @@
> > > > > > >  #ifdef CONFIG_XEN
> > > > > > >  #  include <xen/hvm/hvm_info_table.h>
> > > > > > >  #endif
> > > > > > > +#include "cpu_flags.h"
> > > > > > >  
> > > > > > >  #define MAX_IDE_BUS 2
> > > > > > >  
> > > > > > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > > > > > >  
> > > > > > >  static void pc_machine_v1_1_compat(void)
> > > > > > >  {
> > > > > > > +    disable_kvm_pv_eoi();
> > > > > > >  }
> > > > > > >  
> > > > > > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > > > > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > > > > > index 120a2e3..0d02fd1 100644
> > > > > > > --- a/target-i386/cpu.c
> > > > > > > +++ b/target-i386/cpu.c
> > > > > > > @@ -23,6 +23,7 @@
> > > > > > >  
> > > > > > >  #include "cpu.h"
> > > > > > >  #include "kvm.h"
> > > > > > > +#include "asm/kvm_para.h"
> > > > > > >  
> > > > > > >  #include "qemu-option.h"
> > > > > > >  #include "qemu-config.h"
> > > > > > > @@ -33,6 +34,7 @@
> > > > > > >  #include "hyperv.h"
> > > > > > >  
> > > > > > >  #include "hw/hw.h"
> > > > > > > +#include "hw/cpu_flags.h"
> > > > > > >  
> > > > > > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > > > > > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > > > > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > > > > > >  
> > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > > >  
> > > > > > > +    /* Disable PV EOI for old machine types.
> > > > > > > +     * Feature flags can still override. */
> > > > > > > +    if (kvm_pv_eoi_disabled()) {
> > > > > > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > > > > > +    }
> > > > > > > +
> > > > > > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > > > > > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > > > > > >          &plus_kvm_features, &plus_svm_features);
> > > > > > > -- 
> > > > > > > MST
> > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > > -- 
> > > > > > Eduardo
> > > > 
> > > > -- 
> > > > Eduardo
> 
> -- 
> Eduardo
Michael S. Tsirkin Aug. 29, 2012, 10:23 a.m. UTC | #10
On Wed, Aug 29, 2012 at 06:59:03AM -0300, Marcelo Tosatti wrote:
> On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > > 
> > > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > > >     Enable for 1.1
> > > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > > >     Disable for 1.2
> > > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > > 
> > > > > 
> > > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > > 
> > > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > > supporting PV EOI already have migration broken, so there's not much we
> > > > > can do for them)
> > > > 
> > > > Exactly.
> > > > 
> > > > Talked to Gleb, long term I think we should rework code to make
> > > > it forward-compatible wrt adding new MSRs:
> > > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > > - send all MSRS in key/value format
> > > > - destination gets list of MSRs to be migrated from KVM and
> > > >   only restores the supported ones
> > > 
> > > As far as I understand the migration code requirements/expectations, if
> > > the origin is sending some data, it is because it is part of the
> > > guest-visible machine state that must be kept while migrating. Because
> > > of that, the destination is not allowed to drop anything it doesn't know
> > > about.
> > 
> > 
> > We have a ton of code that reads in values then just
> > ignores them, for compat with old qemu.
> > This will be exactly such a case:
> > we don't drop anything - protocol does not
> > support this. We read and simply do not tell kvm
> > about it. We also have tons of code that sends
> > useless values again for compatibility.
> > 
> > > At the same time, if it's not part of guest-visible machine
> > > state, it doesn't have to be sent by the migration origin.
> > > 
> > 
> > False, we often send internal device state which is not
> > directly guest visible.
> > 
> > > On the other hand, a mode of operation that doesn't require updating
> > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > just don't know how to make work with the current migration protocol.
> > > 
> > 
> > I don't understand. What is the problem with the proposal?
> > What will not work with our protocol?
> > Can you give an example please?
> > 
> > 
> > > > Too late for 1.2?
> > > 
> > > Absolutely (in my opinion).
> 
> My concern here is that you are introducing infrastructure by using
> _one example_, claiming it to solve a general problem, without the
> appropriate amount of energy spent in this solution.

What are you talking about? This new idea of mine?
Do you just assume this? Why? I can give lots of
past examples where it would have worked and
gave us forward compatibility.
For example, APF.
In another example, kvm clock.


> Using old machines is supposed to be supported on older guests.

Sorry, what does this mean exactly?

> Michael, not migrating the PVEOI MSR actually crashes the guest?


Not at all. migration currently simply disables PV EOI, so
each EOI triggers exits after migration.
Michael S. Tsirkin Aug. 29, 2012, 10:31 a.m. UTC | #11
On Wed, Aug 29, 2012 at 01:23:24PM +0300, Michael S. Tsirkin wrote:
> > Michael, not migrating the PVEOI MSR actually crashes the guest?
> 
> 
> Not at all. migration currently simply disables PV EOI, so
> each EOI triggers exits after migration.

I would like to add: without this patch guest will crash.
But what will crash guest is *not* the fact
we do not migrate the PV EOI MSR.
It is the fact that PV EOI bit is set in CPUID.
If guest reads CPUID before migration and then
writes the MSR after migration, it will crash.

> -- 
> MST
Michael S. Tsirkin Aug. 29, 2012, 10:32 a.m. UTC | #12
On Wed, Aug 29, 2012 at 07:03:34AM -0300, Marcelo Tosatti wrote:
> On Wed, Aug 29, 2012 at 06:59:03AM -0300, Marcelo Tosatti wrote:
> > > I don't understand. What is the problem with the proposal?
> > > What will not work with our protocol?
> > > Can you give an example please?
> > > 
> > > 
> > > > > Too late for 1.2?
> > > > 
> > > > Absolutely (in my opinion).
> > 
> > My concern here is that you are introducing infrastructure by using
> > _one example_, claiming it to solve a general problem, without the
> > appropriate amount of energy spent in this solution.
> > 
> > Using old machines is supposed to be supported on older guests.
> > 
> > Michael, not migrating the PVEOI MSR actually crashes the guest?
> 
> Which is irrelevant since it won't be able to migrate to an older 
> qemu.
> 

With this patch, -M pc-1.1 migrates fine to an older qemu
and no crash.

Without, PV EOI bit in CPUID is set on source so
we have a problem.
Eduardo Habkost Aug. 29, 2012, 12:56 p.m. UTC | #13
On Wed, Aug 29, 2012 at 01:06:32PM +0300, Michael S. Tsirkin wrote:
> On Tue, Aug 28, 2012 at 08:50:09PM -0300, Eduardo Habkost wrote:
> > On Wed, Aug 29, 2012 at 01:25:35AM +0300, Michael S. Tsirkin wrote:
> > > On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> > > > On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > > > > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > > > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > > > > 
> > > > > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > > > > >     Enable for 1.1
> > > > > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > > > > >     Disable for 1.2
> > > > > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > > > > 
> > > > > > > 
> > > > > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > > > > 
> > > > > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > > > > supporting PV EOI already have migration broken, so there's not much we
> > > > > > > can do for them)
> > > > > > 
> > > > > > Exactly.
> > > > > > 
> > > > > > Talked to Gleb, long term I think we should rework code to make
> > > > > > it forward-compatible wrt adding new MSRs:
> > > > > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > > > > - send all MSRS in key/value format
> > > > > > - destination gets list of MSRs to be migrated from KVM and
> > > > > >   only restores the supported ones
> > > > > 
> > > > > As far as I understand the migration code requirements/expectations, if
> > > > > the origin is sending some data, it is because it is part of the
> > > > > guest-visible machine state that must be kept while migrating. Because
> > > > > of that, the destination is not allowed to drop anything it doesn't know
> > > > > about.
> > > > 
> > > > 
> > > > We have a ton of code that reads in values then just
> > > > ignores them, for compat with old qemu.
> > 
> > The difference is that you ignore only the values you _know_ to be safe
> > to be ignored. You can't ignore a value simply because you don't know
> > what it means, and if it's important or not.
> > > > This will be exactly such a case:
> > > > we don't drop anything - protocol does not
> > > > support this. We read and simply do not tell kvm
> > > > about it.
> > 
> > This fits the definition of "dropping", to me.
> > 
> > > We also have tons of code that sends
> > > > useless values again for compatibility.
> > 
> > But these values should be ignored only if the receiver knows exactly
> > what it means, and knows exactly why/when it can be ignored.
> 
> Sorry I just do not understand these meta arguments.  Do you mean
> the example below? If yes let us focus on it please.

We have to have these meta arguments, because the problem is about
features/MSRs that will be introduced in the future. But okay, let's
follow with the specific argument:

> 
> > > > 
> > > > > At the same time, if it's not part of guest-visible machine
> > > > > state, it doesn't have to be sent by the migration origin.
> > 
> > It may be not directly visible, but if it's part of the machine state,
> > it affects the guest in some way. If it didn't affect the guest in any
> > way, we wouldn't even have to send it while migrating, in the first
> > place.
> > 
> > > > > 
> > > > 
> > > > False, we often send internal device state which is not
> > > > directly guest visible.
> > > 
> > > Besides, all MSRs in KVM's MSR list are actually guest visible, even if
> > > guests normally do not invoke these MSRs - they can.
> > 
> > The difference is that the machine doesn't guarantee a MSR to be
> > migrated unless the corresponding feature bit reports the MSR as
> > supported.
> > 
> > e.g. using the PV EOI MSR without having the PV EOI CPUID bit set is
> > undefined behavior. The guest shouldn't do it.
> 
> So the problematic scenario involves a guest that sets PV EOI when CPUID
> bit is off on source. Why do we even care what happens on migration?

This is the case where there is _no_ problem. But the migration
destination can't know that, how can it decide if it's safe to drop the
MSR value or not?

> 
> > On the other hand, if the PV EOI CPUID bit is set, the MSR should be
> > always kept, no matter what.
> > 
> > This means that we need to migrate the MSR only if the corresponding
> > CPUID bit is enabled.
> > 
> 
> I do not follow. Why does it mean that?
> It seems completely safe to migrate this MSR no matter what.

It is completely safe to migrate the MSR, but it is completely unsafe to
_ignore_ the incoming MSR value. That's the problem.

> 
> > > 
> > > > > On the other hand, a mode of operation that doesn't require updating
> > > > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > > > just don't know how to make work with the current migration protocol.
> > > > > 
> > > > 
> > > > I don't understand. What is the problem with the proposal?
> > > > What will not work with our protocol?
> > > > Can you give an example please?
> > 
> > Yes:
> > 
> > Suppose kernel 3.7 introduces KVM_FOO_MSR and CPUID_KVM_FOO.
> > 
> > Also, suppose QEMU 1.2 doesn't know anything about KVM_FOO, because it
> > was release before this feature was introduced.
> > 
> > 
> > Then you run "qemu-1.2 -M pc-1.2" on a 3.7 host kernel. qemu-1.2 can do
> > two things here:
> > 
> > 1) Not enable CPUID_KVM_FOO
> > 
> > In this case, the guest should not use KVM_FOO_MSR and the MSR does not
> > need to be migrated (the guest may try to use it, but the behavior when
> > trying to use it is undefined). Sending the MSR value when migrating
> > would be useless.
> > 
> > 
> > 2) Enable CPUID_KVM_FOO.
> > 
> > In this case, the guest may try to use the feature and write something
> > into KVM_FOO_MSR. Sending the MSR value when migrating is absolutely
> > necessary
> > 
> > ---
> > 
> > Now assume you run "qemu-1.2 -M pc-1.2" in the destination host, running
> > the 3.6 kernel (without KVM_FOO).
> > 
> > Then qemu-1.2 receives the KVM_FOO_MSR data in the migration stream. In
> > this case, qemu-1.2 simply can't decide if it's safe to drop the data
> > (and not tell KVM about it), or not.
> > If we simply send every MSR reported by the kernel, both the origin and
> > destination qemu-1.2 processes can't ever know if the MSR value is
> > important or not, because it doesn't know if it's part of the machine
> > state that has to be kept consistent.
> > We could introduce a mode of operation where _every_ MSR reported by KVM
> > is important and sent by the origin (and also where every MSR must be
> > handled by the destination, otherwise migration would fail). But this
> > new mode would break migration compatibility between two hosts running
> > the same machine-type, only because they are running different kernel
> > versions. But it may be useful for some use cases, so maybe it's
> > appropriate for a future "pc-next" machine-type (and for "-cpu host"),
> > but not for "pc-<version>".
> > 
> 
> In this example, we should migrate CPUID (don't we?).
> Destination should validate that CPUID supplied by source
> matches what it can support (doesn't it?).
> 
> If we do and it does not, it's an unrelated bug:
> CPUID changing under guest's feet.

CPUID changing under guest's feet is another problem, that we also have
to solve. But we also have the problem of migration compatibility
between different host kernels.

Note that I am not saying that migrating all MSRs is wrong. It _is_
good, as long as:
- The destination never ignores any of the incoming MSR values.
- We don't do that by default on "pc-<version>", or we defeat the
  purpose of machine-types.

I'll try to enumerate the problems I am trying to address (that are
related, but are separate problems):

- MSR not being migrated when it should:
  - Possible solution: migrate all MSRs even if qemu doesn't know what
    they are.
    - Constraint: migration destination must _never_ ignore any incoming
      MSR value, because it can't decide if it is important to the guest
      or not (with the current KVM interfaces).
    - Problem with this solution: if we do that by default on
      "pc-<version>", we break migration compatibility between hosts
      with different kernel versions.
- Changing CPUID bits under guest's feet.
  - Proposed solution: migrating CPUID bits, refuse migration if
    destination doesn't support the same bits.
    - It solves the compatibility problem for migration to a newer
      kernel, but not to an older kernel. It helps to solve part of
      the problem, but not all.
  - Alternative solution: simply make the resulting CPUID bits not be a
    function of the host kernel capabilities, but only of the qemu
    configuration (except on "-cpu host" and "-M pc-next").
- Migration compatibility/predictability:
  - See my example above: feature introduced in a newer kernel,
    migration to an older kernel.
  - The only way I see to guarantee this is to never enable unknown
    CPUID bits or MSRs. People who don't care about predictable
    migration compatibility can use "-M pc-next", then.


> 
> > > > 
> > > > 
> > > > > > Too late for 1.2?
> > > > > 
> > > > > Absolutely (in my opinion).
> > > > > 
> > > > > > 
> > > > > > > While we don't make the KVM feature-bit handling sane (with defaults
> > > > > > > that are not blindly derived from the host kernel capabilities), maybe
> > > > > > > the safest bet is to expect users to not migrate between hosts running
> > > > > > > kernels with different KVM capabilities? (I am not sure which option is
> > > > > > > better)
> > > > > > 
> > > > > > Sorry not sure what you talk about here. What has KVM feature-bit
> > > > > > handling to do with this patchset?
> > > > > 
> > > > > Everything? The whole point of this patch is to filter out the PV_EOI
> > > > > KVM feature bit.
> > > > > 
> > > > > 
> > > > > This part of the current code, specifically, is wrong:
> > > > > 
> > > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > 
> > > > > The QEMU-side list of KVM features should be whitelist-based, not
> > > > > blacklist-based (unless the user doesn't need migration, in that case he
> > > > > can use "-cpu host" and get every feature blindly enabled), because QEMU
> > > > > can't know if a new feature involves guest-visible state that has to be
> > > > > migrated.
> > > > > 
> > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > > > > ---
> > > > > > > >  hw/Makefile.objs  |  2 +-
> > > > > > > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > > > > > > >  hw/cpu_flags.h    |  9 +++++++++
> > > > > > > >  hw/pc_piix.c      |  2 ++
> > > > > > > >  target-i386/cpu.c |  8 ++++++++
> > > > > > > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > > > > > > >  create mode 100644 hw/cpu_flags.c
> > > > > > > >  create mode 100644 hw/cpu_flags.h
> > > > > > > > 
> > > > > > > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > > > > > > index 850b87b..3f2532a 100644
> > > > > > > > --- a/hw/Makefile.objs
> > > > > > > > +++ b/hw/Makefile.objs
> > > > > > > > @@ -1,5 +1,5 @@
> > > > > > > >  hw-obj-y = usb/ ide/
> > > > > > > > -hw-obj-y += loader.o
> > > > > > > > +hw-obj-y += loader.o cpu_flags.o
> > > > > > > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > > > > > > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > > > > > > >  hw-obj-y += fw_cfg.o
> > > > > > > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > > > > > > new file mode 100644
> > > > > > > > index 0000000..7a633c0
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/hw/cpu_flags.c
> > > > > > > > @@ -0,0 +1,32 @@
> > > > > > > > +/*
> > > > > > > > + * CPU compatibility flags.
> > > > > > > > + *
> > > > > > > > + * Copyright (c) 2012 Red Hat Inc.
> > > > > > > > + * Author: Michael S. Tsirkin.
> > > > > > > > + *
> > > > > > > > + * This program is free software; you can redistribute it and/or modify
> > > > > > > > + * it under the terms of the GNU General Public License as published by
> > > > > > > > + * the Free Software Foundation; either version 2 of the License, or
> > > > > > > > + * (at your option) any later version.
> > > > > > > > + *
> > > > > > > > + * This program is distributed in the hope that it will be useful,
> > > > > > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > > > > > > + * GNU General Public License for more details.
> > > > > > > > + *
> > > > > > > > + * You should have received a copy of the GNU General Public License along
> > > > > > > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > > > > > > + */
> > > > > > > > +#include "hw/cpu_flags.h"
> > > > > > > > +
> > > > > > > > +static bool kvm_pv_eoi_disabled_state;
> > > > > > > > +
> > > > > > > > +void disable_kvm_pv_eoi(void)
> > > > > > > > +{
> > > > > > > > +       kvm_pv_eoi_disabled_state = true;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +bool kvm_pv_eoi_disabled(void)
> > > > > > > > +{
> > > > > > > > +       return kvm_pv_eoi_disabled_state;
> > > > > > > > +}
> > > > > > > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > > > > > > new file mode 100644
> > > > > > > > index 0000000..05777b6
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/hw/cpu_flags.h
> > > > > > > > @@ -0,0 +1,9 @@
> > > > > > > > +#ifndef HW_CPU_FLAGS_H
> > > > > > > > +#define HW_CPU_FLAGS_H
> > > > > > > > +
> > > > > > > > +#include <stdbool.h>
> > > > > > > > +
> > > > > > > > +void disable_kvm_pv_eoi(void);
> > > > > > > > +bool kvm_pv_eoi_disabled(void);
> > > > > > > > +
> > > > > > > > +#endif
> > > > > > > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > > > > > > index 008d42f..bdbceda 100644
> > > > > > > > --- a/hw/pc_piix.c
> > > > > > > > +++ b/hw/pc_piix.c
> > > > > > > > @@ -46,6 +46,7 @@
> > > > > > > >  #ifdef CONFIG_XEN
> > > > > > > >  #  include <xen/hvm/hvm_info_table.h>
> > > > > > > >  #endif
> > > > > > > > +#include "cpu_flags.h"
> > > > > > > >  
> > > > > > > >  #define MAX_IDE_BUS 2
> > > > > > > >  
> > > > > > > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > > > > > > >  
> > > > > > > >  static void pc_machine_v1_1_compat(void)
> > > > > > > >  {
> > > > > > > > +    disable_kvm_pv_eoi();
> > > > > > > >  }
> > > > > > > >  
> > > > > > > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > > > > > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > > > > > > index 120a2e3..0d02fd1 100644
> > > > > > > > --- a/target-i386/cpu.c
> > > > > > > > +++ b/target-i386/cpu.c
> > > > > > > > @@ -23,6 +23,7 @@
> > > > > > > >  
> > > > > > > >  #include "cpu.h"
> > > > > > > >  #include "kvm.h"
> > > > > > > > +#include "asm/kvm_para.h"
> > > > > > > >  
> > > > > > > >  #include "qemu-option.h"
> > > > > > > >  #include "qemu-config.h"
> > > > > > > > @@ -33,6 +34,7 @@
> > > > > > > >  #include "hyperv.h"
> > > > > > > >  
> > > > > > > >  #include "hw/hw.h"
> > > > > > > > +#include "hw/cpu_flags.h"
> > > > > > > >  
> > > > > > > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > > > > > > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > > > > > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > > > > > > >  
> > > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > > > >  
> > > > > > > > +    /* Disable PV EOI for old machine types.
> > > > > > > > +     * Feature flags can still override. */
> > > > > > > > +    if (kvm_pv_eoi_disabled()) {
> > > > > > > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > > > > > > +    }
> > > > > > > > +
> > > > > > > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > > > > > > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > > > > > > >          &plus_kvm_features, &plus_svm_features);
> > > > > > > > -- 
> > > > > > > > MST
> > > > > > > > 
> > > > > > > > 
> > > > > > > 
> > > > > > > -- 
> > > > > > > Eduardo
> > > > > 
> > > > > -- 
> > > > > Eduardo
> > 
> > -- 
> > Eduardo
Michael S. Tsirkin Aug. 29, 2012, 1:18 p.m. UTC | #14
On Wed, Aug 29, 2012 at 09:56:12AM -0300, Eduardo Habkost wrote:
> On Wed, Aug 29, 2012 at 01:06:32PM +0300, Michael S. Tsirkin wrote:
> > On Tue, Aug 28, 2012 at 08:50:09PM -0300, Eduardo Habkost wrote:
> > > On Wed, Aug 29, 2012 at 01:25:35AM +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> > > > > On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > > > > > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > > > > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > > > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > > > > > 
> > > > > > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > > > > > >     Enable for 1.1
> > > > > > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > > > > > >     Disable for 1.2
> > > > > > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > > > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > > > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > > > > > 
> > > > > > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > > > > > supporting PV EOI already have migration broken, so there's not much we
> > > > > > > > can do for them)
> > > > > > > 
> > > > > > > Exactly.
> > > > > > > 
> > > > > > > Talked to Gleb, long term I think we should rework code to make
> > > > > > > it forward-compatible wrt adding new MSRs:
> > > > > > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > > > > > - send all MSRS in key/value format
> > > > > > > - destination gets list of MSRs to be migrated from KVM and
> > > > > > >   only restores the supported ones
> > > > > > 
> > > > > > As far as I understand the migration code requirements/expectations, if
> > > > > > the origin is sending some data, it is because it is part of the
> > > > > > guest-visible machine state that must be kept while migrating. Because
> > > > > > of that, the destination is not allowed to drop anything it doesn't know
> > > > > > about.
> > > > > 
> > > > > 
> > > > > We have a ton of code that reads in values then just
> > > > > ignores them, for compat with old qemu.
> > > 
> > > The difference is that you ignore only the values you _know_ to be safe
> > > to be ignored. You can't ignore a value simply because you don't know
> > > what it means, and if it's important or not.
> > > > > This will be exactly such a case:
> > > > > we don't drop anything - protocol does not
> > > > > support this. We read and simply do not tell kvm
> > > > > about it.
> > > 
> > > This fits the definition of "dropping", to me.
> > > 
> > > > We also have tons of code that sends
> > > > > useless values again for compatibility.
> > > 
> > > But these values should be ignored only if the receiver knows exactly
> > > what it means, and knows exactly why/when it can be ignored.
> > 
> > Sorry I just do not understand these meta arguments.  Do you mean
> > the example below? If yes let us focus on it please.
> 
> We have to have these meta arguments, because the problem is about
> features/MSRs that will be introduced in the future. But okay, let's
> follow with the specific argument:
> 
> > 
> > > > > 
> > > > > > At the same time, if it's not part of guest-visible machine
> > > > > > state, it doesn't have to be sent by the migration origin.
> > > 
> > > It may be not directly visible, but if it's part of the machine state,
> > > it affects the guest in some way. If it didn't affect the guest in any
> > > way, we wouldn't even have to send it while migrating, in the first
> > > place.
> > > 
> > > > > > 
> > > > > 
> > > > > False, we often send internal device state which is not
> > > > > directly guest visible.
> > > > 
> > > > Besides, all MSRs in KVM's MSR list are actually guest visible, even if
> > > > guests normally do not invoke these MSRs - they can.
> > > 
> > > The difference is that the machine doesn't guarantee a MSR to be
> > > migrated unless the corresponding feature bit reports the MSR as
> > > supported.
> > > 
> > > e.g. using the PV EOI MSR without having the PV EOI CPUID bit set is
> > > undefined behavior. The guest shouldn't do it.
> > 
> > So the problematic scenario involves a guest that sets PV EOI when CPUID
> > bit is off on source. Why do we even care what happens on migration?
> 
> This is the case where there is _no_ problem. But the migration
> destination can't know that, how can it decide if it's safe to drop the
> MSR value or not?

I looked and it seems always safe to drop MSRs that we have.
Any counter examples.

> > 
> > > On the other hand, if the PV EOI CPUID bit is set, the MSR should be
> > > always kept, no matter what.
> > > 
> > > This means that we need to migrate the MSR only if the corresponding
> > > CPUID bit is enabled.
> > > 
> > 
> > I do not follow. Why does it mean that?
> > It seems completely safe to migrate this MSR no matter what.
> 
> It is completely safe to migrate the MSR, but it is completely unsafe to
> _ignore_ the incoming MSR value. That's the problem.

Normally CPUID will tell you if such important MSR is available.
So we can check that at destination.

> > 
> > > > 
> > > > > > On the other hand, a mode of operation that doesn't require updating
> > > > > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > > > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > > > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > > > > just don't know how to make work with the current migration protocol.
> > > > > > 
> > > > > 
> > > > > I don't understand. What is the problem with the proposal?
> > > > > What will not work with our protocol?
> > > > > Can you give an example please?
> > > 
> > > Yes:
> > > 
> > > Suppose kernel 3.7 introduces KVM_FOO_MSR and CPUID_KVM_FOO.
> > > 
> > > Also, suppose QEMU 1.2 doesn't know anything about KVM_FOO, because it
> > > was release before this feature was introduced.
> > > 
> > > 
> > > Then you run "qemu-1.2 -M pc-1.2" on a 3.7 host kernel. qemu-1.2 can do
> > > two things here:
> > > 
> > > 1) Not enable CPUID_KVM_FOO
> > > 
> > > In this case, the guest should not use KVM_FOO_MSR and the MSR does not
> > > need to be migrated (the guest may try to use it, but the behavior when
> > > trying to use it is undefined). Sending the MSR value when migrating
> > > would be useless.
> > > 
> > > 
> > > 2) Enable CPUID_KVM_FOO.
> > > 
> > > In this case, the guest may try to use the feature and write something
> > > into KVM_FOO_MSR. Sending the MSR value when migrating is absolutely
> > > necessary
> > > 
> > > ---
> > > 
> > > Now assume you run "qemu-1.2 -M pc-1.2" in the destination host, running
> > > the 3.6 kernel (without KVM_FOO).
> > > 
> > > Then qemu-1.2 receives the KVM_FOO_MSR data in the migration stream. In
> > > this case, qemu-1.2 simply can't decide if it's safe to drop the data
> > > (and not tell KVM about it), or not.
> > > If we simply send every MSR reported by the kernel, both the origin and
> > > destination qemu-1.2 processes can't ever know if the MSR value is
> > > important or not, because it doesn't know if it's part of the machine
> > > state that has to be kept consistent.
> > > We could introduce a mode of operation where _every_ MSR reported by KVM
> > > is important and sent by the origin (and also where every MSR must be
> > > handled by the destination, otherwise migration would fail). But this
> > > new mode would break migration compatibility between two hosts running
> > > the same machine-type, only because they are running different kernel
> > > versions. But it may be useful for some use cases, so maybe it's
> > > appropriate for a future "pc-next" machine-type (and for "-cpu host"),
> > > but not for "pc-<version>".
> > > 
> > 
> > In this example, we should migrate CPUID (don't we?).
> > Destination should validate that CPUID supplied by source
> > matches what it can support (doesn't it?).
> > 
> > If we do and it does not, it's an unrelated bug:
> > CPUID changing under guest's feet.
> 
> CPUID changing under guest's feet is another problem, that we also have
> to solve.
> But we also have the problem of migration compatibility
> between different host kernels.

So here is the solution for both: on destination pass CPUID to kvm and
it should come back unchanged.  If it changed you fail migration.

> 
> Note that I am not saying that migrating all MSRs is wrong. It _is_
> good, as long as:
> - The destination never ignores any of the incoming MSR values.

What I am saying for MSRs added in last 2 years it is OK to ignore
because CPUID check will tell you if it is supported
and fail migration.

> - We don't do that by default on "pc-<version>", or we defeat the
>   purpose of machine-types.
> 
> I'll try to enumerate the problems I am trying to address (that are
> related, but are separate problems):
> 
> - MSR not being migrated when it should:
>   - Possible solution: migrate all MSRs even if qemu doesn't know what
>     they are.
>     - Constraint: migration destination must _never_ ignore any incoming
>       MSR value, because it can't decide if it is important to the guest
>       or not (with the current KVM interfaces).
>     - Problem with this solution: if we do that by default on
>       "pc-<version>", we break migration compatibility between hosts
>       with different kernel versions.

Solution: add vcpu ioctl that tells you which MSRs to migrate
(on source), depending on CPUID.

> - Changing CPUID bits under guest's feet.
>   - Proposed solution: migrating CPUID bits, refuse migration if
>     destination doesn't support the same bits.
>     - It solves the compatibility problem for migration to a newer
>       kernel, but not to an older kernel. It helps to solve part of
>       the problem, but not all.

How does not it save all of the problem? If destination kernel
can support cpuid, then we are fine - it is new enough.

>   - Alternative solution: simply make the resulting CPUID bits not be a
>     function of the host kernel capabilities, but only of the qemu
>     configuration (except on "-cpu host" and "-M pc-next").

This perpetuates existing duplication of code between
kvm and qemu. We are better off with logic in 1 place.

> - Migration compatibility/predictability:
>   - See my example above: feature introduced in a newer kernel,
>     migration to an older kernel.

If it is enabled then migration fails.

>   - The only way I see to guarantee this is to never enable unknown
>     CPUID bits or MSRs. People who don't care about predictable
>     migration compatibility can use "-M pc-next", then.
> 

Guarantee what?
Just check dst can support all msrs and cpuid bits of src.
Way to check is to ask kvm :) Not to add logic in qemu.



> > 
> > > > > 
> > > > > 
> > > > > > > Too late for 1.2?
> > > > > > 
> > > > > > Absolutely (in my opinion).
> > > > > > 
> > > > > > > 
> > > > > > > > While we don't make the KVM feature-bit handling sane (with defaults
> > > > > > > > that are not blindly derived from the host kernel capabilities), maybe
> > > > > > > > the safest bet is to expect users to not migrate between hosts running
> > > > > > > > kernels with different KVM capabilities? (I am not sure which option is
> > > > > > > > better)
> > > > > > > 
> > > > > > > Sorry not sure what you talk about here. What has KVM feature-bit
> > > > > > > handling to do with this patchset?
> > > > > > 
> > > > > > Everything? The whole point of this patch is to filter out the PV_EOI
> > > > > > KVM feature bit.
> > > > > > 
> > > > > > 
> > > > > > This part of the current code, specifically, is wrong:
> > > > > > 
> > > > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > > 
> > > > > > The QEMU-side list of KVM features should be whitelist-based, not
> > > > > > blacklist-based (unless the user doesn't need migration, in that case he
> > > > > > can use "-cpu host" and get every feature blindly enabled), because QEMU
> > > > > > can't know if a new feature involves guest-visible state that has to be
> > > > > > migrated.
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > > > > > ---
> > > > > > > > >  hw/Makefile.objs  |  2 +-
> > > > > > > > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > > > > > > > >  hw/cpu_flags.h    |  9 +++++++++
> > > > > > > > >  hw/pc_piix.c      |  2 ++
> > > > > > > > >  target-i386/cpu.c |  8 ++++++++
> > > > > > > > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > > > > > > > >  create mode 100644 hw/cpu_flags.c
> > > > > > > > >  create mode 100644 hw/cpu_flags.h
> > > > > > > > > 
> > > > > > > > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > > > > > > > index 850b87b..3f2532a 100644
> > > > > > > > > --- a/hw/Makefile.objs
> > > > > > > > > +++ b/hw/Makefile.objs
> > > > > > > > > @@ -1,5 +1,5 @@
> > > > > > > > >  hw-obj-y = usb/ ide/
> > > > > > > > > -hw-obj-y += loader.o
> > > > > > > > > +hw-obj-y += loader.o cpu_flags.o
> > > > > > > > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > > > > > > > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > > > > > > > >  hw-obj-y += fw_cfg.o
> > > > > > > > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > > > > > > > new file mode 100644
> > > > > > > > > index 0000000..7a633c0
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/hw/cpu_flags.c
> > > > > > > > > @@ -0,0 +1,32 @@
> > > > > > > > > +/*
> > > > > > > > > + * CPU compatibility flags.
> > > > > > > > > + *
> > > > > > > > > + * Copyright (c) 2012 Red Hat Inc.
> > > > > > > > > + * Author: Michael S. Tsirkin.
> > > > > > > > > + *
> > > > > > > > > + * This program is free software; you can redistribute it and/or modify
> > > > > > > > > + * it under the terms of the GNU General Public License as published by
> > > > > > > > > + * the Free Software Foundation; either version 2 of the License, or
> > > > > > > > > + * (at your option) any later version.
> > > > > > > > > + *
> > > > > > > > > + * This program is distributed in the hope that it will be useful,
> > > > > > > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > > > > > > > + * GNU General Public License for more details.
> > > > > > > > > + *
> > > > > > > > > + * You should have received a copy of the GNU General Public License along
> > > > > > > > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > > > > > > > + */
> > > > > > > > > +#include "hw/cpu_flags.h"
> > > > > > > > > +
> > > > > > > > > +static bool kvm_pv_eoi_disabled_state;
> > > > > > > > > +
> > > > > > > > > +void disable_kvm_pv_eoi(void)
> > > > > > > > > +{
> > > > > > > > > +       kvm_pv_eoi_disabled_state = true;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +bool kvm_pv_eoi_disabled(void)
> > > > > > > > > +{
> > > > > > > > > +       return kvm_pv_eoi_disabled_state;
> > > > > > > > > +}
> > > > > > > > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > > > > > > > new file mode 100644
> > > > > > > > > index 0000000..05777b6
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/hw/cpu_flags.h
> > > > > > > > > @@ -0,0 +1,9 @@
> > > > > > > > > +#ifndef HW_CPU_FLAGS_H
> > > > > > > > > +#define HW_CPU_FLAGS_H
> > > > > > > > > +
> > > > > > > > > +#include <stdbool.h>
> > > > > > > > > +
> > > > > > > > > +void disable_kvm_pv_eoi(void);
> > > > > > > > > +bool kvm_pv_eoi_disabled(void);
> > > > > > > > > +
> > > > > > > > > +#endif
> > > > > > > > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > > > > > > > index 008d42f..bdbceda 100644
> > > > > > > > > --- a/hw/pc_piix.c
> > > > > > > > > +++ b/hw/pc_piix.c
> > > > > > > > > @@ -46,6 +46,7 @@
> > > > > > > > >  #ifdef CONFIG_XEN
> > > > > > > > >  #  include <xen/hvm/hvm_info_table.h>
> > > > > > > > >  #endif
> > > > > > > > > +#include "cpu_flags.h"
> > > > > > > > >  
> > > > > > > > >  #define MAX_IDE_BUS 2
> > > > > > > > >  
> > > > > > > > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > > > > > > > >  
> > > > > > > > >  static void pc_machine_v1_1_compat(void)
> > > > > > > > >  {
> > > > > > > > > +    disable_kvm_pv_eoi();
> > > > > > > > >  }
> > > > > > > > >  
> > > > > > > > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > > > > > > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > > > > > > > index 120a2e3..0d02fd1 100644
> > > > > > > > > --- a/target-i386/cpu.c
> > > > > > > > > +++ b/target-i386/cpu.c
> > > > > > > > > @@ -23,6 +23,7 @@
> > > > > > > > >  
> > > > > > > > >  #include "cpu.h"
> > > > > > > > >  #include "kvm.h"
> > > > > > > > > +#include "asm/kvm_para.h"
> > > > > > > > >  
> > > > > > > > >  #include "qemu-option.h"
> > > > > > > > >  #include "qemu-config.h"
> > > > > > > > > @@ -33,6 +34,7 @@
> > > > > > > > >  #include "hyperv.h"
> > > > > > > > >  
> > > > > > > > >  #include "hw/hw.h"
> > > > > > > > > +#include "hw/cpu_flags.h"
> > > > > > > > >  
> > > > > > > > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > > > > > > > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > > > > > > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > > > > > > > >  
> > > > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > > > > >  
> > > > > > > > > +    /* Disable PV EOI for old machine types.
> > > > > > > > > +     * Feature flags can still override. */
> > > > > > > > > +    if (kvm_pv_eoi_disabled()) {
> > > > > > > > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > > > > > > > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > > > > > > > >          &plus_kvm_features, &plus_svm_features);
> > > > > > > > > -- 
> > > > > > > > > MST
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > -- 
> > > > > > > > Eduardo
> > > > > > 
> > > > > > -- 
> > > > > > Eduardo
> > > 
> > > -- 
> > > Eduardo
> 
> -- 
> Eduardo
Anthony Liguori Aug. 29, 2012, 1:36 p.m. UTC | #15
"Michael S. Tsirkin" <mst@redhat.com> writes:

> In preparation for adding PV EOI support, disable PV EOI by default for
> 1.1 and older machine types, to avoid CPUID changing during migration.
>
> PV EOI can still be enabled/disabled by specifying it explicitly.
>     Enable for 1.1
>     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
>     Disable for 1.2
>     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

As best I can tell, we're papering over an ABI breakage in KVM.

If an old QEMU attempts to do a live migration on a new kernel,
migrating to an QEMU on a different box with an older kernel, it will
fail because there is state that isn't being migrated.

This ought to be fixed in the kernel by making these features
whitelisted by userspace.

Regards,

Anthony Liguori


> ---
>  hw/Makefile.objs  |  2 +-
>  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
>  hw/cpu_flags.h    |  9 +++++++++
>  hw/pc_piix.c      |  2 ++
>  target-i386/cpu.c |  8 ++++++++
>  5 files changed, 52 insertions(+), 1 deletion(-)
>  create mode 100644 hw/cpu_flags.c
>  create mode 100644 hw/cpu_flags.h
>
> diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> index 850b87b..3f2532a 100644
> --- a/hw/Makefile.objs
> +++ b/hw/Makefile.objs
> @@ -1,5 +1,5 @@
>  hw-obj-y = usb/ ide/
> -hw-obj-y += loader.o
> +hw-obj-y += loader.o cpu_flags.o
>  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
>  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
>  hw-obj-y += fw_cfg.o
> diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> new file mode 100644
> index 0000000..7a633c0
> --- /dev/null
> +++ b/hw/cpu_flags.c
> @@ -0,0 +1,32 @@
> +/*
> + * CPU compatibility flags.
> + *
> + * Copyright (c) 2012 Red Hat Inc.
> + * Author: Michael S. Tsirkin.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +#include "hw/cpu_flags.h"
> +
> +static bool kvm_pv_eoi_disabled_state;
> +
> +void disable_kvm_pv_eoi(void)
> +{
> +       kvm_pv_eoi_disabled_state = true;
> +}
> +
> +bool kvm_pv_eoi_disabled(void)
> +{
> +       return kvm_pv_eoi_disabled_state;
> +}
> diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> new file mode 100644
> index 0000000..05777b6
> --- /dev/null
> +++ b/hw/cpu_flags.h
> @@ -0,0 +1,9 @@
> +#ifndef HW_CPU_FLAGS_H
> +#define HW_CPU_FLAGS_H
> +
> +#include <stdbool.h>
> +
> +void disable_kvm_pv_eoi(void);
> +bool kvm_pv_eoi_disabled(void);
> +
> +#endif
> diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> index 008d42f..bdbceda 100644
> --- a/hw/pc_piix.c
> +++ b/hw/pc_piix.c
> @@ -46,6 +46,7 @@
>  #ifdef CONFIG_XEN
>  #  include <xen/hvm/hvm_info_table.h>
>  #endif
> +#include "cpu_flags.h"
>  
>  #define MAX_IDE_BUS 2
>  
> @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
>  
>  static void pc_machine_v1_1_compat(void)
>  {
> +    disable_kvm_pv_eoi();
>  }
>  
>  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> index 120a2e3..0d02fd1 100644
> --- a/target-i386/cpu.c
> +++ b/target-i386/cpu.c
> @@ -23,6 +23,7 @@
>  
>  #include "cpu.h"
>  #include "kvm.h"
> +#include "asm/kvm_para.h"
>  
>  #include "qemu-option.h"
>  #include "qemu-config.h"
> @@ -33,6 +34,7 @@
>  #include "hyperv.h"
>  
>  #include "hw/hw.h"
> +#include "hw/cpu_flags.h"
>  
>  /* feature flags taken from "Intel Processor Identification and the CPUID
>   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
>  
>      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
>  
> +    /* Disable PV EOI for old machine types.
> +     * Feature flags can still override. */
> +    if (kvm_pv_eoi_disabled()) {
> +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> +    }
> +
>      add_flagname_to_bitmaps("hypervisor", &plus_features,
>          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
>          &plus_kvm_features, &plus_svm_features);
> -- 
> MST
Gleb Natapov Aug. 29, 2012, 1:40 p.m. UTC | #16
On Wed, Aug 29, 2012 at 08:36:30AM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > In preparation for adding PV EOI support, disable PV EOI by default for
> > 1.1 and older machine types, to avoid CPUID changing during migration.
> >
> > PV EOI can still be enabled/disabled by specifying it explicitly.
> >     Enable for 1.1
> >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> >     Disable for 1.2
> >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> As best I can tell, we're papering over an ABI breakage in KVM.
> 
> If an old QEMU attempts to do a live migration on a new kernel,
> migrating to an QEMU on a different box with an older kernel, it will
> fail because there is state that isn't being migrated.
> 
> This ought to be fixed in the kernel by making these features
> whitelisted by userspace.
> 
What do you mean. Userspace and only userspace decides what cpuid bits
will be seen by a guest. Currently userspace enables all PV cpuid bits
it finds.

--
			Gleb.
Eduardo Habkost Aug. 29, 2012, 1:49 p.m. UTC | #17
On Wed, Aug 29, 2012 at 04:18:12PM +0300, Michael S. Tsirkin wrote:
> On Wed, Aug 29, 2012 at 09:56:12AM -0300, Eduardo Habkost wrote:
> > On Wed, Aug 29, 2012 at 01:06:32PM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Aug 28, 2012 at 08:50:09PM -0300, Eduardo Habkost wrote:
> > > > On Wed, Aug 29, 2012 at 01:25:35AM +0300, Michael S. Tsirkin wrote:
> > > > > On Wed, Aug 29, 2012 at 01:21:13AM +0300, Michael S. Tsirkin wrote:
> > > > > > On Tue, Aug 28, 2012 at 07:02:42PM -0300, Eduardo Habkost wrote:
> > > > > > > On Wed, Aug 29, 2012 at 12:35:28AM +0300, Michael S. Tsirkin wrote:
> > > > > > > > On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
> > > > > > > > > On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
> > > > > > > > > > In preparation for adding PV EOI support, disable PV EOI by default for
> > > > > > > > > > 1.1 and older machine types, to avoid CPUID changing during migration.
> > > > > > > > > > 
> > > > > > > > > > PV EOI can still be enabled/disabled by specifying it explicitly.
> > > > > > > > > >     Enable for 1.1
> > > > > > > > > >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> > > > > > > > > >     Disable for 1.2
> > > > > > > > > >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > What about users that are already running "qemu-1.1 -M pc-1.1" on a host
> > > > > > > > > kernel that supports PV EOI already? They would get PV EOI disabled when
> > > > > > > > > migrating to a destination running "qemu-1.2 -M pc-1.1".
> > > > > > > > > 
> > > > > > > > > (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
> > > > > > > > > supporting PV EOI already have migration broken, so there's not much we
> > > > > > > > > can do for them)
> > > > > > > > 
> > > > > > > > Exactly.
> > > > > > > > 
> > > > > > > > Talked to Gleb, long term I think we should rework code to make
> > > > > > > > it forward-compatible wrt adding new MSRs:
> > > > > > > > - source gets list of MSRs to be migrated from KVM and simply sends them all
> > > > > > > > - send all MSRS in key/value format
> > > > > > > > - destination gets list of MSRs to be migrated from KVM and
> > > > > > > >   only restores the supported ones
> > > > > > > 
> > > > > > > As far as I understand the migration code requirements/expectations, if
> > > > > > > the origin is sending some data, it is because it is part of the
> > > > > > > guest-visible machine state that must be kept while migrating. Because
> > > > > > > of that, the destination is not allowed to drop anything it doesn't know
> > > > > > > about.
> > > > > > 
> > > > > > 
> > > > > > We have a ton of code that reads in values then just
> > > > > > ignores them, for compat with old qemu.
> > > > 
> > > > The difference is that you ignore only the values you _know_ to be safe
> > > > to be ignored. You can't ignore a value simply because you don't know
> > > > what it means, and if it's important or not.
> > > > > > This will be exactly such a case:
> > > > > > we don't drop anything - protocol does not
> > > > > > support this. We read and simply do not tell kvm
> > > > > > about it.
> > > > 
> > > > This fits the definition of "dropping", to me.
> > > > 
> > > > > We also have tons of code that sends
> > > > > > useless values again for compatibility.
> > > > 
> > > > But these values should be ignored only if the receiver knows exactly
> > > > what it means, and knows exactly why/when it can be ignored.
> > > 
> > > Sorry I just do not understand these meta arguments.  Do you mean
> > > the example below? If yes let us focus on it please.
> > 
> > We have to have these meta arguments, because the problem is about
> > features/MSRs that will be introduced in the future. But okay, let's
> > follow with the specific argument:
> > 
> > > 
> > > > > > 
> > > > > > > At the same time, if it's not part of guest-visible machine
> > > > > > > state, it doesn't have to be sent by the migration origin.
> > > > 
> > > > It may be not directly visible, but if it's part of the machine state,
> > > > it affects the guest in some way. If it didn't affect the guest in any
> > > > way, we wouldn't even have to send it while migrating, in the first
> > > > place.
> > > > 
> > > > > > > 
> > > > > > 
> > > > > > False, we often send internal device state which is not
> > > > > > directly guest visible.
> > > > > 
> > > > > Besides, all MSRs in KVM's MSR list are actually guest visible, even if
> > > > > guests normally do not invoke these MSRs - they can.
> > > > 
> > > > The difference is that the machine doesn't guarantee a MSR to be
> > > > migrated unless the corresponding feature bit reports the MSR as
> > > > supported.
> > > > 
> > > > e.g. using the PV EOI MSR without having the PV EOI CPUID bit set is
> > > > undefined behavior. The guest shouldn't do it.
> > > 
> > > So the problematic scenario involves a guest that sets PV EOI when CPUID
> > > bit is off on source. Why do we even care what happens on migration?
> > 
> > This is the case where there is _no_ problem. But the migration
> > destination can't know that, how can it decide if it's safe to drop the
> > MSR value or not?
> 
> I looked and it seems always safe to drop MSRs that we have.
> Any counter examples.

It's never correct to drop any MSR that is reported as present to the
guest. If it was safe and correct to drop them, why are you trying to
migrate them?  ;-)

Also, if we did that, we would be forced to never add a
"not-safe-to-drop" MSR to the list of MSRs to save in KVM. I don't think
that's part of the KVM_GET_MSR_INDEX_LIST semantics.


> 
> > > 
> > > > On the other hand, if the PV EOI CPUID bit is set, the MSR should be
> > > > always kept, no matter what.
> > > > 
> > > > This means that we need to migrate the MSR only if the corresponding
> > > > CPUID bit is enabled.
> > > > 
> > > 
> > > I do not follow. Why does it mean that?
> > > It seems completely safe to migrate this MSR no matter what.
> > 
> > It is completely safe to migrate the MSR, but it is completely unsafe to
> > _ignore_ the incoming MSR value. That's the problem.
> 
> Normally CPUID will tell you if such important MSR is available.
> So we can check that at destination.

How can qemu check it, if when the qemu code was written when the MSR
didn't even exist yet?

(You could add an interface to check for that, but there's no KVM
ioctl() to tell qemu "given these CPUID bits, can I safely drop this MSR
that I don't even know about?")


> 
> > > 
> > > > > 
> > > > > > > On the other hand, a mode of operation that doesn't require updating
> > > > > > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > > > > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > > > > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > > > > > just don't know how to make work with the current migration protocol.
> > > > > > > 
> > > > > > 
> > > > > > I don't understand. What is the problem with the proposal?
> > > > > > What will not work with our protocol?
> > > > > > Can you give an example please?
> > > > 
> > > > Yes:
> > > > 
> > > > Suppose kernel 3.7 introduces KVM_FOO_MSR and CPUID_KVM_FOO.
> > > > 
> > > > Also, suppose QEMU 1.2 doesn't know anything about KVM_FOO, because it
> > > > was release before this feature was introduced.
> > > > 
> > > > 
> > > > Then you run "qemu-1.2 -M pc-1.2" on a 3.7 host kernel. qemu-1.2 can do
> > > > two things here:
> > > > 
> > > > 1) Not enable CPUID_KVM_FOO
> > > > 
> > > > In this case, the guest should not use KVM_FOO_MSR and the MSR does not
> > > > need to be migrated (the guest may try to use it, but the behavior when
> > > > trying to use it is undefined). Sending the MSR value when migrating
> > > > would be useless.
> > > > 
> > > > 
> > > > 2) Enable CPUID_KVM_FOO.
> > > > 
> > > > In this case, the guest may try to use the feature and write something
> > > > into KVM_FOO_MSR. Sending the MSR value when migrating is absolutely
> > > > necessary
> > > > 
> > > > ---
> > > > 
> > > > Now assume you run "qemu-1.2 -M pc-1.2" in the destination host, running
> > > > the 3.6 kernel (without KVM_FOO).
> > > > 
> > > > Then qemu-1.2 receives the KVM_FOO_MSR data in the migration stream. In
> > > > this case, qemu-1.2 simply can't decide if it's safe to drop the data
> > > > (and not tell KVM about it), or not.
> > > > If we simply send every MSR reported by the kernel, both the origin and
> > > > destination qemu-1.2 processes can't ever know if the MSR value is
> > > > important or not, because it doesn't know if it's part of the machine
> > > > state that has to be kept consistent.
> > > > We could introduce a mode of operation where _every_ MSR reported by KVM
> > > > is important and sent by the origin (and also where every MSR must be
> > > > handled by the destination, otherwise migration would fail). But this
> > > > new mode would break migration compatibility between two hosts running
> > > > the same machine-type, only because they are running different kernel
> > > > versions. But it may be useful for some use cases, so maybe it's
> > > > appropriate for a future "pc-next" machine-type (and for "-cpu host"),
> > > > but not for "pc-<version>".
> > > > 
> > > 
> > > In this example, we should migrate CPUID (don't we?).
> > > Destination should validate that CPUID supplied by source
> > > matches what it can support (doesn't it?).
> > > 
> > > If we do and it does not, it's an unrelated bug:
> > > CPUID changing under guest's feet.
> > 
> > CPUID changing under guest's feet is another problem, that we also have
> > to solve.
> > But we also have the problem of migration compatibility
> > between different host kernels.
> 
> So here is the solution for both: on destination pass CPUID to kvm and
> it should come back unchanged.  If it changed you fail migration.

This doesn't solve the problem of having predictable migration
compatibility for "-M pc-<old-version>".

The whole point of machine-types is to expose the "same machine" to the
guest, even if you change the hardware or host kernel. "qemu -M pc-1.x"
must expose the same machine configuration to the guest, it doesn 't
matter what's the host kernel version.


> 
> > 
> > Note that I am not saying that migrating all MSRs is wrong. It _is_
> > good, as long as:
> > - The destination never ignores any of the incoming MSR values.
> 
> What I am saying for MSRs added in last 2 years it is OK to ignore
> because CPUID check will tell you if it is supported
> and fail migration.

Existing MSRs are easy to make work. The problem is about MSRs added to
the "msrs_to_save" list in the future.

Also, the problem is not about being "safe" to ignore the MSR values,
it's about being "correct" (part of the expected behavior of the virtual
machine). The fact that most guests doen't crash when the virtual
machine doesn't behave as it should doesn't mean we should do it.

Either the MSR is part of the machine state (and relevant to the guest),
or not. If it is relevant, it must be _always_ migrated and never
dropped by the destination. If it is not, it's useless to migrate it.


> 
> > - We don't do that by default on "pc-<version>", or we defeat the
> >   purpose of machine-types.
> > 
> > I'll try to enumerate the problems I am trying to address (that are
> > related, but are separate problems):
> > 
> > - MSR not being migrated when it should:
> >   - Possible solution: migrate all MSRs even if qemu doesn't know what
> >     they are.
> >     - Constraint: migration destination must _never_ ignore any incoming
> >       MSR value, because it can't decide if it is important to the guest
> >       or not (with the current KVM interfaces).
> >     - Problem with this solution: if we do that by default on
> >       "pc-<version>", we break migration compatibility between hosts
> >       with different kernel versions.
> 
> Solution: add vcpu ioctl that tells you which MSRs to migrate
> (on source), depending on CPUID.

This may be a solution for old-kernel => new-kernel migration, yes. But
this still doesn't solve the migration compatibility problem for
new-kernel => old-kernel migration (see below).


> 
> > - Changing CPUID bits under guest's feet.
> >   - Proposed solution: migrating CPUID bits, refuse migration if
> >     destination doesn't support the same bits.
> >     - It solves the compatibility problem for migration to a newer
> >       kernel, but not to an older kernel. It helps to solve part of
> >       the problem, but not all.
> 
> How does not it save all of the problem? If destination kernel
> can support cpuid, then we are fine - it is new enough.

See below. The problem is being able to migrate to an older host.
That's the whole point of machine-types!

> 
> >   - Alternative solution: simply make the resulting CPUID bits not be a
> >     function of the host kernel capabilities, but only of the qemu
> >     configuration (except on "-cpu host" and "-M pc-next").
> 
> This perpetuates existing duplication of code between
> kvm and qemu. We are better off with logic in 1 place.

Yes, it does, and I would love to avoid having the list inside QEMU,
too. But we can't avoid that because each machine-type defines a set of
available features/MSRs, so we have to have a machine-type =>
list-of-features list somewhere, unfortunately.


> 
> > - Migration compatibility/predictability:
> >   - See my example above: feature introduced in a newer kernel,
> >     migration to an older kernel.
> 
> If it is enabled then migration fails.
> 
> >   - The only way I see to guarantee this is to never enable unknown
> >     CPUID bits or MSRs. People who don't care about predictable
> >     migration compatibility can use "-M pc-next", then.
> > 
> 
> Guarantee what?

Guarantee that "-M pc-1.1" machines can be migrated to any host that is
already capable of running "-M pc-1.1".


> Just check dst can support all msrs and cpuid bits of src.
> Way to check is to ask kvm :) Not to add logic in qemu.

Checking and making migration fail when it has to fail is not the
problem. The problem is that now "qemu -M pc-1.x" will result in a
different machine, depending on the host kernel version. This causes two
problems:

- Now you don't know if your existing machine can be migrated to a
  host running an older kernel (because now migration can fail even when
  you are using the same machine-type on both sides).
- Different VMs using the same machine-type will get different machines
  (with different sets of features), because they are running on
  different kernel versions.

This may be acceptable for "pc-next", but not for "pc-<version>".

> 
> 
> 
> > > 
> > > > > > 
> > > > > > 
> > > > > > > > Too late for 1.2?
> > > > > > > 
> > > > > > > Absolutely (in my opinion).
> > > > > > > 
> > > > > > > > 
> > > > > > > > > While we don't make the KVM feature-bit handling sane (with defaults
> > > > > > > > > that are not blindly derived from the host kernel capabilities), maybe
> > > > > > > > > the safest bet is to expect users to not migrate between hosts running
> > > > > > > > > kernels with different KVM capabilities? (I am not sure which option is
> > > > > > > > > better)
> > > > > > > > 
> > > > > > > > Sorry not sure what you talk about here. What has KVM feature-bit
> > > > > > > > handling to do with this patchset?
> > > > > > > 
> > > > > > > Everything? The whole point of this patch is to filter out the PV_EOI
> > > > > > > KVM feature bit.
> > > > > > > 
> > > > > > > 
> > > > > > > This part of the current code, specifically, is wrong:
> > > > > > > 
> > > > > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > > > 
> > > > > > > The QEMU-side list of KVM features should be whitelist-based, not
> > > > > > > blacklist-based (unless the user doesn't need migration, in that case he
> > > > > > > can use "-cpu host" and get every feature blindly enabled), because QEMU
> > > > > > > can't know if a new feature involves guest-visible state that has to be
> > > > > > > migrated.
> > > > > > > 
> > > > > > > 
> > > > > > > > 
> > > > > > > > > 
> > > > > > > > > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > >  hw/Makefile.objs  |  2 +-
> > > > > > > > > >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> > > > > > > > > >  hw/cpu_flags.h    |  9 +++++++++
> > > > > > > > > >  hw/pc_piix.c      |  2 ++
> > > > > > > > > >  target-i386/cpu.c |  8 ++++++++
> > > > > > > > > >  5 files changed, 52 insertions(+), 1 deletion(-)
> > > > > > > > > >  create mode 100644 hw/cpu_flags.c
> > > > > > > > > >  create mode 100644 hw/cpu_flags.h
> > > > > > > > > > 
> > > > > > > > > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > > > > > > > > > index 850b87b..3f2532a 100644
> > > > > > > > > > --- a/hw/Makefile.objs
> > > > > > > > > > +++ b/hw/Makefile.objs
> > > > > > > > > > @@ -1,5 +1,5 @@
> > > > > > > > > >  hw-obj-y = usb/ ide/
> > > > > > > > > > -hw-obj-y += loader.o
> > > > > > > > > > +hw-obj-y += loader.o cpu_flags.o
> > > > > > > > > >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> > > > > > > > > >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> > > > > > > > > >  hw-obj-y += fw_cfg.o
> > > > > > > > > > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > > > > > > > > > new file mode 100644
> > > > > > > > > > index 0000000..7a633c0
> > > > > > > > > > --- /dev/null
> > > > > > > > > > +++ b/hw/cpu_flags.c
> > > > > > > > > > @@ -0,0 +1,32 @@
> > > > > > > > > > +/*
> > > > > > > > > > + * CPU compatibility flags.
> > > > > > > > > > + *
> > > > > > > > > > + * Copyright (c) 2012 Red Hat Inc.
> > > > > > > > > > + * Author: Michael S. Tsirkin.
> > > > > > > > > > + *
> > > > > > > > > > + * This program is free software; you can redistribute it and/or modify
> > > > > > > > > > + * it under the terms of the GNU General Public License as published by
> > > > > > > > > > + * the Free Software Foundation; either version 2 of the License, or
> > > > > > > > > > + * (at your option) any later version.
> > > > > > > > > > + *
> > > > > > > > > > + * This program is distributed in the hope that it will be useful,
> > > > > > > > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > > > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > > > > > > > > > + * GNU General Public License for more details.
> > > > > > > > > > + *
> > > > > > > > > > + * You should have received a copy of the GNU General Public License along
> > > > > > > > > > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > > > > > > > > > + */
> > > > > > > > > > +#include "hw/cpu_flags.h"
> > > > > > > > > > +
> > > > > > > > > > +static bool kvm_pv_eoi_disabled_state;
> > > > > > > > > > +
> > > > > > > > > > +void disable_kvm_pv_eoi(void)
> > > > > > > > > > +{
> > > > > > > > > > +       kvm_pv_eoi_disabled_state = true;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +bool kvm_pv_eoi_disabled(void)
> > > > > > > > > > +{
> > > > > > > > > > +       return kvm_pv_eoi_disabled_state;
> > > > > > > > > > +}
> > > > > > > > > > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > > > > > > > > > new file mode 100644
> > > > > > > > > > index 0000000..05777b6
> > > > > > > > > > --- /dev/null
> > > > > > > > > > +++ b/hw/cpu_flags.h
> > > > > > > > > > @@ -0,0 +1,9 @@
> > > > > > > > > > +#ifndef HW_CPU_FLAGS_H
> > > > > > > > > > +#define HW_CPU_FLAGS_H
> > > > > > > > > > +
> > > > > > > > > > +#include <stdbool.h>
> > > > > > > > > > +
> > > > > > > > > > +void disable_kvm_pv_eoi(void);
> > > > > > > > > > +bool kvm_pv_eoi_disabled(void);
> > > > > > > > > > +
> > > > > > > > > > +#endif
> > > > > > > > > > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > > > > > > > > > index 008d42f..bdbceda 100644
> > > > > > > > > > --- a/hw/pc_piix.c
> > > > > > > > > > +++ b/hw/pc_piix.c
> > > > > > > > > > @@ -46,6 +46,7 @@
> > > > > > > > > >  #ifdef CONFIG_XEN
> > > > > > > > > >  #  include <xen/hvm/hvm_info_table.h>
> > > > > > > > > >  #endif
> > > > > > > > > > +#include "cpu_flags.h"
> > > > > > > > > >  
> > > > > > > > > >  #define MAX_IDE_BUS 2
> > > > > > > > > >  
> > > > > > > > > > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> > > > > > > > > >  
> > > > > > > > > >  static void pc_machine_v1_1_compat(void)
> > > > > > > > > >  {
> > > > > > > > > > +    disable_kvm_pv_eoi();
> > > > > > > > > >  }
> > > > > > > > > >  
> > > > > > > > > >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > > > > > > > > > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > > > > > > > > > index 120a2e3..0d02fd1 100644
> > > > > > > > > > --- a/target-i386/cpu.c
> > > > > > > > > > +++ b/target-i386/cpu.c
> > > > > > > > > > @@ -23,6 +23,7 @@
> > > > > > > > > >  
> > > > > > > > > >  #include "cpu.h"
> > > > > > > > > >  #include "kvm.h"
> > > > > > > > > > +#include "asm/kvm_para.h"
> > > > > > > > > >  
> > > > > > > > > >  #include "qemu-option.h"
> > > > > > > > > >  #include "qemu-config.h"
> > > > > > > > > > @@ -33,6 +34,7 @@
> > > > > > > > > >  #include "hyperv.h"
> > > > > > > > > >  
> > > > > > > > > >  #include "hw/hw.h"
> > > > > > > > > > +#include "hw/cpu_flags.h"
> > > > > > > > > >  
> > > > > > > > > >  /* feature flags taken from "Intel Processor Identification and the CPUID
> > > > > > > > > >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > > > > > > > > > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> > > > > > > > > >  
> > > > > > > > > >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> > > > > > > > > >  
> > > > > > > > > > +    /* Disable PV EOI for old machine types.
> > > > > > > > > > +     * Feature flags can still override. */
> > > > > > > > > > +    if (kvm_pv_eoi_disabled()) {
> > > > > > > > > > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > > > > > > > > > +    }
> > > > > > > > > > +
> > > > > > > > > >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> > > > > > > > > >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> > > > > > > > > >          &plus_kvm_features, &plus_svm_features);
> > > > > > > > > > -- 
> > > > > > > > > > MST
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > -- 
> > > > > > > > > Eduardo
> > > > > > > 
> > > > > > > -- 
> > > > > > > Eduardo
> > > > 
> > > > -- 
> > > > Eduardo
> > 
> > -- 
> > Eduardo
Eduardo Habkost Aug. 29, 2012, 2:03 p.m. UTC | #18
On Wed, Aug 29, 2012 at 08:36:30AM -0500, Anthony Liguori wrote:
> "Michael S. Tsirkin" <mst@redhat.com> writes:
> 
> > In preparation for adding PV EOI support, disable PV EOI by default for
> > 1.1 and older machine types, to avoid CPUID changing during migration.
> >
> > PV EOI can still be enabled/disabled by specifying it explicitly.
> >     Enable for 1.1
> >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> >     Disable for 1.2
> >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> 
> As best I can tell, we're papering over an ABI breakage in KVM.
> 
> If an old QEMU attempts to do a live migration on a new kernel,
> migrating to an QEMU on a different box with an older kernel, it will
> fail because there is state that isn't being migrated.
> 
> This ought to be fixed in the kernel by making these features
> whitelisted by userspace.

The kernel interfaces are good. They report what's supported and let
userspace set the right CPUID bits based on a whitelist. What's broken
is the fact that QEMU simply does not have a whitelist for the KVM
features, simply blindly enabling everything. I am going to send fixes
for that for 1.3.

> 
> Regards,
> 
> Anthony Liguori
> 
> 
> > ---
> >  hw/Makefile.objs  |  2 +-
> >  hw/cpu_flags.c    | 32 ++++++++++++++++++++++++++++++++
> >  hw/cpu_flags.h    |  9 +++++++++
> >  hw/pc_piix.c      |  2 ++
> >  target-i386/cpu.c |  8 ++++++++
> >  5 files changed, 52 insertions(+), 1 deletion(-)
> >  create mode 100644 hw/cpu_flags.c
> >  create mode 100644 hw/cpu_flags.h
> >
> > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > index 850b87b..3f2532a 100644
> > --- a/hw/Makefile.objs
> > +++ b/hw/Makefile.objs
> > @@ -1,5 +1,5 @@
> >  hw-obj-y = usb/ ide/
> > -hw-obj-y += loader.o
> > +hw-obj-y += loader.o cpu_flags.o
> >  hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
> >  hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> >  hw-obj-y += fw_cfg.o
> > diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
> > new file mode 100644
> > index 0000000..7a633c0
> > --- /dev/null
> > +++ b/hw/cpu_flags.c
> > @@ -0,0 +1,32 @@
> > +/*
> > + * CPU compatibility flags.
> > + *
> > + * Copyright (c) 2012 Red Hat Inc.
> > + * Author: Michael S. Tsirkin.
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License as published by
> > + * the Free Software Foundation; either version 2 of the License, or
> > + * (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License along
> > + * with this program; if not, see <http://www.gnu.org/licenses/>.
> > + */
> > +#include "hw/cpu_flags.h"
> > +
> > +static bool kvm_pv_eoi_disabled_state;
> > +
> > +void disable_kvm_pv_eoi(void)
> > +{
> > +       kvm_pv_eoi_disabled_state = true;
> > +}
> > +
> > +bool kvm_pv_eoi_disabled(void)
> > +{
> > +       return kvm_pv_eoi_disabled_state;
> > +}
> > diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
> > new file mode 100644
> > index 0000000..05777b6
> > --- /dev/null
> > +++ b/hw/cpu_flags.h
> > @@ -0,0 +1,9 @@
> > +#ifndef HW_CPU_FLAGS_H
> > +#define HW_CPU_FLAGS_H
> > +
> > +#include <stdbool.h>
> > +
> > +void disable_kvm_pv_eoi(void);
> > +bool kvm_pv_eoi_disabled(void);
> > +
> > +#endif
> > diff --git a/hw/pc_piix.c b/hw/pc_piix.c
> > index 008d42f..bdbceda 100644
> > --- a/hw/pc_piix.c
> > +++ b/hw/pc_piix.c
> > @@ -46,6 +46,7 @@
> >  #ifdef CONFIG_XEN
> >  #  include <xen/hvm/hvm_info_table.h>
> >  #endif
> > +#include "cpu_flags.h"
> >  
> >  #define MAX_IDE_BUS 2
> >  
> > @@ -371,6 +372,7 @@ static QEMUMachine pc_machine_v1_2 = {
> >  
> >  static void pc_machine_v1_1_compat(void)
> >  {
> > +    disable_kvm_pv_eoi();
> >  }
> >  
> >  static void pc_init_pci_v1_1(ram_addr_t ram_size,
> > diff --git a/target-i386/cpu.c b/target-i386/cpu.c
> > index 120a2e3..0d02fd1 100644
> > --- a/target-i386/cpu.c
> > +++ b/target-i386/cpu.c
> > @@ -23,6 +23,7 @@
> >  
> >  #include "cpu.h"
> >  #include "kvm.h"
> > +#include "asm/kvm_para.h"
> >  
> >  #include "qemu-option.h"
> >  #include "qemu-config.h"
> > @@ -33,6 +34,7 @@
> >  #include "hyperv.h"
> >  
> >  #include "hw/hw.h"
> > +#include "hw/cpu_flags.h"
> >  
> >  /* feature flags taken from "Intel Processor Identification and the CPUID
> >   * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
> > @@ -889,6 +891,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
> >  
> >      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
> >  
> > +    /* Disable PV EOI for old machine types.
> > +     * Feature flags can still override. */
> > +    if (kvm_pv_eoi_disabled()) {
> > +        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
> > +    }
> > +
> >      add_flagname_to_bitmaps("hypervisor", &plus_features,
> >          &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
> >          &plus_kvm_features, &plus_svm_features);
> > -- 
> > MST
>
Anthony Liguori Aug. 29, 2012, 2:09 p.m. UTC | #19
Gleb Natapov <gleb@redhat.com> writes:

> On Wed, Aug 29, 2012 at 08:36:30AM -0500, Anthony Liguori wrote:
>> "Michael S. Tsirkin" <mst@redhat.com> writes:
>> 
>> > In preparation for adding PV EOI support, disable PV EOI by default for
>> > 1.1 and older machine types, to avoid CPUID changing during migration.
>> >
>> > PV EOI can still be enabled/disabled by specifying it explicitly.
>> >     Enable for 1.1
>> >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
>> >     Disable for 1.2
>> >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
>> >
>> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>> 
>> As best I can tell, we're papering over an ABI breakage in KVM.
>> 
>> If an old QEMU attempts to do a live migration on a new kernel,
>> migrating to an QEMU on a different box with an older kernel, it will
>> fail because there is state that isn't being migrated.
>> 
>> This ought to be fixed in the kernel by making these features
>> whitelisted by userspace.
>> 
> What do you mean. Userspace and only userspace decides what cpuid bits
> will be seen by a guest. Currently userspace enables all PV cpuid bits
> it finds.

Right, I misunderstood from the commit message.  I see now that the
problem is that bfee7546df51c08e395dc8a7676a5c7f20186fee unconditionally
enabled kvm_pv_eoi without taking into account migration support.

I think for 1.2 we should simply revert the above commit and then we can
restore it for 1.3 with proper support for migration.

Regards,

Anthony Liguori

>
> --
> 			Gleb.
Michael S. Tsirkin Aug. 29, 2012, 2:11 p.m. UTC | #20
On Wed, Aug 29, 2012 at 10:49:04AM -0300, Eduardo Habkost wrote:
> > Normally CPUID will tell you if such important MSR is available.
> > So we can check that at destination.
> 
> How can qemu check it, if when the qemu code was written when the MSR
> didn't even exist yet?
> 
> (You could add an interface to check for that, but there's no KVM
> ioctl() to tell qemu "given these CPUID bits, can I safely drop this MSR
> that I don't even know about?")
> 

So this is what I suggest exactly. Add a new ioctl like that.

> > 
> > > > 
> > > > > > 
> > > > > > > > On the other hand, a mode of operation that doesn't require updating
> > > > > > > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > > > > > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > > > > > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > > > > > > just don't know how to make work with the current migration protocol.
> > > > > > > > 
> > > > > > > 
> > > > > > > I don't understand. What is the problem with the proposal?
> > > > > > > What will not work with our protocol?
> > > > > > > Can you give an example please?
> > > > > 
> > > > > Yes:
> > > > > 
> > > > > Suppose kernel 3.7 introduces KVM_FOO_MSR and CPUID_KVM_FOO.
> > > > > 
> > > > > Also, suppose QEMU 1.2 doesn't know anything about KVM_FOO, because it
> > > > > was release before this feature was introduced.
> > > > > 
> > > > > 
> > > > > Then you run "qemu-1.2 -M pc-1.2" on a 3.7 host kernel. qemu-1.2 can do
> > > > > two things here:
> > > > > 
> > > > > 1) Not enable CPUID_KVM_FOO
> > > > > 
> > > > > In this case, the guest should not use KVM_FOO_MSR and the MSR does not
> > > > > need to be migrated (the guest may try to use it, but the behavior when
> > > > > trying to use it is undefined). Sending the MSR value when migrating
> > > > > would be useless.
> > > > > 
> > > > > 
> > > > > 2) Enable CPUID_KVM_FOO.
> > > > > 
> > > > > In this case, the guest may try to use the feature and write something
> > > > > into KVM_FOO_MSR. Sending the MSR value when migrating is absolutely
> > > > > necessary
> > > > > 
> > > > > ---
> > > > > 
> > > > > Now assume you run "qemu-1.2 -M pc-1.2" in the destination host, running
> > > > > the 3.6 kernel (without KVM_FOO).
> > > > > 
> > > > > Then qemu-1.2 receives the KVM_FOO_MSR data in the migration stream. In
> > > > > this case, qemu-1.2 simply can't decide if it's safe to drop the data
> > > > > (and not tell KVM about it), or not.
> > > > > If we simply send every MSR reported by the kernel, both the origin and
> > > > > destination qemu-1.2 processes can't ever know if the MSR value is
> > > > > important or not, because it doesn't know if it's part of the machine
> > > > > state that has to be kept consistent.
> > > > > We could introduce a mode of operation where _every_ MSR reported by KVM
> > > > > is important and sent by the origin (and also where every MSR must be
> > > > > handled by the destination, otherwise migration would fail). But this
> > > > > new mode would break migration compatibility between two hosts running
> > > > > the same machine-type, only because they are running different kernel
> > > > > versions. But it may be useful for some use cases, so maybe it's
> > > > > appropriate for a future "pc-next" machine-type (and for "-cpu host"),
> > > > > but not for "pc-<version>".
> > > > > 
> > > > 
> > > > In this example, we should migrate CPUID (don't we?).
> > > > Destination should validate that CPUID supplied by source
> > > > matches what it can support (doesn't it?).
> > > > 
> > > > If we do and it does not, it's an unrelated bug:
> > > > CPUID changing under guest's feet.
> > > 
> > > CPUID changing under guest's feet is another problem, that we also have
> > > to solve.
> > > But we also have the problem of migration compatibility
> > > between different host kernels.
> > 
> > So here is the solution for both: on destination pass CPUID to kvm and
> > it should come back unchanged.  If it changed you fail migration.
> 
> This doesn't solve the problem of having predictable migration
> compatibility for "-M pc-<old-version>".
> 
> The whole point of machine-types is to expose the "same machine" to the
> guest, even if you change the hardware or host kernel. "qemu -M pc-1.x"
> must expose the same machine configuration to the guest, it doesn 't
> matter what's the host kernel version.

I'd tend to disagree. The point is to make migration work
and avoid things like windows re-activation trigger.
Let's not be purists - many internal changes in qemu
introduce subtle guest visible changes, if even in timing.

> > 
> > > 
> > > Note that I am not saying that migrating all MSRs is wrong. It _is_
> > > good, as long as:
> > > - The destination never ignores any of the incoming MSR values.
> > 
> > What I am saying for MSRs added in last 2 years it is OK to ignore
> > because CPUID check will tell you if it is supported
> > and fail migration.
> 
> Existing MSRs are easy to make work. The problem is about MSRs added to
> the "msrs_to_save" list in the future.
> 
> Also, the problem is not about being "safe" to ignore the MSR values,
> it's about being "correct" (part of the expected behavior of the virtual
> machine). The fact that most guests doen't crash when the virtual
> machine doesn't behave as it should doesn't mean we should do it.
> 
> Either the MSR is part of the machine state (and relevant to the guest),
> or not. If it is relevant, it must be _always_ migrated and never
> dropped by the destination. If it is not, it's useless to migrate it.
> 

Yes. But it is better to keep all knowledge which is which
in one place which is in kvm.

> > 
> > > - We don't do that by default on "pc-<version>", or we defeat the
> > >   purpose of machine-types.
> > > 
> > > I'll try to enumerate the problems I am trying to address (that are
> > > related, but are separate problems):
> > > 
> > > - MSR not being migrated when it should:
> > >   - Possible solution: migrate all MSRs even if qemu doesn't know what
> > >     they are.
> > >     - Constraint: migration destination must _never_ ignore any incoming
> > >       MSR value, because it can't decide if it is important to the guest
> > >       or not (with the current KVM interfaces).
> > >     - Problem with this solution: if we do that by default on
> > >       "pc-<version>", we break migration compatibility between hosts
> > >       with different kernel versions.
> > 
> > Solution: add vcpu ioctl that tells you which MSRs to migrate
> > (on source), depending on CPUID.
> 
> This may be a solution for old-kernel => new-kernel migration, yes. But
> this still doesn't solve the migration compatibility problem for
> new-kernel => old-kernel migration (see below).
> 
> 
> > 
> > > - Changing CPUID bits under guest's feet.
> > >   - Proposed solution: migrating CPUID bits, refuse migration if
> > >     destination doesn't support the same bits.
> > >     - It solves the compatibility problem for migration to a newer
> > >       kernel, but not to an older kernel. It helps to solve part of
> > >       the problem, but not all.
> > 
> > How does not it save all of the problem? If destination kernel
> > can support cpuid, then we are fine - it is new enough.
> 
> See below. The problem is being able to migrate to an older host.
> That's the whole point of machine-types!
> 
> > 
> > >   - Alternative solution: simply make the resulting CPUID bits not be a
> > >     function of the host kernel capabilities, but only of the qemu
> > >     configuration (except on "-cpu host" and "-M pc-next").
> > 
> > This perpetuates existing duplication of code between
> > kvm and qemu. We are better off with logic in 1 place.
> 
> Yes, it does, and I would love to avoid having the list inside QEMU,
> too. But we can't avoid that because each machine-type defines a set of
> available features/MSRs, so we have to have a machine-type =>
> list-of-features list somewhere, unfortunately.
> 

Yes. But let us have machine type->cpuid list in qemu.
kvm will have the cpuid->MSR logic.


> > 
> > > - Migration compatibility/predictability:
> > >   - See my example above: feature introduced in a newer kernel,
> > >     migration to an older kernel.
> > 
> > If it is enabled then migration fails.
> > 
> > >   - The only way I see to guarantee this is to never enable unknown
> > >     CPUID bits or MSRs. People who don't care about predictable
> > >     migration compatibility can use "-M pc-next", then.
> > > 
> > 
> > Guarantee what?
> 
> Guarantee that "-M pc-1.1" machines can be migrated to any host that is
> already capable of running "-M pc-1.1".
> 

I can't change the past. I am suggesting forward compatible
approach so we'll be able to guarantee this for
-M pc-1.2 and on.

> > Just check dst can support all msrs and cpuid bits of src.
> > Way to check is to ask kvm :) Not to add logic in qemu.
> 
> Checking and making migration fail when it has to fail is not the
> problem. The problem is that now "qemu -M pc-1.x" will result in a
> different machine, depending on the host kernel version. This causes two
> problems:
> 
> - Now you don't know if your existing machine can be migrated to a
>   host running an older kernel (because now migration can fail even when
>   you are using the same machine-type on both sides).
> - Different VMs using the same machine-type will get different machines
>   (with different sets of features), because they are running on
>   different kernel versions.
> 
> This may be acceptable for "pc-next", but not for "pc-<version>".

So you can whitelist CPUID bits. But leave MSRs alone, it is nasty
enough with CPUID.
Eduardo Habkost Aug. 29, 2012, 2:21 p.m. UTC | #21
On Wed, Aug 29, 2012 at 05:11:12PM +0300, Michael S. Tsirkin wrote:
> On Wed, Aug 29, 2012 at 10:49:04AM -0300, Eduardo Habkost wrote:
> > > Normally CPUID will tell you if such important MSR is available.
> > > So we can check that at destination.
> > 
> > How can qemu check it, if when the qemu code was written when the MSR
> > didn't even exist yet?
> > 
> > (You could add an interface to check for that, but there's no KVM
> > ioctl() to tell qemu "given these CPUID bits, can I safely drop this MSR
> > that I don't even know about?")
> > 
> 
> So this is what I suggest exactly. Add a new ioctl like that.
> 
> > > 
> > > > > 
> > > > > > > 
> > > > > > > > > On the other hand, a mode of operation that doesn't require updating
> > > > > > > > > QEMU every time there's a new bit of guest-visible state to be migrated
> > > > > > > > > would be nice (just like the "-cpu host" mode, that doesn't require
> > > > > > > > > updating QEMU for every new CPU feature, is nice for some use cases). I
> > > > > > > > > just don't know how to make work with the current migration protocol.
> > > > > > > > > 
> > > > > > > > 
> > > > > > > > I don't understand. What is the problem with the proposal?
> > > > > > > > What will not work with our protocol?
> > > > > > > > Can you give an example please?
> > > > > > 
> > > > > > Yes:
> > > > > > 
> > > > > > Suppose kernel 3.7 introduces KVM_FOO_MSR and CPUID_KVM_FOO.
> > > > > > 
> > > > > > Also, suppose QEMU 1.2 doesn't know anything about KVM_FOO, because it
> > > > > > was release before this feature was introduced.
> > > > > > 
> > > > > > 
> > > > > > Then you run "qemu-1.2 -M pc-1.2" on a 3.7 host kernel. qemu-1.2 can do
> > > > > > two things here:
> > > > > > 
> > > > > > 1) Not enable CPUID_KVM_FOO
> > > > > > 
> > > > > > In this case, the guest should not use KVM_FOO_MSR and the MSR does not
> > > > > > need to be migrated (the guest may try to use it, but the behavior when
> > > > > > trying to use it is undefined). Sending the MSR value when migrating
> > > > > > would be useless.
> > > > > > 
> > > > > > 
> > > > > > 2) Enable CPUID_KVM_FOO.
> > > > > > 
> > > > > > In this case, the guest may try to use the feature and write something
> > > > > > into KVM_FOO_MSR. Sending the MSR value when migrating is absolutely
> > > > > > necessary
> > > > > > 
> > > > > > ---
> > > > > > 
> > > > > > Now assume you run "qemu-1.2 -M pc-1.2" in the destination host, running
> > > > > > the 3.6 kernel (without KVM_FOO).
> > > > > > 
> > > > > > Then qemu-1.2 receives the KVM_FOO_MSR data in the migration stream. In
> > > > > > this case, qemu-1.2 simply can't decide if it's safe to drop the data
> > > > > > (and not tell KVM about it), or not.
> > > > > > If we simply send every MSR reported by the kernel, both the origin and
> > > > > > destination qemu-1.2 processes can't ever know if the MSR value is
> > > > > > important or not, because it doesn't know if it's part of the machine
> > > > > > state that has to be kept consistent.
> > > > > > We could introduce a mode of operation where _every_ MSR reported by KVM
> > > > > > is important and sent by the origin (and also where every MSR must be
> > > > > > handled by the destination, otherwise migration would fail). But this
> > > > > > new mode would break migration compatibility between two hosts running
> > > > > > the same machine-type, only because they are running different kernel
> > > > > > versions. But it may be useful for some use cases, so maybe it's
> > > > > > appropriate for a future "pc-next" machine-type (and for "-cpu host"),
> > > > > > but not for "pc-<version>".
> > > > > > 
> > > > > 
> > > > > In this example, we should migrate CPUID (don't we?).
> > > > > Destination should validate that CPUID supplied by source
> > > > > matches what it can support (doesn't it?).
> > > > > 
> > > > > If we do and it does not, it's an unrelated bug:
> > > > > CPUID changing under guest's feet.
> > > > 
> > > > CPUID changing under guest's feet is another problem, that we also have
> > > > to solve.
> > > > But we also have the problem of migration compatibility
> > > > between different host kernels.
> > > 
> > > So here is the solution for both: on destination pass CPUID to kvm and
> > > it should come back unchanged.  If it changed you fail migration.
> > 
> > This doesn't solve the problem of having predictable migration
> > compatibility for "-M pc-<old-version>".
> > 
> > The whole point of machine-types is to expose the "same machine" to the
> > guest, even if you change the hardware or host kernel. "qemu -M pc-1.x"
> > must expose the same machine configuration to the guest, it doesn 't
> > matter what's the host kernel version.
> 
> I'd tend to disagree. The point is to make migration work
> and avoid things like windows re-activation trigger.
> Let's not be purists - many internal changes in qemu
> introduce subtle guest visible changes, if even in timing.

Yes, but in this case silently adding MSRs and/or new feature bits after
a kernel upgrade is not just a "subtle" change, it's a change that
breaks migration.


> 
> > > 
> > > > 
> > > > Note that I am not saying that migrating all MSRs is wrong. It _is_
> > > > good, as long as:
> > > > - The destination never ignores any of the incoming MSR values.
> > > 
> > > What I am saying for MSRs added in last 2 years it is OK to ignore
> > > because CPUID check will tell you if it is supported
> > > and fail migration.
> > 
> > Existing MSRs are easy to make work. The problem is about MSRs added to
> > the "msrs_to_save" list in the future.
> > 
> > Also, the problem is not about being "safe" to ignore the MSR values,
> > it's about being "correct" (part of the expected behavior of the virtual
> > machine). The fact that most guests doen't crash when the virtual
> > machine doesn't behave as it should doesn't mean we should do it.
> > 
> > Either the MSR is part of the machine state (and relevant to the guest),
> > or not. If it is relevant, it must be _always_ migrated and never
> > dropped by the destination. If it is not, it's useless to migrate it.
> > 
> 
> Yes. But it is better to keep all knowledge which is which
> in one place which is in kvm.

True, but doing this may need additional interfaces, not provided by KVM
yet (see below).

> 
> > > 
> > > > - We don't do that by default on "pc-<version>", or we defeat the
> > > >   purpose of machine-types.
> > > > 
> > > > I'll try to enumerate the problems I am trying to address (that are
> > > > related, but are separate problems):
> > > > 
> > > > - MSR not being migrated when it should:
> > > >   - Possible solution: migrate all MSRs even if qemu doesn't know what
> > > >     they are.
> > > >     - Constraint: migration destination must _never_ ignore any incoming
> > > >       MSR value, because it can't decide if it is important to the guest
> > > >       or not (with the current KVM interfaces).
> > > >     - Problem with this solution: if we do that by default on
> > > >       "pc-<version>", we break migration compatibility between hosts
> > > >       with different kernel versions.
> > > 
> > > Solution: add vcpu ioctl that tells you which MSRs to migrate
> > > (on source), depending on CPUID.
> > 
> > This may be a solution for old-kernel => new-kernel migration, yes. But
> > this still doesn't solve the migration compatibility problem for
> > new-kernel => old-kernel migration (see below).
> > 
> > 
> > > 
> > > > - Changing CPUID bits under guest's feet.
> > > >   - Proposed solution: migrating CPUID bits, refuse migration if
> > > >     destination doesn't support the same bits.
> > > >     - It solves the compatibility problem for migration to a newer
> > > >       kernel, but not to an older kernel. It helps to solve part of
> > > >       the problem, but not all.
> > > 
> > > How does not it save all of the problem? If destination kernel
> > > can support cpuid, then we are fine - it is new enough.
> > 
> > See below. The problem is being able to migrate to an older host.
> > That's the whole point of machine-types!
> > 
> > > 
> > > >   - Alternative solution: simply make the resulting CPUID bits not be a
> > > >     function of the host kernel capabilities, but only of the qemu
> > > >     configuration (except on "-cpu host" and "-M pc-next").
> > > 
> > > This perpetuates existing duplication of code between
> > > kvm and qemu. We are better off with logic in 1 place.
> > 
> > Yes, it does, and I would love to avoid having the list inside QEMU,
> > too. But we can't avoid that because each machine-type defines a set of
> > available features/MSRs, so we have to have a machine-type =>
> > list-of-features list somewhere, unfortunately.
> > 
> 
> Yes. But let us have machine type->cpuid list in qemu.
> kvm will have the cpuid->MSR logic.

That would work, too.

> 
> 
> > > 
> > > > - Migration compatibility/predictability:
> > > >   - See my example above: feature introduced in a newer kernel,
> > > >     migration to an older kernel.
> > > 
> > > If it is enabled then migration fails.
> > > 
> > > >   - The only way I see to guarantee this is to never enable unknown
> > > >     CPUID bits or MSRs. People who don't care about predictable
> > > >     migration compatibility can use "-M pc-next", then.
> > > > 
> > > 
> > > Guarantee what?
> > 
> > Guarantee that "-M pc-1.1" machines can be migrated to any host that is
> > already capable of running "-M pc-1.1".
> > 
> 
> I can't change the past. I am suggesting forward compatible
> approach so we'll be able to guarantee this for
> -M pc-1.2 and on.

pc-1.1 is already broken, but at least pc-1.3 and later should work this
way (so, for example, "qemu-1.5 -M pc-1.3" can be guaranteed to be
migratable to any host that can run "qemu -M pc-1.3").


> 
> > > Just check dst can support all msrs and cpuid bits of src.
> > > Way to check is to ask kvm :) Not to add logic in qemu.
> > 
> > Checking and making migration fail when it has to fail is not the
> > problem. The problem is that now "qemu -M pc-1.x" will result in a
> > different machine, depending on the host kernel version. This causes two
> > problems:
> > 
> > - Now you don't know if your existing machine can be migrated to a
> >   host running an older kernel (because now migration can fail even when
> >   you are using the same machine-type on both sides).
> > - Different VMs using the same machine-type will get different machines
> >   (with different sets of features), because they are running on
> >   different kernel versions.
> > 
> > This may be acceptable for "pc-next", but not for "pc-<version>".
> 
> So you can whitelist CPUID bits. But leave MSRs alone, it is nasty
> enough with CPUID.

That may be a solution, yes, as long as we have an interface to map
CPUID bits to a list of "relevant MSRs that have to be migrated", like
you suggests (that we don't have yet).

So, in short:

- A machine-type must define a list of machine "features", many of them
  exposed through CPUID bits
- The exposure of a feature (in this case, always throught a CPUID bit?)
  may make migration of a MSR necessary.
  - We need the CPUID => MSRs mapping somewhere.

The "machine-type => features" mapping has to live in QEMU, but the
"features => MSRs" mapping may live in the kernel, if you want to. But
today we don't have an interface in KVM for that.
Michael S. Tsirkin Aug. 29, 2012, 2:29 p.m. UTC | #22
On Wed, Aug 29, 2012 at 09:09:16AM -0500, Anthony Liguori wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Wed, Aug 29, 2012 at 08:36:30AM -0500, Anthony Liguori wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> > In preparation for adding PV EOI support, disable PV EOI by default for
> >> > 1.1 and older machine types, to avoid CPUID changing during migration.
> >> >
> >> > PV EOI can still be enabled/disabled by specifying it explicitly.
> >> >     Enable for 1.1
> >> >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> >> >     Disable for 1.2
> >> >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> >> >
> >> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >> 
> >> As best I can tell, we're papering over an ABI breakage in KVM.
> >> 
> >> If an old QEMU attempts to do a live migration on a new kernel,
> >> migrating to an QEMU on a different box with an older kernel, it will
> >> fail because there is state that isn't being migrated.
> >> 
> >> This ought to be fixed in the kernel by making these features
> >> whitelisted by userspace.
> >> 
> > What do you mean. Userspace and only userspace decides what cpuid bits
> > will be seen by a guest. Currently userspace enables all PV cpuid bits
> > it finds.
> 
> Right, I misunderstood from the commit message.  I see now that the
> problem is that bfee7546df51c08e395dc8a7676a5c7f20186fee unconditionally
> enabled kvm_pv_eoi without taking into account migration support.

All that commit does it let user disable pv eoi.
Don't see how reverting it will help ...

> I think for 1.2 we should simply revert the above commit and then we can
> restore it for 1.3 with proper support for migration.
> 
> Regards,
> 
> Anthony Liguori
> 
> >
> > --
> > 			Gleb.
Eduardo Habkost Aug. 29, 2012, 2:41 p.m. UTC | #23
On Wed, Aug 29, 2012 at 09:09:16AM -0500, Anthony Liguori wrote:
> Gleb Natapov <gleb@redhat.com> writes:
> 
> > On Wed, Aug 29, 2012 at 08:36:30AM -0500, Anthony Liguori wrote:
> >> "Michael S. Tsirkin" <mst@redhat.com> writes:
> >> 
> >> > In preparation for adding PV EOI support, disable PV EOI by default for
> >> > 1.1 and older machine types, to avoid CPUID changing during migration.
> >> >
> >> > PV EOI can still be enabled/disabled by specifying it explicitly.
> >> >     Enable for 1.1
> >> >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
> >> >     Disable for 1.2
> >> >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
> >> >
> >> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >> 
> >> As best I can tell, we're papering over an ABI breakage in KVM.
> >> 
> >> If an old QEMU attempts to do a live migration on a new kernel,
> >> migrating to an QEMU on a different box with an older kernel, it will
> >> fail because there is state that isn't being migrated.
> >> 
> >> This ought to be fixed in the kernel by making these features
> >> whitelisted by userspace.
> >> 
> > What do you mean. Userspace and only userspace decides what cpuid bits
> > will be seen by a guest. Currently userspace enables all PV cpuid bits
> > it finds.
> 
> Right, I misunderstood from the commit message.  I see now that the
> problem is that bfee7546df51c08e395dc8a7676a5c7f20186fee unconditionally
> enabled kvm_pv_eoi without taking into account migration support.

commit bfee7546df51c08e395dc8a7676a5c7f20186fee didn't enable pv_eoi, it
simple gave it a name.

> 
> I think for 1.2 we should simply revert the above commit and then we can
> restore it for 1.3 with proper support for migration.

Reverting the commit won't do anything. The problem is at:

  static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
  {
      [...]
      plus_kvm_features = ~0; /* not supported bits will be filtered out later */
  [...]

The bits are filtered out using GET_SUPPORTED_CPUID later.

For 1.2, we may temporarily change that ~0 into a hardcoded whitelist, so no
unknown feature will be enabled silently after a kernel upgrade.
Juan Quintela Aug. 29, 2012, 2:43 p.m. UTC | #24
"Michael S. Tsirkin" <mst@redhat.com> wrote:
> On Tue, Aug 28, 2012 at 04:13:38PM -0300, Eduardo Habkost wrote:
>> On Tue, Aug 28, 2012 at 08:43:52PM +0300, Michael S. Tsirkin wrote:
>> > In preparation for adding PV EOI support, disable PV EOI by default for
>> > 1.1 and older machine types, to avoid CPUID changing during migration.
>> > 
>> > PV EOI can still be enabled/disabled by specifying it explicitly.
>> >     Enable for 1.1
>> >     -M pc-1.1 -cpu kvm64,+kvm_pv_eoi
>> >     Disable for 1.2
>> >     -M pc-1.2 -cpu kvm64,-kvm_pv_eoi
>> > 
>> 
>> What about users that are already running "qemu-1.1 -M pc-1.1" on a host
>> kernel that supports PV EOI already? They would get PV EOI disabled when
>> migrating to a destination running "qemu-1.2 -M pc-1.1".
>> 
>> (On the other hand, people running "qemu-1.1 -M pc-1.1" on a host kernel
>> supporting PV EOI already have migration broken, so there's not much we
>> can do for them)
>
> Exactly.
>
> Talked to Gleb, long term I think we should rework code to make
> it forward-compatible wrt adding new MSRs:
> - source gets list of MSRs to be migrated from KVM and simply sends
> them all

This is a bad idea, each time that we add a new MSR, we broke migration.
A better idea is just to _know_ what MSR's have been used by the guest,
and sent that ones.

> - send all MSRS in key/value format

Instead send all MSR's that have changed by its default value (for some
definition of default value).

> - destination gets list of MSRs to be migrated from KVM and
>   only restores the supported ones

it should be the other way around.  If source knows that an MSR is not
needed (it has never been read/writen/configured/... depends on the
MSR), then jsut dont' sent it.

> Too late for 1.2?

Yes.

ARM is trying to come with some fix for this (they have the CP15
registers, I think, that look a lot like MSR's.  I think that only
solution is what  I described here.

Current solution of explicitely list all of them is just a mass (TM).

Later, Juan.)
diff mbox

Patch

diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 850b87b..3f2532a 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -1,5 +1,5 @@ 
 hw-obj-y = usb/ ide/
-hw-obj-y += loader.o
+hw-obj-y += loader.o cpu_flags.o
 hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
 hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
 hw-obj-y += fw_cfg.o
diff --git a/hw/cpu_flags.c b/hw/cpu_flags.c
new file mode 100644
index 0000000..7a633c0
--- /dev/null
+++ b/hw/cpu_flags.c
@@ -0,0 +1,32 @@ 
+/*
+ * CPU compatibility flags.
+ *
+ * Copyright (c) 2012 Red Hat Inc.
+ * Author: Michael S. Tsirkin.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "hw/cpu_flags.h"
+
+static bool kvm_pv_eoi_disabled_state;
+
+void disable_kvm_pv_eoi(void)
+{
+       kvm_pv_eoi_disabled_state = true;
+}
+
+bool kvm_pv_eoi_disabled(void)
+{
+       return kvm_pv_eoi_disabled_state;
+}
diff --git a/hw/cpu_flags.h b/hw/cpu_flags.h
new file mode 100644
index 0000000..05777b6
--- /dev/null
+++ b/hw/cpu_flags.h
@@ -0,0 +1,9 @@ 
+#ifndef HW_CPU_FLAGS_H
+#define HW_CPU_FLAGS_H
+
+#include <stdbool.h>
+
+void disable_kvm_pv_eoi(void);
+bool kvm_pv_eoi_disabled(void);
+
+#endif
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 008d42f..bdbceda 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -46,6 +46,7 @@ 
 #ifdef CONFIG_XEN
 #  include <xen/hvm/hvm_info_table.h>
 #endif
+#include "cpu_flags.h"
 
 #define MAX_IDE_BUS 2
 
@@ -371,6 +372,7 @@  static QEMUMachine pc_machine_v1_2 = {
 
 static void pc_machine_v1_1_compat(void)
 {
+    disable_kvm_pv_eoi();
 }
 
 static void pc_init_pci_v1_1(ram_addr_t ram_size,
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 120a2e3..0d02fd1 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -23,6 +23,7 @@ 
 
 #include "cpu.h"
 #include "kvm.h"
+#include "asm/kvm_para.h"
 
 #include "qemu-option.h"
 #include "qemu-config.h"
@@ -33,6 +34,7 @@ 
 #include "hyperv.h"
 
 #include "hw/hw.h"
+#include "hw/cpu_flags.h"
 
 /* feature flags taken from "Intel Processor Identification and the CPUID
  * Instruction" and AMD's "CPUID Specification".  In cases of disagreement
@@ -889,6 +891,12 @@  static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
 
     plus_kvm_features = ~0; /* not supported bits will be filtered out later */
 
+    /* Disable PV EOI for old machine types.
+     * Feature flags can still override. */
+    if (kvm_pv_eoi_disabled()) {
+        plus_kvm_features &= ~(0x1 << KVM_FEATURE_PV_EOI);
+    }
+
     add_flagname_to_bitmaps("hypervisor", &plus_features,
         &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
         &plus_kvm_features, &plus_svm_features);