diff mbox

[v2,3/9] provide in-kernel ioapic

Message ID 1254953315-5761-4-git-send-email-glommer@redhat.com
State Changes Requested
Headers show

Commit Message

Glauber Costa Oct. 7, 2009, 10:08 p.m. UTC
This patch provides kvm with an in-kernel ioapic. We are currently not enabling it.
The code is heavily based on what's in qemu-kvm.git.

Signed-off-by: Glauber Costa <glommer@redhat.com>
---
 hw/ioapic.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 kvm-all.c   |   20 ++++++++++++++
 kvm.h       |    4 +++
 3 files changed, 107 insertions(+), 1 deletions(-)

Comments

Jan Kiszka Oct. 8, 2009, 11:46 a.m. UTC | #1
Glauber Costa wrote:
> This patch provides kvm with an in-kernel ioapic. We are currently not enabling it.
> The code is heavily based on what's in qemu-kvm.git.
> 
> Signed-off-by: Glauber Costa <glommer@redhat.com>
> ---
>  hw/ioapic.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  kvm-all.c   |   20 ++++++++++++++
>  kvm.h       |    4 +++
>  3 files changed, 107 insertions(+), 1 deletions(-)
> 
> diff --git a/hw/ioapic.c b/hw/ioapic.c
> index d475654..d6a9dac 100644
> --- a/hw/ioapic.c
> +++ b/hw/ioapic.c
> @@ -24,6 +24,7 @@
>  #include "pc.h"
>  #include "qemu-timer.h"
>  #include "host-utils.h"
> +#include "kvm.h"
>  
>  //#define DEBUG_IOAPIC
>  
> @@ -193,6 +194,79 @@ static void ioapic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t va
>      }
>  }
>  
> +static int kvm_kernel_ioapic_load_from_user(IOAPICState *s)
> +{
> +    int r = 0;
> +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)

Hmm, here we additionally depend on TARGET_I386...

> +    struct kvm_irqchip chip;
> +    struct kvm_ioapic_state *kioapic;
> +    int i;
> +
> +    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
> +        return 0;
> +
> +    chip.chip_id = KVM_IRQCHIP_IOAPIC;
> +    kioapic = &chip.chip.ioapic;
> +
> +    kioapic->id = s->id;
> +    kioapic->ioregsel = s->ioregsel;
> +    kioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
> +    kioapic->irr = s->irr;
> +    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> +        kioapic->redirtbl[i].bits = s->ioredtbl[i];
> +    }
> +
> +    r = kvm_set_irqchip(&chip);
> +#endif
> +    return r;
> +}
> +
> +static void kvm_kernel_ioapic_save_to_user(IOAPICState *s)
> +{
> +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
> +    struct kvm_irqchip chip;
> +    struct kvm_ioapic_state *kioapic;
> +    int i;
> +
> +    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
> +        return;
> +    chip.chip_id = KVM_IRQCHIP_IOAPIC;
> +    kvm_get_irqchip(&chip);
> +    kioapic = &chip.chip.ioapic;
> +
> +    s->id = kioapic->id;
> +    s->ioregsel = kioapic->ioregsel;
> +    s->irr = kioapic->irr;
> +    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> +        s->ioredtbl[i] = kioapic->redirtbl[i].bits;
> +    }
> +#endif
> +}
> +
> +static void ioapic_pre_save(void *opaque)
> +{
> +    IOAPICState *s = (void *)opaque;
> +
> +    kvm_kernel_ioapic_save_to_user(s);
> +}
> +
> +static int ioapic_pre_load(void *opaque)
> +{
> +    IOAPICState *s = opaque;
> +
> +    /* in case we are doing version 1, we just set these to sane values */
> +    s->irr = 0;
> +    return 0;
> +}
> +
> +static int ioapic_post_load(void *opaque, int version_id)
> +{
> +    IOAPICState *s = opaque;
> +
> +    return kvm_kernel_ioapic_load_from_user(s);
> +}
> +
> +
>  static const VMStateDescription vmstate_ioapic = {
>      .name = "ioapic",
>      .version_id = 2,
> @@ -205,7 +279,10 @@ static const VMStateDescription vmstate_ioapic = {
>          VMSTATE_UINT32_V(irr, IOAPICState, 2),
>          VMSTATE_UINT64_ARRAY(ioredtbl, IOAPICState, IOAPIC_NUM_PINS),
>          VMSTATE_END_OF_LIST()
> -    }
> +    },
> +    .pre_load = ioapic_pre_load,
> +    .post_load = ioapic_post_load,
> +    .pre_save = ioapic_pre_save,
>  };
>  
>  static void ioapic_reset(void *opaque)
> @@ -217,6 +294,11 @@ static void ioapic_reset(void *opaque)
>      s->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
>      for(i = 0; i < IOAPIC_NUM_PINS; i++)
>          s->ioredtbl[i] = 1 << 16; /* mask LVT */
> +#ifdef KVM_CAP_IRQCHIP

... but here only KVM_CAP_IRQCHIP suffices?

> +    if (kvm_enabled() && kvm_irqchip_in_kernel()) {
> +        kvm_kernel_ioapic_load_from_user(s);
> +    }
> +#endif

Independent of this, both kvm_irqchip_in_kernel() and
kvm_kernel_ioapic_load_from_user() should also be defined for the
!KVM_CAP_IRQCHIP case to avoid #ifdefs here and in many other places
this series touches.

>  }
>  
>  static CPUReadMemoryFunc * const ioapic_mem_read[3] = {
> diff --git a/kvm-all.c b/kvm-all.c
> index 48ae26c..d795285 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
>      return ret;
>  }
>  
> +#ifdef KVM_CAP_IRQCHIP

Again, only KVM_CAP_IRQCHIP - what is the actually required dependency?

> +int kvm_set_irqchip(struct kvm_irqchip *chip)
> +{
> +    if (!kvm_state->irqchip_in_kernel) {
> +        return 0;
> +    }
> +
> +    return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
> +}
> +
> +int kvm_get_irqchip(struct kvm_irqchip *chip)
> +{
> +    if (!kvm_state->irqchip_in_kernel) {
> +        return 0;
> +    }
> +
> +    return kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
> +}
> +#endif
> +
>  int kvm_init(int smp_cpus)
>  {
>      static const char upgrade_note[] =
> diff --git a/kvm.h b/kvm.h
> index e7d5beb..8d4afa0 100644
> --- a/kvm.h
> +++ b/kvm.h
> @@ -16,6 +16,7 @@
>  
>  #include "config.h"
>  #include "qemu-queue.h"
> +#include <linux/kvm.h>
>  
>  #ifdef CONFIG_KVM
>  extern int kvm_allowed;
> @@ -63,6 +64,9 @@ int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap);
>  int kvm_pit_in_kernel(void);
>  int kvm_irqchip_in_kernel(void);
>  
> +int kvm_set_irqchip(struct kvm_irqchip *chip);
> +int kvm_get_irqchip(struct kvm_irqchip *chip);
> +
>  /* internal API */
>  
>  struct KVMState;

Jan
Anthony Liguori Oct. 8, 2009, 1:49 p.m. UTC | #2
Glauber Costa wrote:
> This patch provides kvm with an in-kernel ioapic. We are currently not enabling it.
> The code is heavily based on what's in qemu-kvm.git.
>   

It really ought to be it's own file and own device model.  Having the 
code mixed in with ioapic.c is confusing because it's unclear what code 
is in use when the in-kernel model is used.

> @@ -193,6 +194,79 @@ static void ioapic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t va
>      }
>  }
>
> +static int kvm_kernel_ioapic_load_from_user(IOAPICState *s)
> +{
> +    int r = 0;
> +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
>   

No point in checking the KVM_CAP_IRQCHIP.  Just require it during 
build.  Otherwise, !KVM_CAP_IRQCHIP is dead code since I'm sure noone is 
actually testing kernels that old with modern qemu.

There's no point in restricting to I386 either.

> +    struct kvm_irqchip chip;
> +    struct kvm_ioapic_state *kioapic;
> +    int i;
> +
> +    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
> +        return 0;
> +
> +    chip.chip_id = KVM_IRQCHIP_IOAPIC;
> +    kioapic = &chip.chip.ioapic;
> +
> +    kioapic->id = s->id;
> +    kioapic->ioregsel = s->ioregsel;
> +    kioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
> +    kioapic->irr = s->irr;
> +    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> +        kioapic->redirtbl[i].bits = s->ioredtbl[i];
> +    }
> +
> +    r = kvm_set_irqchip(&chip);
> +#endif
> +    return r;
> +}
> +
> +static void kvm_kernel_ioapic_save_to_user(IOAPICState *s)
> +{
> +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
> +    struct kvm_irqchip chip;
> +    struct kvm_ioapic_state *kioapic;
> +    int i;
> +
> +    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
> +        return;
> +    chip.chip_id = KVM_IRQCHIP_IOAPIC;
> +    kvm_get_irqchip(&chip);
> +    kioapic = &chip.chip.ioapic;
> +
> +    s->id = kioapic->id;
> +    s->ioregsel = kioapic->ioregsel;
> +    s->irr = kioapic->irr;
> +    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> +        s->ioredtbl[i] = kioapic->redirtbl[i].bits;
> +    }
> +#endif
> +}
> +
> +static void ioapic_pre_save(void *opaque)
> +{
> +    IOAPICState *s = (void *)opaque;
> +
> +    kvm_kernel_ioapic_save_to_user(s);
> +}
> +
> +static int ioapic_pre_load(void *opaque)
> +{
> +    IOAPICState *s = opaque;
> +
> +    /* in case we are doing version 1, we just set these to sane values */
> +    s->irr = 0;
> +    return 0;
> +}
> +
> +static int ioapic_post_load(void *opaque, int version_id)
> +{
> +    IOAPICState *s = opaque;
> +
> +    return kvm_kernel_ioapic_load_from_user(s);
> +}
> +
> +
>  static const VMStateDescription vmstate_ioapic = {
>      .name = "ioapic",
>      .version_id = 2,
> @@ -205,7 +279,10 @@ static const VMStateDescription vmstate_ioapic = {
>          VMSTATE_UINT32_V(irr, IOAPICState, 2),
>          VMSTATE_UINT64_ARRAY(ioredtbl, IOAPICState, IOAPIC_NUM_PINS),
>          VMSTATE_END_OF_LIST()
> -    }
> +    },
> +    .pre_load = ioapic_pre_load,
> +    .post_load = ioapic_post_load,
> +    .pre_save = ioapic_pre_save,
>  };
>   

The in kernel apic should be a separate device model with a separate 
savevm section.  They are different devices and there's no real 
advantage to pretending like they're the same device.

>  static CPUReadMemoryFunc * const ioapic_mem_read[3] = {
> diff --git a/kvm-all.c b/kvm-all.c
> index 48ae26c..d795285 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, unsigned int extension)
>      return ret;
>  }
>
> +#ifdef KVM_CAP_IRQCHIP
> +int kvm_set_irqchip(struct kvm_irqchip *chip)
> +{
> +    if (!kvm_state->irqchip_in_kernel) {
> +        return 0;
> +    }
> +
> +    return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
> +}
>   

irqchip_in_kernel ought to disappear.
Avi Kivity Oct. 8, 2009, 1:54 p.m. UTC | #3
On 10/08/2009 03:49 PM, Anthony Liguori wrote:
> Glauber Costa wrote:
>> This patch provides kvm with an in-kernel ioapic. We are currently 
>> not enabling it.
>> The code is heavily based on what's in qemu-kvm.git.
>
> It really ought to be it's own file and own device model.  Having the 
> code mixed in with ioapic.c is confusing because it's unclear what 
> code is in use when the in-kernel model is used.

I disagree.  It's the same device with the same guest-visible interface 
and the same host-visible interface (save/restore, 'info ioapic' if we 
write one).  Splitting it into two files will only result in code 
duplication.

Think of it as an ioapic accelerator.
Jan Kiszka Oct. 8, 2009, 3:53 p.m. UTC | #4
Avi Kivity wrote:
> On 10/08/2009 03:49 PM, Anthony Liguori wrote:
>> Glauber Costa wrote:
>>> This patch provides kvm with an in-kernel ioapic. We are currently
>>> not enabling it.
>>> The code is heavily based on what's in qemu-kvm.git.
>>
>> It really ought to be it's own file and own device model.  Having the
>> code mixed in with ioapic.c is confusing because it's unclear what
>> code is in use when the in-kernel model is used.
> 
> I disagree.  It's the same device with the same guest-visible interface
> and the same host-visible interface (save/restore, 'info ioapic' if we
> write one).  Splitting it into two files will only result in code
> duplication.

Shared functions can be pushed into a commonly used module
(ioapic-common.c). And everyone touching it should realize then that it
could affect more than one user.

Jan
Jamie Lokier Oct. 8, 2009, 4:07 p.m. UTC | #5
Avi Kivity wrote:
> On 10/08/2009 03:49 PM, Anthony Liguori wrote:
> >Glauber Costa wrote:
> >>This patch provides kvm with an in-kernel ioapic. We are currently 
> >>not enabling it.
> >>The code is heavily based on what's in qemu-kvm.git.
> >
> >It really ought to be it's own file and own device model.  Having the 
> >code mixed in with ioapic.c is confusing because it's unclear what 
> >code is in use when the in-kernel model is used.
> 
> I disagree.  It's the same device with the same guest-visible interface 
> and the same host-visible interface (save/restore, 'info ioapic' if we 
> write one).  Splitting it into two files will only result in code 
> duplication.
> 
> Think of it as an ioapic accelerator.

Haven't we already confirmed that it *isn't* just an ioapic accelerator
because you can't migrate between in-kernel iopic and qemu's ioapic?

Imho, if they cannot be swapped transparently, they are different
device emulations.

OF course there's nothing wrong with sharing lots of code.

Maybe ioapic.c and ioapic-kvm.c, with shared code in ioapic-common.c?

-- Jamie
Anthony Liguori Oct. 8, 2009, 4:12 p.m. UTC | #6
Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>> On 10/08/2009 03:49 PM, Anthony Liguori wrote:
>>     
>>> Glauber Costa wrote:
>>>       
>>>> This patch provides kvm with an in-kernel ioapic. We are currently 
>>>> not enabling it.
>>>> The code is heavily based on what's in qemu-kvm.git.
>>>>         
>>> It really ought to be it's own file and own device model.  Having the 
>>> code mixed in with ioapic.c is confusing because it's unclear what 
>>> code is in use when the in-kernel model is used.
>>>       
>> I disagree.  It's the same device with the same guest-visible interface 
>> and the same host-visible interface (save/restore, 'info ioapic' if we 
>> write one).  Splitting it into two files will only result in code 
>> duplication.
>>
>> Think of it as an ioapic accelerator.
>>     
>
> Haven't we already confirmed that it *isn't* just an ioapic accelerator
> because you can't migrate between in-kernel iopic and qemu's ioapic?
>
> Imho, if they cannot be swapped transparently, they are different
> device emulations.
>
> OF course there's nothing wrong with sharing lots of code.
>   

If you avoid having a common save format, you get an overall reduction 
in code size and there's virtually no code to share.
Avi Kivity Oct. 8, 2009, 4:17 p.m. UTC | #7
On 10/08/2009 06:07 PM, Jamie Lokier wrote:
> Haven't we already confirmed that it *isn't* just an ioapic accelerator
> because you can't migrate between in-kernel iopic and qemu's ioapic?
>    

We haven't confirmed it.  Both implement the same spec, and if you can't 
migrate between them, one of them has a bug (for example, qemu ioapic 
doesn't implement polarity - but it's still just a bug).
Gleb Natapov Oct. 8, 2009, 4:22 p.m. UTC | #8
On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote:
> On 10/08/2009 06:07 PM, Jamie Lokier wrote:
> >Haven't we already confirmed that it *isn't* just an ioapic accelerator
> >because you can't migrate between in-kernel iopic and qemu's ioapic?
> 
> We haven't confirmed it.  Both implement the same spec, and if you
> can't migrate between them, one of them has a bug (for example, qemu
> ioapic doesn't implement polarity - but it's still just a bug).
> 
Are you saying that HW spec (that only describes software visible behavior)
completely defines implementation? No other internal state is needed
that may be done differently by different implementations?

--
			Gleb.
Avi Kivity Oct. 8, 2009, 4:29 p.m. UTC | #9
On 10/08/2009 06:22 PM, Gleb Natapov wrote:
> On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote:
>    
>> On 10/08/2009 06:07 PM, Jamie Lokier wrote:
>>      
>>> Haven't we already confirmed that it *isn't* just an ioapic accelerator
>>> because you can't migrate between in-kernel iopic and qemu's ioapic?
>>>        
>> We haven't confirmed it.  Both implement the same spec, and if you
>> can't migrate between them, one of them has a bug (for example, qemu
>> ioapic doesn't implement polarity - but it's still just a bug).
>>
>>      
> Are you saying that HW spec (that only describes software visible behavior)
> completely defines implementation? No other internal state is needed
> that may be done differently by different implementations?
>    

It may be done differently (for example, selecting the cpu to deliver 
the interrupt to), but as the guest cannot rely on the differences, 
there's no need to save the state that can cause these differences.
Gleb Natapov Oct. 8, 2009, 4:34 p.m. UTC | #10
On Thu, Oct 08, 2009 at 06:29:53PM +0200, Avi Kivity wrote:
> On 10/08/2009 06:22 PM, Gleb Natapov wrote:
> >On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote:
> >>On 10/08/2009 06:07 PM, Jamie Lokier wrote:
> >>>Haven't we already confirmed that it *isn't* just an ioapic accelerator
> >>>because you can't migrate between in-kernel iopic and qemu's ioapic?
> >>We haven't confirmed it.  Both implement the same spec, and if you
> >>can't migrate between them, one of them has a bug (for example, qemu
> >>ioapic doesn't implement polarity - but it's still just a bug).
> >>
> >Are you saying that HW spec (that only describes software visible behavior)
> >completely defines implementation? No other internal state is needed
> >that may be done differently by different implementations?
> 
> It may be done differently (for example, selecting the cpu to
> deliver the interrupt to), but as the guest cannot rely on the
> differences, there's no need to save the state that can cause these
> differences.
> 
So suppose I have simple watchdog device that required to be poked every
second, otherwise it resets a computer. On migration we have to migrate
time elapsed since last poke, but if device doesn't expose it to
software in any way you are saying we can recreate is some other way?

--
			Gleb.
Avi Kivity Oct. 8, 2009, 4:42 p.m. UTC | #11
On 10/08/2009 06:34 PM, Gleb Natapov wrote:
> So suppose I have simple watchdog device that required to be poked every
> second, otherwise it resets a computer. On migration we have to migrate
> time elapsed since last poke, but if device doesn't expose it to
> software in any way you are saying we can recreate is some other way?
>    

The time is exposed (you can measure it by poking the device and 
measuring the time till reset) and will be described in the spec 
(otherwise users will be surprised when their machine resets).

You don't have to migrate all exposed state, just exposed state that the 
guest may rely on.
Gleb Natapov Oct. 8, 2009, 5:11 p.m. UTC | #12
On Thu, Oct 08, 2009 at 06:42:07PM +0200, Avi Kivity wrote:
> On 10/08/2009 06:34 PM, Gleb Natapov wrote:
> >So suppose I have simple watchdog device that required to be poked every
> >second, otherwise it resets a computer. On migration we have to migrate
> >time elapsed since last poke, but if device doesn't expose it to
> >software in any way you are saying we can recreate is some other way?
> 
> The time is exposed (you can measure it by poking the device and
The time yes, not its internal representation. What if one implementation 
stores how much time passed and another how much time left. One counts
in wall clack another only when guest runs. etc... and this is a device
with only one write only register.

> measuring the time till reset) and will be described in the spec
> (otherwise users will be surprised when their machine resets).
> 
> You don't have to migrate all exposed state, just exposed state that
> the guest may rely on.

--
			Gleb.
Jamie Lokier Oct. 9, 2009, 10:02 a.m. UTC | #13
Gleb Natapov wrote:
> On Thu, Oct 08, 2009 at 06:42:07PM +0200, Avi Kivity wrote:
> > On 10/08/2009 06:34 PM, Gleb Natapov wrote:
> > >So suppose I have simple watchdog device that required to be poked every
> > >second, otherwise it resets a computer. On migration we have to migrate
> > >time elapsed since last poke, but if device doesn't expose it to
> > >software in any way you are saying we can recreate is some other way?
> > 
> > The time is exposed (you can measure it by poking the device and
> The time yes, not its internal representation. What if one implementation 
> stores how much time passed and another how much time left.
> One counts in wall clack another only when guest runs. etc... and
> this is a device with only one write only register.

In that case you can decide between calling it two different devices
(which have the same guest-visible behaviour but are not
interchangable), or calling them different implementations of one
device - by adding a little more code to save state in a common format.

(Although they may have to be different devices for qemu
configuration, it's ok for them to have the same PCI IDs and for the
guest not to know the difference)

For your watchdog example, it's not hard to pick a saved state which
works for both.

ioapic will be harder to find a useful common saved state, and there
might need to be an *optional hint* section (e.g. for selecting the
next CPU to get an interrupt), but I think it would be worth it in
this case.  Being able to load a KVM image into TCG and vice versa is
quite useful sometimes.  E.g. I've had to do some OS installs using
TCG at first, then switch to KVM later for performance.

-- Jamie
Gleb Natapov Oct. 9, 2009, 12:02 p.m. UTC | #14
On Fri, Oct 09, 2009 at 11:02:31AM +0100, Jamie Lokier wrote:
> Gleb Natapov wrote:
> > On Thu, Oct 08, 2009 at 06:42:07PM +0200, Avi Kivity wrote:
> > > On 10/08/2009 06:34 PM, Gleb Natapov wrote:
> > > >So suppose I have simple watchdog device that required to be poked every
> > > >second, otherwise it resets a computer. On migration we have to migrate
> > > >time elapsed since last poke, but if device doesn't expose it to
> > > >software in any way you are saying we can recreate is some other way?
> > > 
> > > The time is exposed (you can measure it by poking the device and
> > The time yes, not its internal representation. What if one implementation 
> > stores how much time passed and another how much time left.
> > One counts in wall clack another only when guest runs. etc... and
> > this is a device with only one write only register.
> 
> In that case you can decide between calling it two different devices
> (which have the same guest-visible behaviour but are not
> interchangable), or calling them different implementations of one
> device - by adding a little more code to save state in a common format.
> 
That is what currency done for in-kernel/out-of-kernel irq chips. Save
state transformation. The problem begins if one of the devices has more
state (not just the same state but in a different format). You need to
drop info on migration.

> (Although they may have to be different devices for qemu
> configuration, it's ok for them to have the same PCI IDs and for the
> guest not to know the difference)
> 
> For your watchdog example, it's not hard to pick a saved state which
> works for both.
If you can't migrate from one to the other why even bother? In my
example if one device counts wallclock time and another guest cpu time
you can't migrate from one implementation to another.

> 
> ioapic will be harder to find a useful common saved state, and there
> might need to be an *optional hint* section (e.g. for selecting the
> next CPU to get an interrupt), but I think it would be worth it in
> this case.  Being able to load a KVM image into TCG and vice versa is
> quite useful sometimes.  E.g. I've had to do some OS installs using
> TCG at first, then switch to KVM later for performance.
> 
Reboot :)

--
			Gleb.
Glauber Costa Oct. 9, 2009, 2:32 p.m. UTC | #15
On Thu, Oct 08, 2009 at 06:22:48PM +0200, Gleb Natapov wrote:
> On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote:
> > On 10/08/2009 06:07 PM, Jamie Lokier wrote:
> > >Haven't we already confirmed that it *isn't* just an ioapic accelerator
> > >because you can't migrate between in-kernel iopic and qemu's ioapic?
> > 
> > We haven't confirmed it.  Both implement the same spec, and if you
> > can't migrate between them, one of them has a bug (for example, qemu
> > ioapic doesn't implement polarity - but it's still just a bug).
> > 
> Are you saying that HW spec (that only describes software visible behavior)
> completely defines implementation? No other internal state is needed
> that may be done differently by different implementations?
Most specifications leaves a lot as implementation specific.

It's not hard to imagine a case in which both devices will follow the spec correctly,
(no bugs involved), and yet differ in the implementation.
Jamie Lokier Oct. 9, 2009, 4:49 p.m. UTC | #16
Glauber Costa wrote:
> On Thu, Oct 08, 2009 at 06:22:48PM +0200, Gleb Natapov wrote:
> > On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote:
> > > On 10/08/2009 06:07 PM, Jamie Lokier wrote:
> > > >Haven't we already confirmed that it *isn't* just an ioapic accelerator
> > > >because you can't migrate between in-kernel iopic and qemu's ioapic?
> > > 
> > > We haven't confirmed it.  Both implement the same spec, and if you
> > > can't migrate between them, one of them has a bug (for example, qemu
> > > ioapic doesn't implement polarity - but it's still just a bug).
> > > 
> > Are you saying that HW spec (that only describes software visible behavior)
> > completely defines implementation? No other internal state is needed
> > that may be done differently by different implementations?
> Most specifications leaves a lot as implementation specific.
> 
> It's not hard to imagine a case in which both devices will follow
> the spec correctly, (no bugs involved), and yet differ in the
> implementation.

Avi's not saying the implementations won't differ.  I believe he's
saying that implementation-specific states don't need to be saved if
they have no effect on guest visible behaviour.

-- Jamie
Glauber Costa Oct. 9, 2009, 9:34 p.m. UTC | #17
On Fri, Oct 09, 2009 at 09:55:13PM +0200, Juan Quintela wrote:
> Jamie Lokier <jamie@shareable.org> wrote:
> > Glauber Costa wrote:
> >> On Thu, Oct 08, 2009 at 06:22:48PM +0200, Gleb Natapov wrote:
> >> > On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote:
> >> > > On 10/08/2009 06:07 PM, Jamie Lokier wrote:
> >> > > >Haven't we already confirmed that it *isn't* just an ioapic accelerator
> >> > > >because you can't migrate between in-kernel iopic and qemu's ioapic?
> >> > > 
> >> > > We haven't confirmed it.  Both implement the same spec, and if you
> >> > > can't migrate between them, one of them has a bug (for example, qemu
> >> > > ioapic doesn't implement polarity - but it's still just a bug).
> >> > > 
> >> > Are you saying that HW spec (that only describes software visible behavior)
> >> > completely defines implementation? No other internal state is needed
> >> > that may be done differently by different implementations?
> >> Most specifications leaves a lot as implementation specific.
> >> 
> >> It's not hard to imagine a case in which both devices will follow
> >> the spec correctly, (no bugs involved), and yet differ in the
> >> implementation.
> >
> > Avi's not saying the implementations won't differ.  I believe he's
> > saying that implementation-specific states don't need to be saved if
> > they have no effect on guest visible behaviour.
> 
> Just to re-state.  I would also prefer to have a single device.  Reasons
> (majority already told in the thread):
> - We can switch between devices more easily
> - They are emulating the same device.
> - At the moment that you have two different devices, one of them will
>   rot :(
> - Adding state to the save/load format that is used only from one device
>   is not a problem.
> 
> I notice that discussion is going nowhere, basically we are in the
> state:
> - people that want one device
>   * they emulate the same hardware
>   * lots of code is shared
>   * they should be interchageable
>   * if they are not interchageable, it is a bug
>   * once that they are split, it is basically imposible to join then
>     again.
> - people that want 2 devices:
>   * The devices can more easily diverge if they are two devices
>   * They are not interchageable now
>   * It allows you more freedom in changing any of them if they are
>     separate.
> 
> As you can see, none of the proposals is a clear winner.  And what is
> worse, we have the two maintainers (avi and anthony), the two with more
> experience having to deal with this kind of situation disagreeing.
> 
> How to fix the impass?
a deathmatch?
Anthony Liguori Oct. 12, 2009, 1:20 p.m. UTC | #18
Juan Quintela wrote:
> I notice that discussion is going nowhere, basically we are in the
> state:
> - people that want one device
>   * they emulate the same hardware
>   * lots of code is shared
>   * they should be interchageable
>   * if they are not interchageable, it is a bug
>   * once that they are split, it is basically imposible to join then
>     again.
> - people that want 2 devices:
>   * The devices can more easily diverge if they are two devices
>   * They are not interchageable now
>   * It allows you more freedom in changing any of them if they are
>     separate.
>
> As you can see, none of the proposals is a clear winner.  And what is
> worse, we have the two maintainers (avi and anthony), the two with more
> experience having to deal with this kind of situation disagreeing.
>
> How to fix the impass?
>   

We already have the single device model implementation and the 
limitations are well known.  The best way to move forward is for someone 
to send out patches implementing separate device models.

At that point, it becomes a discussion of two concrete pieces of code 
verses hand waving.

> Later, Juan.
>
Jamie Lokier Oct. 12, 2009, 2:18 p.m. UTC | #19
Anthony Liguori wrote:
> We already have the single device model implementation and the 
> limitations are well known.  The best way to move forward is for someone 
> to send out patches implementing separate device models.
> 
> At that point, it becomes a discussion of two concrete pieces of code 
> verses hand waving.

Out of curiosity now, what _are_ the behavioural differences between
the in-kernel irqchip and the qemu one?

Are the differences significant to guests, such that it might be
necessary to disable the in-kernel irqchip for some guests, or
conversely, necessary to use KVM for some guests?

Thanks,
-- Jamie
Anthony Liguori Oct. 12, 2009, 2:49 p.m. UTC | #20
Jamie Lokier wrote:
> Anthony Liguori wrote:
>   
>> We already have the single device model implementation and the 
>> limitations are well known.  The best way to move forward is for someone 
>> to send out patches implementing separate device models.
>>
>> At that point, it becomes a discussion of two concrete pieces of code 
>> verses hand waving.
>>     
>
> Out of curiosity now, what _are_ the behavioural differences between
> the in-kernel irqchip and the qemu one?
>
> Are the differences significant to guests, such that it might be
> necessary to disable the in-kernel irqchip for some guests, or
> conversely, necessary to use KVM for some guests?
>   

No, the behavior differences are not terribly significant for the apic.  
Disabling it is really most useful for debugging purposes.

Regards,

Anthony Liguori
diff mbox

Patch

diff --git a/hw/ioapic.c b/hw/ioapic.c
index d475654..d6a9dac 100644
--- a/hw/ioapic.c
+++ b/hw/ioapic.c
@@ -24,6 +24,7 @@ 
 #include "pc.h"
 #include "qemu-timer.h"
 #include "host-utils.h"
+#include "kvm.h"
 
 //#define DEBUG_IOAPIC
 
@@ -193,6 +194,79 @@  static void ioapic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t va
     }
 }
 
+static int kvm_kernel_ioapic_load_from_user(IOAPICState *s)
+{
+    int r = 0;
+#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
+    struct kvm_irqchip chip;
+    struct kvm_ioapic_state *kioapic;
+    int i;
+
+    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
+        return 0;
+
+    chip.chip_id = KVM_IRQCHIP_IOAPIC;
+    kioapic = &chip.chip.ioapic;
+
+    kioapic->id = s->id;
+    kioapic->ioregsel = s->ioregsel;
+    kioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+    kioapic->irr = s->irr;
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        kioapic->redirtbl[i].bits = s->ioredtbl[i];
+    }
+
+    r = kvm_set_irqchip(&chip);
+#endif
+    return r;
+}
+
+static void kvm_kernel_ioapic_save_to_user(IOAPICState *s)
+{
+#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386)
+    struct kvm_irqchip chip;
+    struct kvm_ioapic_state *kioapic;
+    int i;
+
+    if (!(kvm_enabled() && kvm_irqchip_in_kernel()))
+        return;
+    chip.chip_id = KVM_IRQCHIP_IOAPIC;
+    kvm_get_irqchip(&chip);
+    kioapic = &chip.chip.ioapic;
+
+    s->id = kioapic->id;
+    s->ioregsel = kioapic->ioregsel;
+    s->irr = kioapic->irr;
+    for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+        s->ioredtbl[i] = kioapic->redirtbl[i].bits;
+    }
+#endif
+}
+
+static void ioapic_pre_save(void *opaque)
+{
+    IOAPICState *s = (void *)opaque;
+
+    kvm_kernel_ioapic_save_to_user(s);
+}
+
+static int ioapic_pre_load(void *opaque)
+{
+    IOAPICState *s = opaque;
+
+    /* in case we are doing version 1, we just set these to sane values */
+    s->irr = 0;
+    return 0;
+}
+
+static int ioapic_post_load(void *opaque, int version_id)
+{
+    IOAPICState *s = opaque;
+
+    return kvm_kernel_ioapic_load_from_user(s);
+}
+
+
 static const VMStateDescription vmstate_ioapic = {
     .name = "ioapic",
     .version_id = 2,
@@ -205,7 +279,10 @@  static const VMStateDescription vmstate_ioapic = {
         VMSTATE_UINT32_V(irr, IOAPICState, 2),
         VMSTATE_UINT64_ARRAY(ioredtbl, IOAPICState, IOAPIC_NUM_PINS),
         VMSTATE_END_OF_LIST()
-    }
+    },
+    .pre_load = ioapic_pre_load,
+    .post_load = ioapic_post_load,
+    .pre_save = ioapic_pre_save,
 };
 
 static void ioapic_reset(void *opaque)
@@ -217,6 +294,11 @@  static void ioapic_reset(void *opaque)
     s->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
     for(i = 0; i < IOAPIC_NUM_PINS; i++)
         s->ioredtbl[i] = 1 << 16; /* mask LVT */
+#ifdef KVM_CAP_IRQCHIP
+    if (kvm_enabled() && kvm_irqchip_in_kernel()) {
+        kvm_kernel_ioapic_load_from_user(s);
+    }
+#endif
 }
 
 static CPUReadMemoryFunc * const ioapic_mem_read[3] = {
diff --git a/kvm-all.c b/kvm-all.c
index 48ae26c..d795285 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -411,6 +411,26 @@  int kvm_check_extension(KVMState *s, unsigned int extension)
     return ret;
 }
 
+#ifdef KVM_CAP_IRQCHIP
+int kvm_set_irqchip(struct kvm_irqchip *chip)
+{
+    if (!kvm_state->irqchip_in_kernel) {
+        return 0;
+    }
+
+    return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip);
+}
+
+int kvm_get_irqchip(struct kvm_irqchip *chip)
+{
+    if (!kvm_state->irqchip_in_kernel) {
+        return 0;
+    }
+
+    return kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip);
+}
+#endif
+
 int kvm_init(int smp_cpus)
 {
     static const char upgrade_note[] =
diff --git a/kvm.h b/kvm.h
index e7d5beb..8d4afa0 100644
--- a/kvm.h
+++ b/kvm.h
@@ -16,6 +16,7 @@ 
 
 #include "config.h"
 #include "qemu-queue.h"
+#include <linux/kvm.h>
 
 #ifdef CONFIG_KVM
 extern int kvm_allowed;
@@ -63,6 +64,9 @@  int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap);
 int kvm_pit_in_kernel(void);
 int kvm_irqchip_in_kernel(void);
 
+int kvm_set_irqchip(struct kvm_irqchip *chip);
+int kvm_get_irqchip(struct kvm_irqchip *chip);
+
 /* internal API */
 
 struct KVMState;