Message ID | 1254953315-5761-4-git-send-email-glommer@redhat.com |
---|---|
State | Changes Requested |
Headers | show |
Glauber Costa wrote: > This patch provides kvm with an in-kernel ioapic. We are currently not enabling it. > The code is heavily based on what's in qemu-kvm.git. > > Signed-off-by: Glauber Costa <glommer@redhat.com> > --- > hw/ioapic.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- > kvm-all.c | 20 ++++++++++++++ > kvm.h | 4 +++ > 3 files changed, 107 insertions(+), 1 deletions(-) > > diff --git a/hw/ioapic.c b/hw/ioapic.c > index d475654..d6a9dac 100644 > --- a/hw/ioapic.c > +++ b/hw/ioapic.c > @@ -24,6 +24,7 @@ > #include "pc.h" > #include "qemu-timer.h" > #include "host-utils.h" > +#include "kvm.h" > > //#define DEBUG_IOAPIC > > @@ -193,6 +194,79 @@ static void ioapic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t va > } > } > > +static int kvm_kernel_ioapic_load_from_user(IOAPICState *s) > +{ > + int r = 0; > +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386) Hmm, here we additionally depend on TARGET_I386... > + struct kvm_irqchip chip; > + struct kvm_ioapic_state *kioapic; > + int i; > + > + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) > + return 0; > + > + chip.chip_id = KVM_IRQCHIP_IOAPIC; > + kioapic = &chip.chip.ioapic; > + > + kioapic->id = s->id; > + kioapic->ioregsel = s->ioregsel; > + kioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; > + kioapic->irr = s->irr; > + for (i = 0; i < IOAPIC_NUM_PINS; i++) { > + kioapic->redirtbl[i].bits = s->ioredtbl[i]; > + } > + > + r = kvm_set_irqchip(&chip); > +#endif > + return r; > +} > + > +static void kvm_kernel_ioapic_save_to_user(IOAPICState *s) > +{ > +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386) > + struct kvm_irqchip chip; > + struct kvm_ioapic_state *kioapic; > + int i; > + > + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) > + return; > + chip.chip_id = KVM_IRQCHIP_IOAPIC; > + kvm_get_irqchip(&chip); > + kioapic = &chip.chip.ioapic; > + > + s->id = kioapic->id; > + s->ioregsel = kioapic->ioregsel; > + s->irr = kioapic->irr; > + for (i = 0; i < IOAPIC_NUM_PINS; i++) { > + s->ioredtbl[i] = kioapic->redirtbl[i].bits; > + } > +#endif > +} > + > +static void ioapic_pre_save(void *opaque) > +{ > + IOAPICState *s = (void *)opaque; > + > + kvm_kernel_ioapic_save_to_user(s); > +} > + > +static int ioapic_pre_load(void *opaque) > +{ > + IOAPICState *s = opaque; > + > + /* in case we are doing version 1, we just set these to sane values */ > + s->irr = 0; > + return 0; > +} > + > +static int ioapic_post_load(void *opaque, int version_id) > +{ > + IOAPICState *s = opaque; > + > + return kvm_kernel_ioapic_load_from_user(s); > +} > + > + > static const VMStateDescription vmstate_ioapic = { > .name = "ioapic", > .version_id = 2, > @@ -205,7 +279,10 @@ static const VMStateDescription vmstate_ioapic = { > VMSTATE_UINT32_V(irr, IOAPICState, 2), > VMSTATE_UINT64_ARRAY(ioredtbl, IOAPICState, IOAPIC_NUM_PINS), > VMSTATE_END_OF_LIST() > - } > + }, > + .pre_load = ioapic_pre_load, > + .post_load = ioapic_post_load, > + .pre_save = ioapic_pre_save, > }; > > static void ioapic_reset(void *opaque) > @@ -217,6 +294,11 @@ static void ioapic_reset(void *opaque) > s->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; > for(i = 0; i < IOAPIC_NUM_PINS; i++) > s->ioredtbl[i] = 1 << 16; /* mask LVT */ > +#ifdef KVM_CAP_IRQCHIP ... but here only KVM_CAP_IRQCHIP suffices? > + if (kvm_enabled() && kvm_irqchip_in_kernel()) { > + kvm_kernel_ioapic_load_from_user(s); > + } > +#endif Independent of this, both kvm_irqchip_in_kernel() and kvm_kernel_ioapic_load_from_user() should also be defined for the !KVM_CAP_IRQCHIP case to avoid #ifdefs here and in many other places this series touches. > } > > static CPUReadMemoryFunc * const ioapic_mem_read[3] = { > diff --git a/kvm-all.c b/kvm-all.c > index 48ae26c..d795285 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, unsigned int extension) > return ret; > } > > +#ifdef KVM_CAP_IRQCHIP Again, only KVM_CAP_IRQCHIP - what is the actually required dependency? > +int kvm_set_irqchip(struct kvm_irqchip *chip) > +{ > + if (!kvm_state->irqchip_in_kernel) { > + return 0; > + } > + > + return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip); > +} > + > +int kvm_get_irqchip(struct kvm_irqchip *chip) > +{ > + if (!kvm_state->irqchip_in_kernel) { > + return 0; > + } > + > + return kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip); > +} > +#endif > + > int kvm_init(int smp_cpus) > { > static const char upgrade_note[] = > diff --git a/kvm.h b/kvm.h > index e7d5beb..8d4afa0 100644 > --- a/kvm.h > +++ b/kvm.h > @@ -16,6 +16,7 @@ > > #include "config.h" > #include "qemu-queue.h" > +#include <linux/kvm.h> > > #ifdef CONFIG_KVM > extern int kvm_allowed; > @@ -63,6 +64,9 @@ int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap); > int kvm_pit_in_kernel(void); > int kvm_irqchip_in_kernel(void); > > +int kvm_set_irqchip(struct kvm_irqchip *chip); > +int kvm_get_irqchip(struct kvm_irqchip *chip); > + > /* internal API */ > > struct KVMState; Jan
Glauber Costa wrote: > This patch provides kvm with an in-kernel ioapic. We are currently not enabling it. > The code is heavily based on what's in qemu-kvm.git. > It really ought to be it's own file and own device model. Having the code mixed in with ioapic.c is confusing because it's unclear what code is in use when the in-kernel model is used. > @@ -193,6 +194,79 @@ static void ioapic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t va > } > } > > +static int kvm_kernel_ioapic_load_from_user(IOAPICState *s) > +{ > + int r = 0; > +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386) > No point in checking the KVM_CAP_IRQCHIP. Just require it during build. Otherwise, !KVM_CAP_IRQCHIP is dead code since I'm sure noone is actually testing kernels that old with modern qemu. There's no point in restricting to I386 either. > + struct kvm_irqchip chip; > + struct kvm_ioapic_state *kioapic; > + int i; > + > + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) > + return 0; > + > + chip.chip_id = KVM_IRQCHIP_IOAPIC; > + kioapic = &chip.chip.ioapic; > + > + kioapic->id = s->id; > + kioapic->ioregsel = s->ioregsel; > + kioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; > + kioapic->irr = s->irr; > + for (i = 0; i < IOAPIC_NUM_PINS; i++) { > + kioapic->redirtbl[i].bits = s->ioredtbl[i]; > + } > + > + r = kvm_set_irqchip(&chip); > +#endif > + return r; > +} > + > +static void kvm_kernel_ioapic_save_to_user(IOAPICState *s) > +{ > +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386) > + struct kvm_irqchip chip; > + struct kvm_ioapic_state *kioapic; > + int i; > + > + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) > + return; > + chip.chip_id = KVM_IRQCHIP_IOAPIC; > + kvm_get_irqchip(&chip); > + kioapic = &chip.chip.ioapic; > + > + s->id = kioapic->id; > + s->ioregsel = kioapic->ioregsel; > + s->irr = kioapic->irr; > + for (i = 0; i < IOAPIC_NUM_PINS; i++) { > + s->ioredtbl[i] = kioapic->redirtbl[i].bits; > + } > +#endif > +} > + > +static void ioapic_pre_save(void *opaque) > +{ > + IOAPICState *s = (void *)opaque; > + > + kvm_kernel_ioapic_save_to_user(s); > +} > + > +static int ioapic_pre_load(void *opaque) > +{ > + IOAPICState *s = opaque; > + > + /* in case we are doing version 1, we just set these to sane values */ > + s->irr = 0; > + return 0; > +} > + > +static int ioapic_post_load(void *opaque, int version_id) > +{ > + IOAPICState *s = opaque; > + > + return kvm_kernel_ioapic_load_from_user(s); > +} > + > + > static const VMStateDescription vmstate_ioapic = { > .name = "ioapic", > .version_id = 2, > @@ -205,7 +279,10 @@ static const VMStateDescription vmstate_ioapic = { > VMSTATE_UINT32_V(irr, IOAPICState, 2), > VMSTATE_UINT64_ARRAY(ioredtbl, IOAPICState, IOAPIC_NUM_PINS), > VMSTATE_END_OF_LIST() > - } > + }, > + .pre_load = ioapic_pre_load, > + .post_load = ioapic_post_load, > + .pre_save = ioapic_pre_save, > }; > The in kernel apic should be a separate device model with a separate savevm section. They are different devices and there's no real advantage to pretending like they're the same device. > static CPUReadMemoryFunc * const ioapic_mem_read[3] = { > diff --git a/kvm-all.c b/kvm-all.c > index 48ae26c..d795285 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, unsigned int extension) > return ret; > } > > +#ifdef KVM_CAP_IRQCHIP > +int kvm_set_irqchip(struct kvm_irqchip *chip) > +{ > + if (!kvm_state->irqchip_in_kernel) { > + return 0; > + } > + > + return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip); > +} > irqchip_in_kernel ought to disappear.
On 10/08/2009 03:49 PM, Anthony Liguori wrote: > Glauber Costa wrote: >> This patch provides kvm with an in-kernel ioapic. We are currently >> not enabling it. >> The code is heavily based on what's in qemu-kvm.git. > > It really ought to be it's own file and own device model. Having the > code mixed in with ioapic.c is confusing because it's unclear what > code is in use when the in-kernel model is used. I disagree. It's the same device with the same guest-visible interface and the same host-visible interface (save/restore, 'info ioapic' if we write one). Splitting it into two files will only result in code duplication. Think of it as an ioapic accelerator.
Avi Kivity wrote: > On 10/08/2009 03:49 PM, Anthony Liguori wrote: >> Glauber Costa wrote: >>> This patch provides kvm with an in-kernel ioapic. We are currently >>> not enabling it. >>> The code is heavily based on what's in qemu-kvm.git. >> >> It really ought to be it's own file and own device model. Having the >> code mixed in with ioapic.c is confusing because it's unclear what >> code is in use when the in-kernel model is used. > > I disagree. It's the same device with the same guest-visible interface > and the same host-visible interface (save/restore, 'info ioapic' if we > write one). Splitting it into two files will only result in code > duplication. Shared functions can be pushed into a commonly used module (ioapic-common.c). And everyone touching it should realize then that it could affect more than one user. Jan
Avi Kivity wrote: > On 10/08/2009 03:49 PM, Anthony Liguori wrote: > >Glauber Costa wrote: > >>This patch provides kvm with an in-kernel ioapic. We are currently > >>not enabling it. > >>The code is heavily based on what's in qemu-kvm.git. > > > >It really ought to be it's own file and own device model. Having the > >code mixed in with ioapic.c is confusing because it's unclear what > >code is in use when the in-kernel model is used. > > I disagree. It's the same device with the same guest-visible interface > and the same host-visible interface (save/restore, 'info ioapic' if we > write one). Splitting it into two files will only result in code > duplication. > > Think of it as an ioapic accelerator. Haven't we already confirmed that it *isn't* just an ioapic accelerator because you can't migrate between in-kernel iopic and qemu's ioapic? Imho, if they cannot be swapped transparently, they are different device emulations. OF course there's nothing wrong with sharing lots of code. Maybe ioapic.c and ioapic-kvm.c, with shared code in ioapic-common.c? -- Jamie
Jamie Lokier wrote: > Avi Kivity wrote: > >> On 10/08/2009 03:49 PM, Anthony Liguori wrote: >> >>> Glauber Costa wrote: >>> >>>> This patch provides kvm with an in-kernel ioapic. We are currently >>>> not enabling it. >>>> The code is heavily based on what's in qemu-kvm.git. >>>> >>> It really ought to be it's own file and own device model. Having the >>> code mixed in with ioapic.c is confusing because it's unclear what >>> code is in use when the in-kernel model is used. >>> >> I disagree. It's the same device with the same guest-visible interface >> and the same host-visible interface (save/restore, 'info ioapic' if we >> write one). Splitting it into two files will only result in code >> duplication. >> >> Think of it as an ioapic accelerator. >> > > Haven't we already confirmed that it *isn't* just an ioapic accelerator > because you can't migrate between in-kernel iopic and qemu's ioapic? > > Imho, if they cannot be swapped transparently, they are different > device emulations. > > OF course there's nothing wrong with sharing lots of code. > If you avoid having a common save format, you get an overall reduction in code size and there's virtually no code to share.
On 10/08/2009 06:07 PM, Jamie Lokier wrote: > Haven't we already confirmed that it *isn't* just an ioapic accelerator > because you can't migrate between in-kernel iopic and qemu's ioapic? > We haven't confirmed it. Both implement the same spec, and if you can't migrate between them, one of them has a bug (for example, qemu ioapic doesn't implement polarity - but it's still just a bug).
On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote: > On 10/08/2009 06:07 PM, Jamie Lokier wrote: > >Haven't we already confirmed that it *isn't* just an ioapic accelerator > >because you can't migrate between in-kernel iopic and qemu's ioapic? > > We haven't confirmed it. Both implement the same spec, and if you > can't migrate between them, one of them has a bug (for example, qemu > ioapic doesn't implement polarity - but it's still just a bug). > Are you saying that HW spec (that only describes software visible behavior) completely defines implementation? No other internal state is needed that may be done differently by different implementations? -- Gleb.
On 10/08/2009 06:22 PM, Gleb Natapov wrote: > On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote: > >> On 10/08/2009 06:07 PM, Jamie Lokier wrote: >> >>> Haven't we already confirmed that it *isn't* just an ioapic accelerator >>> because you can't migrate between in-kernel iopic and qemu's ioapic? >>> >> We haven't confirmed it. Both implement the same spec, and if you >> can't migrate between them, one of them has a bug (for example, qemu >> ioapic doesn't implement polarity - but it's still just a bug). >> >> > Are you saying that HW spec (that only describes software visible behavior) > completely defines implementation? No other internal state is needed > that may be done differently by different implementations? > It may be done differently (for example, selecting the cpu to deliver the interrupt to), but as the guest cannot rely on the differences, there's no need to save the state that can cause these differences.
On Thu, Oct 08, 2009 at 06:29:53PM +0200, Avi Kivity wrote: > On 10/08/2009 06:22 PM, Gleb Natapov wrote: > >On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote: > >>On 10/08/2009 06:07 PM, Jamie Lokier wrote: > >>>Haven't we already confirmed that it *isn't* just an ioapic accelerator > >>>because you can't migrate between in-kernel iopic and qemu's ioapic? > >>We haven't confirmed it. Both implement the same spec, and if you > >>can't migrate between them, one of them has a bug (for example, qemu > >>ioapic doesn't implement polarity - but it's still just a bug). > >> > >Are you saying that HW spec (that only describes software visible behavior) > >completely defines implementation? No other internal state is needed > >that may be done differently by different implementations? > > It may be done differently (for example, selecting the cpu to > deliver the interrupt to), but as the guest cannot rely on the > differences, there's no need to save the state that can cause these > differences. > So suppose I have simple watchdog device that required to be poked every second, otherwise it resets a computer. On migration we have to migrate time elapsed since last poke, but if device doesn't expose it to software in any way you are saying we can recreate is some other way? -- Gleb.
On 10/08/2009 06:34 PM, Gleb Natapov wrote: > So suppose I have simple watchdog device that required to be poked every > second, otherwise it resets a computer. On migration we have to migrate > time elapsed since last poke, but if device doesn't expose it to > software in any way you are saying we can recreate is some other way? > The time is exposed (you can measure it by poking the device and measuring the time till reset) and will be described in the spec (otherwise users will be surprised when their machine resets). You don't have to migrate all exposed state, just exposed state that the guest may rely on.
On Thu, Oct 08, 2009 at 06:42:07PM +0200, Avi Kivity wrote: > On 10/08/2009 06:34 PM, Gleb Natapov wrote: > >So suppose I have simple watchdog device that required to be poked every > >second, otherwise it resets a computer. On migration we have to migrate > >time elapsed since last poke, but if device doesn't expose it to > >software in any way you are saying we can recreate is some other way? > > The time is exposed (you can measure it by poking the device and The time yes, not its internal representation. What if one implementation stores how much time passed and another how much time left. One counts in wall clack another only when guest runs. etc... and this is a device with only one write only register. > measuring the time till reset) and will be described in the spec > (otherwise users will be surprised when their machine resets). > > You don't have to migrate all exposed state, just exposed state that > the guest may rely on. -- Gleb.
Gleb Natapov wrote: > On Thu, Oct 08, 2009 at 06:42:07PM +0200, Avi Kivity wrote: > > On 10/08/2009 06:34 PM, Gleb Natapov wrote: > > >So suppose I have simple watchdog device that required to be poked every > > >second, otherwise it resets a computer. On migration we have to migrate > > >time elapsed since last poke, but if device doesn't expose it to > > >software in any way you are saying we can recreate is some other way? > > > > The time is exposed (you can measure it by poking the device and > The time yes, not its internal representation. What if one implementation > stores how much time passed and another how much time left. > One counts in wall clack another only when guest runs. etc... and > this is a device with only one write only register. In that case you can decide between calling it two different devices (which have the same guest-visible behaviour but are not interchangable), or calling them different implementations of one device - by adding a little more code to save state in a common format. (Although they may have to be different devices for qemu configuration, it's ok for them to have the same PCI IDs and for the guest not to know the difference) For your watchdog example, it's not hard to pick a saved state which works for both. ioapic will be harder to find a useful common saved state, and there might need to be an *optional hint* section (e.g. for selecting the next CPU to get an interrupt), but I think it would be worth it in this case. Being able to load a KVM image into TCG and vice versa is quite useful sometimes. E.g. I've had to do some OS installs using TCG at first, then switch to KVM later for performance. -- Jamie
On Fri, Oct 09, 2009 at 11:02:31AM +0100, Jamie Lokier wrote: > Gleb Natapov wrote: > > On Thu, Oct 08, 2009 at 06:42:07PM +0200, Avi Kivity wrote: > > > On 10/08/2009 06:34 PM, Gleb Natapov wrote: > > > >So suppose I have simple watchdog device that required to be poked every > > > >second, otherwise it resets a computer. On migration we have to migrate > > > >time elapsed since last poke, but if device doesn't expose it to > > > >software in any way you are saying we can recreate is some other way? > > > > > > The time is exposed (you can measure it by poking the device and > > The time yes, not its internal representation. What if one implementation > > stores how much time passed and another how much time left. > > One counts in wall clack another only when guest runs. etc... and > > this is a device with only one write only register. > > In that case you can decide between calling it two different devices > (which have the same guest-visible behaviour but are not > interchangable), or calling them different implementations of one > device - by adding a little more code to save state in a common format. > That is what currency done for in-kernel/out-of-kernel irq chips. Save state transformation. The problem begins if one of the devices has more state (not just the same state but in a different format). You need to drop info on migration. > (Although they may have to be different devices for qemu > configuration, it's ok for them to have the same PCI IDs and for the > guest not to know the difference) > > For your watchdog example, it's not hard to pick a saved state which > works for both. If you can't migrate from one to the other why even bother? In my example if one device counts wallclock time and another guest cpu time you can't migrate from one implementation to another. > > ioapic will be harder to find a useful common saved state, and there > might need to be an *optional hint* section (e.g. for selecting the > next CPU to get an interrupt), but I think it would be worth it in > this case. Being able to load a KVM image into TCG and vice versa is > quite useful sometimes. E.g. I've had to do some OS installs using > TCG at first, then switch to KVM later for performance. > Reboot :) -- Gleb.
On Thu, Oct 08, 2009 at 06:22:48PM +0200, Gleb Natapov wrote: > On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote: > > On 10/08/2009 06:07 PM, Jamie Lokier wrote: > > >Haven't we already confirmed that it *isn't* just an ioapic accelerator > > >because you can't migrate between in-kernel iopic and qemu's ioapic? > > > > We haven't confirmed it. Both implement the same spec, and if you > > can't migrate between them, one of them has a bug (for example, qemu > > ioapic doesn't implement polarity - but it's still just a bug). > > > Are you saying that HW spec (that only describes software visible behavior) > completely defines implementation? No other internal state is needed > that may be done differently by different implementations? Most specifications leaves a lot as implementation specific. It's not hard to imagine a case in which both devices will follow the spec correctly, (no bugs involved), and yet differ in the implementation.
Glauber Costa wrote: > On Thu, Oct 08, 2009 at 06:22:48PM +0200, Gleb Natapov wrote: > > On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote: > > > On 10/08/2009 06:07 PM, Jamie Lokier wrote: > > > >Haven't we already confirmed that it *isn't* just an ioapic accelerator > > > >because you can't migrate between in-kernel iopic and qemu's ioapic? > > > > > > We haven't confirmed it. Both implement the same spec, and if you > > > can't migrate between them, one of them has a bug (for example, qemu > > > ioapic doesn't implement polarity - but it's still just a bug). > > > > > Are you saying that HW spec (that only describes software visible behavior) > > completely defines implementation? No other internal state is needed > > that may be done differently by different implementations? > Most specifications leaves a lot as implementation specific. > > It's not hard to imagine a case in which both devices will follow > the spec correctly, (no bugs involved), and yet differ in the > implementation. Avi's not saying the implementations won't differ. I believe he's saying that implementation-specific states don't need to be saved if they have no effect on guest visible behaviour. -- Jamie
On Fri, Oct 09, 2009 at 09:55:13PM +0200, Juan Quintela wrote: > Jamie Lokier <jamie@shareable.org> wrote: > > Glauber Costa wrote: > >> On Thu, Oct 08, 2009 at 06:22:48PM +0200, Gleb Natapov wrote: > >> > On Thu, Oct 08, 2009 at 06:17:57PM +0200, Avi Kivity wrote: > >> > > On 10/08/2009 06:07 PM, Jamie Lokier wrote: > >> > > >Haven't we already confirmed that it *isn't* just an ioapic accelerator > >> > > >because you can't migrate between in-kernel iopic and qemu's ioapic? > >> > > > >> > > We haven't confirmed it. Both implement the same spec, and if you > >> > > can't migrate between them, one of them has a bug (for example, qemu > >> > > ioapic doesn't implement polarity - but it's still just a bug). > >> > > > >> > Are you saying that HW spec (that only describes software visible behavior) > >> > completely defines implementation? No other internal state is needed > >> > that may be done differently by different implementations? > >> Most specifications leaves a lot as implementation specific. > >> > >> It's not hard to imagine a case in which both devices will follow > >> the spec correctly, (no bugs involved), and yet differ in the > >> implementation. > > > > Avi's not saying the implementations won't differ. I believe he's > > saying that implementation-specific states don't need to be saved if > > they have no effect on guest visible behaviour. > > Just to re-state. I would also prefer to have a single device. Reasons > (majority already told in the thread): > - We can switch between devices more easily > - They are emulating the same device. > - At the moment that you have two different devices, one of them will > rot :( > - Adding state to the save/load format that is used only from one device > is not a problem. > > I notice that discussion is going nowhere, basically we are in the > state: > - people that want one device > * they emulate the same hardware > * lots of code is shared > * they should be interchageable > * if they are not interchageable, it is a bug > * once that they are split, it is basically imposible to join then > again. > - people that want 2 devices: > * The devices can more easily diverge if they are two devices > * They are not interchageable now > * It allows you more freedom in changing any of them if they are > separate. > > As you can see, none of the proposals is a clear winner. And what is > worse, we have the two maintainers (avi and anthony), the two with more > experience having to deal with this kind of situation disagreeing. > > How to fix the impass? a deathmatch?
Juan Quintela wrote: > I notice that discussion is going nowhere, basically we are in the > state: > - people that want one device > * they emulate the same hardware > * lots of code is shared > * they should be interchageable > * if they are not interchageable, it is a bug > * once that they are split, it is basically imposible to join then > again. > - people that want 2 devices: > * The devices can more easily diverge if they are two devices > * They are not interchageable now > * It allows you more freedom in changing any of them if they are > separate. > > As you can see, none of the proposals is a clear winner. And what is > worse, we have the two maintainers (avi and anthony), the two with more > experience having to deal with this kind of situation disagreeing. > > How to fix the impass? > We already have the single device model implementation and the limitations are well known. The best way to move forward is for someone to send out patches implementing separate device models. At that point, it becomes a discussion of two concrete pieces of code verses hand waving. > Later, Juan. >
Anthony Liguori wrote: > We already have the single device model implementation and the > limitations are well known. The best way to move forward is for someone > to send out patches implementing separate device models. > > At that point, it becomes a discussion of two concrete pieces of code > verses hand waving. Out of curiosity now, what _are_ the behavioural differences between the in-kernel irqchip and the qemu one? Are the differences significant to guests, such that it might be necessary to disable the in-kernel irqchip for some guests, or conversely, necessary to use KVM for some guests? Thanks, -- Jamie
Jamie Lokier wrote: > Anthony Liguori wrote: > >> We already have the single device model implementation and the >> limitations are well known. The best way to move forward is for someone >> to send out patches implementing separate device models. >> >> At that point, it becomes a discussion of two concrete pieces of code >> verses hand waving. >> > > Out of curiosity now, what _are_ the behavioural differences between > the in-kernel irqchip and the qemu one? > > Are the differences significant to guests, such that it might be > necessary to disable the in-kernel irqchip for some guests, or > conversely, necessary to use KVM for some guests? > No, the behavior differences are not terribly significant for the apic. Disabling it is really most useful for debugging purposes. Regards, Anthony Liguori
diff --git a/hw/ioapic.c b/hw/ioapic.c index d475654..d6a9dac 100644 --- a/hw/ioapic.c +++ b/hw/ioapic.c @@ -24,6 +24,7 @@ #include "pc.h" #include "qemu-timer.h" #include "host-utils.h" +#include "kvm.h" //#define DEBUG_IOAPIC @@ -193,6 +194,79 @@ static void ioapic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t va } } +static int kvm_kernel_ioapic_load_from_user(IOAPICState *s) +{ + int r = 0; +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386) + struct kvm_irqchip chip; + struct kvm_ioapic_state *kioapic; + int i; + + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) + return 0; + + chip.chip_id = KVM_IRQCHIP_IOAPIC; + kioapic = &chip.chip.ioapic; + + kioapic->id = s->id; + kioapic->ioregsel = s->ioregsel; + kioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + kioapic->irr = s->irr; + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + kioapic->redirtbl[i].bits = s->ioredtbl[i]; + } + + r = kvm_set_irqchip(&chip); +#endif + return r; +} + +static void kvm_kernel_ioapic_save_to_user(IOAPICState *s) +{ +#if defined(KVM_CAP_IRQCHIP) && defined(TARGET_I386) + struct kvm_irqchip chip; + struct kvm_ioapic_state *kioapic; + int i; + + if (!(kvm_enabled() && kvm_irqchip_in_kernel())) + return; + chip.chip_id = KVM_IRQCHIP_IOAPIC; + kvm_get_irqchip(&chip); + kioapic = &chip.chip.ioapic; + + s->id = kioapic->id; + s->ioregsel = kioapic->ioregsel; + s->irr = kioapic->irr; + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + s->ioredtbl[i] = kioapic->redirtbl[i].bits; + } +#endif +} + +static void ioapic_pre_save(void *opaque) +{ + IOAPICState *s = (void *)opaque; + + kvm_kernel_ioapic_save_to_user(s); +} + +static int ioapic_pre_load(void *opaque) +{ + IOAPICState *s = opaque; + + /* in case we are doing version 1, we just set these to sane values */ + s->irr = 0; + return 0; +} + +static int ioapic_post_load(void *opaque, int version_id) +{ + IOAPICState *s = opaque; + + return kvm_kernel_ioapic_load_from_user(s); +} + + static const VMStateDescription vmstate_ioapic = { .name = "ioapic", .version_id = 2, @@ -205,7 +279,10 @@ static const VMStateDescription vmstate_ioapic = { VMSTATE_UINT32_V(irr, IOAPICState, 2), VMSTATE_UINT64_ARRAY(ioredtbl, IOAPICState, IOAPIC_NUM_PINS), VMSTATE_END_OF_LIST() - } + }, + .pre_load = ioapic_pre_load, + .post_load = ioapic_post_load, + .pre_save = ioapic_pre_save, }; static void ioapic_reset(void *opaque) @@ -217,6 +294,11 @@ static void ioapic_reset(void *opaque) s->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; for(i = 0; i < IOAPIC_NUM_PINS; i++) s->ioredtbl[i] = 1 << 16; /* mask LVT */ +#ifdef KVM_CAP_IRQCHIP + if (kvm_enabled() && kvm_irqchip_in_kernel()) { + kvm_kernel_ioapic_load_from_user(s); + } +#endif } static CPUReadMemoryFunc * const ioapic_mem_read[3] = { diff --git a/kvm-all.c b/kvm-all.c index 48ae26c..d795285 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, unsigned int extension) return ret; } +#ifdef KVM_CAP_IRQCHIP +int kvm_set_irqchip(struct kvm_irqchip *chip) +{ + if (!kvm_state->irqchip_in_kernel) { + return 0; + } + + return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip); +} + +int kvm_get_irqchip(struct kvm_irqchip *chip) +{ + if (!kvm_state->irqchip_in_kernel) { + return 0; + } + + return kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip); +} +#endif + int kvm_init(int smp_cpus) { static const char upgrade_note[] = diff --git a/kvm.h b/kvm.h index e7d5beb..8d4afa0 100644 --- a/kvm.h +++ b/kvm.h @@ -16,6 +16,7 @@ #include "config.h" #include "qemu-queue.h" +#include <linux/kvm.h> #ifdef CONFIG_KVM extern int kvm_allowed; @@ -63,6 +64,9 @@ int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap); int kvm_pit_in_kernel(void); int kvm_irqchip_in_kernel(void); +int kvm_set_irqchip(struct kvm_irqchip *chip); +int kvm_get_irqchip(struct kvm_irqchip *chip); + /* internal API */ struct KVMState;
This patch provides kvm with an in-kernel ioapic. We are currently not enabling it. The code is heavily based on what's in qemu-kvm.git. Signed-off-by: Glauber Costa <glommer@redhat.com> --- hw/ioapic.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- kvm-all.c | 20 ++++++++++++++ kvm.h | 4 +++ 3 files changed, 107 insertions(+), 1 deletions(-)