Message ID | 20091014143042.GD8092@mothafucka.localdomain |
---|---|
State | New |
Headers | show |
Specially Avi, comments on this one? 2009/10/14 Glauber Costa <glommer@redhat.com>: > Hello people, > > As I promised, I am sending a very brief PoC wrt split devices and in-kernel irqchip. > In this mail, I am including only the ioapic version for apreciation. I also have i8259, > and apic will take me a little bit more. This is just to try to bind the discussion to real > code. > > Note that we end up with a very slim representation of the device, and the code is much less > confusing, IMHO. > > > > Index: qemu/Makefile.target > =================================================================== > --- qemu.orig/Makefile.target > +++ qemu/Makefile.target > @@ -197,6 +197,8 @@ obj-i386-y += usb-uhci.o vmmouse.o vmpor > obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o > obj-i386-y += ne2000-isa.o > > +obj-i386-$(CONFIG_KVM) += ioapic-kvm.o > + > # shared objects > obj-ppc-y = ppc.o ide/core.o ide/qdev.o ide/isa.o ide/pci.o ide/macio.o > obj-ppc-y += vga.o vga-pci.o $(sound-obj-y) dma.o openpic.o > Index: qemu/hw/ioapic-kvm.c > =================================================================== > --- /dev/null > +++ qemu/hw/ioapic-kvm.c > @@ -0,0 +1,81 @@ > +#include "hw.h" > +#include "pc.h" > +#include "qemu-timer.h" > +#include "host-utils.h" > +#include "kvm.h" > + > +#define IOAPIC_NUM_PINS 0x18 > +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 > + > +static void ioapic_reset(void *opaque) > +{ > + struct kvm_ioapic_state *s = opaque; > + struct kvm_irqchip *chip; > + int i; > + > + chip = container_of(s, struct kvm_irqchip, chip.ioapic); > + > + chip->chip_id = KVM_IRQCHIP_IOAPIC; > + > + memset(s, 0, sizeof(*s)); > + s->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; > + for(i = 0; i < IOAPIC_NUM_PINS; i++) > + s->redirtbl[i].bits = 1 << 16; /* mask LVT */ > + > + kvm_set_irqchip(chip); > +} > + > +static void ioapic_pre_save(void *opaque) > +{ > + struct kvm_ioapic_state *s = opaque; > + struct kvm_irqchip *chip; > + > + chip = container_of(s, struct kvm_irqchip, chip.ioapic); > + > + kvm_get_irqchip(chip); > +} > + > +static int ioapic_post_load(void *opaque, int version_id) > +{ > + struct kvm_ioapic_state *s = opaque; > + struct kvm_irqchip *chip; > + > + chip = container_of(s, struct kvm_irqchip, chip.ioapic); > + > + return kvm_set_irqchip(chip); > +} > + > +static const VMStateDescription vmstate_kvm_ioapic = { > + .name = "ioapic-kvm", > + .version_id = 1, > + .minimum_version_id = 1, > + .post_load = ioapic_post_load, > + .pre_save = ioapic_pre_save, > + .fields = (VMStateField []) { > + VMSTATE_U64(base_address, struct kvm_ioapic_state), > + VMSTATE_UINT32(id, struct kvm_ioapic_state), > + VMSTATE_UINT32(ioregsel, struct kvm_ioapic_state), > + VMSTATE_UINT32(irr, struct kvm_ioapic_state), > + VMSTATE_ARRAY_UNSAFE(redirtbl, struct kvm_ioapic_state, IOAPIC_NUM_PINS, 0, vmstate_info_u64, __u64), > + VMSTATE_END_OF_LIST() > + } > +}; > + > + > +static void kvm_ioapic_set_irq(void *opaque, int vector, int level) > +{ > +} > + > +qemu_irq *kvm_ioapic_init(void) > +{ > + struct kvm_irqchip *s; > + > + s = qemu_mallocz(sizeof(*s)); > + > + ioapic_reset(&s->chip.ioapic); > + > + vmstate_register(0, &vmstate_kvm_ioapic, &s->chip.ioapic); > + qemu_register_reset(ioapic_reset, &s->chip.ioapic); > + > + return qemu_allocate_irqs(kvm_ioapic_set_irq, &s->chip.ioapic, IOAPIC_NUM_PINS); > +} > Index: qemu/hw/pc.h > =================================================================== > --- qemu.orig/hw/pc.h > +++ qemu/hw/pc.h > @@ -48,6 +48,8 @@ void ioapic_set_irq(void *opaque, int ve > void apic_reset_irq_delivered(void); > int apic_get_irq_delivered(void); > > +qemu_irq *kvm_ioapic_init(void); > + > /* i8254.c */ > > #define PIT_FREQ 1193182 > Index: qemu/kvm-all.c > =================================================================== > --- qemu.orig/kvm-all.c > +++ qemu/kvm-all.c > @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, uns > return ret; > } > > +#ifdef KVM_CAP_IRQCHIP > +int kvm_set_irqchip(struct kvm_irqchip *chip) > +{ > + if (!kvm_state->irqchip_in_kernel) { > + return 0; > + } > + > + return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip); > +} > + > +int kvm_get_irqchip(struct kvm_irqchip *chip) > +{ > + if (!kvm_state->irqchip_in_kernel) { > + return 0; > + } > + > + return kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip); > +} > +#endif > + > int kvm_init(int smp_cpus) > { > static const char upgrade_note[] = > Index: qemu/kvm.h > =================================================================== > --- qemu.orig/kvm.h > +++ qemu/kvm.h > @@ -16,6 +16,7 @@ > > #include "config.h" > #include "qemu-queue.h" > +#include <linux/kvm.h> > > #ifdef CONFIG_KVM > extern int kvm_allowed; > @@ -63,6 +64,9 @@ int kvm_update_guest_debug(CPUState *env > int kvm_pit_in_kernel(void); > int kvm_irqchip_in_kernel(void); > > +int kvm_set_irqchip(struct kvm_irqchip *chip); > +int kvm_get_irqchip(struct kvm_irqchip *chip); > + > /* internal API */ > > struct KVMState; > >
On Wed, Oct 14, 2009 at 11:30:43AM -0300, Glauber Costa wrote: > Hello people, > > As I promised, I am sending a very brief PoC wrt split devices and in-kernel irqchip. > In this mail, I am including only the ioapic version for apreciation. I also have i8259, > and apic will take me a little bit more. This is just to try to bind the discussion to real > code. > > Note that we end up with a very slim representation of the device, and the code is much less > confusing, IMHO. Looks good to me. > Index: qemu/Makefile.target > =================================================================== > --- qemu.orig/Makefile.target > +++ qemu/Makefile.target > @@ -197,6 +197,8 @@ obj-i386-y += usb-uhci.o vmmouse.o vmpor > obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o > obj-i386-y += ne2000-isa.o > > +obj-i386-$(CONFIG_KVM) += ioapic-kvm.o > + > # shared objects > obj-ppc-y = ppc.o ide/core.o ide/qdev.o ide/isa.o ide/pci.o ide/macio.o > obj-ppc-y += vga.o vga-pci.o $(sound-obj-y) dma.o openpic.o > Index: qemu/hw/ioapic-kvm.c > =================================================================== > --- /dev/null > +++ qemu/hw/ioapic-kvm.c > @@ -0,0 +1,81 @@ > +#include "hw.h" > +#include "pc.h" > +#include "qemu-timer.h" > +#include "host-utils.h" > +#include "kvm.h" > + > +#define IOAPIC_NUM_PINS 0x18 > +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 > + > +static void ioapic_reset(void *opaque) > +{ > + struct kvm_ioapic_state *s = opaque; > + struct kvm_irqchip *chip; > + int i; > + > + chip = container_of(s, struct kvm_irqchip, chip.ioapic); > + > + chip->chip_id = KVM_IRQCHIP_IOAPIC; > + > + memset(s, 0, sizeof(*s)); > + s->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; > + for(i = 0; i < IOAPIC_NUM_PINS; i++) > + s->redirtbl[i].bits = 1 << 16; /* mask LVT */ > + > + kvm_set_irqchip(chip); > +} > + > +static void ioapic_pre_save(void *opaque) > +{ > + struct kvm_ioapic_state *s = opaque; > + struct kvm_irqchip *chip; > + > + chip = container_of(s, struct kvm_irqchip, chip.ioapic); > + > + kvm_get_irqchip(chip); > +} > + > +static int ioapic_post_load(void *opaque, int version_id) > +{ > + struct kvm_ioapic_state *s = opaque; > + struct kvm_irqchip *chip; > + > + chip = container_of(s, struct kvm_irqchip, chip.ioapic); > + > + return kvm_set_irqchip(chip); > +} > + > +static const VMStateDescription vmstate_kvm_ioapic = { > + .name = "ioapic-kvm", > + .version_id = 1, > + .minimum_version_id = 1, > + .post_load = ioapic_post_load, > + .pre_save = ioapic_pre_save, > + .fields = (VMStateField []) { > + VMSTATE_U64(base_address, struct kvm_ioapic_state), > + VMSTATE_UINT32(id, struct kvm_ioapic_state), > + VMSTATE_UINT32(ioregsel, struct kvm_ioapic_state), > + VMSTATE_UINT32(irr, struct kvm_ioapic_state), > + VMSTATE_ARRAY_UNSAFE(redirtbl, struct kvm_ioapic_state, IOAPIC_NUM_PINS, 0, vmstate_info_u64, __u64), > + VMSTATE_END_OF_LIST() > + } > +}; > + > + > +static void kvm_ioapic_set_irq(void *opaque, int vector, int level) > +{ > +} > + > +qemu_irq *kvm_ioapic_init(void) > +{ > + struct kvm_irqchip *s; > + > + s = qemu_mallocz(sizeof(*s)); > + > + ioapic_reset(&s->chip.ioapic); > + > + vmstate_register(0, &vmstate_kvm_ioapic, &s->chip.ioapic); > + qemu_register_reset(ioapic_reset, &s->chip.ioapic); > + > + return qemu_allocate_irqs(kvm_ioapic_set_irq, &s->chip.ioapic, IOAPIC_NUM_PINS); > +} > Index: qemu/hw/pc.h > =================================================================== > --- qemu.orig/hw/pc.h > +++ qemu/hw/pc.h > @@ -48,6 +48,8 @@ void ioapic_set_irq(void *opaque, int ve > void apic_reset_irq_delivered(void); > int apic_get_irq_delivered(void); > > +qemu_irq *kvm_ioapic_init(void); > + > /* i8254.c */ > > #define PIT_FREQ 1193182 > Index: qemu/kvm-all.c > =================================================================== > --- qemu.orig/kvm-all.c > +++ qemu/kvm-all.c > @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, uns > return ret; > } > > +#ifdef KVM_CAP_IRQCHIP > +int kvm_set_irqchip(struct kvm_irqchip *chip) > +{ > + if (!kvm_state->irqchip_in_kernel) { > + return 0; > + } > + > + return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip); > +} > + > +int kvm_get_irqchip(struct kvm_irqchip *chip) > +{ > + if (!kvm_state->irqchip_in_kernel) { > + return 0; > + } > + > + return kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip); > +} > +#endif > + > int kvm_init(int smp_cpus) > { > static const char upgrade_note[] = > Index: qemu/kvm.h > =================================================================== > --- qemu.orig/kvm.h > +++ qemu/kvm.h > @@ -16,6 +16,7 @@ > > #include "config.h" > #include "qemu-queue.h" > +#include <linux/kvm.h> > > #ifdef CONFIG_KVM > extern int kvm_allowed; > @@ -63,6 +64,9 @@ int kvm_update_guest_debug(CPUState *env > int kvm_pit_in_kernel(void); > int kvm_irqchip_in_kernel(void); > > +int kvm_set_irqchip(struct kvm_irqchip *chip); > +int kvm_get_irqchip(struct kvm_irqchip *chip); > + > /* internal API */ > > struct KVMState;
On 10/14/2009 04:30 PM, Glauber Costa wrote: > Hello people, > > As I promised, I am sending a very brief PoC wrt split devices and in-kernel irqchip. > In this mail, I am including only the ioapic version for apreciation. I also have i8259, > and apic will take me a little bit more. This is just to try to bind the discussion to real > code. > > I still can't say I like it. The reset function is duplicated, the state representation (which is an ABI) is gratuitously forked. You can't save/restore in-kernel irqchip and userspace irqchip, even though where the code is located is an implementation detail. While we may not care much for the ioapic, it sets a bad precedent for vhost-net, where we'd like to migrate from non-vhost-net hosts to vhost-net hosts without the user noticing anything. > Note that we end up with a very slim representation of the device, and the code is much less > confusing, IMHO. > You can always remove if statements by duplicating the code and pushing the if one level upwards. In total, there is more code, and it is more confusing (since you need to deal with implementation details at a higher level).
On Sun, Oct 25, 2009 at 12:26:51PM +0200, Avi Kivity wrote: > On 10/14/2009 04:30 PM, Glauber Costa wrote: >> Hello people, >> >> As I promised, I am sending a very brief PoC wrt split devices and in-kernel irqchip. >> In this mail, I am including only the ioapic version for apreciation. I also have i8259, >> and apic will take me a little bit more. This is just to try to bind the discussion to real >> code. >> >> > > I still can't say I like it. The reset function is duplicated, the > state representation (which is an ABI) is gratuitously forked. > > You can't save/restore in-kernel irqchip and userspace irqchip, even > though where the code is located is an implementation detail. While we > may not care much for the ioapic, it sets a bad precedent for vhost-net, > where we'd like to migrate from non-vhost-net hosts to vhost-net hosts > without the user noticing anything. > >> Note that we end up with a very slim representation of the device, and the code is much less >> confusing, IMHO. >> > > You can always remove if statements by duplicating the code and pushing > the if one level upwards. In total, there is more code, and it is more > confusing (since you need to deal with implementation details at a > higher level). > It pretty much depends on your definition of confusing. Larger? yes, probably. It has a separate file, and doesn't matter how hard we fight duplicates, some will persist. But just the other day, I was on IRC with anthony, trying to draw some conclusions about the behaviour of our ioapic. We went through it, and it took us quite a while to determine what pieces of code were being used, and what were not. This is pretty much what I mean by confusing. With the approach we are proposing, things get much more straightforward > -- > error compiling committee.c: too many arguments to function >
Avi Kivity wrote: > On 10/14/2009 04:30 PM, Glauber Costa wrote: >> Hello people, >> >> As I promised, I am sending a very brief PoC wrt split devices and >> in-kernel irqchip. >> In this mail, I am including only the ioapic version for apreciation. >> I also have i8259, >> and apic will take me a little bit more. This is just to try to bind >> the discussion to real >> code. >> >> > > I still can't say I like it. The reset function is duplicated, the > state representation (which is an ABI) is gratuitously forked. > > You can't save/restore in-kernel irqchip and userspace irqchip, even > though where the code is located is an implementation detail. While > we may not care much for the ioapic, it sets a bad precedent for > vhost-net, where we'd like to migrate from non-vhost-net hosts to > vhost-net hosts without the user noticing anything. > >> Note that we end up with a very slim representation of the device, >> and the code is much less >> confusing, IMHO. >> > > You can always remove if statements by duplicating the code and > pushing the if one level upwards. In total, there is more code, and > it is more confusing (since you need to deal with implementation > details at a higher level). I'm surprised you feel this way. Maybe this is an issue of having the model in your head vs. not having it because the current in-kernel code is extremely confusing IMHO. When you look at ioapic.c in qemu-kvm, the first question I ask is, "what parts of this code is used when using in-kernel apic?". The answer is not at all obvious. To understand it, you have to first search for kvm_enabled() and you'll see that during save/restore the state is synced with the in-kernel state. However, it's not clear whether pio/mmio operations still get processed and certainly not clear whether ioapic_set_irq() is not called anymore. In fact, I think you to start with the assumption that it is which leads you to wonder why it doesn't do kvm_set_irq(). The answers are all subtle and have to do with weird things about how the isa irqs are allocated. It's extremely confusing to someone who doesn't know exactly what's going on. OTOH, the split model makes this all very obvious. Sure there's some duplication but at the end of the day, you don't have to understand very much to see what's going on. We just use userspace for device save/restore and reset support. Code readability wins in my mind over reducing a couple dozen lines of code. Regards, Anthony Liguori
Index: qemu/Makefile.target =================================================================== --- qemu.orig/Makefile.target +++ qemu/Makefile.target @@ -197,6 +197,8 @@ obj-i386-y += usb-uhci.o vmmouse.o vmpor obj-i386-y += device-hotplug.o pci-hotplug.o smbios.o wdt_ib700.o obj-i386-y += ne2000-isa.o +obj-i386-$(CONFIG_KVM) += ioapic-kvm.o + # shared objects obj-ppc-y = ppc.o ide/core.o ide/qdev.o ide/isa.o ide/pci.o ide/macio.o obj-ppc-y += vga.o vga-pci.o $(sound-obj-y) dma.o openpic.o Index: qemu/hw/ioapic-kvm.c =================================================================== --- /dev/null +++ qemu/hw/ioapic-kvm.c @@ -0,0 +1,81 @@ +#include "hw.h" +#include "pc.h" +#include "qemu-timer.h" +#include "host-utils.h" +#include "kvm.h" + +#define IOAPIC_NUM_PINS 0x18 +#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000 + +static void ioapic_reset(void *opaque) +{ + struct kvm_ioapic_state *s = opaque; + struct kvm_irqchip *chip; + int i; + + chip = container_of(s, struct kvm_irqchip, chip.ioapic); + + chip->chip_id = KVM_IRQCHIP_IOAPIC; + + memset(s, 0, sizeof(*s)); + s->base_address = IOAPIC_DEFAULT_BASE_ADDRESS; + for(i = 0; i < IOAPIC_NUM_PINS; i++) + s->redirtbl[i].bits = 1 << 16; /* mask LVT */ + + kvm_set_irqchip(chip); +} + +static void ioapic_pre_save(void *opaque) +{ + struct kvm_ioapic_state *s = opaque; + struct kvm_irqchip *chip; + + chip = container_of(s, struct kvm_irqchip, chip.ioapic); + + kvm_get_irqchip(chip); +} + +static int ioapic_post_load(void *opaque, int version_id) +{ + struct kvm_ioapic_state *s = opaque; + struct kvm_irqchip *chip; + + chip = container_of(s, struct kvm_irqchip, chip.ioapic); + + return kvm_set_irqchip(chip); +} + +static const VMStateDescription vmstate_kvm_ioapic = { + .name = "ioapic-kvm", + .version_id = 1, + .minimum_version_id = 1, + .post_load = ioapic_post_load, + .pre_save = ioapic_pre_save, + .fields = (VMStateField []) { + VMSTATE_U64(base_address, struct kvm_ioapic_state), + VMSTATE_UINT32(id, struct kvm_ioapic_state), + VMSTATE_UINT32(ioregsel, struct kvm_ioapic_state), + VMSTATE_UINT32(irr, struct kvm_ioapic_state), + VMSTATE_ARRAY_UNSAFE(redirtbl, struct kvm_ioapic_state, IOAPIC_NUM_PINS, 0, vmstate_info_u64, __u64), + VMSTATE_END_OF_LIST() + } +}; + + +static void kvm_ioapic_set_irq(void *opaque, int vector, int level) +{ +} + +qemu_irq *kvm_ioapic_init(void) +{ + struct kvm_irqchip *s; + + s = qemu_mallocz(sizeof(*s)); + + ioapic_reset(&s->chip.ioapic); + + vmstate_register(0, &vmstate_kvm_ioapic, &s->chip.ioapic); + qemu_register_reset(ioapic_reset, &s->chip.ioapic); + + return qemu_allocate_irqs(kvm_ioapic_set_irq, &s->chip.ioapic, IOAPIC_NUM_PINS); +} Index: qemu/hw/pc.h =================================================================== --- qemu.orig/hw/pc.h +++ qemu/hw/pc.h @@ -48,6 +48,8 @@ void ioapic_set_irq(void *opaque, int ve void apic_reset_irq_delivered(void); int apic_get_irq_delivered(void); +qemu_irq *kvm_ioapic_init(void); + /* i8254.c */ #define PIT_FREQ 1193182 Index: qemu/kvm-all.c =================================================================== --- qemu.orig/kvm-all.c +++ qemu/kvm-all.c @@ -411,6 +411,26 @@ int kvm_check_extension(KVMState *s, uns return ret; } +#ifdef KVM_CAP_IRQCHIP +int kvm_set_irqchip(struct kvm_irqchip *chip) +{ + if (!kvm_state->irqchip_in_kernel) { + return 0; + } + + return kvm_vm_ioctl(kvm_state, KVM_SET_IRQCHIP, chip); +} + +int kvm_get_irqchip(struct kvm_irqchip *chip) +{ + if (!kvm_state->irqchip_in_kernel) { + return 0; + } + + return kvm_vm_ioctl(kvm_state, KVM_GET_IRQCHIP, chip); +} +#endif + int kvm_init(int smp_cpus) { static const char upgrade_note[] = Index: qemu/kvm.h =================================================================== --- qemu.orig/kvm.h +++ qemu/kvm.h @@ -16,6 +16,7 @@ #include "config.h" #include "qemu-queue.h" +#include <linux/kvm.h> #ifdef CONFIG_KVM extern int kvm_allowed; @@ -63,6 +64,9 @@ int kvm_update_guest_debug(CPUState *env int kvm_pit_in_kernel(void); int kvm_irqchip_in_kernel(void); +int kvm_set_irqchip(struct kvm_irqchip *chip); +int kvm_get_irqchip(struct kvm_irqchip *chip); + /* internal API */ struct KVMState;