Message ID | ae30bba4024251d46e1d139d509c817fc4002b7e.1338799935.git.jan.kiszka@siemens.com |
---|---|
State | New |
Headers | show |
On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > pci_change_irq_level(pci_dev, irq_num, change); > } > > +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > +{ > + PCIBus *bus = dev->host_bus; > + > + assert(bus->route_intx_to_irq); > + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > +} > + > /***********************************************************/ > /* monitor info on PCI */ > Just an idea: can devices cache this result, bypassing the intx to irq lookup on data path? > diff --git a/hw/pci.h b/hw/pci.h > index 5b54e2d..bbba01e 100644 > --- a/hw/pci.h > +++ b/hw/pci.h > @@ -141,6 +141,15 @@ enum { > #define PCI_DEVICE_GET_CLASS(obj) \ > OBJECT_GET_CLASS(PCIDeviceClass, (obj), TYPE_PCI_DEVICE) > > +typedef struct PCIINTxRoute { > + enum { > + PCI_INTX_ENABLED, > + PCI_INTX_INVERTED, > + PCI_INTX_DISABLED, > + } mode; > + int irq; > +} PCIINTxRoute; Is this INTX route or IRQ route? Is the INTX enabled/disabled/inverted or the IRQ? I have the impression it's the IRQ, in the apic. PCI INTX are never inverted they are always active low.
On 2012-06-07 16:32, Michael S. Tsirkin wrote: > On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: >> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) >> pci_change_irq_level(pci_dev, irq_num, change); >> } >> >> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) >> +{ >> + PCIBus *bus = dev->host_bus; >> + >> + assert(bus->route_intx_to_irq); >> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); >> +} >> + >> /***********************************************************/ >> /* monitor info on PCI */ >> > > Just an idea: can devices cache this result, bypassing the > intx to irq lookup on data path? That lookup is part of set_irq which we don't bypass so far and where this is generally trivial. If we want to cache the effects of set_irq as well, I guess things would become pretty complex (e.g. due to vmstate compatibility), and I'm unsure if it would buy us much. > >> diff --git a/hw/pci.h b/hw/pci.h >> index 5b54e2d..bbba01e 100644 >> --- a/hw/pci.h >> +++ b/hw/pci.h >> @@ -141,6 +141,15 @@ enum { >> #define PCI_DEVICE_GET_CLASS(obj) \ >> OBJECT_GET_CLASS(PCIDeviceClass, (obj), TYPE_PCI_DEVICE) >> >> +typedef struct PCIINTxRoute { >> + enum { >> + PCI_INTX_ENABLED, >> + PCI_INTX_INVERTED, >> + PCI_INTX_DISABLED, >> + } mode; >> + int irq; >> +} PCIINTxRoute; > > Is this INTX route or IRQ route? > Is the INTX enabled/disabled/inverted or the IRQ? > > I have the impression it's the IRQ, in the apic. > PCI INTX are never inverted they are always active low. This should be considered as "the route *of* an INTx", not "to some IRQ". I could call it PCIINTxToIRQRoute if you prefer, but it's a bit lengthy. Jan
On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > On 2012-06-07 16:32, Michael S. Tsirkin wrote: > > On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > >> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > >> pci_change_irq_level(pci_dev, irq_num, change); > >> } > >> > >> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > >> +{ > >> + PCIBus *bus = dev->host_bus; > >> + > >> + assert(bus->route_intx_to_irq); > >> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > >> +} > >> + > >> /***********************************************************/ > >> /* monitor info on PCI */ > >> > > > > Just an idea: can devices cache this result, bypassing the > > intx to irq lookup on data path? > > That lookup is part of set_irq which we don't bypass so far and where > this is generally trivial. If we want to cache the effects of set_irq as > well, I guess things would become pretty complex (e.g. due to vmstate > compatibility), and I'm unsure if it would buy us much. This is less for performance but more for making everyone use the same infrastructure rather than assigned devices being the weird case. > > > >> diff --git a/hw/pci.h b/hw/pci.h > >> index 5b54e2d..bbba01e 100644 > >> --- a/hw/pci.h > >> +++ b/hw/pci.h > >> @@ -141,6 +141,15 @@ enum { > >> #define PCI_DEVICE_GET_CLASS(obj) \ > >> OBJECT_GET_CLASS(PCIDeviceClass, (obj), TYPE_PCI_DEVICE) > >> > >> +typedef struct PCIINTxRoute { > >> + enum { > >> + PCI_INTX_ENABLED, > >> + PCI_INTX_INVERTED, > >> + PCI_INTX_DISABLED, > >> + } mode; > >> + int irq; > >> +} PCIINTxRoute; > > > > Is this INTX route or IRQ route? > > Is the INTX enabled/disabled/inverted or the IRQ? > > > > I have the impression it's the IRQ, in the apic. > > PCI INTX are never inverted they are always active low. > > This should be considered as "the route *of* an INTx", not "to some > IRQ". I could call it PCIINTxToIRQRoute if you prefer, but it's a bit > lengthy. > > Jan Yes but the polarity is in apic? Or is it in host bridge?
On 2012-06-07 18:28, Michael S. Tsirkin wrote: > On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: >> On 2012-06-07 16:32, Michael S. Tsirkin wrote: >>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: >>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) >>>> pci_change_irq_level(pci_dev, irq_num, change); >>>> } >>>> >>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) >>>> +{ >>>> + PCIBus *bus = dev->host_bus; >>>> + >>>> + assert(bus->route_intx_to_irq); >>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); >>>> +} >>>> + >>>> /***********************************************************/ >>>> /* monitor info on PCI */ >>>> >>> >>> Just an idea: can devices cache this result, bypassing the >>> intx to irq lookup on data path? >> >> That lookup is part of set_irq which we don't bypass so far and where >> this is generally trivial. If we want to cache the effects of set_irq as >> well, I guess things would become pretty complex (e.g. due to vmstate >> compatibility), and I'm unsure if it would buy us much. > > This is less for performance but more for making > everyone use the same infrastructure rather than > assigned devices being the weird case. Device assignment is weird. It bypasses all state updates as it does not have to bother about migratability. Well, of course we could cache the host bridge routing result as well, for every device. It would have to be in addition to host_intx_pin. But the result would look pretty strange to me. In any case, I would prefer to do this, if at all, on top of this series, specifically as it will require to touch all host bridges. > >>> >>>> diff --git a/hw/pci.h b/hw/pci.h >>>> index 5b54e2d..bbba01e 100644 >>>> --- a/hw/pci.h >>>> +++ b/hw/pci.h >>>> @@ -141,6 +141,15 @@ enum { >>>> #define PCI_DEVICE_GET_CLASS(obj) \ >>>> OBJECT_GET_CLASS(PCIDeviceClass, (obj), TYPE_PCI_DEVICE) >>>> >>>> +typedef struct PCIINTxRoute { >>>> + enum { >>>> + PCI_INTX_ENABLED, >>>> + PCI_INTX_INVERTED, >>>> + PCI_INTX_DISABLED, >>>> + } mode; >>>> + int irq; >>>> +} PCIINTxRoute; >>> >>> Is this INTX route or IRQ route? >>> Is the INTX enabled/disabled/inverted or the IRQ? >>> >>> I have the impression it's the IRQ, in the apic. >>> PCI INTX are never inverted they are always active low. >> >> This should be considered as "the route *of* an INTx", not "to some >> IRQ". I could call it PCIINTxToIRQRoute if you prefer, but it's a bit >> lengthy. >> >> Jan > > Yes but the polarity is in apic? Or is it in host bridge? > Nope (then we would not have to bother). At least one host bridge (bonito) is apparently able to invert the polarity. Jan
On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > On 2012-06-07 18:28, Michael S. Tsirkin wrote: > > On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > >> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > >>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > >>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > >>>> pci_change_irq_level(pci_dev, irq_num, change); > >>>> } > >>>> > >>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > >>>> +{ > >>>> + PCIBus *bus = dev->host_bus; > >>>> + > >>>> + assert(bus->route_intx_to_irq); > >>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > >>>> +} > >>>> + > >>>> /***********************************************************/ > >>>> /* monitor info on PCI */ > >>>> > >>> > >>> Just an idea: can devices cache this result, bypassing the > >>> intx to irq lookup on data path? > >> > >> That lookup is part of set_irq which we don't bypass so far and where > >> this is generally trivial. If we want to cache the effects of set_irq as > >> well, I guess things would become pretty complex (e.g. due to vmstate > >> compatibility), and I'm unsure if it would buy us much. > > > > This is less for performance but more for making > > everyone use the same infrastructure rather than > > assigned devices being the weird case. > > Device assignment is weird. It bypasses all state updates as it does not > have to bother about migratability. > > Well, of course we could cache the host bridge routing result as well, > for every device. It would have to be in addition to host_intx_pin. But > the result would look pretty strange to me. > > In any case, I would prefer to do this, if at all, on top of this > series, specifically as it will require to touch all host bridges. Yes that's fine. > > > >>> > >>>> diff --git a/hw/pci.h b/hw/pci.h > >>>> index 5b54e2d..bbba01e 100644 > >>>> --- a/hw/pci.h > >>>> +++ b/hw/pci.h > >>>> @@ -141,6 +141,15 @@ enum { > >>>> #define PCI_DEVICE_GET_CLASS(obj) \ > >>>> OBJECT_GET_CLASS(PCIDeviceClass, (obj), TYPE_PCI_DEVICE) > >>>> > >>>> +typedef struct PCIINTxRoute { > >>>> + enum { > >>>> + PCI_INTX_ENABLED, > >>>> + PCI_INTX_INVERTED, > >>>> + PCI_INTX_DISABLED, > >>>> + } mode; > >>>> + int irq; > >>>> +} PCIINTxRoute; > >>> > >>> Is this INTX route or IRQ route? > >>> Is the INTX enabled/disabled/inverted or the IRQ? > >>> > >>> I have the impression it's the IRQ, in the apic. > >>> PCI INTX are never inverted they are always active low. > >> > >> This should be considered as "the route *of* an INTx", not "to some > >> IRQ". I could call it PCIINTxToIRQRoute if you prefer, but it's a bit > >> lengthy. > >> > >> Jan > > > > Yes but the polarity is in apic? Or is it in host bridge? > > > > Nope (then we would not have to bother). At least one host bridge > (bonito) is apparently able to invert the polarity. > > Jan >
On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > On 2012-06-07 18:28, Michael S. Tsirkin wrote: > > On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > >> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > >>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > >>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > >>>> pci_change_irq_level(pci_dev, irq_num, change); > >>>> } > >>>> > >>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > >>>> +{ > >>>> + PCIBus *bus = dev->host_bus; > >>>> + > >>>> + assert(bus->route_intx_to_irq); > >>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > >>>> +} > >>>> + > >>>> /***********************************************************/ > >>>> /* monitor info on PCI */ > >>>> > >>> > >>> Just an idea: can devices cache this result, bypassing the > >>> intx to irq lookup on data path? > >> > >> That lookup is part of set_irq which we don't bypass so far and where > >> this is generally trivial. If we want to cache the effects of set_irq as > >> well, I guess things would become pretty complex (e.g. due to vmstate > >> compatibility), and I'm unsure if it would buy us much. > > > > This is less for performance but more for making > > everyone use the same infrastructure rather than > > assigned devices being the weird case. > > Device assignment is weird. It bypasses all state updates as it does not > have to bother about migratability. > > Well, of course we could cache the host bridge routing result as well, > for every device. It would have to be in addition to host_intx_pin. But > the result would look pretty strange to me. > > In any case, I would prefer to do this, if at all, on top of this > series, specifically as it will require to touch all host bridges. I'd like to ponder this a bit more then. If the claim is that device assignment is only needed for piix anyway, then why not make it depend on piix *explicitly*? Yes ugly but this will make it very easy to find and address any missing pieces. As it is you are adding APIs that in theory address non PIIX issues but in practice don't implement for non PIIX. So we never really know. > > > >>> > >>>> diff --git a/hw/pci.h b/hw/pci.h > >>>> index 5b54e2d..bbba01e 100644 > >>>> --- a/hw/pci.h > >>>> +++ b/hw/pci.h > >>>> @@ -141,6 +141,15 @@ enum { > >>>> #define PCI_DEVICE_GET_CLASS(obj) \ > >>>> OBJECT_GET_CLASS(PCIDeviceClass, (obj), TYPE_PCI_DEVICE) > >>>> > >>>> +typedef struct PCIINTxRoute { > >>>> + enum { > >>>> + PCI_INTX_ENABLED, > >>>> + PCI_INTX_INVERTED, > >>>> + PCI_INTX_DISABLED, > >>>> + } mode; > >>>> + int irq; > >>>> +} PCIINTxRoute; > >>> > >>> Is this INTX route or IRQ route? > >>> Is the INTX enabled/disabled/inverted or the IRQ? > >>> > >>> I have the impression it's the IRQ, in the apic. > >>> PCI INTX are never inverted they are always active low. > >> > >> This should be considered as "the route *of* an INTx", not "to some > >> IRQ". I could call it PCIINTxToIRQRoute if you prefer, but it's a bit > >> lengthy. > >> > >> Jan > > > > Yes but the polarity is in apic? Or is it in host bridge? > > > > Nope (then we would not have to bother). At least one host bridge > (bonito) is apparently able to invert the polarity. > > Jan >
On 2012-06-10 11:55, Michael S. Tsirkin wrote: > On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: >> On 2012-06-07 18:28, Michael S. Tsirkin wrote: >>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: >>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: >>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: >>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) >>>>>> pci_change_irq_level(pci_dev, irq_num, change); >>>>>> } >>>>>> >>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) >>>>>> +{ >>>>>> + PCIBus *bus = dev->host_bus; >>>>>> + >>>>>> + assert(bus->route_intx_to_irq); >>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); >>>>>> +} >>>>>> + >>>>>> /***********************************************************/ >>>>>> /* monitor info on PCI */ >>>>>> >>>>> >>>>> Just an idea: can devices cache this result, bypassing the >>>>> intx to irq lookup on data path? >>>> >>>> That lookup is part of set_irq which we don't bypass so far and where >>>> this is generally trivial. If we want to cache the effects of set_irq as >>>> well, I guess things would become pretty complex (e.g. due to vmstate >>>> compatibility), and I'm unsure if it would buy us much. >>> >>> This is less for performance but more for making >>> everyone use the same infrastructure rather than >>> assigned devices being the weird case. >> >> Device assignment is weird. It bypasses all state updates as it does not >> have to bother about migratability. >> >> Well, of course we could cache the host bridge routing result as well, >> for every device. It would have to be in addition to host_intx_pin. But >> the result would look pretty strange to me. >> >> In any case, I would prefer to do this, if at all, on top of this >> series, specifically as it will require to touch all host bridges. > > I'd like to ponder this a bit more then. > > If the claim is that device assignment is only needed for > piix anyway, then why not make it depend on piix *explicitly*? > Yes ugly but this will make it very easy to find and > address any missing pieces. Because it is conceptually independent of the PIIX, we will need it for successors of that x86 chipset as well, and I won't add the ugly hack of qemu-kvm upstream > > As it is you are adding APIs that in theory address > non PIIX issues but in practice don't implement for non > PIIX. So we never really know. I once hacked q35 to make it work with device assignment. This really requires something like this. It actually requires something generic, independent of PCI, but that's too much for this round. Jan
On Sun, Jun 10, 2012 at 12:08:23PM +0200, Jan Kiszka wrote: > On 2012-06-10 11:55, Michael S. Tsirkin wrote: > > On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > >> On 2012-06-07 18:28, Michael S. Tsirkin wrote: > >>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > >>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > >>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > >>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > >>>>>> pci_change_irq_level(pci_dev, irq_num, change); > >>>>>> } > >>>>>> > >>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > >>>>>> +{ > >>>>>> + PCIBus *bus = dev->host_bus; > >>>>>> + > >>>>>> + assert(bus->route_intx_to_irq); > >>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > >>>>>> +} > >>>>>> + > >>>>>> /***********************************************************/ > >>>>>> /* monitor info on PCI */ > >>>>>> > >>>>> > >>>>> Just an idea: can devices cache this result, bypassing the > >>>>> intx to irq lookup on data path? > >>>> > >>>> That lookup is part of set_irq which we don't bypass so far and where > >>>> this is generally trivial. If we want to cache the effects of set_irq as > >>>> well, I guess things would become pretty complex (e.g. due to vmstate > >>>> compatibility), and I'm unsure if it would buy us much. > >>> > >>> This is less for performance but more for making > >>> everyone use the same infrastructure rather than > >>> assigned devices being the weird case. > >> > >> Device assignment is weird. It bypasses all state updates as it does not > >> have to bother about migratability. > >> > >> Well, of course we could cache the host bridge routing result as well, > >> for every device. It would have to be in addition to host_intx_pin. But > >> the result would look pretty strange to me. > >> > >> In any case, I would prefer to do this, if at all, on top of this > >> series, specifically as it will require to touch all host bridges. > > > > I'd like to ponder this a bit more then. > > > > If the claim is that device assignment is only needed for > > piix anyway, then why not make it depend on piix *explicitly*? > > Yes ugly but this will make it very easy to find and > > address any missing pieces. > > Because it is conceptually independent of the PIIX, we will need it for > successors of that x86 chipset as well, and I won't add the ugly hack of > qemu-kvm upstream So you look at an API and see it requires a route callback. And you ask "why doesn't chipset X implement it"? And the answer is "because it's only used by device assignment". Which you will only know if you read this thread. So it's a hack. And I'd rather have the hacks in device-assignment.c than in pci.c even if the former are nastier. > > > > As it is you are adding APIs that in theory address > > non PIIX issues but in practice don't implement for non > > PIIX. So we never really know. > > I once hacked q35 to make it work with device assignment. This really > requires something like this. Yes I'm aware of this motivation. This does not do much to address the concerns though. > It actually requires something generic, > independent of PCI, but that's too much for this round. > > Jan And this just makes the concerns worse :(
On 2012-06-10 12:41, Michael S. Tsirkin wrote: > On Sun, Jun 10, 2012 at 12:08:23PM +0200, Jan Kiszka wrote: >> On 2012-06-10 11:55, Michael S. Tsirkin wrote: >>> On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: >>>> On 2012-06-07 18:28, Michael S. Tsirkin wrote: >>>>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: >>>>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: >>>>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: >>>>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) >>>>>>>> pci_change_irq_level(pci_dev, irq_num, change); >>>>>>>> } >>>>>>>> >>>>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) >>>>>>>> +{ >>>>>>>> + PCIBus *bus = dev->host_bus; >>>>>>>> + >>>>>>>> + assert(bus->route_intx_to_irq); >>>>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); >>>>>>>> +} >>>>>>>> + >>>>>>>> /***********************************************************/ >>>>>>>> /* monitor info on PCI */ >>>>>>>> >>>>>>> >>>>>>> Just an idea: can devices cache this result, bypassing the >>>>>>> intx to irq lookup on data path? >>>>>> >>>>>> That lookup is part of set_irq which we don't bypass so far and where >>>>>> this is generally trivial. If we want to cache the effects of set_irq as >>>>>> well, I guess things would become pretty complex (e.g. due to vmstate >>>>>> compatibility), and I'm unsure if it would buy us much. >>>>> >>>>> This is less for performance but more for making >>>>> everyone use the same infrastructure rather than >>>>> assigned devices being the weird case. >>>> >>>> Device assignment is weird. It bypasses all state updates as it does not >>>> have to bother about migratability. >>>> >>>> Well, of course we could cache the host bridge routing result as well, >>>> for every device. It would have to be in addition to host_intx_pin. But >>>> the result would look pretty strange to me. >>>> >>>> In any case, I would prefer to do this, if at all, on top of this >>>> series, specifically as it will require to touch all host bridges. >>> >>> I'd like to ponder this a bit more then. >>> >>> If the claim is that device assignment is only needed for >>> piix anyway, then why not make it depend on piix *explicitly*? >>> Yes ugly but this will make it very easy to find and >>> address any missing pieces. >> >> Because it is conceptually independent of the PIIX, we will need it for >> successors of that x86 chipset as well, and I won't add the ugly hack of >> qemu-kvm upstream > > So you look at an API and see it requires a route > callback. And you ask "why doesn't chipset X implement it"? > And the answer is "because it's only used by device assignment". > Which you will only know if you read this thread. So it's > a hack. And I'd rather have the hacks in device-assignment.c > than in pci.c even if the former are nastier. I don't share your view on this. It is surely _not_ a hack, specifically when compared to what we have so far and what could be done otherwise, e.g. hacking device-assignment and PIIX to make them cooperate (sorry, I would vote against such an attempt). This is just a partially used generic API. Any chipset not providing the required routing function will cause an assert once some tries to make use of it. So, what can I do to make this API more acceptable for you? Jan
On Sun, Jun 10, 2012 at 12:49:11PM +0200, Jan Kiszka wrote: > On 2012-06-10 12:41, Michael S. Tsirkin wrote: > > On Sun, Jun 10, 2012 at 12:08:23PM +0200, Jan Kiszka wrote: > >> On 2012-06-10 11:55, Michael S. Tsirkin wrote: > >>> On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > >>>> On 2012-06-07 18:28, Michael S. Tsirkin wrote: > >>>>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > >>>>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > >>>>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > >>>>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > >>>>>>>> pci_change_irq_level(pci_dev, irq_num, change); > >>>>>>>> } > >>>>>>>> > >>>>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > >>>>>>>> +{ > >>>>>>>> + PCIBus *bus = dev->host_bus; > >>>>>>>> + > >>>>>>>> + assert(bus->route_intx_to_irq); > >>>>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > >>>>>>>> +} > >>>>>>>> + > >>>>>>>> /***********************************************************/ > >>>>>>>> /* monitor info on PCI */ > >>>>>>>> > >>>>>>> > >>>>>>> Just an idea: can devices cache this result, bypassing the > >>>>>>> intx to irq lookup on data path? > >>>>>> > >>>>>> That lookup is part of set_irq which we don't bypass so far and where > >>>>>> this is generally trivial. If we want to cache the effects of set_irq as > >>>>>> well, I guess things would become pretty complex (e.g. due to vmstate > >>>>>> compatibility), and I'm unsure if it would buy us much. > >>>>> > >>>>> This is less for performance but more for making > >>>>> everyone use the same infrastructure rather than > >>>>> assigned devices being the weird case. > >>>> > >>>> Device assignment is weird. It bypasses all state updates as it does not > >>>> have to bother about migratability. > >>>> > >>>> Well, of course we could cache the host bridge routing result as well, > >>>> for every device. It would have to be in addition to host_intx_pin. But > >>>> the result would look pretty strange to me. > >>>> > >>>> In any case, I would prefer to do this, if at all, on top of this > >>>> series, specifically as it will require to touch all host bridges. > >>> > >>> I'd like to ponder this a bit more then. > >>> > >>> If the claim is that device assignment is only needed for > >>> piix anyway, then why not make it depend on piix *explicitly*? > >>> Yes ugly but this will make it very easy to find and > >>> address any missing pieces. > >> > >> Because it is conceptually independent of the PIIX, we will need it for > >> successors of that x86 chipset as well, and I won't add the ugly hack of > >> qemu-kvm upstream > > > > So you look at an API and see it requires a route > > callback. And you ask "why doesn't chipset X implement it"? > > And the answer is "because it's only used by device assignment". > > Which you will only know if you read this thread. So it's > > a hack. And I'd rather have the hacks in device-assignment.c > > than in pci.c even if the former are nastier. > > I don't share your view on this. It is surely _not_ a hack, specifically > when compared to what we have so far and what could be done otherwise, > e.g. hacking device-assignment and PIIX to make them cooperate (sorry, I > would vote against such an attempt). This is just a partially used > generic API. Any chipset not providing the required routing function > will cause an assert once some tries to make use of it. > > So, what can I do to make this API more acceptable for you? > > Jan > The idea that I suggested above would address this concern. Or wait a bit, maybe I'll come around when I ponder the alternatives.
On Sun, 2012-06-10 at 12:49 +0200, Jan Kiszka wrote: > On 2012-06-10 12:41, Michael S. Tsirkin wrote: > > On Sun, Jun 10, 2012 at 12:08:23PM +0200, Jan Kiszka wrote: > >> On 2012-06-10 11:55, Michael S. Tsirkin wrote: > >>> On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > >>>> On 2012-06-07 18:28, Michael S. Tsirkin wrote: > >>>>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > >>>>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > >>>>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > >>>>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > >>>>>>>> pci_change_irq_level(pci_dev, irq_num, change); > >>>>>>>> } > >>>>>>>> > >>>>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > >>>>>>>> +{ > >>>>>>>> + PCIBus *bus = dev->host_bus; > >>>>>>>> + > >>>>>>>> + assert(bus->route_intx_to_irq); > >>>>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > >>>>>>>> +} > >>>>>>>> + > >>>>>>>> /***********************************************************/ > >>>>>>>> /* monitor info on PCI */ > >>>>>>>> > >>>>>>> > >>>>>>> Just an idea: can devices cache this result, bypassing the > >>>>>>> intx to irq lookup on data path? > >>>>>> > >>>>>> That lookup is part of set_irq which we don't bypass so far and where > >>>>>> this is generally trivial. If we want to cache the effects of set_irq as > >>>>>> well, I guess things would become pretty complex (e.g. due to vmstate > >>>>>> compatibility), and I'm unsure if it would buy us much. > >>>>> > >>>>> This is less for performance but more for making > >>>>> everyone use the same infrastructure rather than > >>>>> assigned devices being the weird case. > >>>> > >>>> Device assignment is weird. It bypasses all state updates as it does not > >>>> have to bother about migratability. > >>>> > >>>> Well, of course we could cache the host bridge routing result as well, > >>>> for every device. It would have to be in addition to host_intx_pin. But > >>>> the result would look pretty strange to me. > >>>> > >>>> In any case, I would prefer to do this, if at all, on top of this > >>>> series, specifically as it will require to touch all host bridges. > >>> > >>> I'd like to ponder this a bit more then. > >>> > >>> If the claim is that device assignment is only needed for > >>> piix anyway, then why not make it depend on piix *explicitly*? > >>> Yes ugly but this will make it very easy to find and > >>> address any missing pieces. > >> > >> Because it is conceptually independent of the PIIX, we will need it for > >> successors of that x86 chipset as well, and I won't add the ugly hack of > >> qemu-kvm upstream > > > > So you look at an API and see it requires a route > > callback. And you ask "why doesn't chipset X implement it"? > > And the answer is "because it's only used by device assignment". > > Which you will only know if you read this thread. So it's > > a hack. And I'd rather have the hacks in device-assignment.c > > than in pci.c even if the former are nastier. > > I don't share your view on this. It is surely _not_ a hack, specifically > when compared to what we have so far and what could be done otherwise, > e.g. hacking device-assignment and PIIX to make them cooperate (sorry, I > would vote against such an attempt). This is just a partially used > generic API. Any chipset not providing the required routing function > will cause an assert once some tries to make use of it. I agree, and frankly it sounds like a rather biased attitude against device assignment. piix and q35 may be the only initial chipsets to implement this, but VFIO is designed to be architecture neutral. It's already working on POWER. Interrupt routing is one of the most intrusive parts because we cannot know from inside the device assignment code how the chipset exposes intx out to an irq. Jan's interface makes it easy for a chipset to add this, versus hacks in device assignment code, even if they were possible. If it needs to be tweaked for other chipsets, we'll do that when they come. Please don't roadblock device assignment or VFIO support by not allowing us a well architected, generic interface to track interrupts. Thanks, Alex
On Sun, Jun 10, 2012 at 08:19:28AM -0600, Alex Williamson wrote: > On Sun, 2012-06-10 at 12:49 +0200, Jan Kiszka wrote: > > On 2012-06-10 12:41, Michael S. Tsirkin wrote: > > > On Sun, Jun 10, 2012 at 12:08:23PM +0200, Jan Kiszka wrote: > > >> On 2012-06-10 11:55, Michael S. Tsirkin wrote: > > >>> On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > > >>>> On 2012-06-07 18:28, Michael S. Tsirkin wrote: > > >>>>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > > >>>>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > > >>>>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > > >>>>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > > >>>>>>>> pci_change_irq_level(pci_dev, irq_num, change); > > >>>>>>>> } > > >>>>>>>> > > >>>>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > > >>>>>>>> +{ > > >>>>>>>> + PCIBus *bus = dev->host_bus; > > >>>>>>>> + > > >>>>>>>> + assert(bus->route_intx_to_irq); > > >>>>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > > >>>>>>>> +} > > >>>>>>>> + > > >>>>>>>> /***********************************************************/ > > >>>>>>>> /* monitor info on PCI */ > > >>>>>>>> > > >>>>>>> > > >>>>>>> Just an idea: can devices cache this result, bypassing the > > >>>>>>> intx to irq lookup on data path? > > >>>>>> > > >>>>>> That lookup is part of set_irq which we don't bypass so far and where > > >>>>>> this is generally trivial. If we want to cache the effects of set_irq as > > >>>>>> well, I guess things would become pretty complex (e.g. due to vmstate > > >>>>>> compatibility), and I'm unsure if it would buy us much. > > >>>>> > > >>>>> This is less for performance but more for making > > >>>>> everyone use the same infrastructure rather than > > >>>>> assigned devices being the weird case. > > >>>> > > >>>> Device assignment is weird. It bypasses all state updates as it does not > > >>>> have to bother about migratability. > > >>>> > > >>>> Well, of course we could cache the host bridge routing result as well, > > >>>> for every device. It would have to be in addition to host_intx_pin. But > > >>>> the result would look pretty strange to me. > > >>>> > > >>>> In any case, I would prefer to do this, if at all, on top of this > > >>>> series, specifically as it will require to touch all host bridges. > > >>> > > >>> I'd like to ponder this a bit more then. > > >>> > > >>> If the claim is that device assignment is only needed for > > >>> piix anyway, then why not make it depend on piix *explicitly*? > > >>> Yes ugly but this will make it very easy to find and > > >>> address any missing pieces. > > >> > > >> Because it is conceptually independent of the PIIX, we will need it for > > >> successors of that x86 chipset as well, and I won't add the ugly hack of > > >> qemu-kvm upstream > > > > > > So you look at an API and see it requires a route > > > callback. And you ask "why doesn't chipset X implement it"? > > > And the answer is "because it's only used by device assignment". > > > Which you will only know if you read this thread. So it's > > > a hack. And I'd rather have the hacks in device-assignment.c > > > than in pci.c even if the former are nastier. > > > > I don't share your view on this. It is surely _not_ a hack, specifically > > when compared to what we have so far and what could be done otherwise, > > e.g. hacking device-assignment and PIIX to make them cooperate (sorry, I > > would vote against such an attempt). This is just a partially used > > generic API. Any chipset not providing the required routing function > > will cause an assert once some tries to make use of it. > > I agree, and frankly it sounds like a rather biased attitude against > device assignment. > piix and q35 may be the only initial chipsets to > implement this, but VFIO is designed to be architecture neutral. It's > already working on POWER. Not with this patch it doesn't. > Interrupt routing is one of the most > intrusive parts because we cannot know from inside the device assignment > code how the chipset exposes intx out to an irq. Jan's interface makes > it easy for a chipset to add this, versus hacks in device assignment > code, even if they were possible. If it needs to be tweaked for other > chipsets, we'll do that when they come. Please don't roadblock device > assignment or VFIO support by not allowing us a well architected, > generic interface to track interrupts. Thanks, > > Alex Problems are: 1. This sticks NULL in all chipsets except one. This means it's hard to find and fill out. 2. Adds a function, in every chipset, that is only used by assignment. This means many maintainers have no way to test it. Ways to handle this that came up so far, in order of my preference: 1. Cache all of route in devices. 2. Some ugly hack in device assignment. 3. Merge as is and fix it when it gets broken. So if you expect me to merge this work, then either Jan does (1), or gives up and does (2), or I find some time and do (1), or I fail to do (1) and get convinced that we need to do (3). Or someone else gets involved.
On Sun, 2012-06-10 at 17:43 +0300, Michael S. Tsirkin wrote: > On Sun, Jun 10, 2012 at 08:19:28AM -0600, Alex Williamson wrote: > > On Sun, 2012-06-10 at 12:49 +0200, Jan Kiszka wrote: > > > On 2012-06-10 12:41, Michael S. Tsirkin wrote: > > > > On Sun, Jun 10, 2012 at 12:08:23PM +0200, Jan Kiszka wrote: > > > >> On 2012-06-10 11:55, Michael S. Tsirkin wrote: > > > >>> On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > > > >>>> On 2012-06-07 18:28, Michael S. Tsirkin wrote: > > > >>>>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > > > >>>>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > > > >>>>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > > > >>>>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > > > >>>>>>>> pci_change_irq_level(pci_dev, irq_num, change); > > > >>>>>>>> } > > > >>>>>>>> > > > >>>>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > > > >>>>>>>> +{ > > > >>>>>>>> + PCIBus *bus = dev->host_bus; > > > >>>>>>>> + > > > >>>>>>>> + assert(bus->route_intx_to_irq); > > > >>>>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > > > >>>>>>>> +} > > > >>>>>>>> + > > > >>>>>>>> /***********************************************************/ > > > >>>>>>>> /* monitor info on PCI */ > > > >>>>>>>> > > > >>>>>>> > > > >>>>>>> Just an idea: can devices cache this result, bypassing the > > > >>>>>>> intx to irq lookup on data path? > > > >>>>>> > > > >>>>>> That lookup is part of set_irq which we don't bypass so far and where > > > >>>>>> this is generally trivial. If we want to cache the effects of set_irq as > > > >>>>>> well, I guess things would become pretty complex (e.g. due to vmstate > > > >>>>>> compatibility), and I'm unsure if it would buy us much. > > > >>>>> > > > >>>>> This is less for performance but more for making > > > >>>>> everyone use the same infrastructure rather than > > > >>>>> assigned devices being the weird case. > > > >>>> > > > >>>> Device assignment is weird. It bypasses all state updates as it does not > > > >>>> have to bother about migratability. > > > >>>> > > > >>>> Well, of course we could cache the host bridge routing result as well, > > > >>>> for every device. It would have to be in addition to host_intx_pin. But > > > >>>> the result would look pretty strange to me. > > > >>>> > > > >>>> In any case, I would prefer to do this, if at all, on top of this > > > >>>> series, specifically as it will require to touch all host bridges. > > > >>> > > > >>> I'd like to ponder this a bit more then. > > > >>> > > > >>> If the claim is that device assignment is only needed for > > > >>> piix anyway, then why not make it depend on piix *explicitly*? > > > >>> Yes ugly but this will make it very easy to find and > > > >>> address any missing pieces. > > > >> > > > >> Because it is conceptually independent of the PIIX, we will need it for > > > >> successors of that x86 chipset as well, and I won't add the ugly hack of > > > >> qemu-kvm upstream > > > > > > > > So you look at an API and see it requires a route > > > > callback. And you ask "why doesn't chipset X implement it"? > > > > And the answer is "because it's only used by device assignment". > > > > Which you will only know if you read this thread. So it's > > > > a hack. And I'd rather have the hacks in device-assignment.c > > > > than in pci.c even if the former are nastier. > > > > > > I don't share your view on this. It is surely _not_ a hack, specifically > > > when compared to what we have so far and what could be done otherwise, > > > e.g. hacking device-assignment and PIIX to make them cooperate (sorry, I > > > would vote against such an attempt). This is just a partially used > > > generic API. Any chipset not providing the required routing function > > > will cause an assert once some tries to make use of it. > > > > I agree, and frankly it sounds like a rather biased attitude against > > device assignment. > > piix and q35 may be the only initial chipsets to > > implement this, but VFIO is designed to be architecture neutral. It's > > already working on POWER. > > Not with this patch it doesn't. Alexey, how are you handling INTx routing and EOI on POWER? > > Interrupt routing is one of the most > > intrusive parts because we cannot know from inside the device assignment > > code how the chipset exposes intx out to an irq. Jan's interface makes > > it easy for a chipset to add this, versus hacks in device assignment > > code, even if they were possible. If it needs to be tweaked for other > > chipsets, we'll do that when they come. Please don't roadblock device > > assignment or VFIO support by not allowing us a well architected, > > generic interface to track interrupts. Thanks, > > > > Alex > > Problems are: > 1. This sticks NULL in all chipsets except one. This means it's > hard to find and fill out. I don't buy that. Look for all the callers, find the one that's not NULL, look at what it does... How's that a problem? This can be solved by documentation. > 2. Adds a function, in every chipset, that is only used by assignment. > This means many maintainers have no way to test it. It's effectively optional, so they don't have to implement it. They can wait to care about it until they want device assignment. This can also be solved by documentation so the maintainer knows when and how this is used and can choose to implement it or not. > Ways to handle this that came up so far, in order of my preference: > 1. Cache all of route in devices. How do we get the route to cache it? > 2. Some ugly hack in device assignment. We still have the problem of how do we get the route to start with? > 3. Merge as is and fix it when it gets broken. Isn't that how open source works? ;^) > So if you expect me to merge this work, then either Jan does (1), or > gives up and does (2), or I find some time and do (1), or I fail to do > (1) and get convinced that we need to do (3). Or someone else gets > involved. I still have trouble seeing how (3) is a problem. The translation of an INTx to an irq is chipset specific. We need a chipset function to perform that transform. We don't know how to do this for every chipset in the tree, nor do many of those chipset care at the moment about device assignment. Jan has created an arrangement where chipsets can easily opt-in to this support. Aside from asking us to go spend a month digging up specs for every chipset in tree and trying to implement this ourselves, what kind of enhancement to the interface are you looking for? Thanks, Alex
On Sun, Jun 10, 2012 at 09:25:06AM -0600, Alex Williamson wrote: > On Sun, 2012-06-10 at 17:43 +0300, Michael S. Tsirkin wrote: > > On Sun, Jun 10, 2012 at 08:19:28AM -0600, Alex Williamson wrote: > > > On Sun, 2012-06-10 at 12:49 +0200, Jan Kiszka wrote: > > > > On 2012-06-10 12:41, Michael S. Tsirkin wrote: > > > > > On Sun, Jun 10, 2012 at 12:08:23PM +0200, Jan Kiszka wrote: > > > > >> On 2012-06-10 11:55, Michael S. Tsirkin wrote: > > > > >>> On Thu, Jun 07, 2012 at 06:46:38PM +0200, Jan Kiszka wrote: > > > > >>>> On 2012-06-07 18:28, Michael S. Tsirkin wrote: > > > > >>>>> On Thu, Jun 07, 2012 at 05:10:17PM +0200, Jan Kiszka wrote: > > > > >>>>>> On 2012-06-07 16:32, Michael S. Tsirkin wrote: > > > > >>>>>>> On Mon, Jun 04, 2012 at 10:52:13AM +0200, Jan Kiszka wrote: > > > > >>>>>>>> @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) > > > > >>>>>>>> pci_change_irq_level(pci_dev, irq_num, change); > > > > >>>>>>>> } > > > > >>>>>>>> > > > > >>>>>>>> +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) > > > > >>>>>>>> +{ > > > > >>>>>>>> + PCIBus *bus = dev->host_bus; > > > > >>>>>>>> + > > > > >>>>>>>> + assert(bus->route_intx_to_irq); > > > > >>>>>>>> + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); > > > > >>>>>>>> +} > > > > >>>>>>>> + > > > > >>>>>>>> /***********************************************************/ > > > > >>>>>>>> /* monitor info on PCI */ > > > > >>>>>>>> > > > > >>>>>>> > > > > >>>>>>> Just an idea: can devices cache this result, bypassing the > > > > >>>>>>> intx to irq lookup on data path? > > > > >>>>>> > > > > >>>>>> That lookup is part of set_irq which we don't bypass so far and where > > > > >>>>>> this is generally trivial. If we want to cache the effects of set_irq as > > > > >>>>>> well, I guess things would become pretty complex (e.g. due to vmstate > > > > >>>>>> compatibility), and I'm unsure if it would buy us much. > > > > >>>>> > > > > >>>>> This is less for performance but more for making > > > > >>>>> everyone use the same infrastructure rather than > > > > >>>>> assigned devices being the weird case. > > > > >>>> > > > > >>>> Device assignment is weird. It bypasses all state updates as it does not > > > > >>>> have to bother about migratability. > > > > >>>> > > > > >>>> Well, of course we could cache the host bridge routing result as well, > > > > >>>> for every device. It would have to be in addition to host_intx_pin. But > > > > >>>> the result would look pretty strange to me. > > > > >>>> > > > > >>>> In any case, I would prefer to do this, if at all, on top of this > > > > >>>> series, specifically as it will require to touch all host bridges. > > > > >>> > > > > >>> I'd like to ponder this a bit more then. > > > > >>> > > > > >>> If the claim is that device assignment is only needed for > > > > >>> piix anyway, then why not make it depend on piix *explicitly*? > > > > >>> Yes ugly but this will make it very easy to find and > > > > >>> address any missing pieces. > > > > >> > > > > >> Because it is conceptually independent of the PIIX, we will need it for > > > > >> successors of that x86 chipset as well, and I won't add the ugly hack of > > > > >> qemu-kvm upstream > > > > > > > > > > So you look at an API and see it requires a route > > > > > callback. And you ask "why doesn't chipset X implement it"? > > > > > And the answer is "because it's only used by device assignment". > > > > > Which you will only know if you read this thread. So it's > > > > > a hack. And I'd rather have the hacks in device-assignment.c > > > > > than in pci.c even if the former are nastier. > > > > > > > > I don't share your view on this. It is surely _not_ a hack, specifically > > > > when compared to what we have so far and what could be done otherwise, > > > > e.g. hacking device-assignment and PIIX to make them cooperate (sorry, I > > > > would vote against such an attempt). This is just a partially used > > > > generic API. Any chipset not providing the required routing function > > > > will cause an assert once some tries to make use of it. > > > > > > I agree, and frankly it sounds like a rather biased attitude against > > > device assignment. > > > piix and q35 may be the only initial chipsets to > > > implement this, but VFIO is designed to be architecture neutral. It's > > > already working on POWER. > > > > Not with this patch it doesn't. > > Alexey, how are you handling INTx routing and EOI on POWER? > > > > Interrupt routing is one of the most > > > intrusive parts because we cannot know from inside the device assignment > > > code how the chipset exposes intx out to an irq. Jan's interface makes > > > it easy for a chipset to add this, versus hacks in device assignment > > > code, even if they were possible. If it needs to be tweaked for other > > > chipsets, we'll do that when they come. Please don't roadblock device > > > assignment or VFIO support by not allowing us a well architected, > > > generic interface to track interrupts. Thanks, > > > > > > Alex > > > > Problems are: > > 1. This sticks NULL in all chipsets except one. This means it's > > hard to find and fill out. > > I don't buy that. Look for all the callers, find the one that's not > NULL, look at what it does... How's that a problem? This can be solved > by documentation. Yes but it's not there. > > 2. Adds a function, in every chipset, that is only used by assignment. > > This means many maintainers have no way to test it. > > It's effectively optional, so they don't have to implement it. They can > wait to care about it until they want device assignment. This can also > be solved by documentation so the maintainer knows when and how this is > used and can choose to implement it or not. No, what I meant is it's hard to test on systems where it's implemented. > > Ways to handle this that came up so far, in order of my preference: > > 1. Cache all of route in devices. > > How do we get the route to cache it? Move routing logic from piix3_set_irq_level to a route_intx callback, or equivalent. Then you just look it up in device->root_route. > > 2. Some ugly hack in device assignment. > > We still have the problem of how do we get the route to start with? You are asking how an ugly hack might work? Think it up yourself, I'm not gonnu mention hacks at this point yet. > > 3. Merge as is and fix it when it gets broken. > > Isn't that how open source works? ;^) It works by darwinism. This is exactly what's going on here. > > So if you expect me to merge this work, then either Jan does (1), or > > gives up and does (2), or I find some time and do (1), or I fail to do > > (1) and get convinced that we need to do (3). Or someone else gets > > involved. > > I still have trouble seeing how (3) is a problem. The translation of an > INTx to an irq is chipset specific. We need a chipset function to > perform that transform. We don't know how to do this for every chipset > in the tree, nor do many of those chipset care at the moment about > device assignment. Jan has created an arrangement where chipsets can > easily opt-in to this support. Aside from asking us to go spend a month > digging up specs for every chipset in tree and trying to implement this > ourselves, what kind of enhancement to the interface are you looking > for? Thanks, > > Alex Sorry I don't understand what you are talking about. It's better to have one way to determine interrupt routing than two. It can be done, and IIUC Jan doesn't disagree. There are gnarly issues related to migration compatibility with old qemu sorrounding the solution which makes Jan think it's too complex, but nothing remotely related to digging up any specs.
On 2012-06-10 17:55, Michael S. Tsirkin wrote: >>> So if you expect me to merge this work, then either Jan does (1), or >>> gives up and does (2), or I find some time and do (1), or I fail to do >>> (1) and get convinced that we need to do (3). Or someone else gets >>> involved. >> >> I still have trouble seeing how (3) is a problem. The translation of an >> INTx to an irq is chipset specific. We need a chipset function to >> perform that transform. We don't know how to do this for every chipset >> in the tree, nor do many of those chipset care at the moment about >> device assignment. Jan has created an arrangement where chipsets can >> easily opt-in to this support. Aside from asking us to go spend a month >> digging up specs for every chipset in tree and trying to implement this >> ourselves, what kind of enhancement to the interface are you looking >> for? Thanks, >> >> Alex > > Sorry I don't understand what you are talking about. > It's better to have one way to determine interrupt routing than two. > It can be done, and IIUC Jan doesn't disagree. > There are gnarly issues related to migration compatibility > with old qemu sorrounding the solution which makes Jan think it's too complex, > but nothing remotely related to digging up any specs. Caching the host bridge generically means changing all chipsets and, well, also testing that they still work afterward. As explained before, I'd really like to avoid doing this in a single step. And there are still migration issues - as explained based on the PIIX3. Until it is officially stated that we give up on backward migratability, it will be quite some effort (ie. lots of additional code) to provide compatibility on top of generic intx-to-irq routing. This interface here is surely not the perfect solution. But it is better than a) hacking device assignment and VFIO into a specific chipset and b) starting a huge refactoring now. We can surely tweak some aspects of this approach before merging, e.g. documentation. But lets try to move forward in small, well testable steps. I really don't see the need for touching all chipsets and migration formats for this purpose ATM. Jan
On Sun, Jun 10, 2012 at 06:30:59PM +0200, Jan Kiszka wrote: > Caching the host bridge generically means changing all chipsets and, > well, also testing that they still work afterward. As explained before, > I'd really like to avoid doing this in a single step. Surely it is not hard to find a way to switch chipsets gradually.
On Sun, Jun 10, 2012 at 07:50:07PM +0300, Michael S. Tsirkin wrote: > On Sun, Jun 10, 2012 at 06:30:59PM +0200, Jan Kiszka wrote: > > Caching the host bridge generically means changing all chipsets and, > > well, also testing that they still work afterward. As explained before, > > I'd really like to avoid doing this in a single step. > > Surely it is not hard to find a way to switch chipsets gradually. Just to stress: the problem is that with this patch there are 2 ways to get routing even for piix. So instead of your patch which caches just intx route but always, and which I applied, you could have one that only caches if the root supplies a new route callback. Then to set irq, you check cache and if not valid, fallback on the legacy interface. With time we convert everyone and drop the legacy interface. > -- > MST
diff --git a/hw/alpha_typhoon.c b/hw/alpha_typhoon.c index d5193bb..1056b50 100644 --- a/hw/alpha_typhoon.c +++ b/hw/alpha_typhoon.c @@ -764,7 +764,7 @@ PCIBus *typhoon_init(ram_addr_t ram_size, ISABus **isa_bus, &s->pchip.reg_io); b = pci_register_bus(&s->host.busdev.qdev, "pci", - typhoon_set_irq, sys_map_irq, s, + typhoon_set_irq, sys_map_irq, NULL, s, &s->pchip.reg_mem, addr_space_io, 0, 64); s->host.bus = b; diff --git a/hw/apb_pci.c b/hw/apb_pci.c index 57ead09..270a785 100644 --- a/hw/apb_pci.c +++ b/hw/apb_pci.c @@ -378,7 +378,7 @@ PCIBus *pci_apb_init(target_phys_addr_t special_base, memory_region_add_subregion(get_system_memory(), mem_base, &d->pci_mmio); d->bus = pci_register_bus(&d->busdev.qdev, "pci", - pci_apb_set_irq, pci_pbm_map_irq, d, + pci_apb_set_irq, pci_pbm_map_irq, NULL, d, &d->pci_mmio, get_system_io(), 0, 32); diff --git a/hw/bonito.c b/hw/bonito.c index 77786f8..7ce5993 100644 --- a/hw/bonito.c +++ b/hw/bonito.c @@ -750,7 +750,7 @@ PCIBus *bonito_init(qemu_irq *pic) dev = qdev_create(NULL, "Bonito-pcihost"); pcihost = FROM_SYSBUS(BonitoState, sysbus_from_qdev(dev)); b = pci_register_bus(&pcihost->busdev.qdev, "pci", pci_bonito_set_irq, - pci_bonito_map_irq, pic, get_system_memory(), + pci_bonito_map_irq, NULL, pic, get_system_memory(), get_system_io(), 0x28, 32); pcihost->bus = b; diff --git a/hw/grackle_pci.c b/hw/grackle_pci.c index 81ff3a3..f47d9fe 100644 --- a/hw/grackle_pci.c +++ b/hw/grackle_pci.c @@ -85,6 +85,7 @@ PCIBus *pci_grackle_init(uint32_t base, qemu_irq *pic, d->host_state.bus = pci_register_bus(&d->busdev.qdev, "pci", pci_grackle_set_irq, pci_grackle_map_irq, + NULL, pic, &d->pci_mmio, address_space_io, diff --git a/hw/gt64xxx.c b/hw/gt64xxx.c index a2d0e5a..2418238 100644 --- a/hw/gt64xxx.c +++ b/hw/gt64xxx.c @@ -1093,6 +1093,7 @@ PCIBus *gt64120_register(qemu_irq *pic) d = FROM_SYSBUS(GT64120State, s); d->pci.bus = pci_register_bus(&d->busdev.qdev, "pci", gt64120_pci_set_irq, gt64120_pci_map_irq, + NULL, pic, get_system_memory(), get_system_io(), diff --git a/hw/pci.c b/hw/pci.c index 9a2b4a3..8878a11 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -320,10 +320,12 @@ PCIBus *pci_bus_new(DeviceState *parent, const char *name, void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, pci_route_pin_fn route_intx_pin, + pci_route_irq_fn route_intx_to_irq, void *irq_opaque, int nirq) { bus->set_irq = set_irq; bus->route_intx_pin = route_intx_pin; + bus->route_intx_to_irq = route_intx_to_irq; bus->irq_opaque = irq_opaque; bus->nirq = nirq; bus->irq_count = g_malloc0(nirq * sizeof(bus->irq_count[0])); @@ -340,6 +342,7 @@ void pci_bus_hotplug(PCIBus *bus, pci_hotplug_fn hotplug, DeviceState *qdev) PCIBus *pci_register_bus(DeviceState *parent, const char *name, pci_set_irq_fn set_irq, pci_route_pin_fn route_intx_pin, + pci_route_irq_fn route_intx_to_irq, void *irq_opaque, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, @@ -349,7 +352,8 @@ PCIBus *pci_register_bus(DeviceState *parent, const char *name, bus = pci_bus_new(parent, name, address_space_mem, address_space_io, devfn_min); - pci_bus_irqs(bus, set_irq, route_intx_pin, irq_opaque, nirq); + pci_bus_irqs(bus, set_irq, route_intx_pin, route_intx_to_irq, irq_opaque, + nirq); return bus; } @@ -1089,6 +1093,14 @@ static void pci_set_irq(void *opaque, int irq_num, int level) pci_change_irq_level(pci_dev, irq_num, change); } +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin) +{ + PCIBus *bus = dev->host_bus; + + assert(bus->route_intx_to_irq); + return bus->route_intx_to_irq(bus->irq_opaque, dev->host_intx_pin[pin]); +} + /***********************************************************/ /* monitor info on PCI */ diff --git a/hw/pci.h b/hw/pci.h index 5b54e2d..bbba01e 100644 --- a/hw/pci.h +++ b/hw/pci.h @@ -141,6 +141,15 @@ enum { #define PCI_DEVICE_GET_CLASS(obj) \ OBJECT_GET_CLASS(PCIDeviceClass, (obj), TYPE_PCI_DEVICE) +typedef struct PCIINTxRoute { + enum { + PCI_INTX_ENABLED, + PCI_INTX_INVERTED, + PCI_INTX_DISABLED, + } mode; + int irq; +} PCIINTxRoute; + typedef struct PCIDeviceClass { DeviceClass parent_class; @@ -280,6 +289,7 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev); typedef void (*pci_set_irq_fn)(void *opaque, int irq_num, int level); typedef int (*pci_route_pin_fn)(PCIDevice *pci_dev, int pin); +typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); typedef enum { PCI_HOTPLUG_DISABLED, @@ -295,16 +305,19 @@ PCIBus *pci_bus_new(DeviceState *parent, const char *name, uint8_t devfn_min); void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, pci_route_pin_fn route_intx_pin, + pci_route_irq_fn route_intx_to_irq, void *irq_opaque, int nirq); int pci_bus_get_irq_level(PCIBus *bus, int irq_num); void pci_bus_hotplug(PCIBus *bus, pci_hotplug_fn hotplug, DeviceState *dev); PCIBus *pci_register_bus(DeviceState *parent, const char *name, pci_set_irq_fn set_irq, pci_route_pin_fn route_intx_pin, + pci_route_irq_fn route_intx_to_irq, void *irq_opaque, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, uint8_t devfn_min, int nirq); +PCIINTxRoute pci_device_route_intx_to_irq(PCIDevice *dev, int pin); void pci_device_reset(PCIDevice *dev); void pci_bus_reset(PCIBus *bus); diff --git a/hw/pci_internals.h b/hw/pci_internals.h index be5594b..6639232 100644 --- a/hw/pci_internals.h +++ b/hw/pci_internals.h @@ -19,6 +19,7 @@ struct PCIBus { uint8_t devfn_min; pci_set_irq_fn set_irq; pci_route_pin_fn route_intx_pin; + pci_route_irq_fn route_intx_to_irq; pci_hotplug_fn hotplug; DeviceState *hotplug_qdev; void *irq_opaque; diff --git a/hw/piix_pci.c b/hw/piix_pci.c index 09e84f5..347177f 100644 --- a/hw/piix_pci.c +++ b/hw/piix_pci.c @@ -89,6 +89,7 @@ struct PCII440FXState { #define I440FX_SMRAM 0x72 static void piix3_set_irq(void *opaque, int pirq, int level); +static PCIINTxRoute piix3_route_intx_pin_to_irq(void *opaque, int pci_intx); static void piix3_write_config_xen(PCIDevice *dev, uint32_t address, uint32_t val, int len); @@ -308,13 +309,13 @@ static PCIBus *i440fx_common_init(const char *device_name, if (xen_enabled()) { piix3 = DO_UPCAST(PIIX3State, dev, pci_create_simple_multifunction(b, -1, true, "PIIX3-xen")); - pci_bus_irqs(b, xen_piix3_set_irq, xen_pci_slot_get_pirq, + pci_bus_irqs(b, xen_piix3_set_irq, xen_pci_slot_get_pirq, NULL, piix3, XEN_PIIX_NUM_PIRQS); } else { piix3 = DO_UPCAST(PIIX3State, dev, pci_create_simple_multifunction(b, -1, true, "PIIX3")); - pci_bus_irqs(b, piix3_set_irq, pci_slot_get_pirq, piix3, - PIIX_NUM_PIRQS); + pci_bus_irqs(b, piix3_set_irq, pci_slot_get_pirq, + piix3_route_intx_pin_to_irq, piix3, PIIX_NUM_PIRQS); } piix3->pic = pic; *isa_bus = DO_UPCAST(ISABus, qbus, @@ -386,6 +387,22 @@ static void piix3_set_irq(void *opaque, int pirq, int level) piix3_set_irq_level(piix3, pirq, level); } +static PCIINTxRoute piix3_route_intx_pin_to_irq(void *opaque, int pin) +{ + PIIX3State *piix3 = opaque; + int irq = piix3->dev.config[PIIX_PIRQC + pin]; + PCIINTxRoute route; + + if (irq < PIIX_NUM_PIC_IRQS) { + route.mode = PCI_INTX_ENABLED; + route.irq = irq; + } else { + route.mode = PCI_INTX_DISABLED; + route.irq = -1; + } + return route; +} + /* irq routing is changed. so rebuild bitmap */ static void piix3_update_irq_levels(PIIX3State *piix3) { diff --git a/hw/ppc4xx_pci.c b/hw/ppc4xx_pci.c index 203c3cd..224c4a0 100644 --- a/hw/ppc4xx_pci.c +++ b/hw/ppc4xx_pci.c @@ -343,7 +343,7 @@ static int ppc4xx_pcihost_initfn(SysBusDevice *dev) } b = pci_register_bus(&s->pci_state.busdev.qdev, NULL, ppc4xx_pci_set_irq, - ppc4xx_pci_map_irq, s->irq, get_system_memory(), + ppc4xx_pci_map_irq, NULL, s->irq, get_system_memory(), get_system_io(), 0, 4); s->pci_state.bus = b; diff --git a/hw/ppce500_pci.c b/hw/ppce500_pci.c index 0f60b24..dd924ae 100644 --- a/hw/ppce500_pci.c +++ b/hw/ppce500_pci.c @@ -318,7 +318,7 @@ static int e500_pcihost_initfn(SysBusDevice *dev) } b = pci_register_bus(&s->pci_state.busdev.qdev, NULL, mpc85xx_pci_set_irq, - mpc85xx_pci_map_irq, s->irq, address_space_mem, + mpc85xx_pci_map_irq, NULL, s->irq, address_space_mem, address_space_io, PCI_DEVFN(0x11, 0), 4); s->pci_state.bus = b; diff --git a/hw/prep_pci.c b/hw/prep_pci.c index 38dbff4..9d7bec7 100644 --- a/hw/prep_pci.c +++ b/hw/prep_pci.c @@ -108,7 +108,7 @@ static int raven_pcihost_init(SysBusDevice *dev) } bus = pci_register_bus(&h->busdev.qdev, NULL, - prep_set_irq, prep_map_irq, s->irq, + prep_set_irq, prep_map_irq, NULL, s->irq, address_space_mem, address_space_io, 0, 4); h->bus = bus; diff --git a/hw/sh_pci.c b/hw/sh_pci.c index 0cfac46..1cea12b 100644 --- a/hw/sh_pci.c +++ b/hw/sh_pci.c @@ -120,7 +120,7 @@ static int sh_pci_device_init(SysBusDevice *dev) sysbus_init_irq(dev, &s->irq[i]); } s->bus = pci_register_bus(&s->busdev.qdev, "pci", - sh_pci_set_irq, sh_pci_map_irq, + sh_pci_set_irq, sh_pci_map_irq, NULL, s->irq, get_system_memory(), get_system_io(), diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c index 25b400a..4a43062 100644 --- a/hw/spapr_pci.c +++ b/hw/spapr_pci.c @@ -306,7 +306,7 @@ static int spapr_phb_init(SysBusDevice *s) bus = pci_register_bus(&phb->busdev.qdev, phb->busname ? phb->busname : phb->dtbusname, - pci_spapr_set_irq, pci_spapr_map_irq, phb, + pci_spapr_set_irq, pci_spapr_map_irq, NULL, phb, &phb->memspace, &phb->iospace, PCI_DEVFN(0, 0), PCI_NUM_PINS); phb->host_state.bus = bus; diff --git a/hw/unin_pci.c b/hw/unin_pci.c index 409bcd4..056e3bc 100644 --- a/hw/unin_pci.c +++ b/hw/unin_pci.c @@ -227,7 +227,7 @@ PCIBus *pci_pmac_init(qemu_irq *pic, d->host_state.bus = pci_register_bus(dev, "pci", pci_unin_set_irq, pci_unin_map_irq, - pic, + NULL, pic, &d->pci_mmio, address_space_io, PCI_DEVFN(11, 0), 4); @@ -293,7 +293,7 @@ PCIBus *pci_pmac_u3_init(qemu_irq *pic, d->host_state.bus = pci_register_bus(dev, "pci", pci_unin_set_irq, pci_unin_map_irq, - pic, + NULL, pic, &d->pci_mmio, address_space_io, PCI_DEVFN(11, 0), 4); diff --git a/hw/versatile_pci.c b/hw/versatile_pci.c index ae53a8b..90c400e 100644 --- a/hw/versatile_pci.c +++ b/hw/versatile_pci.c @@ -68,7 +68,7 @@ static int pci_vpb_init(SysBusDevice *dev) sysbus_init_irq(dev, &s->irq[i]); } bus = pci_register_bus(&dev->qdev, "pci", - pci_vpb_set_irq, pci_vpb_map_irq, s->irq, + pci_vpb_set_irq, pci_vpb_map_irq, NULL, s->irq, get_system_memory(), get_system_io(), PCI_DEVFN(11, 0), 4);
Add a PCI IRQ path discovery function that walks from a given device to the host bridge, returning the mode (enabled/inverted/disabled) and the IRQ number that is reported to the attached interrupt controller. For this purpose, another host bridge callback function is introduced: route_intx_to_irq. It is so far only implemented by the PIIX3, other host bridges can be added later on as required. Will be used for KVM PCI device assignment and VFIO. Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> --- hw/alpha_typhoon.c | 2 +- hw/apb_pci.c | 2 +- hw/bonito.c | 2 +- hw/grackle_pci.c | 1 + hw/gt64xxx.c | 1 + hw/pci.c | 14 +++++++++++++- hw/pci.h | 13 +++++++++++++ hw/pci_internals.h | 1 + hw/piix_pci.c | 23 ++++++++++++++++++++--- hw/ppc4xx_pci.c | 2 +- hw/ppce500_pci.c | 2 +- hw/prep_pci.c | 2 +- hw/sh_pci.c | 2 +- hw/spapr_pci.c | 2 +- hw/unin_pci.c | 4 ++-- hw/versatile_pci.c | 2 +- 16 files changed, 60 insertions(+), 15 deletions(-)