diff mbox

[v2,12/13] intel_iommu: ioapic: IR support for emulated IOAPIC

Message ID 1460366363-4589-13-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu April 11, 2016, 9:19 a.m. UTC
This patch add the first device support for Intel IOMMU interrupt
remapping, which is the default IOAPIC device created alongside with Q35
platform. This will be the first step along the way to fully enable
IOMMU IR on x86 systems.

Currently, only emluated IOAPIC is supported. This requires
"kernel_irqchip=off" parameter specified.

Originally, IOAPIC has its own table to maintain IRQ information. When
IOMMU IR is enabled, guest OS will fill in the real IRQ data into IRTE
entries of IOMMU IR root table, while in IOAPIC only the index
information is maintained (with several legacy bits which might not be
covered by VT-d IR). If so, we need to talk to IOMMU to get the real IRQ
information to deliver.

Please refer to VT-d spec 5.1.5.1 for more information.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c         | 127 ++++++++++++++++++++++++++++++++++++++++++
 hw/intc/ioapic.c              |  36 +++++++++---
 include/hw/i386/intel_iommu.h |  17 ++++++
 3 files changed, 173 insertions(+), 7 deletions(-)

Comments

Jan Kiszka April 12, 2016, 5:22 a.m. UTC | #1
On 2016-04-11 02:19, Peter Xu wrote:
> diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> index 378e663..d963d45 100644
> --- a/hw/intc/ioapic.c
> +++ b/hw/intc/ioapic.c
> @@ -57,6 +57,8 @@ static void ioapic_service(IOAPICCommonState *s)
>      uint64_t entry;
>      uint8_t dest;
>      uint8_t dest_mode;
> +    IntelIOMMUState *iommu = s->iommu;
> +    VTDIrq irq = {0};
>  
>      for (i = 0; i < IOAPIC_NUM_PINS; i++) {
>          mask = 1 << i;
> @@ -65,11 +67,33 @@ static void ioapic_service(IOAPICCommonState *s)
>  
>              entry = s->ioredtbl[i];
>              if (!(entry & IOAPIC_LVT_MASKED)) {
> -                trig_mode = ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1);
> -                dest = entry >> IOAPIC_LVT_DEST_SHIFT;
> -                dest_mode = (entry >> IOAPIC_LVT_DEST_MODE_SHIFT) & 1;
> -                delivery_mode =
> -                    (entry >> IOAPIC_LVT_DELIV_MODE_SHIFT) & IOAPIC_DM_MASK;
> +
> +                if (iommu && iommu->intr_enabled) {
> +                    /*
> +                    * Interrupt remapping is enabled in owner IOMMU,
> +                    * we need to fetch the real IRQ information via
> +                    * IRTE of the root mapping table
> +                    */
> +                    if (vtd_interrupt_remap_ioapic(iommu, &entry, &irq)) {

VT-d is only one of the possible IOMMUs on x86. Please introduce a
generic interface.

Look at Rita's and my patches: they translate the IOAPIC (and HPET...)
interrupts into MSI messages that are then - in a generic way -
intercepted by the respective IOMMU or directly dispatched to the APICs.
We may no longer need new memory regions for this, thanks to the region
attributes, but we also need no hard-coded hooks here.

Thanks,
Jan
Peter Xu April 12, 2016, 9:02 a.m. UTC | #2
On Mon, Apr 11, 2016 at 10:22:18PM -0700, Jan Kiszka wrote:
> On 2016-04-11 02:19, Peter Xu wrote:
> > diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
> > index 378e663..d963d45 100644
> > --- a/hw/intc/ioapic.c
> > +++ b/hw/intc/ioapic.c
> > @@ -57,6 +57,8 @@ static void ioapic_service(IOAPICCommonState *s)
> >      uint64_t entry;
> >      uint8_t dest;
> >      uint8_t dest_mode;
> > +    IntelIOMMUState *iommu = s->iommu;
> > +    VTDIrq irq = {0};
> >  
> >      for (i = 0; i < IOAPIC_NUM_PINS; i++) {
> >          mask = 1 << i;
> > @@ -65,11 +67,33 @@ static void ioapic_service(IOAPICCommonState *s)
> >  
> >              entry = s->ioredtbl[i];
> >              if (!(entry & IOAPIC_LVT_MASKED)) {
> > -                trig_mode = ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1);
> > -                dest = entry >> IOAPIC_LVT_DEST_SHIFT;
> > -                dest_mode = (entry >> IOAPIC_LVT_DEST_MODE_SHIFT) & 1;
> > -                delivery_mode =
> > -                    (entry >> IOAPIC_LVT_DELIV_MODE_SHIFT) & IOAPIC_DM_MASK;
> > +
> > +                if (iommu && iommu->intr_enabled) {
> > +                    /*
> > +                    * Interrupt remapping is enabled in owner IOMMU,
> > +                    * we need to fetch the real IRQ information via
> > +                    * IRTE of the root mapping table
> > +                    */
> > +                    if (vtd_interrupt_remap_ioapic(iommu, &entry, &irq)) {
> 
> VT-d is only one of the possible IOMMUs on x86. Please introduce a
> generic interface.
> 
> Look at Rita's and my patches: they translate the IOAPIC (and HPET...)
> interrupts into MSI messages that are then - in a generic way -
> intercepted by the respective IOMMU or directly dispatched to the APICs.
> We may no longer need new memory regions for this, thanks to the region
> attributes, but we also need no hard-coded hooks here.

Yes, I should consider other x86 platforms like AMD. Thanks to point
out. It seems that there are many places in the patchset that lacks
thorough consideration about this. Will try to fix them in next
version.

Regarding to the above MSI solution: I'd say it is a good way to
hide everything else behind.  However, since we introduced one extra
layer (MSI) which actually does not exist, not sure there would be
problem too.  Also, I feel it a little bit hacky if we "create" one
MSI out of the air...  For example, if someone tries to capture MSIs
from QEMU inside in the APIC memory writes, he will see something he
cannot explain if he never knows this hack's there.  Considering the
above, I would prefer hooks, or better to provide a callback (a
function pointer that others like AMD can override) to do the
translation.  How do you think?

Thanks.

-- peterx
Jan Kiszka April 12, 2016, 3:39 p.m. UTC | #3
On 2016-04-12 02:02, Peter Xu wrote:
> On Mon, Apr 11, 2016 at 10:22:18PM -0700, Jan Kiszka wrote:
>> On 2016-04-11 02:19, Peter Xu wrote:
>>> diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
>>> index 378e663..d963d45 100644
>>> --- a/hw/intc/ioapic.c
>>> +++ b/hw/intc/ioapic.c
>>> @@ -57,6 +57,8 @@ static void ioapic_service(IOAPICCommonState *s)
>>>      uint64_t entry;
>>>      uint8_t dest;
>>>      uint8_t dest_mode;
>>> +    IntelIOMMUState *iommu = s->iommu;
>>> +    VTDIrq irq = {0};
>>>  
>>>      for (i = 0; i < IOAPIC_NUM_PINS; i++) {
>>>          mask = 1 << i;
>>> @@ -65,11 +67,33 @@ static void ioapic_service(IOAPICCommonState *s)
>>>  
>>>              entry = s->ioredtbl[i];
>>>              if (!(entry & IOAPIC_LVT_MASKED)) {
>>> -                trig_mode = ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1);
>>> -                dest = entry >> IOAPIC_LVT_DEST_SHIFT;
>>> -                dest_mode = (entry >> IOAPIC_LVT_DEST_MODE_SHIFT) & 1;
>>> -                delivery_mode =
>>> -                    (entry >> IOAPIC_LVT_DELIV_MODE_SHIFT) & IOAPIC_DM_MASK;
>>> +
>>> +                if (iommu && iommu->intr_enabled) {
>>> +                    /*
>>> +                    * Interrupt remapping is enabled in owner IOMMU,
>>> +                    * we need to fetch the real IRQ information via
>>> +                    * IRTE of the root mapping table
>>> +                    */
>>> +                    if (vtd_interrupt_remap_ioapic(iommu, &entry, &irq)) {
>>
>> VT-d is only one of the possible IOMMUs on x86. Please introduce a
>> generic interface.
>>
>> Look at Rita's and my patches: they translate the IOAPIC (and HPET...)
>> interrupts into MSI messages that are then - in a generic way -
>> intercepted by the respective IOMMU or directly dispatched to the APICs.
>> We may no longer need new memory regions for this, thanks to the region
>> attributes, but we also need no hard-coded hooks here.
> 
> Yes, I should consider other x86 platforms like AMD. Thanks to point
> out. It seems that there are many places in the patchset that lacks
> thorough consideration about this. Will try to fix them in next
> version.
> 
> Regarding to the above MSI solution: I'd say it is a good way to
> hide everything else behind.  However, since we introduced one extra
> layer (MSI) which actually does not exist, not sure there would be
> problem too.  Also, I feel it a little bit hacky if we "create" one
> MSI out of the air...  For example, if someone tries to capture MSIs
> from QEMU inside in the APIC memory writes, he will see something he
> cannot explain if he never knows this hack's there.  Considering the
> above, I would prefer hooks, or better to provide a callback (a
> function pointer that others like AMD can override) to do the
> translation.  How do you think?

The HPET does send MSIs, and I'm not sure how much different the
IOAPIC's message actually is. In any case, modelling it as MSI is
neither adding incorrectness nor making the code more complex (in fact,
the contrary is true!). Last but not least, it would be trivial to
filter out non-PCI MSI sources if we wanted to trace only PCI - because
we need to identify the origin anyway for remapping purposes. So,
explicit hooking looks like the wrong way to me.

Jan
Peter Xu April 13, 2016, 3:33 a.m. UTC | #4
On Tue, Apr 12, 2016 at 08:39:02AM -0700, Jan Kiszka wrote:
> On 2016-04-12 02:02, Peter Xu wrote:

[...]

> > Yes, I should consider other x86 platforms like AMD. Thanks to point
> > out. It seems that there are many places in the patchset that lacks
> > thorough consideration about this. Will try to fix them in next
> > version.
> > 
> > Regarding to the above MSI solution: I'd say it is a good way to
> > hide everything else behind.  However, since we introduced one extra
> > layer (MSI) which actually does not exist, not sure there would be
> > problem too.  Also, I feel it a little bit hacky if we "create" one
> > MSI out of the air...  For example, if someone tries to capture MSIs
> > from QEMU inside in the APIC memory writes, he will see something he
> > cannot explain if he never knows this hack's there.  Considering the
> > above, I would prefer hooks, or better to provide a callback (a
> > function pointer that others like AMD can override) to do the
> > translation.  How do you think?
> 
> The HPET does send MSIs, and I'm not sure how much different the
> IOAPIC's message actually is. In any case, modelling it as MSI is
> neither adding incorrectness nor making the code more complex (in fact,
> the contrary is true!). Last but not least, it would be trivial to
> filter out non-PCI MSI sources if we wanted to trace only PCI - because
> we need to identify the origin anyway for remapping purposes. So,
> explicit hooking looks like the wrong way to me.

I am just not sure about the difference between IOAPIC's messages
and MSI ones. For now, they seems very alike. However, I am not sure
whether it would be not alike in the future. E.g., if one day, we
extend APIC bus to support more than 255 CPUs (could it? I do not
know for sure), here if we are with this "MSI layer", we would not
be able to do that, since MSI only support 8 bits for destination ID
field. That's my only worry now. If you (or Radim? or anyone more
experienced on this than me) can confirm that this would never be a
problem, I'd be glad to take the MSI way.

Thanks.

-- peterx
Jan Kiszka April 13, 2016, 3:39 a.m. UTC | #5
On 2016-04-12 20:33, Peter Xu wrote:
> On Tue, Apr 12, 2016 at 08:39:02AM -0700, Jan Kiszka wrote:
>> On 2016-04-12 02:02, Peter Xu wrote:
> 
> [...]
> 
>>> Yes, I should consider other x86 platforms like AMD. Thanks to point
>>> out. It seems that there are many places in the patchset that lacks
>>> thorough consideration about this. Will try to fix them in next
>>> version.
>>>
>>> Regarding to the above MSI solution: I'd say it is a good way to
>>> hide everything else behind.  However, since we introduced one extra
>>> layer (MSI) which actually does not exist, not sure there would be
>>> problem too.  Also, I feel it a little bit hacky if we "create" one
>>> MSI out of the air...  For example, if someone tries to capture MSIs
>>> from QEMU inside in the APIC memory writes, he will see something he
>>> cannot explain if he never knows this hack's there.  Considering the
>>> above, I would prefer hooks, or better to provide a callback (a
>>> function pointer that others like AMD can override) to do the
>>> translation.  How do you think?
>>
>> The HPET does send MSIs, and I'm not sure how much different the
>> IOAPIC's message actually is. In any case, modelling it as MSI is
>> neither adding incorrectness nor making the code more complex (in fact,
>> the contrary is true!). Last but not least, it would be trivial to
>> filter out non-PCI MSI sources if we wanted to trace only PCI - because
>> we need to identify the origin anyway for remapping purposes. So,
>> explicit hooking looks like the wrong way to me.
> 
> I am just not sure about the difference between IOAPIC's messages
> and MSI ones. For now, they seems very alike. However, I am not sure
> whether it would be not alike in the future. E.g., if one day, we
> extend APIC bus to support more than 255 CPUs (could it? I do not
> know for sure), here if we are with this "MSI layer", we would not
> be able to do that, since MSI only support 8 bits for destination ID
> field. That's my only worry now. If you (or Radim? or anyone more
> experienced on this than me) can confirm that this would never be a
> problem, I'd be glad to take the MSI way.

That's one of the reason why we need IR: >255 is only possible this way,
because it requires x2APIC and that requires IR (see Intel spec). So,
IOAPIC messages will then always travel via VT-d. No need to worry at all.

Jan
Peter Xu April 13, 2016, 5:09 a.m. UTC | #6
On Tue, Apr 12, 2016 at 08:39:21PM -0700, Jan Kiszka wrote:
> On 2016-04-12 20:33, Peter Xu wrote:
> > On Tue, Apr 12, 2016 at 08:39:02AM -0700, Jan Kiszka wrote:
> >> On 2016-04-12 02:02, Peter Xu wrote:
> > 
> > [...]
> > 
> >>> Yes, I should consider other x86 platforms like AMD. Thanks to point
> >>> out. It seems that there are many places in the patchset that lacks
> >>> thorough consideration about this. Will try to fix them in next
> >>> version.
> >>>
> >>> Regarding to the above MSI solution: I'd say it is a good way to
> >>> hide everything else behind.  However, since we introduced one extra
> >>> layer (MSI) which actually does not exist, not sure there would be
> >>> problem too.  Also, I feel it a little bit hacky if we "create" one
> >>> MSI out of the air...  For example, if someone tries to capture MSIs
> >>> from QEMU inside in the APIC memory writes, he will see something he
> >>> cannot explain if he never knows this hack's there.  Considering the
> >>> above, I would prefer hooks, or better to provide a callback (a
> >>> function pointer that others like AMD can override) to do the
> >>> translation.  How do you think?
> >>
> >> The HPET does send MSIs, and I'm not sure how much different the
> >> IOAPIC's message actually is. In any case, modelling it as MSI is
> >> neither adding incorrectness nor making the code more complex (in fact,
> >> the contrary is true!). Last but not least, it would be trivial to
> >> filter out non-PCI MSI sources if we wanted to trace only PCI - because
> >> we need to identify the origin anyway for remapping purposes. So,
> >> explicit hooking looks like the wrong way to me.
> > 
> > I am just not sure about the difference between IOAPIC's messages
> > and MSI ones. For now, they seems very alike. However, I am not sure
> > whether it would be not alike in the future. E.g., if one day, we
> > extend APIC bus to support more than 255 CPUs (could it? I do not
> > know for sure), here if we are with this "MSI layer", we would not
> > be able to do that, since MSI only support 8 bits for destination ID
> > field. That's my only worry now. If you (or Radim? or anyone more
> > experienced on this than me) can confirm that this would never be a
> > problem, I'd be glad to take the MSI way.
> 
> That's one of the reason why we need IR: >255 is only possible this way,
> because it requires x2APIC and that requires IR (see Intel spec). So,
> IOAPIC messages will then always travel via VT-d. No need to worry at all.

Ah, right. When we deliver the MSI, it's in remappable format, so
there is no destination ID at all... Okay, I can take this in
v3. Thanks.

-- peterx
Peter Xu April 13, 2016, 10:06 a.m. UTC | #7
On Tue, Apr 12, 2016 at 08:39:21PM -0700, Jan Kiszka wrote:
> On 2016-04-12 20:33, Peter Xu wrote:
> > On Tue, Apr 12, 2016 at 08:39:02AM -0700, Jan Kiszka wrote:
> >> On 2016-04-12 02:02, Peter Xu wrote:
> > 
> > [...]
> > 
> >>> Yes, I should consider other x86 platforms like AMD. Thanks to point
> >>> out. It seems that there are many places in the patchset that lacks
> >>> thorough consideration about this. Will try to fix them in next
> >>> version.
> >>>
> >>> Regarding to the above MSI solution: I'd say it is a good way to
> >>> hide everything else behind.  However, since we introduced one extra
> >>> layer (MSI) which actually does not exist, not sure there would be
> >>> problem too.  Also, I feel it a little bit hacky if we "create" one
> >>> MSI out of the air...  For example, if someone tries to capture MSIs
> >>> from QEMU inside in the APIC memory writes, he will see something he
> >>> cannot explain if he never knows this hack's there.  Considering the
> >>> above, I would prefer hooks, or better to provide a callback (a
> >>> function pointer that others like AMD can override) to do the
> >>> translation.  How do you think?
> >>
> >> The HPET does send MSIs, and I'm not sure how much different the
> >> IOAPIC's message actually is. In any case, modelling it as MSI is
> >> neither adding incorrectness nor making the code more complex (in fact,
> >> the contrary is true!). Last but not least, it would be trivial to
> >> filter out non-PCI MSI sources if we wanted to trace only PCI - because
> >> we need to identify the origin anyway for remapping purposes. So,
> >> explicit hooking looks like the wrong way to me.
> > 
> > I am just not sure about the difference between IOAPIC's messages
> > and MSI ones. For now, they seems very alike. However, I am not sure
> > whether it would be not alike in the future. E.g., if one day, we
> > extend APIC bus to support more than 255 CPUs (could it? I do not
> > know for sure), here if we are with this "MSI layer", we would not
> > be able to do that, since MSI only support 8 bits for destination ID
> > field. That's my only worry now. If you (or Radim? or anyone more
> > experienced on this than me) can confirm that this would never be a
> > problem, I'd be glad to take the MSI way.
> 
> That's one of the reason why we need IR: >255 is only possible this way,
> because it requires x2APIC and that requires IR (see Intel spec). So,
> IOAPIC messages will then always travel via VT-d. No need to worry at all.

One thing I am curious about: I see that in vtd spec 5.1.5.1:

"RTE bits 10:8 is programmed to 000b (Fixed) to force the SHV
(SubHandle Valid) field as Clear in the interrupt address
generated."

So... In real IOMMU hardwares, IOAPIC interrupt will finally be
translated to MSI as well? am I understanding it correctly?

Btw, if to use the way you suggested, the patch content would
possibly be very alike the one you and Rita has posted, which is:

https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg01933.html

I will only pick up those lines I needed in supporting IOAPIC. If
so, do you mind I add your s-o-b as well above mine (maybe add
Rita's too)?

Thanks.

-- peterx
Jan Kiszka April 13, 2016, 2:44 p.m. UTC | #8
On 2016-04-13 03:06, Peter Xu wrote:
> On Tue, Apr 12, 2016 at 08:39:21PM -0700, Jan Kiszka wrote:
>> On 2016-04-12 20:33, Peter Xu wrote:
>>> On Tue, Apr 12, 2016 at 08:39:02AM -0700, Jan Kiszka wrote:
>>>> On 2016-04-12 02:02, Peter Xu wrote:
>>>
>>> [...]
>>>
>>>>> Yes, I should consider other x86 platforms like AMD. Thanks to point
>>>>> out. It seems that there are many places in the patchset that lacks
>>>>> thorough consideration about this. Will try to fix them in next
>>>>> version.
>>>>>
>>>>> Regarding to the above MSI solution: I'd say it is a good way to
>>>>> hide everything else behind.  However, since we introduced one extra
>>>>> layer (MSI) which actually does not exist, not sure there would be
>>>>> problem too.  Also, I feel it a little bit hacky if we "create" one
>>>>> MSI out of the air...  For example, if someone tries to capture MSIs
>>>>> from QEMU inside in the APIC memory writes, he will see something he
>>>>> cannot explain if he never knows this hack's there.  Considering the
>>>>> above, I would prefer hooks, or better to provide a callback (a
>>>>> function pointer that others like AMD can override) to do the
>>>>> translation.  How do you think?
>>>>
>>>> The HPET does send MSIs, and I'm not sure how much different the
>>>> IOAPIC's message actually is. In any case, modelling it as MSI is
>>>> neither adding incorrectness nor making the code more complex (in fact,
>>>> the contrary is true!). Last but not least, it would be trivial to
>>>> filter out non-PCI MSI sources if we wanted to trace only PCI - because
>>>> we need to identify the origin anyway for remapping purposes. So,
>>>> explicit hooking looks like the wrong way to me.
>>>
>>> I am just not sure about the difference between IOAPIC's messages
>>> and MSI ones. For now, they seems very alike. However, I am not sure
>>> whether it would be not alike in the future. E.g., if one day, we
>>> extend APIC bus to support more than 255 CPUs (could it? I do not
>>> know for sure), here if we are with this "MSI layer", we would not
>>> be able to do that, since MSI only support 8 bits for destination ID
>>> field. That's my only worry now. If you (or Radim? or anyone more
>>> experienced on this than me) can confirm that this would never be a
>>> problem, I'd be glad to take the MSI way.
>>
>> That's one of the reason why we need IR: >255 is only possible this way,
>> because it requires x2APIC and that requires IR (see Intel spec). So,
>> IOAPIC messages will then always travel via VT-d. No need to worry at all.
> 
> One thing I am curious about: I see that in vtd spec 5.1.5.1:
> 
> "RTE bits 10:8 is programmed to 000b (Fixed) to force the SHV
> (SubHandle Valid) field as Clear in the interrupt address
> generated."
> 
> So... In real IOMMU hardwares, IOAPIC interrupt will finally be
> translated to MSI as well? am I understanding it correctly?

It will be translated into something that the IR unit can receive -
whatever that is technically. Logically, there is no difference to MSIs
received from PCI devices.

> 
> Btw, if to use the way you suggested, the patch content would
> possibly be very alike the one you and Rita has posted, which is:
> 
> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg01933.html
> 
> I will only pick up those lines I needed in supporting IOAPIC. If
> so, do you mind I add your s-o-b as well above mine (maybe add
> Rita's too)?

If a patch is almost identical, add your [Peter: my changes...] line and
your signed of to it. If it is more modified, claim authorship and just
refer to the original authors in the commit log.

Jan
Peter Xu April 14, 2016, 2:46 a.m. UTC | #9
On Wed, Apr 13, 2016 at 07:44:54AM -0700, Jan Kiszka wrote:
[...]
> > One thing I am curious about: I see that in vtd spec 5.1.5.1:
> > 
> > "RTE bits 10:8 is programmed to 000b (Fixed) to force the SHV
> > (SubHandle Valid) field as Clear in the interrupt address
> > generated."
> > 
> > So... In real IOMMU hardwares, IOAPIC interrupt will finally be
> > translated to MSI as well? am I understanding it correctly?
> 
> It will be translated into something that the IR unit can receive -
> whatever that is technically. Logically, there is no difference to MSIs
> received from PCI devices.

Ok. I see there are still differences between IOAPIC and MSI, e.g.,
for IOAPIC entries, it has "Interrupt Input Pin Polarity", which is
bit 13 of the entry, to show whether 1 or 0 is taken as assertion
for level-triggered interrupts. While in MSI, I assume assertion
will be 1 always. Can I take it as "obsolete" and we will never use
it?

If to take IOAPIC entries as MSI messages, all these extra bits will
be dropped (it's dropped in ioapic_service() already, but not sure
whether we will pick them up in the future).

> 
> > 
> > Btw, if to use the way you suggested, the patch content would
> > possibly be very alike the one you and Rita has posted, which is:
> > 
> > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg01933.html
> > 
> > I will only pick up those lines I needed in supporting IOAPIC. If
> > so, do you mind I add your s-o-b as well above mine (maybe add
> > Rita's too)?
> 
> If a patch is almost identical, add your [Peter: my changes...] line and
> your signed of to it. If it is more modified, claim authorship and just
> refer to the original authors in the commit log.

Ok. Thanks.

-- peterx
Jan Kiszka April 14, 2016, 5:42 a.m. UTC | #10
On 2016-04-13 19:46, Peter Xu wrote:
> On Wed, Apr 13, 2016 at 07:44:54AM -0700, Jan Kiszka wrote:
> [...]
>>> One thing I am curious about: I see that in vtd spec 5.1.5.1:
>>>
>>> "RTE bits 10:8 is programmed to 000b (Fixed) to force the SHV
>>> (SubHandle Valid) field as Clear in the interrupt address
>>> generated."
>>>
>>> So... In real IOMMU hardwares, IOAPIC interrupt will finally be
>>> translated to MSI as well? am I understanding it correctly?
>>
>> It will be translated into something that the IR unit can receive -
>> whatever that is technically. Logically, there is no difference to MSIs
>> received from PCI devices.
> 
> Ok. I see there are still differences between IOAPIC and MSI, e.g.,
> for IOAPIC entries, it has "Interrupt Input Pin Polarity", which is
> bit 13 of the entry, to show whether 1 or 0 is taken as assertion
> for level-triggered interrupts. While in MSI, I assume assertion
> will be 1 always. Can I take it as "obsolete" and we will never use
> it?

I can't check details right now, but I would recommend studying in the
specs if any of the *receiver* (APIC and IOMMU) can make sense of that
bit at all, and how. Also study (commit logs) if there is a reason why
the bit is unused.

> 
> If to take IOAPIC entries as MSI messages, all these extra bits will
> be dropped (it's dropped in ioapic_service() already, but not sure
> whether we will pick them up in the future).

What other bits?

Jan
Peter Xu April 14, 2016, 8:28 a.m. UTC | #11
On Wed, Apr 13, 2016 at 10:42:26PM -0700, Jan Kiszka wrote:
> On 2016-04-13 19:46, Peter Xu wrote:
> > On Wed, Apr 13, 2016 at 07:44:54AM -0700, Jan Kiszka wrote:
> > [...]
> >>> One thing I am curious about: I see that in vtd spec 5.1.5.1:
> >>>
> >>> "RTE bits 10:8 is programmed to 000b (Fixed) to force the SHV
> >>> (SubHandle Valid) field as Clear in the interrupt address
> >>> generated."
> >>>
> >>> So... In real IOMMU hardwares, IOAPIC interrupt will finally be
> >>> translated to MSI as well? am I understanding it correctly?
> >>
> >> It will be translated into something that the IR unit can receive -
> >> whatever that is technically. Logically, there is no difference to MSIs
> >> received from PCI devices.
> > 
> > Ok. I see there are still differences between IOAPIC and MSI, e.g.,
> > for IOAPIC entries, it has "Interrupt Input Pin Polarity", which is
> > bit 13 of the entry, to show whether 1 or 0 is taken as assertion
> > for level-triggered interrupts. While in MSI, I assume assertion
> > will be 1 always. Can I take it as "obsolete" and we will never use
> > it?
> 
> I can't check details right now, but I would recommend studying in the
> specs if any of the *receiver* (APIC and IOMMU) can make sense of that
> bit at all, and how. Also study (commit logs) if there is a reason why
> the bit is unused.

Thanks for the advices. Will add this in my todo list. I am guessing
that, all devices emulated by QEMU are using 1 as assertions..

It's defined as IOAPIC_LVT_POLARITY in QEMU. As long as I am sure
that current QEMU does not use it, I think I can move on to v3 using
MSI path, at least to make sure I'm not making things worse, or
bringing any functionality loss.

> 
> > 
> > If to take IOAPIC entries as MSI messages, all these extra bits will
> > be dropped (it's dropped in ioapic_service() already, but not sure
> > whether we will pick them up in the future).
> 
> What other bits?

Besides polarity bit, there is another one IOAPIC_LVT_DELIV_STATUS,
which is not used too. But after more careful look, I see this bit
is not too relevant if we are talking about translating IOAPIC
entries into MSI.

Thanks.

-- peterx
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a44289f..1dcdc7b 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2014,6 +2014,133 @@  IntelIOMMUState *vtd_iommu_get(void)
     return (IntelIOMMUState *)intel_iommu;
 }
 
+/* Read IRTE entry with specific index */
+static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index,
+                        VTD_IRTE *entry)
+{
+    dma_addr_t addr = 0x00;
+
+    addr = iommu->intr_root + index * sizeof(*entry);
+    if (dma_memory_read(&address_space_memory, addr, entry,
+                        sizeof(*entry))) {
+        VTD_DPRINTF(GENERAL, "error: fail to access IR root at 0x%"PRIx64
+                    " + %"PRIu16, iommu->intr_root, index);
+        return -VTD_FR_IR_ROOT_INVAL;
+    }
+
+    if (!entry->present) {
+        VTD_DPRINTF(GENERAL, "error: present flag not set in IRTE"
+                    " entry index %u value 0x%"PRIx64 " 0x%"PRIx64,
+                    index, le64_to_cpu(entry->data[1]),
+                    le64_to_cpu(entry->data[0]));
+        return -VTD_FR_IR_ENTRY_P;
+    }
+
+    if (entry->__reserved_0 || entry->__reserved_1 || \
+        entry->__reserved_2) {
+        VTD_DPRINTF(GENERAL, "error: IRTE entry index %"PRIu16
+                    " reserved fields non-zero: 0x%"PRIx64 " 0x%"PRIx64,
+                    index, le64_to_cpu(entry->data[1]),
+                    le64_to_cpu(entry->data[0]));
+        return -VTD_FR_IR_IRTE_RSVD;
+    }
+
+    /*
+     * TODO: Check Source-ID corresponds to SVT (Source Validation
+     * Type) bits
+     */
+
+    return 0;
+}
+
+/* Fetch IRQ information of specific IR index */
+static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, VTDIrq *irq)
+{
+    VTD_IRTE irte;
+    int ret = 0;
+
+    bzero(&irte, sizeof(irte));
+
+    ret = vtd_irte_get(iommu, index, &irte);
+    if (ret) {
+        return ret;
+    }
+
+    irq->trigger_mode = irte.trigger_mode;
+    irq->vector = irte.vector;
+    irq->delivery_mode = irte.delivery_mode;
+    /* Not support EIM yet: please refer to vt-d 9.10 DST bits */
+#define  VTD_IR_APIC_DEST_MASK         (0xff00ULL)
+#define  VTD_IR_APIC_DEST_SHIFT        (8)
+    irq->dest = (irte.dest_id & VTD_IR_APIC_DEST_MASK) >> \
+        VTD_IR_APIC_DEST_SHIFT;
+    irq->dest_mode = irte.dest_mode;
+
+    VTD_DPRINTF(IR, "remapping interrupt index %d: trig:%u,vec:%u,"
+                "deliver:%u,dest:%u,dest_mode:%u", index,
+                irq->trigger_mode, irq->vector, irq->delivery_mode,
+                irq->dest, irq->dest_mode);
+
+    return 0;
+}
+
+/* Interrupt remapping for IOAPIC IRQ entry */
+int vtd_interrupt_remap_ioapic(IntelIOMMUState *iommu,
+                               uint64_t *ioapic_entry, VTDIrq *irq)
+{
+    int ret = 0;
+    uint16_t index = 0;
+    VTD_IR_IOAPICEntry *entry = (VTD_IR_IOAPICEntry *)ioapic_entry;
+
+    assert(iommu && entry && irq);
+    assert(iommu->intr_enabled);
+
+    /* TODO: Currently we still do not support compatible mode */
+    if (entry->int_mode != VTD_IR_INT_FORMAT_REMAP) {
+        VTD_DPRINTF(GENERAL, "error: trying to remap IOAPIC entry"
+                    " with compatible format: 0x%"PRIx64,
+                    le64_to_cpu(entry->data));
+        return -VTD_FR_IR_REQ_COMPAT;
+    }
+
+    if (entry->__zeros || entry->__reserved) {
+        VTD_DPRINTF(GENERAL, "error: reserved not empty for IOAPIC"
+                    "entry 0x%"PRIx64, le64_to_cpu(entry->data));
+        return -VTD_FR_IR_REQ_RSVD;
+    }
+
+    index = entry->index_h << 15 | entry->index_l;
+    ret = vtd_remap_irq_get(iommu, index, irq);
+    if (ret) {
+        return ret;
+    }
+
+    /* Trigger mode should be aligned between IOAPIC entry and IRTE
+     * entry */
+    if (irq->trigger_mode != entry->trigger_mode) {
+        /* This is possibly guest OS bug?! */
+        VTD_DPRINTF(GENERAL, "error: IOAPIC trigger mode inconsistent: "
+                    "0x%"PRIx64 " with IR table index %d",
+                    le64_to_cpu(entry->data), index);
+        /* Currently no such error defined */
+        return -VTD_FR_RESERVED_ERR;
+    }
+
+    /* Vector should be aligned too */
+    if (irq->vector != entry->vector) {
+        /*
+         * Latest linux kernel will not provide consistent
+         * vectors. Need some more digging to know why. Whatever,
+         * the one in IRTE is always correct. So directly use it.
+         */
+        VTD_DPRINTF(IR, "warn: IOAPIC vector inconsistent: "
+                    "index %d: entry=%d, IRTE=%d", index,
+                    entry->vector, irq->vector);
+    }
+
+    return 0;
+}
+
 /* Do the initialization. It will also be called when reset, so pay
  * attention when adding new initialization stuff.
  */
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 378e663..d963d45 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -57,6 +57,8 @@  static void ioapic_service(IOAPICCommonState *s)
     uint64_t entry;
     uint8_t dest;
     uint8_t dest_mode;
+    IntelIOMMUState *iommu = s->iommu;
+    VTDIrq irq = {0};
 
     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
         mask = 1 << i;
@@ -65,11 +67,33 @@  static void ioapic_service(IOAPICCommonState *s)
 
             entry = s->ioredtbl[i];
             if (!(entry & IOAPIC_LVT_MASKED)) {
-                trig_mode = ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1);
-                dest = entry >> IOAPIC_LVT_DEST_SHIFT;
-                dest_mode = (entry >> IOAPIC_LVT_DEST_MODE_SHIFT) & 1;
-                delivery_mode =
-                    (entry >> IOAPIC_LVT_DELIV_MODE_SHIFT) & IOAPIC_DM_MASK;
+
+                if (iommu && iommu->intr_enabled) {
+                    /*
+                    * Interrupt remapping is enabled in owner IOMMU,
+                    * we need to fetch the real IRQ information via
+                    * IRTE of the root mapping table
+                    */
+                    if (vtd_interrupt_remap_ioapic(iommu, &entry, &irq)) {
+                        DPRINTF("%s: IOAPIC remap fail on index %d "
+                                "entry 0x%lx, drop it for now\n",
+                                __func__, index, entry);
+                        return;
+                    }
+                    trig_mode = irq.trigger_mode;
+                    dest = irq.dest;
+                    dest_mode = irq.dest_mode;
+                    delivery_mode = irq.delivery_mode;
+                    vector = irq.vector;
+                } else {
+                    /* This is generic IOAPIC entry */
+                    trig_mode = ((entry >> IOAPIC_LVT_TRIGGER_MODE_SHIFT) & 1);
+                    dest = entry >> IOAPIC_LVT_DEST_SHIFT;
+                    dest_mode = (entry >> IOAPIC_LVT_DEST_MODE_SHIFT) & 1;
+                    delivery_mode =
+                        (entry >> IOAPIC_LVT_DELIV_MODE_SHIFT) & IOAPIC_DM_MASK;
+                    vector = entry & IOAPIC_VECTOR_MASK;
+                }
                 if (trig_mode == IOAPIC_TRIGGER_EDGE) {
                     s->irr &= ~mask;
                 } else {
@@ -78,8 +102,6 @@  static void ioapic_service(IOAPICCommonState *s)
                 }
                 if (delivery_mode == IOAPIC_DM_EXTINT) {
                     vector = pic_read_irq(isa_pic);
-                } else {
-                    vector = entry & IOAPIC_VECTOR_MASK;
                 }
 #ifdef CONFIG_KVM
                 if (kvm_irqchip_is_split()) {
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 9ee84f7..6a79207 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -23,6 +23,7 @@ 
 #define INTEL_IOMMU_H
 #include "hw/qdev.h"
 #include "sysemu/dma.h"
+#include "hw/i386/ioapic.h"
 
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -55,6 +56,7 @@  typedef struct VTDBus VTDBus;
 typedef union VTD_IRTE VTD_IRTE;
 typedef union VTD_IR_IOAPICEntry VTD_IR_IOAPICEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
+typedef struct VTDIrq VTDIrq;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -116,6 +118,9 @@  union VTD_IRTE {
     uint64_t data[2];
 };
 
+#define VTD_IR_INT_FORMAT_COMPAT     (0) /* Compatible Interrupt */
+#define VTD_IR_INT_FORMAT_REMAP      (1) /* Remappable Interrupt */
+
 /* Programming format for IOAPIC table entries */
 union VTD_IR_IOAPICEntry {
     struct {
@@ -147,6 +152,15 @@  union VTD_IR_MSIAddress {
     uint32_t data;
 };
 
+/* Generic IRQ entry information */
+struct VTDIrq {
+    uint8_t trigger_mode;
+    uint8_t vector;
+    uint8_t delivery_mode;
+    uint32_t dest;
+    uint8_t dest_mode;
+};
+
 /* When IR is enabled, all MSI/MSI-X data bits should be zero */
 #define VTD_IR_MSI_DATA          (0)
 
@@ -198,5 +212,8 @@  struct IntelIOMMUState {
 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
 /* Get default IOMMU object */
 IntelIOMMUState *vtd_iommu_get(void);
+/* Interrupt remapping for IOAPIC IRQ entry */
+int vtd_interrupt_remap_ioapic(IntelIOMMUState *iommu,
+                               uint64_t *ioapic_entry, VTDIrq *irq);
 
 #endif