diff mbox

[v2,13/13] intel_iommu: Add support for PCI MSI remap

Message ID 1460366363-4589-14-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu April 11, 2016, 9:19 a.m. UTC
This patch enables interrupt remapping for PCI devices.

To play the trick, one memory region "iommu_ir" is added as child region
of the original iommu memory region, covering range 0xfeeXXXXX (which is
the address range for APIC). All the writes to this range will be taken
as MSI, and translation is carried out when IR is enabled.

The translation process tried to make full use of the helper function
from IOAPIC patch. Several more bits are added to VTDIrq as MSI specific
fields.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c          | 172 ++++++++++++++++++++++++++++++++++++++++-
 hw/i386/intel_iommu_internal.h |   2 +
 include/hw/i386/intel_iommu.h  |  34 ++++++++
 include/hw/pci/msi.h           |   4 +
 4 files changed, 211 insertions(+), 1 deletion(-)

Comments

Michael S. Tsirkin April 11, 2016, 12:41 p.m. UTC | #1
On Mon, Apr 11, 2016 at 05:19:23PM +0800, Peter Xu wrote:
> This patch enables interrupt remapping for PCI devices.
> 
> To play the trick, one memory region "iommu_ir" is added as child region
> of the original iommu memory region, covering range 0xfeeXXXXX (which is
> the address range for APIC). All the writes to this range will be taken
> as MSI, and translation is carried out when IR is enabled.
> 
> The translation process tried to make full use of the helper function
> from IOAPIC patch. Several more bits are added to VTDIrq as MSI specific
> fields.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

So far so good, but I wonder how we are going to
support irqfd with this approach.
It would be nicer to have a single module that
somehow handles both ioapic and non ioapic.

> ---
>  hw/i386/intel_iommu.c          | 172 ++++++++++++++++++++++++++++++++++++++++-
>  hw/i386/intel_iommu_internal.h |   2 +
>  include/hw/i386/intel_iommu.h  |  34 ++++++++
>  include/hw/pci/msi.h           |   4 +
>  4 files changed, 211 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 1dcdc7b..322b5ac 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -44,6 +44,10 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
>  
> +static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
> +                                   MSIMessage *origin,
> +                                   MSIMessage *translated);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {

don't forward-declare static functions, just arrange them sensibly.

> @@ -1964,12 +1968,70 @@ static const MemoryRegionOps vtd_mem_ops = {
>      },
>  };
>  
> +static uint64_t vtd_mem_ir_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +    uint64_t data = 0;
> +
> +    addr += VTD_INTERRUPT_ADDR_FIRST;
> +
> +    VTD_DPRINTF(IR, "read mem_ir addr 0x%"PRIx64 " size %u",
> +                addr, size);
> +
> +    if (dma_memory_read(&address_space_memory, addr, &data, size)) {
> +        VTD_DPRINTF(GENERAL, "error: fail to access 0x%"PRIx64, addr);
> +        return (uint32_t) -1;
> +    }
> +
> +    return data;
> +}
> +
> +static void vtd_mem_ir_write(void *opaque, hwaddr addr,
> +                             uint64_t value, unsigned size)
> +{
> +    int ret = 0;
> +    MSIMessage from = {0}, to = {0};
> +
> +    from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
> +    from.data = (uint32_t) value;
> +
> +    ret = vtd_interrupt_remap_msi(opaque, &from, &to);
> +    if (ret) {
> +        /* TODO: report error */
> +        VTD_DPRINTF(GENERAL, "int remap fail for addr 0x%"PRIx64
> +                    " data 0x%"PRIx32, from.address, from.data);
> +        /* Drop this interrupt */
> +        return;
> +    }
> +
> +    VTD_DPRINTF(IR, "delivering MSI 0x%"PRIx64":0x%"PRIx32,
> +                to.address, to.data);
> +
> +    if (dma_memory_write(&address_space_memory, to.address,
> +                         &to.data, size)) {
> +        VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64
> +                    " value 0x%"PRIx32, to.address, to.data);
> +    }
> +}
> +
> +static const MemoryRegionOps vtd_mem_ir_ops = {
> +    .read = vtd_mem_ir_read,
> +    .write = vtd_mem_ir_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 4,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 4,
> +    },
> +};
> +
>  static Property vtd_properties[] = {
>      DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> -
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>  {
>      uintptr_t key = (uintptr_t)bus;
> @@ -1995,6 +2057,11 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>          vtd_dev_as->context_cache_entry.context_cache_gen = 0;
>          memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
>                                   &s->iommu_ops, "intel_iommu", UINT64_MAX);
> +        memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
> +                              &vtd_mem_ir_ops, s, "intel_iommu_ir",
> +                              VTD_INTERRUPT_ADDR_SIZE);
> +        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
> +                                    &vtd_dev_as->iommu_ir);
>          address_space_init(&vtd_dev_as->as,
>                             &vtd_dev_as->iommu, "intel_iommu");
>      }
> @@ -2075,6 +2142,7 @@ static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, VTDIrq *irq
>      irq->dest = (irte.dest_id & VTD_IR_APIC_DEST_MASK) >> \
>          VTD_IR_APIC_DEST_SHIFT;
>      irq->dest_mode = irte.dest_mode;
> +    irq->redir_hint = irte.redir_hint;
>  
>      VTD_DPRINTF(IR, "remapping interrupt index %d: trig:%u,vec:%u,"
>                  "deliver:%u,dest:%u,dest_mode:%u", index,
> @@ -2141,6 +2209,108 @@ int vtd_interrupt_remap_ioapic(IntelIOMMUState *iommu,
>      return 0;
>  }
>  
> +/* Generate one MSI message from VTDIrq info */
> +static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out)
> +{
> +    VTD_MSIMessage msg;
> +
> +    bzero(&msg, sizeof(msg));
> +

Just msg = {} will do it.

> +    /* Generate address bits */
> +    msg.dest_mode = irq->dest_mode;
> +    msg.redir_hint = irq->redir_hint;
> +    msg.dest = irq->dest;
> +    msg.__addr_head = 0xfee;
> +    /* Keep this from original MSI address bits */
> +    msg.__not_used = irq->msi_addr_last_bits;
> +
> +    /* Generate data bits */
> +    msg.vector = irq->vector;
> +    msg.delivery_mode = irq->delivery_mode;
> +    /* FIXME: assert always for level mode interrupt */

what does this mean?

> +    msg.level = 1;
> +    msg.trigger_mode = irq->trigger_mode;
> +
> +    msg_out->address = msg.msi_addr;
> +    msg_out->data = msg.msi_data;
> +}
> +
> +/* Interrupt remapping for MSI/MSI-X entry */
> +static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
> +                                   MSIMessage *origin,
> +                                   MSIMessage *translated)
> +{
> +    int ret = 0;
> +    VTD_IR_MSIAddress addr;
> +    uint16_t index = 0;
> +    VTDIrq irq = {0};
> +
> +    assert(iommu && origin && translated);
> +
> +    if (!iommu->intr_enabled) {
> +        memcpy(translated, origin, sizeof(*origin));
> +        return 0;
> +    }
> +
> +    if (origin->data) {
> +        VTD_DPRINTF(GENERAL, "error: MSI data bits non-zero for "
> +                    "interrupt remappable entry: 0x%"PRIx32,
> +                    origin->data);
> +        return -VTD_FR_IR_REQ_RSVD;
> +    }
> +
> +    if (origin->address & MSI_ADDR_HI_MASK) {
> +        VTD_DPRINTF(GENERAL, "error: MSI addr high 32 bits nonzero"
> +                    " during interrupt remapping: 0x%"PRIx32,
> +                    (uint32_t)((origin->address & MSI_ADDR_HI_MASK) >> \
> +                    MSI_ADDR_HI_SHIFT));
> +        return -VTD_FR_IR_REQ_RSVD;
> +    }
> +
> +    addr.data = origin->address & MSI_ADDR_LO_MASK;
> +    if (addr.__head != 0xfee) {
> +        VTD_DPRINTF(GENERAL, "error: MSI addr low 32 bits invalid: "
> +                    "0x%"PRIx32, addr.data);
> +        return -VTD_FR_IR_REQ_RSVD;
> +    }
> +
> +    if (addr.sub_valid != 1) {
> +        VTD_DPRINTF(GENERAL, "error: SHV not set for MSI addr: "
> +                    "0x%"PRIx32, addr.data);
> +        return -VTD_FR_IR_REQ_RSVD;
> +    }
> +
> +    /* TODO: Currently we still do not support compatible mode */
> +    if (addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
> +        VTD_DPRINTF(GENERAL, "error: trying to remap MSI entry"
> +                    " with compatible format: 0x%"PRIx32,
> +                    addr.data);
> +        return -VTD_FR_IR_REQ_COMPAT;
> +    }
> +
> +    index = addr.index_h << 15 | addr.index_l;
> +
> +    ret = vtd_remap_irq_get(iommu, index, &irq);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    /*
> +     * We'd better keep the last two bits, assuming that guest OS
> +     * might modify it. Keep it does not hurt after all.
> +     */
> +    irq.msi_addr_last_bits = addr.__not_care;
> +
> +    /* Translate VTDIrq to MSI message */
> +    vtd_generate_msi_message(&irq, translated);
> +
> +    VTD_DPRINTF(IR, "mapping MSI 0x%"PRIx64":0x%"PRIx32 " -> "
> +                "0x%"PRIx64":0x%"PRIx32, origin->address, origin->data,
> +                translated->address, translated->data);
> +
> +    return 0;
> +}
> +
>  /* Do the initialization. It will also be called when reset, so pay
>   * attention when adding new initialization stuff.
>   */
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 2a9987f..e1a08cb 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -110,6 +110,8 @@
>  /* Interrupt Address Range */
>  #define VTD_INTERRUPT_ADDR_FIRST    0xfee00000ULL
>  #define VTD_INTERRUPT_ADDR_LAST     0xfeefffffULL
> +#define VTD_INTERRUPT_ADDR_SIZE     (VTD_INTERRUPT_ADDR_LAST - \
> +                                     VTD_INTERRUPT_ADDR_FIRST + 1)
>  
>  /* The shift of source_id in the key of IOTLB hash table */
>  #define VTD_IOTLB_SID_SHIFT         36
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 6a79207..9297eba 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -24,6 +24,7 @@
>  #include "hw/qdev.h"
>  #include "sysemu/dma.h"
>  #include "hw/i386/ioapic.h"
> +#include "hw/pci/msi.h"
>  
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> @@ -57,6 +58,7 @@ typedef union VTD_IRTE VTD_IRTE;
>  typedef union VTD_IR_IOAPICEntry VTD_IR_IOAPICEntry;
>  typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
>  typedef struct VTDIrq VTDIrq;
> +typedef struct VTD_MSIMessage VTD_MSIMessage;
>  
>  /* Context-Entry */
>  struct VTDContextEntry {
> @@ -77,6 +79,7 @@ struct VTDAddressSpace {
>      uint8_t devfn;
>      AddressSpace as;
>      MemoryRegion iommu;
> +    MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
>      IntelIOMMUState *iommu_state;
>      VTDContextCacheEntry context_cache_entry;
>  };
> @@ -154,11 +157,42 @@ union VTD_IR_MSIAddress {
>  
>  /* Generic IRQ entry information */
>  struct VTDIrq {
> +    /* Used by both IOAPIC/MSI interrupt remapping */
>      uint8_t trigger_mode;
>      uint8_t vector;
>      uint8_t delivery_mode;
>      uint32_t dest;
>      uint8_t dest_mode;
> +
> +    /* only used by MSI interrupt remapping */
> +    uint8_t redir_hint;
> +    uint8_t msi_addr_last_bits;
> +};
> +
> +struct VTD_MSIMessage {
> +    union {
> +        struct {
> +            uint16_t __not_used:2;
> +            uint16_t dest_mode:1;
> +            uint16_t redir_hint:1;
> +            uint16_t __reserved:8;
> +            uint16_t dest:8;
> +            uint16_t __addr_head:12; /* 0xfee */
> +            uint32_t __addr_hi:32;
> +        } QEMU_PACKED;
> +        uint64_t msi_addr;
> +    };
> +    union {
> +        struct {
> +            uint16_t vector:8;
> +            uint16_t delivery_mode:3;
> +            uint16_t __resved:3;
> +            uint16_t level:1;
> +            uint16_t trigger_mode:1;
> +            uint16_t __resved1:16;
> +        } QEMU_PACKED;
> +        uint32_t msi_data;
> +    };
>  };
>  
>  /* When IR is enabled, all MSI/MSI-X data bits should be zero */
> diff --git a/include/hw/pci/msi.h b/include/hw/pci/msi.h
> index 8124908..8e94fca 100644
> --- a/include/hw/pci/msi.h
> +++ b/include/hw/pci/msi.h
> @@ -24,6 +24,10 @@
>  #include "qemu-common.h"
>  #include "hw/pci/pci.h"
>  
> +#define  MSI_ADDR_HI_MASK        (0xffffffff00000000ULL)
> +#define  MSI_ADDR_HI_SHIFT       (32)
> +#define  MSI_ADDR_LO_MASK        (0x00000000ffffffffULL)
> +

I don't think these are needed in msi.h, rename them with
vtd prefix and put them where they are used.

>  struct MSIMessage {
>      uint64_t address;
>      uint32_t data;
> -- 
> 2.4.3
Peter Xu April 13, 2016, 7:23 a.m. UTC | #2
Hi, Michael,

On Mon, Apr 11, 2016 at 03:41:29PM +0300, Michael S. Tsirkin wrote:
> On Mon, Apr 11, 2016 at 05:19:23PM +0800, Peter Xu wrote:
> > This patch enables interrupt remapping for PCI devices.
> > 
> > To play the trick, one memory region "iommu_ir" is added as child region
> > of the original iommu memory region, covering range 0xfeeXXXXX (which is
> > the address range for APIC). All the writes to this range will be taken
> > as MSI, and translation is carried out when IR is enabled.
> > 
> > The translation process tried to make full use of the helper function
> > from IOAPIC patch. Several more bits are added to VTDIrq as MSI specific
> > fields.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> So far so good, but I wonder how we are going to
> support irqfd with this approach.

I suppose the support for irqfd can be considered together with
kvm-ioapic support (though if to implement it, I will start from
kvm-ioapic). I haven't thought about it in depth, but currently I
have two plans for it:

- translate interrupt in kernel: firstly, we may need to provide a
  new x86-specific ioctl to tell the kernel the location of the IR
  table. After that, when we got irqfd signal, we translate before
  putting it onto APIC bus. The problem is that (at least), we may
  need to implement the logic twice (both in QEMU and KVM, though
  the logic to parse IR is not very complicated), and everything
  will be doubled in the future (e.g., future IR caches, etc).

- translate interrupt in user space: this will be something like
  Jason's DMAR patchset. When kernel receive IR request (either
  kvm-ioapic, or irqfd), it can ask QEMU about it. Here, we'd better
  introduce IR cache in kernel, and we may further need to enable IR
  cache invalidation in QEMU side, so that when the IR cache is
  polluted, QEMU can let KVM know it.

> It would be nicer to have a single module that
> somehow handles both ioapic and non ioapic.

I will take Jan's advice to handle IOAPIC leveraging MSI
path. That will make things cleaner.

> 
> > ---
> >  hw/i386/intel_iommu.c          | 172 ++++++++++++++++++++++++++++++++++++++++-
> >  hw/i386/intel_iommu_internal.h |   2 +
> >  include/hw/i386/intel_iommu.h  |  34 ++++++++
> >  include/hw/pci/msi.h           |   4 +
> >  4 files changed, 211 insertions(+), 1 deletion(-)
> > 
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 1dcdc7b..322b5ac 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -44,6 +44,10 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >  
> > +static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
> > +                                   MSIMessage *origin,
> > +                                   MSIMessage *translated);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
> >                              uint64_t wmask, uint64_t w1cmask)
> >  {
> 
> don't forward-declare static functions, just arrange them sensibly.

Will do.

> 
> > @@ -1964,12 +1968,70 @@ static const MemoryRegionOps vtd_mem_ops = {
> >      },
> >  };
> >  
> > +static uint64_t vtd_mem_ir_read(void *opaque, hwaddr addr, unsigned size)
> > +{
> > +    uint64_t data = 0;
> > +
> > +    addr += VTD_INTERRUPT_ADDR_FIRST;
> > +
> > +    VTD_DPRINTF(IR, "read mem_ir addr 0x%"PRIx64 " size %u",
> > +                addr, size);
> > +
> > +    if (dma_memory_read(&address_space_memory, addr, &data, size)) {
> > +        VTD_DPRINTF(GENERAL, "error: fail to access 0x%"PRIx64, addr);
> > +        return (uint32_t) -1;
> > +    }
> > +
> > +    return data;
> > +}
> > +
> > +static void vtd_mem_ir_write(void *opaque, hwaddr addr,
> > +                             uint64_t value, unsigned size)
> > +{
> > +    int ret = 0;
> > +    MSIMessage from = {0}, to = {0};
> > +
> > +    from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
> > +    from.data = (uint32_t) value;
> > +
> > +    ret = vtd_interrupt_remap_msi(opaque, &from, &to);
> > +    if (ret) {
> > +        /* TODO: report error */
> > +        VTD_DPRINTF(GENERAL, "int remap fail for addr 0x%"PRIx64
> > +                    " data 0x%"PRIx32, from.address, from.data);
> > +        /* Drop this interrupt */
> > +        return;
> > +    }
> > +
> > +    VTD_DPRINTF(IR, "delivering MSI 0x%"PRIx64":0x%"PRIx32,
> > +                to.address, to.data);
> > +
> > +    if (dma_memory_write(&address_space_memory, to.address,
> > +                         &to.data, size)) {
> > +        VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64
> > +                    " value 0x%"PRIx32, to.address, to.data);
> > +    }
> > +}
> > +
> > +static const MemoryRegionOps vtd_mem_ir_ops = {
> > +    .read = vtd_mem_ir_read,
> > +    .write = vtd_mem_ir_write,
> > +    .endianness = DEVICE_LITTLE_ENDIAN,
> > +    .impl = {
> > +        .min_access_size = 4,
> > +        .max_access_size = 4,
> > +    },
> > +    .valid = {
> > +        .min_access_size = 4,
> > +        .max_access_size = 4,
> > +    },
> > +};
> > +
> >  static Property vtd_properties[] = {
> >      DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >  
> > -
> >  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
> >  {
> >      uintptr_t key = (uintptr_t)bus;
> > @@ -1995,6 +2057,11 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
> >          vtd_dev_as->context_cache_entry.context_cache_gen = 0;
> >          memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
> >                                   &s->iommu_ops, "intel_iommu", UINT64_MAX);
> > +        memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
> > +                              &vtd_mem_ir_ops, s, "intel_iommu_ir",
> > +                              VTD_INTERRUPT_ADDR_SIZE);
> > +        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
> > +                                    &vtd_dev_as->iommu_ir);
> >          address_space_init(&vtd_dev_as->as,
> >                             &vtd_dev_as->iommu, "intel_iommu");
> >      }
> > @@ -2075,6 +2142,7 @@ static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, VTDIrq *irq
> >      irq->dest = (irte.dest_id & VTD_IR_APIC_DEST_MASK) >> \
> >          VTD_IR_APIC_DEST_SHIFT;
> >      irq->dest_mode = irte.dest_mode;
> > +    irq->redir_hint = irte.redir_hint;
> >  
> >      VTD_DPRINTF(IR, "remapping interrupt index %d: trig:%u,vec:%u,"
> >                  "deliver:%u,dest:%u,dest_mode:%u", index,
> > @@ -2141,6 +2209,108 @@ int vtd_interrupt_remap_ioapic(IntelIOMMUState *iommu,
> >      return 0;
> >  }
> >  
> > +/* Generate one MSI message from VTDIrq info */
> > +static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out)
> > +{
> > +    VTD_MSIMessage msg;
> > +
> > +    bzero(&msg, sizeof(msg));
> > +
> 
> Just msg = {} will do it.

Ok.

> 
> > +    /* Generate address bits */
> > +    msg.dest_mode = irq->dest_mode;
> > +    msg.redir_hint = irq->redir_hint;
> > +    msg.dest = irq->dest;
> > +    msg.__addr_head = 0xfee;
> > +    /* Keep this from original MSI address bits */
> > +    msg.__not_used = irq->msi_addr_last_bits;
> > +
> > +    /* Generate data bits */
> > +    msg.vector = irq->vector;
> > +    msg.delivery_mode = irq->delivery_mode;
> > +    /* FIXME: assert always for level mode interrupt */
> 
> what does this mean?

I will remove above line.

Just to confirm: for level-triggered interrupt, we will always set
msg.level to 1, right?

> 
> > +    msg.level = 1;
> > +    msg.trigger_mode = irq->trigger_mode;
> > +
> > +    msg_out->address = msg.msi_addr;
> > +    msg_out->data = msg.msi_data;
> > +}
> > +
> > +/* Interrupt remapping for MSI/MSI-X entry */
> > +static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
> > +                                   MSIMessage *origin,
> > +                                   MSIMessage *translated)
> > +{
> > +    int ret = 0;
> > +    VTD_IR_MSIAddress addr;
> > +    uint16_t index = 0;
> > +    VTDIrq irq = {0};
> > +
> > +    assert(iommu && origin && translated);
> > +
> > +    if (!iommu->intr_enabled) {
> > +        memcpy(translated, origin, sizeof(*origin));
> > +        return 0;
> > +    }
> > +
> > +    if (origin->data) {
> > +        VTD_DPRINTF(GENERAL, "error: MSI data bits non-zero for "
> > +                    "interrupt remappable entry: 0x%"PRIx32,
> > +                    origin->data);
> > +        return -VTD_FR_IR_REQ_RSVD;
> > +    }
> > +
> > +    if (origin->address & MSI_ADDR_HI_MASK) {
> > +        VTD_DPRINTF(GENERAL, "error: MSI addr high 32 bits nonzero"
> > +                    " during interrupt remapping: 0x%"PRIx32,
> > +                    (uint32_t)((origin->address & MSI_ADDR_HI_MASK) >> \
> > +                    MSI_ADDR_HI_SHIFT));
> > +        return -VTD_FR_IR_REQ_RSVD;
> > +    }
> > +
> > +    addr.data = origin->address & MSI_ADDR_LO_MASK;
> > +    if (addr.__head != 0xfee) {
> > +        VTD_DPRINTF(GENERAL, "error: MSI addr low 32 bits invalid: "
> > +                    "0x%"PRIx32, addr.data);
> > +        return -VTD_FR_IR_REQ_RSVD;
> > +    }
> > +
> > +    if (addr.sub_valid != 1) {
> > +        VTD_DPRINTF(GENERAL, "error: SHV not set for MSI addr: "
> > +                    "0x%"PRIx32, addr.data);
> > +        return -VTD_FR_IR_REQ_RSVD;
> > +    }
> > +
> > +    /* TODO: Currently we still do not support compatible mode */
> > +    if (addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
> > +        VTD_DPRINTF(GENERAL, "error: trying to remap MSI entry"
> > +                    " with compatible format: 0x%"PRIx32,
> > +                    addr.data);
> > +        return -VTD_FR_IR_REQ_COMPAT;
> > +    }
> > +
> > +    index = addr.index_h << 15 | addr.index_l;
> > +
> > +    ret = vtd_remap_irq_get(iommu, index, &irq);
> > +    if (ret) {
> > +        return ret;
> > +    }
> > +
> > +    /*
> > +     * We'd better keep the last two bits, assuming that guest OS
> > +     * might modify it. Keep it does not hurt after all.
> > +     */
> > +    irq.msi_addr_last_bits = addr.__not_care;
> > +
> > +    /* Translate VTDIrq to MSI message */
> > +    vtd_generate_msi_message(&irq, translated);
> > +
> > +    VTD_DPRINTF(IR, "mapping MSI 0x%"PRIx64":0x%"PRIx32 " -> "
> > +                "0x%"PRIx64":0x%"PRIx32, origin->address, origin->data,
> > +                translated->address, translated->data);
> > +
> > +    return 0;
> > +}
> > +
> >  /* Do the initialization. It will also be called when reset, so pay
> >   * attention when adding new initialization stuff.
> >   */
> > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> > index 2a9987f..e1a08cb 100644
> > --- a/hw/i386/intel_iommu_internal.h
> > +++ b/hw/i386/intel_iommu_internal.h
> > @@ -110,6 +110,8 @@
> >  /* Interrupt Address Range */
> >  #define VTD_INTERRUPT_ADDR_FIRST    0xfee00000ULL
> >  #define VTD_INTERRUPT_ADDR_LAST     0xfeefffffULL
> > +#define VTD_INTERRUPT_ADDR_SIZE     (VTD_INTERRUPT_ADDR_LAST - \
> > +                                     VTD_INTERRUPT_ADDR_FIRST + 1)
> >  
> >  /* The shift of source_id in the key of IOTLB hash table */
> >  #define VTD_IOTLB_SID_SHIFT         36
> > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> > index 6a79207..9297eba 100644
> > --- a/include/hw/i386/intel_iommu.h
> > +++ b/include/hw/i386/intel_iommu.h
> > @@ -24,6 +24,7 @@
> >  #include "hw/qdev.h"
> >  #include "sysemu/dma.h"
> >  #include "hw/i386/ioapic.h"
> > +#include "hw/pci/msi.h"
> >  
> >  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> >  #define INTEL_IOMMU_DEVICE(obj) \
> > @@ -57,6 +58,7 @@ typedef union VTD_IRTE VTD_IRTE;
> >  typedef union VTD_IR_IOAPICEntry VTD_IR_IOAPICEntry;
> >  typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
> >  typedef struct VTDIrq VTDIrq;
> > +typedef struct VTD_MSIMessage VTD_MSIMessage;
> >  
> >  /* Context-Entry */
> >  struct VTDContextEntry {
> > @@ -77,6 +79,7 @@ struct VTDAddressSpace {
> >      uint8_t devfn;
> >      AddressSpace as;
> >      MemoryRegion iommu;
> > +    MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
> >      IntelIOMMUState *iommu_state;
> >      VTDContextCacheEntry context_cache_entry;
> >  };
> > @@ -154,11 +157,42 @@ union VTD_IR_MSIAddress {
> >  
> >  /* Generic IRQ entry information */
> >  struct VTDIrq {
> > +    /* Used by both IOAPIC/MSI interrupt remapping */
> >      uint8_t trigger_mode;
> >      uint8_t vector;
> >      uint8_t delivery_mode;
> >      uint32_t dest;
> >      uint8_t dest_mode;
> > +
> > +    /* only used by MSI interrupt remapping */
> > +    uint8_t redir_hint;
> > +    uint8_t msi_addr_last_bits;
> > +};
> > +
> > +struct VTD_MSIMessage {
> > +    union {
> > +        struct {
> > +            uint16_t __not_used:2;
> > +            uint16_t dest_mode:1;
> > +            uint16_t redir_hint:1;
> > +            uint16_t __reserved:8;
> > +            uint16_t dest:8;
> > +            uint16_t __addr_head:12; /* 0xfee */
> > +            uint32_t __addr_hi:32;
> > +        } QEMU_PACKED;
> > +        uint64_t msi_addr;
> > +    };
> > +    union {
> > +        struct {
> > +            uint16_t vector:8;
> > +            uint16_t delivery_mode:3;
> > +            uint16_t __resved:3;
> > +            uint16_t level:1;
> > +            uint16_t trigger_mode:1;
> > +            uint16_t __resved1:16;
> > +        } QEMU_PACKED;
> > +        uint32_t msi_data;
> > +    };
> >  };
> >  
> >  /* When IR is enabled, all MSI/MSI-X data bits should be zero */
> > diff --git a/include/hw/pci/msi.h b/include/hw/pci/msi.h
> > index 8124908..8e94fca 100644
> > --- a/include/hw/pci/msi.h
> > +++ b/include/hw/pci/msi.h
> > @@ -24,6 +24,10 @@
> >  #include "qemu-common.h"
> >  #include "hw/pci/pci.h"
> >  
> > +#define  MSI_ADDR_HI_MASK        (0xffffffff00000000ULL)
> > +#define  MSI_ADDR_HI_SHIFT       (32)
> > +#define  MSI_ADDR_LO_MASK        (0x00000000ffffffffULL)
> > +
> 
> I don't think these are needed in msi.h, rename them with
> vtd prefix and put them where they are used.

Ok.

Thanks.

-- peterx
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1dcdc7b..322b5ac 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -44,6 +44,10 @@  static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
+                                   MSIMessage *origin,
+                                   MSIMessage *translated);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
                             uint64_t wmask, uint64_t w1cmask)
 {
@@ -1964,12 +1968,70 @@  static const MemoryRegionOps vtd_mem_ops = {
     },
 };
 
+static uint64_t vtd_mem_ir_read(void *opaque, hwaddr addr, unsigned size)
+{
+    uint64_t data = 0;
+
+    addr += VTD_INTERRUPT_ADDR_FIRST;
+
+    VTD_DPRINTF(IR, "read mem_ir addr 0x%"PRIx64 " size %u",
+                addr, size);
+
+    if (dma_memory_read(&address_space_memory, addr, &data, size)) {
+        VTD_DPRINTF(GENERAL, "error: fail to access 0x%"PRIx64, addr);
+        return (uint32_t) -1;
+    }
+
+    return data;
+}
+
+static void vtd_mem_ir_write(void *opaque, hwaddr addr,
+                             uint64_t value, unsigned size)
+{
+    int ret = 0;
+    MSIMessage from = {0}, to = {0};
+
+    from.address = (uint64_t) addr + VTD_INTERRUPT_ADDR_FIRST;
+    from.data = (uint32_t) value;
+
+    ret = vtd_interrupt_remap_msi(opaque, &from, &to);
+    if (ret) {
+        /* TODO: report error */
+        VTD_DPRINTF(GENERAL, "int remap fail for addr 0x%"PRIx64
+                    " data 0x%"PRIx32, from.address, from.data);
+        /* Drop this interrupt */
+        return;
+    }
+
+    VTD_DPRINTF(IR, "delivering MSI 0x%"PRIx64":0x%"PRIx32,
+                to.address, to.data);
+
+    if (dma_memory_write(&address_space_memory, to.address,
+                         &to.data, size)) {
+        VTD_DPRINTF(GENERAL, "error: fail to write 0x%"PRIx64
+                    " value 0x%"PRIx32, to.address, to.data);
+    }
+}
+
+static const MemoryRegionOps vtd_mem_ir_ops = {
+    .read = vtd_mem_ir_read,
+    .write = vtd_mem_ir_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 4,
+    },
+};
+
 static Property vtd_properties[] = {
     DEFINE_PROP_UINT32("version", IntelIOMMUState, version, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
-
 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
 {
     uintptr_t key = (uintptr_t)bus;
@@ -1995,6 +2057,11 @@  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
         memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
                                  &s->iommu_ops, "intel_iommu", UINT64_MAX);
+        memory_region_init_io(&vtd_dev_as->iommu_ir, OBJECT(s),
+                              &vtd_mem_ir_ops, s, "intel_iommu_ir",
+                              VTD_INTERRUPT_ADDR_SIZE);
+        memory_region_add_subregion(&vtd_dev_as->iommu, VTD_INTERRUPT_ADDR_FIRST,
+                                    &vtd_dev_as->iommu_ir);
         address_space_init(&vtd_dev_as->as,
                            &vtd_dev_as->iommu, "intel_iommu");
     }
@@ -2075,6 +2142,7 @@  static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, VTDIrq *irq
     irq->dest = (irte.dest_id & VTD_IR_APIC_DEST_MASK) >> \
         VTD_IR_APIC_DEST_SHIFT;
     irq->dest_mode = irte.dest_mode;
+    irq->redir_hint = irte.redir_hint;
 
     VTD_DPRINTF(IR, "remapping interrupt index %d: trig:%u,vec:%u,"
                 "deliver:%u,dest:%u,dest_mode:%u", index,
@@ -2141,6 +2209,108 @@  int vtd_interrupt_remap_ioapic(IntelIOMMUState *iommu,
     return 0;
 }
 
+/* Generate one MSI message from VTDIrq info */
+static void vtd_generate_msi_message(VTDIrq *irq, MSIMessage *msg_out)
+{
+    VTD_MSIMessage msg;
+
+    bzero(&msg, sizeof(msg));
+
+    /* Generate address bits */
+    msg.dest_mode = irq->dest_mode;
+    msg.redir_hint = irq->redir_hint;
+    msg.dest = irq->dest;
+    msg.__addr_head = 0xfee;
+    /* Keep this from original MSI address bits */
+    msg.__not_used = irq->msi_addr_last_bits;
+
+    /* Generate data bits */
+    msg.vector = irq->vector;
+    msg.delivery_mode = irq->delivery_mode;
+    /* FIXME: assert always for level mode interrupt */
+    msg.level = 1;
+    msg.trigger_mode = irq->trigger_mode;
+
+    msg_out->address = msg.msi_addr;
+    msg_out->data = msg.msi_data;
+}
+
+/* Interrupt remapping for MSI/MSI-X entry */
+static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu,
+                                   MSIMessage *origin,
+                                   MSIMessage *translated)
+{
+    int ret = 0;
+    VTD_IR_MSIAddress addr;
+    uint16_t index = 0;
+    VTDIrq irq = {0};
+
+    assert(iommu && origin && translated);
+
+    if (!iommu->intr_enabled) {
+        memcpy(translated, origin, sizeof(*origin));
+        return 0;
+    }
+
+    if (origin->data) {
+        VTD_DPRINTF(GENERAL, "error: MSI data bits non-zero for "
+                    "interrupt remappable entry: 0x%"PRIx32,
+                    origin->data);
+        return -VTD_FR_IR_REQ_RSVD;
+    }
+
+    if (origin->address & MSI_ADDR_HI_MASK) {
+        VTD_DPRINTF(GENERAL, "error: MSI addr high 32 bits nonzero"
+                    " during interrupt remapping: 0x%"PRIx32,
+                    (uint32_t)((origin->address & MSI_ADDR_HI_MASK) >> \
+                    MSI_ADDR_HI_SHIFT));
+        return -VTD_FR_IR_REQ_RSVD;
+    }
+
+    addr.data = origin->address & MSI_ADDR_LO_MASK;
+    if (addr.__head != 0xfee) {
+        VTD_DPRINTF(GENERAL, "error: MSI addr low 32 bits invalid: "
+                    "0x%"PRIx32, addr.data);
+        return -VTD_FR_IR_REQ_RSVD;
+    }
+
+    if (addr.sub_valid != 1) {
+        VTD_DPRINTF(GENERAL, "error: SHV not set for MSI addr: "
+                    "0x%"PRIx32, addr.data);
+        return -VTD_FR_IR_REQ_RSVD;
+    }
+
+    /* TODO: Currently we still do not support compatible mode */
+    if (addr.int_mode != VTD_IR_INT_FORMAT_REMAP) {
+        VTD_DPRINTF(GENERAL, "error: trying to remap MSI entry"
+                    " with compatible format: 0x%"PRIx32,
+                    addr.data);
+        return -VTD_FR_IR_REQ_COMPAT;
+    }
+
+    index = addr.index_h << 15 | addr.index_l;
+
+    ret = vtd_remap_irq_get(iommu, index, &irq);
+    if (ret) {
+        return ret;
+    }
+
+    /*
+     * We'd better keep the last two bits, assuming that guest OS
+     * might modify it. Keep it does not hurt after all.
+     */
+    irq.msi_addr_last_bits = addr.__not_care;
+
+    /* Translate VTDIrq to MSI message */
+    vtd_generate_msi_message(&irq, translated);
+
+    VTD_DPRINTF(IR, "mapping MSI 0x%"PRIx64":0x%"PRIx32 " -> "
+                "0x%"PRIx64":0x%"PRIx32, origin->address, origin->data,
+                translated->address, translated->data);
+
+    return 0;
+}
+
 /* Do the initialization. It will also be called when reset, so pay
  * attention when adding new initialization stuff.
  */
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 2a9987f..e1a08cb 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -110,6 +110,8 @@ 
 /* Interrupt Address Range */
 #define VTD_INTERRUPT_ADDR_FIRST    0xfee00000ULL
 #define VTD_INTERRUPT_ADDR_LAST     0xfeefffffULL
+#define VTD_INTERRUPT_ADDR_SIZE     (VTD_INTERRUPT_ADDR_LAST - \
+                                     VTD_INTERRUPT_ADDR_FIRST + 1)
 
 /* The shift of source_id in the key of IOTLB hash table */
 #define VTD_IOTLB_SID_SHIFT         36
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 6a79207..9297eba 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -24,6 +24,7 @@ 
 #include "hw/qdev.h"
 #include "sysemu/dma.h"
 #include "hw/i386/ioapic.h"
+#include "hw/pci/msi.h"
 
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -57,6 +58,7 @@  typedef union VTD_IRTE VTD_IRTE;
 typedef union VTD_IR_IOAPICEntry VTD_IR_IOAPICEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDIrq VTDIrq;
+typedef struct VTD_MSIMessage VTD_MSIMessage;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -77,6 +79,7 @@  struct VTDAddressSpace {
     uint8_t devfn;
     AddressSpace as;
     MemoryRegion iommu;
+    MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
     IntelIOMMUState *iommu_state;
     VTDContextCacheEntry context_cache_entry;
 };
@@ -154,11 +157,42 @@  union VTD_IR_MSIAddress {
 
 /* Generic IRQ entry information */
 struct VTDIrq {
+    /* Used by both IOAPIC/MSI interrupt remapping */
     uint8_t trigger_mode;
     uint8_t vector;
     uint8_t delivery_mode;
     uint32_t dest;
     uint8_t dest_mode;
+
+    /* only used by MSI interrupt remapping */
+    uint8_t redir_hint;
+    uint8_t msi_addr_last_bits;
+};
+
+struct VTD_MSIMessage {
+    union {
+        struct {
+            uint16_t __not_used:2;
+            uint16_t dest_mode:1;
+            uint16_t redir_hint:1;
+            uint16_t __reserved:8;
+            uint16_t dest:8;
+            uint16_t __addr_head:12; /* 0xfee */
+            uint32_t __addr_hi:32;
+        } QEMU_PACKED;
+        uint64_t msi_addr;
+    };
+    union {
+        struct {
+            uint16_t vector:8;
+            uint16_t delivery_mode:3;
+            uint16_t __resved:3;
+            uint16_t level:1;
+            uint16_t trigger_mode:1;
+            uint16_t __resved1:16;
+        } QEMU_PACKED;
+        uint32_t msi_data;
+    };
 };
 
 /* When IR is enabled, all MSI/MSI-X data bits should be zero */
diff --git a/include/hw/pci/msi.h b/include/hw/pci/msi.h
index 8124908..8e94fca 100644
--- a/include/hw/pci/msi.h
+++ b/include/hw/pci/msi.h
@@ -24,6 +24,10 @@ 
 #include "qemu-common.h"
 #include "hw/pci/pci.h"
 
+#define  MSI_ADDR_HI_MASK        (0xffffffff00000000ULL)
+#define  MSI_ADDR_HI_SHIFT       (32)
+#define  MSI_ADDR_LO_MASK        (0x00000000ffffffffULL)
+
 struct MSIMessage {
     uint64_t address;
     uint32_t data;