diff mbox

[v2,7/7] intel_iommu: support passthrough (PT)

Message ID 1492428730-13438-8-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu April 17, 2017, 11:32 a.m. UTC
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c          | 109 +++++++++++++++++++++++++++++++++--------
 hw/i386/intel_iommu_internal.h |   1 +
 hw/i386/trace-events           |   1 +
 hw/i386/x86-iommu.c            |   1 +
 include/hw/i386/x86-iommu.h    |   1 +
 5 files changed, 93 insertions(+), 20 deletions(-)

Comments

Yi Liu April 18, 2017, 4:30 a.m. UTC | #1
> -----Original Message-----
> From: Qemu-devel [mailto:qemu-devel-bounces+yi.l.liu=intel.com@nongnu.org] On
> Behalf Of Peter Xu
> Sent: Monday, April 17, 2017 7:32 PM
> To: qemu-devel@nongnu.org
> Cc: Lan, Tianyu <tianyu.lan@intel.com>; Michael S . Tsirkin <mst@redhat.com>;
> Jason Wang <jasowang@redhat.com>; peterx@redhat.com; Marcel Apfelbaum
> <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>
> Subject: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  hw/i386/intel_iommu.c          | 109 +++++++++++++++++++++++++++++++++--------
>  hw/i386/intel_iommu_internal.h |   1 +
>  hw/i386/trace-events           |   1 +
>  hw/i386/x86-iommu.c            |   1 +
>  include/hw/i386/x86-iommu.h    |   1 +
>  5 files changed, 93 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index
> 05ae631..deb2007 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -872,7 +872,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> uint8_t bus_num,
>      } else {
>          switch (vtd_ce_get_type(ce)) {
>          case VTD_CONTEXT_TT_MULTI_LEVEL:
> -            /* fall through */
> +        case VTD_CONTEXT_TT_PASS_THROUGH:
>          case VTD_CONTEXT_TT_DEV_IOTLB:
>              break;
>          default:
> @@ -883,6 +883,73 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> uint8_t bus_num,
>      return 0;
>  }
> 
> +/* Fetch translation type for specific device. Returns <0 if error
> + * happens, otherwise return the shifted type to check against
> + * VTD_CONTEXT_TT_*. */
> +static int vtd_dev_get_trans_type(VTDAddressSpace *as) {
> +    IntelIOMMUState *s;
> +    VTDContextEntry ce;
> +    int ret;
> +
> +    s = as->iommu_state;
> +
> +    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
> +                                   as->devfn, &ce);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    return vtd_ce_get_type(&ce);
> +}
> +
> +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) {
> +    int ret;
> +
> +    assert(as);
> +
> +    ret = vtd_dev_get_trans_type(as);
> +    if (ret < 0) {
> +        /*
> +         * Possibly failed to parse the context entry for some reason
> +         * (e.g., during init, or any guest configuration errors on
> +         * context entries). We should assume PT not enabled for
> +         * safety.
> +         */
> +        return false;
> +    }
> +
> +    return ret == VTD_CONTEXT_TT_PASS_THROUGH; }
> +
> +static void vtd_switch_address_space(VTDAddressSpace *as) {
> +    bool use_iommu;
> +
> +    assert(as);
> +
> +    use_iommu = as->iommu_state->dmar_enabled;
> +    if (use_iommu) {
> +        /* Further checks per-device configuration */
> +        use_iommu &= !vtd_dev_pt_enabled(as);
> +    }
> +
> +    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> +                                   VTD_PCI_SLOT(as->devfn),
> +                                   VTD_PCI_FUNC(as->devfn),
> +                                   use_iommu);

Hi Peter,

Skip address space switching is a good idea to support Passthru mode.
However, without the address space, the vfio notifier would not be
registered, thus vIOMMU emulator has no way to connect to host. It is
no harm if there is only map/unmap notifier. But if we have more notifiers
other than map/unmap, it may be a problem.

I think we need to reconsider it here. 

Regards,
Yi L
> +    /* Turn off first then on the other */
> +    if (use_iommu) {
> +        memory_region_set_enabled(&as->sys_alias, false);
> +        memory_region_set_enabled(&as->iommu, true);
> +    } else {
> +        memory_region_set_enabled(&as->iommu, false);
> +        memory_region_set_enabled(&as->sys_alias, true);
> +    }
> +}
> +
>  static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)  {
>      return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); @@ -991,6 +1058,18 @@
> static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>          cc_entry->context_cache_gen = s->context_cache_gen;
>      }
> 
> +    /*
> +     * We don't need to translate for pass-through context entries.
> +     * Also, let's ignore IOTLB caching as well for PT devices.
> +     */
> +    if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
> +        entry->translated_addr = entry->iova;
> +        entry->addr_mask = VTD_PAGE_SIZE - 1;
> +        entry->perm = IOMMU_RW;
> +        trace_vtd_translate_pt(source_id, entry->iova);
> +        return;
> +    }
> +
>      ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
>                                 &reads, &writes);
>      if (ret_fr) {
> @@ -1135,6 +1214,11 @@ static void
> vtd_context_device_invalidate(IntelIOMMUState *s,
>                                               VTD_PCI_FUNC(devfn_it));
>                  vtd_as->context_cache_entry.context_cache_gen = 0;
>                  /*
> +                 * Do switch address space when needed, in case if the
> +                 * device passthrough bit is switched.
> +                 */
> +                vtd_switch_address_space(vtd_as);
> +                /*
>                   * So a device is moving out of (or moving into) a
>                   * domain, a replay() suites here to notify all the
>                   * IOMMU_NOTIFIER_MAP registers about this change.
> @@ -1366,25 +1450,6 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
>      vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);  }
> 
> -static void vtd_switch_address_space(VTDAddressSpace *as) -{
> -    assert(as);
> -
> -    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> -                                   VTD_PCI_SLOT(as->devfn),
> -                                   VTD_PCI_FUNC(as->devfn),
> -                                   as->iommu_state->dmar_enabled);
> -
> -    /* Turn off first then on the other */
> -    if (as->iommu_state->dmar_enabled) {
> -        memory_region_set_enabled(&as->sys_alias, false);
> -        memory_region_set_enabled(&as->iommu, true);
> -    } else {
> -        memory_region_set_enabled(&as->iommu, false);
> -        memory_region_set_enabled(&as->sys_alias, true);
> -    }
> -}
> -
>  static void vtd_switch_address_space_all(IntelIOMMUState *s)  {
>      GHashTableIter iter;
> @@ -2849,6 +2914,10 @@ static void vtd_init(IntelIOMMUState *s)
>          s->ecap |= VTD_ECAP_DT;
>      }
> 
> +    if (x86_iommu->pt_supported) {
> +        s->ecap |= VTD_ECAP_PT;
> +    }
> +
>      if (s->caching_mode) {
>          s->cap |= VTD_CAP_CM;
>      }
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index
> 29d6707..0e73a65 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -187,6 +187,7 @@
>  /* Interrupt Remapping support */
>  #define VTD_ECAP_IR                 (1ULL << 3)
>  #define VTD_ECAP_EIM                (1ULL << 4)
> +#define VTD_ECAP_PT                 (1ULL << 6)
>  #define VTD_ECAP_MHMV               (15ULL << 20)
> 
>  /* CAP_REG */
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 04a6980..867ad0b
> 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -38,6 +38,7 @@ vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page
> walk skip iova 0x%"P  vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next)
> "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
>  vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on)
> "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>  vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t
> size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
> +vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16",
> +iova 0x%"PRIx64
> 
>  # hw/i386/amd_iommu.c
>  amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr
> 0x%"PRIx64" +  offset 0x%"PRIx32 diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-
> iommu.c index 02b8825..293caf8 100644
> --- a/hw/i386/x86-iommu.c
> +++ b/hw/i386/x86-iommu.c
> @@ -91,6 +91,7 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp)
> static Property x86_iommu_properties[] = {
>      DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false),
>      DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false),
> +    DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
> 
> diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h index
> 361c07c..ef89c0c 100644
> --- a/include/hw/i386/x86-iommu.h
> +++ b/include/hw/i386/x86-iommu.h
> @@ -74,6 +74,7 @@ struct X86IOMMUState {
>      SysBusDevice busdev;
>      bool intr_supported;        /* Whether vIOMMU supports IR */
>      bool dt_supported;          /* Whether vIOMMU supports DT */
> +    bool pt_supported;          /* Whether vIOMMU supports pass-through */
>      IommuType type;             /* IOMMU type - AMD/Intel     */
>      QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */  };
> --
> 2.7.4
>
Peter Xu April 18, 2017, 4:54 a.m. UTC | #2
On Tue, Apr 18, 2017 at 04:30:40AM +0000, Liu, Yi L wrote:

[...]

> > +static void vtd_switch_address_space(VTDAddressSpace *as) {
> > +    bool use_iommu;
> > +
> > +    assert(as);
> > +
> > +    use_iommu = as->iommu_state->dmar_enabled;
> > +    if (use_iommu) {
> > +        /* Further checks per-device configuration */
> > +        use_iommu &= !vtd_dev_pt_enabled(as);
> > +    }
> > +
> > +    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> > +                                   VTD_PCI_SLOT(as->devfn),
> > +                                   VTD_PCI_FUNC(as->devfn),
> > +                                   use_iommu);
> 
> Hi Peter,
> 
> Skip address space switching is a good idea to support Passthru mode.
> However, without the address space, the vfio notifier would not be
> registered, thus vIOMMU emulator has no way to connect to host. It is
> no harm if there is only map/unmap notifier. But if we have more notifiers
> other than map/unmap, it may be a problem.
> 
> I think we need to reconsider it here. 

For now I think as switching is good to us in general. Could I know
more context about this? Would it be okay to work on top of this in
the future?

Thanks,
Yi Liu April 18, 2017, 6:02 a.m. UTC | #3
> > Hi Peter,

> >

> > Skip address space switching is a good idea to support Passthru mode.

> > However, without the address space, the vfio notifier would not be

> > registered, thus vIOMMU emulator has no way to connect to host. It is

> > no harm if there is only map/unmap notifier. But if we have more

> > notifiers other than map/unmap, it may be a problem.

> >

> > I think we need to reconsider it here.

> 

> For now I think as switching is good to us in general. Could I know more context

> about this? Would it be okay to work on top of this in the future?

> 


Let me explain. For vSVM enabling, it needs to add new notifier to bind gPASID table
to host. If an assigned device uses SVM in guest, as designed guest IOMMU driver would
set "pt" for it. This is to make sure the host second-level page table stores GPA->HPA
mapping. So that pIOMMU can do nested translation to achieve gVA->GPA GPA->HPA 
mapping.

So the situation is if we want to keep GPA->HPA mapping, we should skip address space
switch, but the vfio listener would not know vIOMMU is added, so no vfio notifier would
be registered. But if we do the switch, the GPA->HPA mapping is unmapped. And need
another way to build the GPA->HPA mapping.

I think we may have two choice here. Pls let me know your ideas.

1. skip the switch for "pt" mode, find other way to register vfio notifiers. not sure if this
is workable. Just a quick thought.

2. do the switch, and rebuild GPA->HPA mapping if device use "pt" mode. For this option,
I also have two way to build the GPA->HPA mapping.
a) walk all the memory region sections in address_space_memory, and build the mapping.
address_space_memory.dispatch->map.sections, sections stores all the mapped sections.

b) let guest iommu driver mimics a 1:1 mapping for the devices use "pt" mode. in this way,
the GPA->HPA mapping could be setup by walking the guest SL page table. This is consistent
with your vIOVA replay solution.

Also I'm curious if Tianyu's fault report framework needs to register new notifiers.

Thanks,
Yi L
Peter Xu April 18, 2017, 7:27 a.m. UTC | #4
On Tue, Apr 18, 2017 at 06:02:44AM +0000, Liu, Yi L wrote:
> > > Hi Peter,
> > >
> > > Skip address space switching is a good idea to support Passthru mode.
> > > However, without the address space, the vfio notifier would not be
> > > registered, thus vIOMMU emulator has no way to connect to host. It is
> > > no harm if there is only map/unmap notifier. But if we have more
> > > notifiers other than map/unmap, it may be a problem.
> > >
> > > I think we need to reconsider it here.
> > 
> > For now I think as switching is good to us in general. Could I know more context
> > about this? Would it be okay to work on top of this in the future?
> > 
> 
> Let me explain. For vSVM enabling, it needs to add new notifier to bind gPASID table
> to host. If an assigned device uses SVM in guest, as designed guest IOMMU driver would
> set "pt" for it. This is to make sure the host second-level page table stores GPA->HPA
> mapping. So that pIOMMU can do nested translation to achieve gVA->GPA GPA->HPA 
> mapping.

That should mean that in the guest it will only enable first level
translation, while in the host it'll be nested (first + second)? Then
you should be trapping the guest extended context entry invalidation,
then pass the PASID table pointer downward to the host IOMMU, am I
correct?

> 
> So the situation is if we want to keep GPA->HPA mapping, we should skip address space
> switch, but the vfio listener would not know vIOMMU is added, so no vfio notifier would
> be registered. But if we do the switch, the GPA->HPA mapping is unmapped. And need
> another way to build the GPA->HPA mapping.

If my understanding above is correct, current IOMMU notifier may not
satisfy your need. If you see the current notify interface, it's
delivering IOMMUTLBEntry. I suspect it only suites for IOTLB
notifications, but not if you want to pass some pointer (one GPA)
downward somewhere.

> 
> I think we may have two choice here. Pls let me know your ideas.
> 
> 1. skip the switch for "pt" mode, find other way to register vfio notifiers. not sure if this
> is workable. Just a quick thought.
> 
> 2. do the switch, and rebuild GPA->HPA mapping if device use "pt" mode. For this option,
> I also have two way to build the GPA->HPA mapping.
> a) walk all the memory region sections in address_space_memory, and build the mapping.
> address_space_memory.dispatch->map.sections, sections stores all the mapped sections.
> 
> b) let guest iommu driver mimics a 1:1 mapping for the devices use "pt" mode. in this way,
> the GPA->HPA mapping could be setup by walking the guest SL page table. This is consistent
> with your vIOVA replay solution.

I'll prefer (1). Reason explained above.

> 
> Also I'm curious if Tianyu's fault report framework needs to register new notifiers.

For Tianyu's case, I feel like we need other kind of notifier as well,
but not the current IOTLB one. And, of course in this case it'll be in
an reverted direction, which is from device to vIOMMU.

(I am thinking whether it's a good time now to let any PCI device able
 to fetch its direct owner IOMMU object, then it can register anything
 it want onto that object maybe?)

Thanks,
Yi Liu April 18, 2017, 9:04 a.m. UTC | #5
> -----Original Message-----

> From: Peter Xu [mailto:peterx@redhat.com]

> Sent: Tuesday, April 18, 2017 3:27 PM

> To: Liu, Yi L <yi.l.liu@intel.com>

> Cc: qemu-devel@nongnu.org; Lan, Tianyu <tianyu.lan@intel.com>; Michael S .

> Tsirkin <mst@redhat.com>; Jason Wang <jasowang@redhat.com>; Marcel

> Apfelbaum <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>

> Subject: Re: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)

> 

> On Tue, Apr 18, 2017 at 06:02:44AM +0000, Liu, Yi L wrote:

> > > > Hi Peter,

> > > >

> > > > Skip address space switching is a good idea to support Passthru mode.

> > > > However, without the address space, the vfio notifier would not be

> > > > registered, thus vIOMMU emulator has no way to connect to host. It

> > > > is no harm if there is only map/unmap notifier. But if we have

> > > > more notifiers other than map/unmap, it may be a problem.

> > > >

> > > > I think we need to reconsider it here.

> > >

> > > For now I think as switching is good to us in general. Could I know

> > > more context about this? Would it be okay to work on top of this in the future?

> > >

> >

> > Let me explain. For vSVM enabling, it needs to add new notifier to

> > bind gPASID table to host. If an assigned device uses SVM in guest, as

> > designed guest IOMMU driver would set "pt" for it. This is to make

> > sure the host second-level page table stores GPA->HPA mapping. So that

> > pIOMMU can do nested translation to achieve gVA->GPA GPA->HPA mapping.

> 

> That should mean that in the guest it will only enable first level translation, while in

> the host it'll be nested (first + second)? Then you should be trapping the guest

> extended context entry invalidation, then pass the PASID table pointer downward to

> the host IOMMU, am I correct?


exactly.

> 

> >

> > So the situation is if we want to keep GPA->HPA mapping, we should

> > skip address space switch, but the vfio listener would not know vIOMMU

> > is added, so no vfio notifier would be registered. But if we do the

> > switch, the GPA->HPA mapping is unmapped. And need another way to build the

> GPA->HPA mapping.

> 

> If my understanding above is correct, current IOMMU notifier may not satisfy your

> need. If you see the current notify interface, it's delivering IOMMUTLBEntry. I

> suspect it only suites for IOTLB notifications, but not if you want to pass some

> pointer (one GPA) downward somewhere.


Yeah, you got it more than absolutely. One of my patch is to modify the para to be
void * and let each notifiers to translate separately. I think it should be a reasonable
change.

Supposedly, I would send RFC for vSVM soon. We may talk more it at that thread.

> >

> > I think we may have two choice here. Pls let me know your ideas.

> >

> > 1. skip the switch for "pt" mode, find other way to register vfio

> > notifiers. not sure if this is workable. Just a quick thought.

> >

> > 2. do the switch, and rebuild GPA->HPA mapping if device use "pt"

> > mode. For this option, I also have two way to build the GPA->HPA mapping.

> > a) walk all the memory region sections in address_space_memory, and build the

> mapping.

> > address_space_memory.dispatch->map.sections, sections stores all the mapped

> sections.

> >

> > b) let guest iommu driver mimics a 1:1 mapping for the devices use

> > "pt" mode. in this way, the GPA->HPA mapping could be setup by walking

> > the guest SL page table. This is consistent with your vIOVA replay solution.

> 

> I'll prefer (1). Reason explained above.

> 

> >

> > Also I'm curious if Tianyu's fault report framework needs to register new notifiers.

> 

> For Tianyu's case, I feel like we need other kind of notifier as well, but not the

> current IOTLB one. And, of course in this case it'll be in an reverted direction, which

> is from device to vIOMMU.

> 

> (I am thinking whether it's a good time now to let any PCI device able  to fetch its

> direct owner IOMMU object, then it can register anything  it want onto that object

> maybe?)

> 

Hmmm, let's see Tianyu's comment as he's driving fault report. Let's keep in touch on
this Passthru-Mode enabling.

Thanks,
Yi L
Yi Liu April 19, 2017, 7:13 a.m. UTC | #6
Peter,

Besides the gPA->hPA mapping, pt mode enabling requires some sanity check on
s->pt_supported. See comments inline.

> -----Original Message-----
> From: Qemu-devel [mailto:qemu-devel-bounces+yi.l.liu=intel.com@nongnu.org] On
> Behalf Of Peter Xu
> Sent: Monday, April 17, 2017 7:32 PM
> To: qemu-devel@nongnu.org
> Cc: Lan, Tianyu <tianyu.lan@intel.com>; Michael S . Tsirkin <mst@redhat.com>;
> Jason Wang <jasowang@redhat.com>; peterx@redhat.com; Marcel Apfelbaum
> <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>
> Subject: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  hw/i386/intel_iommu.c          | 109 +++++++++++++++++++++++++++++++++--------
>  hw/i386/intel_iommu_internal.h |   1 +
>  hw/i386/trace-events           |   1 +
>  hw/i386/x86-iommu.c            |   1 +
>  include/hw/i386/x86-iommu.h    |   1 +
>  5 files changed, 93 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index
> 05ae631..deb2007 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -872,7 +872,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> uint8_t bus_num,
>      } else {
>          switch (vtd_ce_get_type(ce)) {
>          case VTD_CONTEXT_TT_MULTI_LEVEL:
> -            /* fall through */
> +        case VTD_CONTEXT_TT_PASS_THROUGH:

Passthru type is a reserved type if ecap.PT=0, so add a check here. If s->pt_supported
is false, report this translation type as unsupported and report an invalid context 
programming to guest.

>          case VTD_CONTEXT_TT_DEV_IOTLB:
>              break;
>          default:
> @@ -883,6 +883,73 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> uint8_t bus_num,
>      return 0;
>  }
> 
> +/* Fetch translation type for specific device. Returns <0 if error
> + * happens, otherwise return the shifted type to check against
> + * VTD_CONTEXT_TT_*. */
> +static int vtd_dev_get_trans_type(VTDAddressSpace *as) {
> +    IntelIOMMUState *s;
> +    VTDContextEntry ce;
> +    int ret;
> +
> +    s = as->iommu_state;
> +
> +    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
> +                                   as->devfn, &ce);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    return vtd_ce_get_type(&ce);
> +}
> +
> +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) {
> +    int ret;
> +
> +    assert(as);
> +
> +    ret = vtd_dev_get_trans_type(as);
> +    if (ret < 0) {
> +        /*
> +         * Possibly failed to parse the context entry for some reason
> +         * (e.g., during init, or any guest configuration errors on
> +         * context entries). We should assume PT not enabled for
> +         * safety.
> +         */
> +        return false;
> +    }
> +
> +    return ret == VTD_CONTEXT_TT_PASS_THROUGH; }
> +
> +static void vtd_switch_address_space(VTDAddressSpace *as) {
> +    bool use_iommu;
> +
> +    assert(as);
> +
> +    use_iommu = as->iommu_state->dmar_enabled;
> +    if (use_iommu) {

if s->pt_supported is false, no need to check the context.TT, even
context.TT=pt, then it should be an invalid context programming.

Regards,
Yi L
Lan Tianyu April 19, 2017, 7:27 a.m. UTC | #7
Hi All:
	Sorry for later response.
On 2017年04月18日 17:04, Liu, Yi L wrote:
>> -----Original Message-----
>> From: Peter Xu [mailto:peterx@redhat.com]
>> Sent: Tuesday, April 18, 2017 3:27 PM
>> To: Liu, Yi L <yi.l.liu@intel.com>
>> Cc: qemu-devel@nongnu.org; Lan, Tianyu <tianyu.lan@intel.com>; Michael S .
>> Tsirkin <mst@redhat.com>; Jason Wang <jasowang@redhat.com>; Marcel
>> Apfelbaum <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>
>> Subject: Re: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)
>>
>> On Tue, Apr 18, 2017 at 06:02:44AM +0000, Liu, Yi L wrote:
>>>>> Hi Peter,
>>>>>
>>>>> Skip address space switching is a good idea to support Passthru mode.
>>>>> However, without the address space, the vfio notifier would not be
>>>>> registered, thus vIOMMU emulator has no way to connect to host. It
>>>>> is no harm if there is only map/unmap notifier. But if we have
>>>>> more notifiers other than map/unmap, it may be a problem.
>>>>>
>>>>> I think we need to reconsider it here.
>>>>
>>>> For now I think as switching is good to us in general. Could I know
>>>> more context about this? Would it be okay to work on top of this in the future?
>>>>
>>>
>>> Let me explain. For vSVM enabling, it needs to add new notifier to
>>> bind gPASID table to host. If an assigned device uses SVM in guest, as
>>> designed guest IOMMU driver would set "pt" for it. This is to make
>>> sure the host second-level page table stores GPA->HPA mapping. So that
>>> pIOMMU can do nested translation to achieve gVA->GPA GPA->HPA mapping.
>>
>> That should mean that in the guest it will only enable first level translation, while in
>> the host it'll be nested (first + second)? Then you should be trapping the guest
>> extended context entry invalidation, then pass the PASID table pointer downward to
>> the host IOMMU, am I correct?
> 
> exactly.
> 
>>
>>>
>>> So the situation is if we want to keep GPA->HPA mapping, we should
>>> skip address space switch, but the vfio listener would not know vIOMMU
>>> is added, so no vfio notifier would be registered. But if we do the
>>> switch, the GPA->HPA mapping is unmapped. And need another way to build the
>> GPA->HPA mapping.
>>
>> If my understanding above is correct, current IOMMU notifier may not satisfy your
>> need. If you see the current notify interface, it's delivering IOMMUTLBEntry. I
>> suspect it only suites for IOTLB notifications, but not if you want to pass some
>> pointer (one GPA) downward somewhere.
> 
> Yeah, you got it more than absolutely. One of my patch is to modify the para to be
> void * and let each notifiers to translate separately. I think it should be a reasonable
> change.
> 
> Supposedly, I would send RFC for vSVM soon. We may talk more it at that thread.
> 
>>>
>>> I think we may have two choice here. Pls let me know your ideas.
>>>
>>> 1. skip the switch for "pt" mode, find other way to register vfio
>>> notifiers. not sure if this is workable. Just a quick thought.
>>>
>>> 2. do the switch, and rebuild GPA->HPA mapping if device use "pt"
>>> mode. For this option, I also have two way to build the GPA->HPA mapping.
>>> a) walk all the memory region sections in address_space_memory, and build the
>> mapping.
>>> address_space_memory.dispatch->map.sections, sections stores all the mapped
>> sections.
>>>
>>> b) let guest iommu driver mimics a 1:1 mapping for the devices use
>>> "pt" mode. in this way, the GPA->HPA mapping could be setup by walking
>>> the guest SL page table. This is consistent with your vIOVA replay solution.
>>
>> I'll prefer (1). Reason explained above.
>>
>>>
>>> Also I'm curious if Tianyu's fault report framework needs to register new notifiers.
>>
>> For Tianyu's case, I feel like we need other kind of notifier as well, but not the
>> current IOTLB one. And, of course in this case it'll be in an reverted direction, which
>> is from device to vIOMMU.
>>
>> (I am thinking whether it's a good time now to let any PCI device able  to fetch its
>> direct owner IOMMU object, then it can register anything  it want onto that object
>> maybe?)
>>
> Hmmm, let's see Tianyu's comment as he's driving fault report. Let's keep in touch on
> this Passthru-Mode enabling.

In my previous RFC patchset of fault event reporting, I registered fault
notifier when there is a VFIO group attached to VFIO container and used
the address space to check whether vIOMMU is added. The address space is
returned by vtd_host_dma_iommu(). vtd_find_add_as() initializes device's
IOMMU memory region and put it into device's address space as root
memory region.
Does this make sense?

@@ -1103,6 +1132,14 @@ static int vfio_connect_container(VFIOGroup
*group, AddressSpace *as,
         goto listener_release_exit;
     }

+    if (memory_region_is_iommu(container->space->as->root)) {
+        if (vfio_set_iommu_fault_notifier(container)) {
+            error_setg_errno(errp, -ret,
+                "Fail to set IOMMU fault notifier");
+            goto listener_release_exit;
+        }
+    }
+
     container->initialized = true;
Peter Xu April 20, 2017, 3:04 a.m. UTC | #8
On Wed, Apr 19, 2017 at 03:27:52PM +0800, Lan Tianyu wrote:
> Hi All:
> 	Sorry for later response.
> On 2017年04月18日 17:04, Liu, Yi L wrote:
> >> -----Original Message-----
> >> From: Peter Xu [mailto:peterx@redhat.com]
> >> Sent: Tuesday, April 18, 2017 3:27 PM
> >> To: Liu, Yi L <yi.l.liu@intel.com>
> >> Cc: qemu-devel@nongnu.org; Lan, Tianyu <tianyu.lan@intel.com>; Michael S .
> >> Tsirkin <mst@redhat.com>; Jason Wang <jasowang@redhat.com>; Marcel
> >> Apfelbaum <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>
> >> Subject: Re: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)
> >>
> >> On Tue, Apr 18, 2017 at 06:02:44AM +0000, Liu, Yi L wrote:
> >>>>> Hi Peter,
> >>>>>
> >>>>> Skip address space switching is a good idea to support Passthru mode.
> >>>>> However, without the address space, the vfio notifier would not be
> >>>>> registered, thus vIOMMU emulator has no way to connect to host. It
> >>>>> is no harm if there is only map/unmap notifier. But if we have
> >>>>> more notifiers other than map/unmap, it may be a problem.
> >>>>>
> >>>>> I think we need to reconsider it here.
> >>>>
> >>>> For now I think as switching is good to us in general. Could I know
> >>>> more context about this? Would it be okay to work on top of this in the future?
> >>>>
> >>>
> >>> Let me explain. For vSVM enabling, it needs to add new notifier to
> >>> bind gPASID table to host. If an assigned device uses SVM in guest, as
> >>> designed guest IOMMU driver would set "pt" for it. This is to make
> >>> sure the host second-level page table stores GPA->HPA mapping. So that
> >>> pIOMMU can do nested translation to achieve gVA->GPA GPA->HPA mapping.
> >>
> >> That should mean that in the guest it will only enable first level translation, while in
> >> the host it'll be nested (first + second)? Then you should be trapping the guest
> >> extended context entry invalidation, then pass the PASID table pointer downward to
> >> the host IOMMU, am I correct?
> > 
> > exactly.
> > 
> >>
> >>>
> >>> So the situation is if we want to keep GPA->HPA mapping, we should
> >>> skip address space switch, but the vfio listener would not know vIOMMU
> >>> is added, so no vfio notifier would be registered. But if we do the
> >>> switch, the GPA->HPA mapping is unmapped. And need another way to build the
> >> GPA->HPA mapping.
> >>
> >> If my understanding above is correct, current IOMMU notifier may not satisfy your
> >> need. If you see the current notify interface, it's delivering IOMMUTLBEntry. I
> >> suspect it only suites for IOTLB notifications, but not if you want to pass some
> >> pointer (one GPA) downward somewhere.
> > 
> > Yeah, you got it more than absolutely. One of my patch is to modify the para to be
> > void * and let each notifiers to translate separately. I think it should be a reasonable
> > change.
> > 
> > Supposedly, I would send RFC for vSVM soon. We may talk more it at that thread.

I suspect whether it'll be good that we hang everything under current
IOMMU notifiers... But sure I'm looking forward to your RFC. :)

> > 
> >>>
> >>> I think we may have two choice here. Pls let me know your ideas.
> >>>
> >>> 1. skip the switch for "pt" mode, find other way to register vfio
> >>> notifiers. not sure if this is workable. Just a quick thought.
> >>>
> >>> 2. do the switch, and rebuild GPA->HPA mapping if device use "pt"
> >>> mode. For this option, I also have two way to build the GPA->HPA mapping.
> >>> a) walk all the memory region sections in address_space_memory, and build the
> >> mapping.
> >>> address_space_memory.dispatch->map.sections, sections stores all the mapped
> >> sections.
> >>>
> >>> b) let guest iommu driver mimics a 1:1 mapping for the devices use
> >>> "pt" mode. in this way, the GPA->HPA mapping could be setup by walking
> >>> the guest SL page table. This is consistent with your vIOVA replay solution.
> >>
> >> I'll prefer (1). Reason explained above.
> >>
> >>>
> >>> Also I'm curious if Tianyu's fault report framework needs to register new notifiers.
> >>
> >> For Tianyu's case, I feel like we need other kind of notifier as well, but not the
> >> current IOTLB one. And, of course in this case it'll be in an reverted direction, which
> >> is from device to vIOMMU.
> >>
> >> (I am thinking whether it's a good time now to let any PCI device able  to fetch its
> >> direct owner IOMMU object, then it can register anything  it want onto that object
> >> maybe?)
> >>
> > Hmmm, let's see Tianyu's comment as he's driving fault report. Let's keep in touch on
> > this Passthru-Mode enabling.
> 
> In my previous RFC patchset of fault event reporting, I registered fault
> notifier when there is a VFIO group attached to VFIO container and used
> the address space to check whether vIOMMU is added. The address space is
> returned by vtd_host_dma_iommu(). vtd_find_add_as() initializes device's
> IOMMU memory region and put it into device's address space as root
> memory region.
> Does this make sense?
> 
> @@ -1103,6 +1132,14 @@ static int vfio_connect_container(VFIOGroup
> *group, AddressSpace *as,
>          goto listener_release_exit;
>      }
> 
> +    if (memory_region_is_iommu(container->space->as->root)) {

I would suggest we don't play with as->root any more. After vtd vfio
series, this may not be true if passthrough mode is enabled (then the
root may be switched to an system memory alias). I don't know the best
way to check this, one alternative might be that we check whether
container->space->as == system_memory(), it should be workable, but in
a slightly hackish way.

Thanks,

> +        if (vfio_set_iommu_fault_notifier(container)) {
> +            error_setg_errno(errp, -ret,
> +                "Fail to set IOMMU fault notifier");
> +            goto listener_release_exit;
> +        }
> +    }
> +
>      container->initialized = true;
> 
> -- 
> Best regards
> Tianyu Lan
Peter Xu April 20, 2017, 3:07 a.m. UTC | #9
On Wed, Apr 19, 2017 at 07:13:47AM +0000, Liu, Yi L wrote:
> Peter,
> 
> Besides the gPA->hPA mapping, pt mode enabling requires some sanity check on
> s->pt_supported. See comments inline.
> 
> > -----Original Message-----
> > From: Qemu-devel [mailto:qemu-devel-bounces+yi.l.liu=intel.com@nongnu.org] On
> > Behalf Of Peter Xu
> > Sent: Monday, April 17, 2017 7:32 PM
> > To: qemu-devel@nongnu.org
> > Cc: Lan, Tianyu <tianyu.lan@intel.com>; Michael S . Tsirkin <mst@redhat.com>;
> > Jason Wang <jasowang@redhat.com>; peterx@redhat.com; Marcel Apfelbaum
> > <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>
> > Subject: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  hw/i386/intel_iommu.c          | 109 +++++++++++++++++++++++++++++++++--------
> >  hw/i386/intel_iommu_internal.h |   1 +
> >  hw/i386/trace-events           |   1 +
> >  hw/i386/x86-iommu.c            |   1 +
> >  include/hw/i386/x86-iommu.h    |   1 +
> >  5 files changed, 93 insertions(+), 20 deletions(-)
> > 
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index
> > 05ae631..deb2007 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -872,7 +872,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> > uint8_t bus_num,
> >      } else {
> >          switch (vtd_ce_get_type(ce)) {
> >          case VTD_CONTEXT_TT_MULTI_LEVEL:
> > -            /* fall through */
> > +        case VTD_CONTEXT_TT_PASS_THROUGH:
> 
> Passthru type is a reserved type if ecap.PT=0, so add a check here. If s->pt_supported
> is false, report this translation type as unsupported and report an invalid context 
> programming to guest.
> 
> >          case VTD_CONTEXT_TT_DEV_IOTLB:
> >              break;
> >          default:
> > @@ -883,6 +883,73 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s,
> > uint8_t bus_num,
> >      return 0;
> >  }
> > 
> > +/* Fetch translation type for specific device. Returns <0 if error
> > + * happens, otherwise return the shifted type to check against
> > + * VTD_CONTEXT_TT_*. */
> > +static int vtd_dev_get_trans_type(VTDAddressSpace *as) {
> > +    IntelIOMMUState *s;
> > +    VTDContextEntry ce;
> > +    int ret;
> > +
> > +    s = as->iommu_state;
> > +
> > +    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
> > +                                   as->devfn, &ce);
> > +    if (ret) {
> > +        return ret;
> > +    }
> > +
> > +    return vtd_ce_get_type(&ce);
> > +}
> > +
> > +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) {
> > +    int ret;
> > +
> > +    assert(as);
> > +
> > +    ret = vtd_dev_get_trans_type(as);
> > +    if (ret < 0) {
> > +        /*
> > +         * Possibly failed to parse the context entry for some reason
> > +         * (e.g., during init, or any guest configuration errors on
> > +         * context entries). We should assume PT not enabled for
> > +         * safety.
> > +         */
> > +        return false;
> > +    }
> > +
> > +    return ret == VTD_CONTEXT_TT_PASS_THROUGH; }
> > +
> > +static void vtd_switch_address_space(VTDAddressSpace *as) {
> > +    bool use_iommu;
> > +
> > +    assert(as);
> > +
> > +    use_iommu = as->iommu_state->dmar_enabled;
> > +    if (use_iommu) {
> 
> if s->pt_supported is false, no need to check the context.TT, even
> context.TT=pt, then it should be an invalid context programming.

Agree with you on both comments. Thanks. :-)
Yi Liu April 20, 2017, 4:55 a.m. UTC | #10
> -----Original Message-----

> From: Peter Xu [mailto:peterx@redhat.com]

> Sent: Thursday, April 20, 2017 11:04 AM

> To: Lan, Tianyu <tianyu.lan@intel.com>

> Cc: Liu, Yi L <yi.l.liu@intel.com>; qemu-devel@nongnu.org; Michael S . Tsirkin

> <mst@redhat.com>; Jason Wang <jasowang@redhat.com>; Marcel Apfelbaum

> <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>; Tian, Kevin

> <kevin.tian@intel.com>

> Subject: Re: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)

> 

> On Wed, Apr 19, 2017 at 03:27:52PM +0800, Lan Tianyu wrote:

> > Hi All:

> > 	Sorry for later response.

> > On 2017年04月18日 17:04, Liu, Yi L wrote:

> > >> -----Original Message-----

> > >> From: Peter Xu [mailto:peterx@redhat.com]

> > >> Sent: Tuesday, April 18, 2017 3:27 PM

> > >> To: Liu, Yi L <yi.l.liu@intel.com>

> > >> Cc: qemu-devel@nongnu.org; Lan, Tianyu <tianyu.lan@intel.com>; Michael S .

> > >> Tsirkin <mst@redhat.com>; Jason Wang <jasowang@redhat.com>; Marcel

> > >> Apfelbaum <marcel@redhat.com>; David Gibson

> > >> <david@gibson.dropbear.id.au>

> > >> Subject: Re: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support

> > >> passthrough (PT)

> > >>

> > >> On Tue, Apr 18, 2017 at 06:02:44AM +0000, Liu, Yi L wrote:

> > >>>>> Hi Peter,

> > >>>>>

> > >>>>> Skip address space switching is a good idea to support Passthru mode.

> > >>>>> However, without the address space, the vfio notifier would not

> > >>>>> be registered, thus vIOMMU emulator has no way to connect to

> > >>>>> host. It is no harm if there is only map/unmap notifier. But if

> > >>>>> we have more notifiers other than map/unmap, it may be a problem.

> > >>>>>

> > >>>>> I think we need to reconsider it here.

> > >>>>

> > >>>> For now I think as switching is good to us in general. Could I

> > >>>> know more context about this? Would it be okay to work on top of this in the

> future?

> > >>>>

> > >>>

> > >>> Let me explain. For vSVM enabling, it needs to add new notifier to

> > >>> bind gPASID table to host. If an assigned device uses SVM in

> > >>> guest, as designed guest IOMMU driver would set "pt" for it. This

> > >>> is to make sure the host second-level page table stores GPA->HPA

> > >>> mapping. So that pIOMMU can do nested translation to achieve gVA->GPA

> GPA->HPA mapping.

> > >>

> > >> That should mean that in the guest it will only enable first level

> > >> translation, while in the host it'll be nested (first + second)?

> > >> Then you should be trapping the guest extended context entry

> > >> invalidation, then pass the PASID table pointer downward to the host IOMMU,

> am I correct?

> > >

> > > exactly.

> > >

> > >>

> > >>>

> > >>> So the situation is if we want to keep GPA->HPA mapping, we should

> > >>> skip address space switch, but the vfio listener would not know

> > >>> vIOMMU is added, so no vfio notifier would be registered. But if

> > >>> we do the switch, the GPA->HPA mapping is unmapped. And need

> > >>> another way to build the

> > >> GPA->HPA mapping.

> > >>

> > >> If my understanding above is correct, current IOMMU notifier may

> > >> not satisfy your need. If you see the current notify interface,

> > >> it's delivering IOMMUTLBEntry. I suspect it only suites for IOTLB

> > >> notifications, but not if you want to pass some pointer (one GPA) downward

> somewhere.

> > >

> > > Yeah, you got it more than absolutely. One of my patch is to modify

> > > the para to be void * and let each notifiers to translate

> > > separately. I think it should be a reasonable change.

> > >

> > > Supposedly, I would send RFC for vSVM soon. We may talk more it at that thread.

> 

> I suspect whether it'll be good that we hang everything under current IOMMU

> notifiers... But sure I'm looking forward to your RFC. :)


The current IOMMU notifier is no doubt MAP/UNMAP specific. However, I think it should
be a common requirement that vIOMMU emulator needs notifiers other than MAP/UNMAP
to notify pIOMMU. So I think it's better to show the basic design for vSVM. And discuss
the best way to place the new notifiers. Would accelerate my work. And thx for your
understanding.

> > >

> > >>>

> > >>> I think we may have two choice here. Pls let me know your ideas.

> > >>>

> > >>> 1. skip the switch for "pt" mode, find other way to register vfio

> > >>> notifiers. not sure if this is workable. Just a quick thought.

> > >>>

> > >>> 2. do the switch, and rebuild GPA->HPA mapping if device use "pt"

> > >>> mode. For this option, I also have two way to build the GPA->HPA mapping.

> > >>> a) walk all the memory region sections in address_space_memory,

> > >>> and build the

> > >> mapping.

> > >>> address_space_memory.dispatch->map.sections, sections stores all

> > >>> the mapped

> > >> sections.

> > >>>

> > >>> b) let guest iommu driver mimics a 1:1 mapping for the devices use

> > >>> "pt" mode. in this way, the GPA->HPA mapping could be setup by

> > >>> walking the guest SL page table. This is consistent with your vIOVA replay

> solution.

> > >>

> > >> I'll prefer (1). Reason explained above.

> > >>

> > >>>

> > >>> Also I'm curious if Tianyu's fault report framework needs to register new

> notifiers.

> > >>

> > >> For Tianyu's case, I feel like we need other kind of notifier as

> > >> well, but not the current IOTLB one. And, of course in this case

> > >> it'll be in an reverted direction, which is from device to vIOMMU.

> > >>

> > >> (I am thinking whether it's a good time now to let any PCI device

> > >> able  to fetch its direct owner IOMMU object, then it can register

> > >> anything  it want onto that object

> > >> maybe?)

> > >>

> > > Hmmm, let's see Tianyu's comment as he's driving fault report. Let's

> > > keep in touch on this Passthru-Mode enabling.

> >

> > In my previous RFC patchset of fault event reporting, I registered

> > fault notifier when there is a VFIO group attached to VFIO container

> > and used the address space to check whether vIOMMU is added. The

> > address space is returned by vtd_host_dma_iommu(). vtd_find_add_as()

> > initializes device's IOMMU memory region and put it into device's

> > address space as root memory region.

> > Does this make sense?

> >

> > @@ -1103,6 +1132,14 @@ static int vfio_connect_container(VFIOGroup

> > *group, AddressSpace *as,

> >          goto listener_release_exit;

> >      }

> >

> > +    if (memory_region_is_iommu(container->space->as->root)) {

> 

> I would suggest we don't play with as->root any more. After vtd vfio series, this may

> not be true if passthrough mode is enabled (then the root may be switched to an

> system memory alias). I don't know the best way to check this, one alternative might

> be that we check whether

> container->space->as == system_memory(), it should be workable, but in

> a slightly hackish way.


In my understanding, container->space->as->root cannot work here no matter passthru-mode
is enabled or not. The code here is aiming to check if vIOMMU exists. After the vfio series,
the vtd_dev_as->root is not initialized to be a iommu MemoryRegion. Compared with checking
if it is system_memory(), I think adding a mechanism to get the iommu MemoryRegion may
be a better choice. Just like the current pci_device_iommu_address_space().

> 

> > +        if (vfio_set_iommu_fault_notifier(container)) {

> > +            error_setg_errno(errp, -ret,

> > +                "Fail to set IOMMU fault notifier");

> > +            goto listener_release_exit;

> > +        }

> > +    }

> > +

> >      container->initialized = true;

> >

> > --

> > Best regards

> > Tianyu Lan

> 

> --

> Peter Xu


Thanks,
Yi L
Peter Xu April 20, 2017, 5:40 a.m. UTC | #11
On Thu, Apr 20, 2017 at 04:55:24AM +0000, Liu, Yi L wrote:

[...]

> > > In my previous RFC patchset of fault event reporting, I registered
> > > fault notifier when there is a VFIO group attached to VFIO container
> > > and used the address space to check whether vIOMMU is added. The
> > > address space is returned by vtd_host_dma_iommu(). vtd_find_add_as()
> > > initializes device's IOMMU memory region and put it into device's
> > > address space as root memory region.
> > > Does this make sense?
> > >
> > > @@ -1103,6 +1132,14 @@ static int vfio_connect_container(VFIOGroup
> > > *group, AddressSpace *as,
> > >          goto listener_release_exit;
> > >      }
> > >
> > > +    if (memory_region_is_iommu(container->space->as->root)) {
> > 
> > I would suggest we don't play with as->root any more. After vtd vfio series, this may
> > not be true if passthrough mode is enabled (then the root may be switched to an
> > system memory alias). I don't know the best way to check this, one alternative might
> > be that we check whether
> > container->space->as == system_memory(), it should be workable, but in

Sorry, I was meaning &address_space_memory.

> > a slightly hackish way.
> 
> In my understanding, container->space->as->root cannot work here no matter passthru-mode
> is enabled or not. The code here is aiming to check if vIOMMU exists. After the vfio series,
> the vtd_dev_as->root is not initialized to be a iommu MemoryRegion. Compared with checking
> if it is system_memory(), I think adding a mechanism to get the iommu MemoryRegion may
> be a better choice. Just like the current pci_device_iommu_address_space().

Isn't pci_device_iommu_address_space() used to get that IOMMU memory
region? And, one thing to mention is that container->space->as is
actually derived from pci_device_iommu_address_space() (when calling
vfio_get_group()).

I feel like that playing around with an IOMMU memory region is still
not clear enough in many cases. I still feel like some day we would
like an "IOMMU object". Then, we can register non-iotlb notifiers
against that IOMMU object, rather than memory regions...

Thanks,
Yi Liu April 20, 2017, 6:36 a.m. UTC | #12
> -----Original Message-----

> From: Peter Xu [mailto:peterx@redhat.com]

> Sent: Thursday, April 20, 2017 1:40 PM

> To: Liu, Yi L <yi.l.liu@intel.com>

> Cc: Lan, Tianyu <tianyu.lan@intel.com>; qemu-devel@nongnu.org; Michael S .

> Tsirkin <mst@redhat.com>; Jason Wang <jasowang@redhat.com>; Marcel

> Apfelbaum <marcel@redhat.com>; David Gibson <david@gibson.dropbear.id.au>;

> Tian, Kevin <kevin.tian@intel.com>

> Subject: Re: [Qemu-devel] [PATCH v2 7/7] intel_iommu: support passthrough (PT)

> 

> On Thu, Apr 20, 2017 at 04:55:24AM +0000, Liu, Yi L wrote:

> 

> [...]

> 

> > > > In my previous RFC patchset of fault event reporting, I registered

> > > > fault notifier when there is a VFIO group attached to VFIO

> > > > container and used the address space to check whether vIOMMU is

> > > > added. The address space is returned by vtd_host_dma_iommu().

> > > > vtd_find_add_as() initializes device's IOMMU memory region and put

> > > > it into device's address space as root memory region.

> > > > Does this make sense?

> > > >

> > > > @@ -1103,6 +1132,14 @@ static int vfio_connect_container(VFIOGroup

> > > > *group, AddressSpace *as,

> > > >          goto listener_release_exit;

> > > >      }

> > > >

> > > > +    if (memory_region_is_iommu(container->space->as->root)) {

> > >

> > > I would suggest we don't play with as->root any more. After vtd vfio

> > > series, this may not be true if passthrough mode is enabled (then

> > > the root may be switched to an system memory alias). I don't know

> > > the best way to check this, one alternative might be that we check

> > > whether

> > > container->space->as == system_memory(), it should be workable, but

> > > container->space->in

> 

> Sorry, I was meaning &address_space_memory.


correct, I got your point all the same.

> 

> > > a slightly hackish way.

> >

> > In my understanding, container->space->as->root cannot work here no

> > matter passthru-mode is enabled or not. The code here is aiming to

> > check if vIOMMU exists. After the vfio series, the vtd_dev_as->root is

> > not initialized to be a iommu MemoryRegion. Compared with checking if

> > it is system_memory(), I think adding a mechanism to get the iommu

> MemoryRegion may be a better choice. Just like the current

> pci_device_iommu_address_space().

> 

> Isn't pci_device_iommu_address_space() used to get that IOMMU memory region?


It actually returns the AddressSpace, and the AddressSpace includes a memory region.
It is as->root. But after adding the vfio series, through the IOMMU memory region
is got, but it has no iommu_ops. Just as the following code shows. That's why I said
even without passthru-mode, Tianyu's that code snippet is not able to get the correct
check.

        memory_region_init(&vtd_dev_as->root, OBJECT(s),
                           "vtd_root", UINT64_MAX);
        address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);

> And, one thing to mention is that container->space->as is actually derived from

> pci_device_iommu_address_space() (when calling vfio_get_group()).


yes, it is.

> I feel like that playing around with an IOMMU memory region is still not clear

> enough in many cases. I still feel like some day we would like an "IOMMU object".

> Then, we can register non-iotlb notifiers against that IOMMU object, rather than

> memory regions...

> 

haven’t think much about it. if you have any early patch, may be glad to have a look.

Thanks,
Yi L
Lan Tianyu April 20, 2017, 6:51 a.m. UTC | #13
On 4/20/2017 1:40 PM, Peter Xu wrote:
> On Thu, Apr 20, 2017 at 04:55:24AM +0000, Liu, Yi L wrote:
>
> [...]
>
>>>> In my previous RFC patchset of fault event reporting, I registered
>>>> fault notifier when there is a VFIO group attached to VFIO container
>>>> and used the address space to check whether vIOMMU is added. The
>>>> address space is returned by vtd_host_dma_iommu(). vtd_find_add_as()
>>>> initializes device's IOMMU memory region and put it into device's
>>>> address space as root memory region.
>>>> Does this make sense?
>>>>
>>>> @@ -1103,6 +1132,14 @@ static int vfio_connect_container(VFIOGroup
>>>> *group, AddressSpace *as,
>>>>          goto listener_release_exit;
>>>>      }
>>>>
>>>> +    if (memory_region_is_iommu(container->space->as->root)) {
>>>
>>> I would suggest we don't play with as->root any more. After vtd vfio series, this may
>>> not be true if passthrough mode is enabled (then the root may be switched to an
>>> system memory alias). I don't know the best way to check this, one alternative might
>>> be that we check whether
>>> container->space->as == system_memory(), it should be workable, but in
>
> Sorry, I was meaning &address_space_memory.
>
>>> a slightly hackish way.
>>
>> In my understanding, container->space->as->root cannot work here no matter passthru-mode
>> is enabled or not. The code here is aiming to check if vIOMMU exists. After the vfio series,
>> the vtd_dev_as->root is not initialized to be a iommu MemoryRegion. Compared with checking
>> if it is system_memory(), I think adding a mechanism to get the iommu MemoryRegion may
>> be a better choice. Just like the current pci_device_iommu_address_space().
>
> Isn't pci_device_iommu_address_space() used to get that IOMMU memory
> region? And, one thing to mention is that container->space->as is
> actually derived from pci_device_iommu_address_space() (when calling
> vfio_get_group()).
>
> I feel like that playing around with an IOMMU memory region is still
> not clear enough in many cases. I still feel like some day we would
> like an "IOMMU object". Then, we can register non-iotlb notifiers
> against that IOMMU object, rather than memory regions...

Our target is to check whether assigned device is under a vIOMMU.
We may check whether iommu_fn point has been populated in its pci bus' 
data structure(PCIDevice). This is what pci_device_iommu_address_space()
does.
Peter Xu April 20, 2017, 7 a.m. UTC | #14
On Thu, Apr 20, 2017 at 02:51:11PM +0800, Lan, Tianyu wrote:
> On 4/20/2017 1:40 PM, Peter Xu wrote:

[...]

> >>>a slightly hackish way.
> >>
> >>In my understanding, container->space->as->root cannot work here no matter passthru-mode
> >>is enabled or not. The code here is aiming to check if vIOMMU exists. After the vfio series,
> >>the vtd_dev_as->root is not initialized to be a iommu MemoryRegion. Compared with checking
> >>if it is system_memory(), I think adding a mechanism to get the iommu MemoryRegion may
> >>be a better choice. Just like the current pci_device_iommu_address_space().
> >
> >Isn't pci_device_iommu_address_space() used to get that IOMMU memory
> >region? And, one thing to mention is that container->space->as is
> >actually derived from pci_device_iommu_address_space() (when calling
> >vfio_get_group()).
> >
> >I feel like that playing around with an IOMMU memory region is still
> >not clear enough in many cases. I still feel like some day we would
> >like an "IOMMU object". Then, we can register non-iotlb notifiers
> >against that IOMMU object, rather than memory regions...
> 
> Our target is to check whether assigned device is under a vIOMMU.
> We may check whether iommu_fn point has been populated in its pci bus' data
> structure(PCIDevice). This is what pci_device_iommu_address_space()
> does.

Agreed. Thanks,
Peter Xu April 20, 2017, 7:04 a.m. UTC | #15
On Thu, Apr 20, 2017 at 06:36:16AM +0000, Liu, Yi L wrote:

[...]

> > > In my understanding, container->space->as->root cannot work here no
> > > matter passthru-mode is enabled or not. The code here is aiming to
> > > check if vIOMMU exists. After the vfio series, the vtd_dev_as->root is
> > > not initialized to be a iommu MemoryRegion. Compared with checking if
> > > it is system_memory(), I think adding a mechanism to get the iommu
> > MemoryRegion may be a better choice. Just like the current
> > pci_device_iommu_address_space().
> > 
> > Isn't pci_device_iommu_address_space() used to get that IOMMU memory region?

Again I should say s/memory region/address space/...

> 
> It actually returns the AddressSpace, and the AddressSpace includes a memory region.
> It is as->root. But after adding the vfio series, through the IOMMU memory region
> is got, but it has no iommu_ops. Just as the following code shows. That's why I said
> even without passthru-mode, Tianyu's that code snippet is not able to get the correct
> check.
> 
>         memory_region_init(&vtd_dev_as->root, OBJECT(s),
>                            "vtd_root", UINT64_MAX);
>         address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);

The problem is, I am not sure whether there is always _only_ one IOMMU
region behind one device. E.g., IIUC ppc can have more than one IOMMU
memory regions, but translate for different address ranges.

Thanks,
Lan Tianyu April 20, 2017, 7:10 a.m. UTC | #16
On 2017年04月20日 15:04, Peter Xu wrote:
> On Thu, Apr 20, 2017 at 06:36:16AM +0000, Liu, Yi L wrote:
> 
> [...]
> 
>>>> In my understanding, container->space->as->root cannot work here no
>>>> matter passthru-mode is enabled or not. The code here is aiming to
>>>> check if vIOMMU exists. After the vfio series, the vtd_dev_as->root is
>>>> not initialized to be a iommu MemoryRegion. Compared with checking if
>>>> it is system_memory(), I think adding a mechanism to get the iommu
>>> MemoryRegion may be a better choice. Just like the current
>>> pci_device_iommu_address_space().
>>>
>>> Isn't pci_device_iommu_address_space() used to get that IOMMU memory region?
> 
> Again I should say s/memory region/address space/...
> 
>>
>> It actually returns the AddressSpace, and the AddressSpace includes a memory region.
>> It is as->root. But after adding the vfio series, through the IOMMU memory region
>> is got, but it has no iommu_ops. Just as the following code shows. That's why I said
>> even without passthru-mode, Tianyu's that code snippet is not able to get the correct
>> check.
>>
>>         memory_region_init(&vtd_dev_as->root, OBJECT(s),
>>                            "vtd_root", UINT64_MAX);
>>         address_space_init(&vtd_dev_as->as, &vtd_dev_as->root, name);
> 
> The problem is, I am not sure whether there is always _only_ one IOMMU
> region behind one device. E.g., IIUC ppc can have more than one IOMMU
> memory regions, but translate for different address ranges.
> 

If we really want to check this via memory region, we may check all
subregions under container's address space. Register vIOMMU related
notifiers if one of them has iommu_ops,
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 05ae631..deb2007 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -872,7 +872,7 @@  static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     } else {
         switch (vtd_ce_get_type(ce)) {
         case VTD_CONTEXT_TT_MULTI_LEVEL:
-            /* fall through */
+        case VTD_CONTEXT_TT_PASS_THROUGH:
         case VTD_CONTEXT_TT_DEV_IOTLB:
             break;
         default:
@@ -883,6 +883,73 @@  static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     return 0;
 }
 
+/* Fetch translation type for specific device. Returns <0 if error
+ * happens, otherwise return the shifted type to check against
+ * VTD_CONTEXT_TT_*. */
+static int vtd_dev_get_trans_type(VTDAddressSpace *as)
+{
+    IntelIOMMUState *s;
+    VTDContextEntry ce;
+    int ret;
+
+    s = as->iommu_state;
+
+    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
+                                   as->devfn, &ce);
+    if (ret) {
+        return ret;
+    }
+
+    return vtd_ce_get_type(&ce);
+}
+
+static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
+{
+    int ret;
+
+    assert(as);
+
+    ret = vtd_dev_get_trans_type(as);
+    if (ret < 0) {
+        /*
+         * Possibly failed to parse the context entry for some reason
+         * (e.g., during init, or any guest configuration errors on
+         * context entries). We should assume PT not enabled for
+         * safety.
+         */
+        return false;
+    }
+
+    return ret == VTD_CONTEXT_TT_PASS_THROUGH;
+}
+
+static void vtd_switch_address_space(VTDAddressSpace *as)
+{
+    bool use_iommu;
+
+    assert(as);
+
+    use_iommu = as->iommu_state->dmar_enabled;
+    if (use_iommu) {
+        /* Further checks per-device configuration */
+        use_iommu &= !vtd_dev_pt_enabled(as);
+    }
+
+    trace_vtd_switch_address_space(pci_bus_num(as->bus),
+                                   VTD_PCI_SLOT(as->devfn),
+                                   VTD_PCI_FUNC(as->devfn),
+                                   use_iommu);
+
+    /* Turn off first then on the other */
+    if (use_iommu) {
+        memory_region_set_enabled(&as->sys_alias, false);
+        memory_region_set_enabled(&as->iommu, true);
+    } else {
+        memory_region_set_enabled(&as->iommu, false);
+        memory_region_set_enabled(&as->sys_alias, true);
+    }
+}
+
 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
 {
     return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
@@ -991,6 +1058,18 @@  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         cc_entry->context_cache_gen = s->context_cache_gen;
     }
 
+    /*
+     * We don't need to translate for pass-through context entries.
+     * Also, let's ignore IOTLB caching as well for PT devices.
+     */
+    if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
+        entry->translated_addr = entry->iova;
+        entry->addr_mask = VTD_PAGE_SIZE - 1;
+        entry->perm = IOMMU_RW;
+        trace_vtd_translate_pt(source_id, entry->iova);
+        return;
+    }
+
     ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
                                &reads, &writes);
     if (ret_fr) {
@@ -1135,6 +1214,11 @@  static void vtd_context_device_invalidate(IntelIOMMUState *s,
                                              VTD_PCI_FUNC(devfn_it));
                 vtd_as->context_cache_entry.context_cache_gen = 0;
                 /*
+                 * Do switch address space when needed, in case if the
+                 * device passthrough bit is switched.
+                 */
+                vtd_switch_address_space(vtd_as);
+                /*
                  * So a device is moving out of (or moving into) a
                  * domain, a replay() suites here to notify all the
                  * IOMMU_NOTIFIER_MAP registers about this change.
@@ -1366,25 +1450,6 @@  static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
 }
 
-static void vtd_switch_address_space(VTDAddressSpace *as)
-{
-    assert(as);
-
-    trace_vtd_switch_address_space(pci_bus_num(as->bus),
-                                   VTD_PCI_SLOT(as->devfn),
-                                   VTD_PCI_FUNC(as->devfn),
-                                   as->iommu_state->dmar_enabled);
-
-    /* Turn off first then on the other */
-    if (as->iommu_state->dmar_enabled) {
-        memory_region_set_enabled(&as->sys_alias, false);
-        memory_region_set_enabled(&as->iommu, true);
-    } else {
-        memory_region_set_enabled(&as->iommu, false);
-        memory_region_set_enabled(&as->sys_alias, true);
-    }
-}
-
 static void vtd_switch_address_space_all(IntelIOMMUState *s)
 {
     GHashTableIter iter;
@@ -2849,6 +2914,10 @@  static void vtd_init(IntelIOMMUState *s)
         s->ecap |= VTD_ECAP_DT;
     }
 
+    if (x86_iommu->pt_supported) {
+        s->ecap |= VTD_ECAP_PT;
+    }
+
     if (s->caching_mode) {
         s->cap |= VTD_CAP_CM;
     }
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 29d6707..0e73a65 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -187,6 +187,7 @@ 
 /* Interrupt Remapping support */
 #define VTD_ECAP_IR                 (1ULL << 3)
 #define VTD_ECAP_EIM                (1ULL << 4)
+#define VTD_ECAP_PT                 (1ULL << 6)
 #define VTD_ECAP_MHMV               (15ULL << 20)
 
 /* CAP_REG */
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 04a6980..867ad0b 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -38,6 +38,7 @@  vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
+vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64
 
 # hw/i386/amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
index 02b8825..293caf8 100644
--- a/hw/i386/x86-iommu.c
+++ b/hw/i386/x86-iommu.c
@@ -91,6 +91,7 @@  static void x86_iommu_realize(DeviceState *dev, Error **errp)
 static Property x86_iommu_properties[] = {
     DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false),
     DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false),
+    DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h
index 361c07c..ef89c0c 100644
--- a/include/hw/i386/x86-iommu.h
+++ b/include/hw/i386/x86-iommu.h
@@ -74,6 +74,7 @@  struct X86IOMMUState {
     SysBusDevice busdev;
     bool intr_supported;        /* Whether vIOMMU supports IR */
     bool dt_supported;          /* Whether vIOMMU supports DT */
+    bool pt_supported;          /* Whether vIOMMU supports pass-through */
     IommuType type;             /* IOMMU type - AMD/Intel     */
     QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */
 };