diff mbox

[v3,10/12] intel_iommu: support passthrough (PT)

Message ID 1494403315-12760-11-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu May 10, 2017, 8:01 a.m. UTC
Hardware support for VT-d device passthrough. Although current Linux can
live with iommu=pt even without this, but this is faster than when using
software passthrough.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 hw/i386/intel_iommu.c          | 210 ++++++++++++++++++++++++++++++++---------
 hw/i386/intel_iommu_internal.h |   1 +
 hw/i386/trace-events           |   2 +
 hw/i386/x86-iommu.c            |   1 +
 include/hw/i386/x86-iommu.h    |   1 +
 5 files changed, 171 insertions(+), 44 deletions(-)

Comments

Jason Wang May 11, 2017, 8:31 a.m. UTC | #1
On 2017年05月10日 16:01, Peter Xu wrote:
> Hardware support for VT-d device passthrough. Although current Linux can
> live with iommu=pt even without this, but this is faster than when using
> software passthrough.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   hw/i386/intel_iommu.c          | 210 ++++++++++++++++++++++++++++++++---------
>   hw/i386/intel_iommu_internal.h |   1 +
>   hw/i386/trace-events           |   2 +
>   hw/i386/x86-iommu.c            |   1 +
>   include/hw/i386/x86-iommu.h    |   1 +
>   5 files changed, 171 insertions(+), 44 deletions(-)
>
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 1a7eba2..1d034f9 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -640,6 +640,29 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
>       }
>   }
>   
> +/* Find the VTD address space associated with a given bus number */
> +static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> +{
> +    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
> +    if (!vtd_bus) {
> +        /*
> +         * Iterate over the registered buses to find the one which
> +         * currently hold this bus number, and update the bus_num
> +         * lookup table:
> +         */
> +        GHashTableIter iter;
> +
> +        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
> +        while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) {
> +            if (pci_bus_num(vtd_bus->bus) == bus_num) {
> +                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
> +                return vtd_bus;
> +            }
> +        }
> +    }
> +    return vtd_bus;
> +}
> +
>   /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
>    * of the translation, can be used for deciding the size of large page.
>    */
> @@ -881,6 +904,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>                   type_fail = true;
>               }
>               break;
> +        case VTD_CONTEXT_TT_PASS_THROUGH:
> +            if (!x86_iommu->pt_supported) {
> +                type_fail = true;
> +            }
> +            break;
>           default:
>               /* Unknwon type */
>               type_fail = true;
> @@ -894,6 +922,84 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>       return 0;
>   }
>   
> +/*
> + * Fetch translation type for specific device. Returns <0 if error
> + * happens, otherwise return the shifted type to check against
> + * VTD_CONTEXT_TT_*.
> + */
> +static int vtd_dev_get_trans_type(VTDAddressSpace *as)
> +{
> +    IntelIOMMUState *s;
> +    VTDContextEntry ce;
> +    int ret;
> +
> +    s = as->iommu_state;
> +
> +    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
> +                                   as->devfn, &ce);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    return vtd_ce_get_type(&ce);
> +}
> +
> +static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
> +{
> +    int ret;
> +
> +    assert(as);
> +
> +    ret = vtd_dev_get_trans_type(as);
> +    if (ret < 0) {
> +        /*
> +         * Possibly failed to parse the context entry for some reason
> +         * (e.g., during init, or any guest configuration errors on
> +         * context entries). We should assume PT not enabled for
> +         * safety.
> +         */
> +        return false;
> +    }
> +
> +    return ret == VTD_CONTEXT_TT_PASS_THROUGH;
> +}
> +
> +/*
> + * When we are during init phase (device realizations, global
> + * enable/disable of translations), we should not detect PT
> + * (passthrough) when switching address spaces. In that cases, we
> + * should set `detect_pt' to false.
> + *
> + * Return whether the device is using IOMMU translation.
> + */
> +static bool vtd_switch_address_space(VTDAddressSpace *as, bool detect_pt)
> +{

The detect_pt looks suspicious. E.g if the context entry does not exist, 
vtd_dev_pt_enabled() will return false.

> +    bool use_iommu;
> +
> +    assert(as);
> +
> +    use_iommu = as->iommu_state->dmar_enabled;
> +    if (detect_pt) {
> +        use_iommu &= !vtd_dev_pt_enabled(as);
> +    }
> +
> +    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> +                                   VTD_PCI_SLOT(as->devfn),
> +                                   VTD_PCI_FUNC(as->devfn),
> +                                   use_iommu);
> +
> +    /* Turn off first then on the other */
> +    if (use_iommu) {
> +        memory_region_set_enabled(&as->sys_alias, false);
> +        memory_region_set_enabled(&as->iommu, true);
> +    } else {
> +        memory_region_set_enabled(&as->iommu, false);
> +        memory_region_set_enabled(&as->sys_alias, true);
> +    }
> +
> +    return use_iommu;
> +}
> +
>   static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
>   {
>       return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
> @@ -931,6 +1037,31 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
>       return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
>   }
>   
> +static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
> +{
> +    VTDBus *vtd_bus;
> +    VTDAddressSpace *vtd_as;
> +    const char *msg = "FAIL";
> +
> +    vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
> +    if (!vtd_bus) {
> +        goto out;
> +    }
> +
> +    vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
> +    if (!vtd_as) {
> +        goto out;
> +    }
> +
> +    if (vtd_switch_address_space(vtd_as, true) == false) {
> +        /* We switched off IOMMU region successfully. */
> +        msg = "SUCCESS";
> +    }
> +
> +out:
> +    trace_vtd_pt_enable_fast_path(source_id, msg);

Looks like using a boolean is better here.

> +}
> +
>   /* Map dev to context-entry then do a paging-structures walk to do a iommu
>    * translation.
>    *
> @@ -1002,6 +1133,30 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
>           cc_entry->context_cache_gen = s->context_cache_gen;
>       }
>   
> +    /*
> +     * We don't need to translate for pass-through context entries.
> +     * Also, let's ignore IOTLB caching as well for PT devices.
> +     */
> +    if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
> +        entry->translated_addr = entry->iova;
> +        entry->addr_mask = VTD_PAGE_SIZE - 1;
> +        entry->perm = IOMMU_RW;
> +        trace_vtd_translate_pt(source_id, entry->iova);
> +
> +        /*
> +         * When this happens, it means firstly caching-mode is not
> +         * enabled, and this is the first passthrough translation for
> +         * the device. Let's enable the fast path for passthrough.
> +         *
> +         * When passthrough is disabled again for the device, we can
> +         * capture it via the context entry invalidation, then the
> +         * IOMMU region can be swapped back.
> +         */
> +        vtd_pt_enable_fast_path(s, source_id);
> +
> +        return;
> +    }
> +
>       ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
>                                  &reads, &writes);
>       if (ret_fr) {
> @@ -1081,29 +1236,6 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s)
>       vtd_iommu_replay_all(s);
>   }
>   
> -
> -/* Find the VTD address space currently associated with a given bus number,
> - */
> -static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> -{
> -    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
> -    if (!vtd_bus) {
> -        /* Iterate over the registered buses to find the one
> -         * which currently hold this bus number, and update the bus_num lookup table:
> -         */
> -        GHashTableIter iter;
> -
> -        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
> -        while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) {
> -            if (pci_bus_num(vtd_bus->bus) == bus_num) {
> -                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
> -                return vtd_bus;
> -            }
> -        }
> -    }
> -    return vtd_bus;
> -}
> -
>   /* Do a context-cache device-selective invalidation.
>    * @func_mask: FM field after shifting
>    */
> @@ -1146,6 +1278,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
>                                                VTD_PCI_FUNC(devfn_it));
>                   vtd_as->context_cache_entry.context_cache_gen = 0;
>                   /*
> +                 * Do switch address space when needed, in case if the
> +                 * device passthrough bit is switched.
> +                 */
> +                vtd_switch_address_space(vtd_as, true);

Do we need to do this also in DSI and GLOBAL invalidation?

Thanks

> +                /*
>                    * So a device is moving out of (or moving into) a
>                    * domain, a replay() suites here to notify all the
>                    * IOMMU_NOTIFIER_MAP registers about this change.
> @@ -1377,25 +1514,6 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
>       vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
>   }
>   
> -static void vtd_switch_address_space(VTDAddressSpace *as)
> -{
> -    assert(as);
> -
> -    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> -                                   VTD_PCI_SLOT(as->devfn),
> -                                   VTD_PCI_FUNC(as->devfn),
> -                                   as->iommu_state->dmar_enabled);
> -
> -    /* Turn off first then on the other */
> -    if (as->iommu_state->dmar_enabled) {
> -        memory_region_set_enabled(&as->sys_alias, false);
> -        memory_region_set_enabled(&as->iommu, true);
> -    } else {
> -        memory_region_set_enabled(&as->iommu, false);
> -        memory_region_set_enabled(&as->sys_alias, true);
> -    }
> -}
> -
>   static void vtd_switch_address_space_all(IntelIOMMUState *s)
>   {
>       GHashTableIter iter;
> @@ -1408,7 +1526,7 @@ static void vtd_switch_address_space_all(IntelIOMMUState *s)
>               if (!vtd_bus->dev_as[i]) {
>                   continue;
>               }
> -            vtd_switch_address_space(vtd_bus->dev_as[i]);
> +            vtd_switch_address_space(vtd_bus->dev_as[i], false);
>           }
>       }
>   }
> @@ -2712,7 +2830,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>                                               &vtd_dev_as->sys_alias, 1);
>           memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
>                                               &vtd_dev_as->iommu, 1);
> -        vtd_switch_address_space(vtd_dev_as);
> +        vtd_switch_address_space(vtd_dev_as, false);
>       }
>       return vtd_dev_as;
>   }
> @@ -2860,6 +2978,10 @@ static void vtd_init(IntelIOMMUState *s)
>           s->ecap |= VTD_ECAP_DT;
>       }
>   
> +    if (x86_iommu->pt_supported) {
> +        s->ecap |= VTD_ECAP_PT;
> +    }
> +
>       if (s->caching_mode) {
>           s->cap |= VTD_CAP_CM;
>       }
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 29d6707..0e73a65 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -187,6 +187,7 @@
>   /* Interrupt Remapping support */
>   #define VTD_ECAP_IR                 (1ULL << 3)
>   #define VTD_ECAP_EIM                (1ULL << 4)
> +#define VTD_ECAP_PT                 (1ULL << 6)
>   #define VTD_ECAP_MHMV               (15ULL << 20)
>   
>   /* CAP_REG */
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index 04a6980..5c3e466 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -38,6 +38,8 @@ vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P
>   vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
>   vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
>   vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
> +vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64
> +vtd_pt_enable_fast_path(uint16_t sid, const char *msg) "sid 0x%"PRIu16" %s"
>   
>   # hw/i386/amd_iommu.c
>   amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
> diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
> index 02b8825..293caf8 100644
> --- a/hw/i386/x86-iommu.c
> +++ b/hw/i386/x86-iommu.c
> @@ -91,6 +91,7 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp)
>   static Property x86_iommu_properties[] = {
>       DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false),
>       DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false),
> +    DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true),
>       DEFINE_PROP_END_OF_LIST(),
>   };
>   
> diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h
> index 361c07c..ef89c0c 100644
> --- a/include/hw/i386/x86-iommu.h
> +++ b/include/hw/i386/x86-iommu.h
> @@ -74,6 +74,7 @@ struct X86IOMMUState {
>       SysBusDevice busdev;
>       bool intr_supported;        /* Whether vIOMMU supports IR */
>       bool dt_supported;          /* Whether vIOMMU supports DT */
> +    bool pt_supported;          /* Whether vIOMMU supports pass-through */
>       IommuType type;             /* IOMMU type - AMD/Intel     */
>       QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */
>   };
Peter Xu May 11, 2017, 8:48 a.m. UTC | #2
On Thu, May 11, 2017 at 04:31:40PM +0800, Jason Wang wrote:
> 
> 
> On 2017年05月10日 16:01, Peter Xu wrote:
> >Hardware support for VT-d device passthrough. Although current Linux can
> >live with iommu=pt even without this, but this is faster than when using
> >software passthrough.
> >
> >Signed-off-by: Peter Xu <peterx@redhat.com>
> >---
> >  hw/i386/intel_iommu.c          | 210 ++++++++++++++++++++++++++++++++---------
> >  hw/i386/intel_iommu_internal.h |   1 +
> >  hw/i386/trace-events           |   2 +
> >  hw/i386/x86-iommu.c            |   1 +
> >  include/hw/i386/x86-iommu.h    |   1 +
> >  5 files changed, 171 insertions(+), 44 deletions(-)
> >
> >diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >index 1a7eba2..1d034f9 100644
> >--- a/hw/i386/intel_iommu.c
> >+++ b/hw/i386/intel_iommu.c
> >@@ -640,6 +640,29 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
> >      }
> >  }
> >+/* Find the VTD address space associated with a given bus number */
> >+static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> >+{
> >+    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
> >+    if (!vtd_bus) {
> >+        /*
> >+         * Iterate over the registered buses to find the one which
> >+         * currently hold this bus number, and update the bus_num
> >+         * lookup table:
> >+         */
> >+        GHashTableIter iter;
> >+
> >+        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
> >+        while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) {
> >+            if (pci_bus_num(vtd_bus->bus) == bus_num) {
> >+                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
> >+                return vtd_bus;
> >+            }
> >+        }
> >+    }
> >+    return vtd_bus;
> >+}
> >+
> >  /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
> >   * of the translation, can be used for deciding the size of large page.
> >   */
> >@@ -881,6 +904,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> >                  type_fail = true;
> >              }
> >              break;
> >+        case VTD_CONTEXT_TT_PASS_THROUGH:
> >+            if (!x86_iommu->pt_supported) {
> >+                type_fail = true;
> >+            }
> >+            break;
> >          default:
> >              /* Unknwon type */
> >              type_fail = true;
> >@@ -894,6 +922,84 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> >      return 0;
> >  }
> >+/*
> >+ * Fetch translation type for specific device. Returns <0 if error
> >+ * happens, otherwise return the shifted type to check against
> >+ * VTD_CONTEXT_TT_*.
> >+ */
> >+static int vtd_dev_get_trans_type(VTDAddressSpace *as)
> >+{
> >+    IntelIOMMUState *s;
> >+    VTDContextEntry ce;
> >+    int ret;
> >+
> >+    s = as->iommu_state;
> >+
> >+    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
> >+                                   as->devfn, &ce);
> >+    if (ret) {
> >+        return ret;
> >+    }
> >+
> >+    return vtd_ce_get_type(&ce);
> >+}
> >+
> >+static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
> >+{
> >+    int ret;
> >+
> >+    assert(as);
> >+
> >+    ret = vtd_dev_get_trans_type(as);
> >+    if (ret < 0) {
> >+        /*
> >+         * Possibly failed to parse the context entry for some reason
> >+         * (e.g., during init, or any guest configuration errors on
> >+         * context entries). We should assume PT not enabled for
> >+         * safety.
> >+         */
> >+        return false;
> >+    }
> >+
> >+    return ret == VTD_CONTEXT_TT_PASS_THROUGH;
> >+}
> >+
> >+/*
> >+ * When we are during init phase (device realizations, global
> >+ * enable/disable of translations), we should not detect PT
> >+ * (passthrough) when switching address spaces. In that cases, we
> >+ * should set `detect_pt' to false.
> >+ *
> >+ * Return whether the device is using IOMMU translation.
> >+ */
> >+static bool vtd_switch_address_space(VTDAddressSpace *as, bool detect_pt)
> >+{
> 
> The detect_pt looks suspicious. E.g if the context entry does not exist,
> vtd_dev_pt_enabled() will return false.

I forgot why I added that even after reading the comments I wrote. I
blame too much context switches recently in my brain. :(

(this is an excuse of mine :)

I did some test and I see nothing wrong to not hack on this bit. I
will remove that in next version, until one day I remembered
something.

And I will try to add more detailed comments in the future.

> 
> >+    bool use_iommu;
> >+
> >+    assert(as);
> >+
> >+    use_iommu = as->iommu_state->dmar_enabled;
> >+    if (detect_pt) {
> >+        use_iommu &= !vtd_dev_pt_enabled(as);
> >+    }
> >+
> >+    trace_vtd_switch_address_space(pci_bus_num(as->bus),
> >+                                   VTD_PCI_SLOT(as->devfn),
> >+                                   VTD_PCI_FUNC(as->devfn),
> >+                                   use_iommu);
> >+
> >+    /* Turn off first then on the other */
> >+    if (use_iommu) {
> >+        memory_region_set_enabled(&as->sys_alias, false);
> >+        memory_region_set_enabled(&as->iommu, true);
> >+    } else {
> >+        memory_region_set_enabled(&as->iommu, false);
> >+        memory_region_set_enabled(&as->sys_alias, true);
> >+    }
> >+
> >+    return use_iommu;
> >+}
> >+
> >  static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
> >  {
> >      return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
> >@@ -931,6 +1037,31 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr)
> >      return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
> >  }
> >+static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
> >+{
> >+    VTDBus *vtd_bus;
> >+    VTDAddressSpace *vtd_as;
> >+    const char *msg = "FAIL";
> >+
> >+    vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
> >+    if (!vtd_bus) {
> >+        goto out;
> >+    }
> >+
> >+    vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
> >+    if (!vtd_as) {
> >+        goto out;
> >+    }
> >+
> >+    if (vtd_switch_address_space(vtd_as, true) == false) {
> >+        /* We switched off IOMMU region successfully. */
> >+        msg = "SUCCESS";
> >+    }
> >+
> >+out:
> >+    trace_vtd_pt_enable_fast_path(source_id, msg);
> 
> Looks like using a boolean is better here.

Sure.

> 
> >+}
> >+
> >  /* Map dev to context-entry then do a paging-structures walk to do a iommu
> >   * translation.
> >   *
> >@@ -1002,6 +1133,30 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
> >          cc_entry->context_cache_gen = s->context_cache_gen;
> >      }
> >+    /*
> >+     * We don't need to translate for pass-through context entries.
> >+     * Also, let's ignore IOTLB caching as well for PT devices.
> >+     */
> >+    if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
> >+        entry->translated_addr = entry->iova;
> >+        entry->addr_mask = VTD_PAGE_SIZE - 1;
> >+        entry->perm = IOMMU_RW;
> >+        trace_vtd_translate_pt(source_id, entry->iova);
> >+
> >+        /*
> >+         * When this happens, it means firstly caching-mode is not
> >+         * enabled, and this is the first passthrough translation for
> >+         * the device. Let's enable the fast path for passthrough.
> >+         *
> >+         * When passthrough is disabled again for the device, we can
> >+         * capture it via the context entry invalidation, then the
> >+         * IOMMU region can be swapped back.
> >+         */
> >+        vtd_pt_enable_fast_path(s, source_id);
> >+
> >+        return;
> >+    }
> >+
> >      ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
> >                                 &reads, &writes);
> >      if (ret_fr) {
> >@@ -1081,29 +1236,6 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s)
> >      vtd_iommu_replay_all(s);
> >  }
> >-
> >-/* Find the VTD address space currently associated with a given bus number,
> >- */
> >-static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
> >-{
> >-    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
> >-    if (!vtd_bus) {
> >-        /* Iterate over the registered buses to find the one
> >-         * which currently hold this bus number, and update the bus_num lookup table:
> >-         */
> >-        GHashTableIter iter;
> >-
> >-        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
> >-        while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) {
> >-            if (pci_bus_num(vtd_bus->bus) == bus_num) {
> >-                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
> >-                return vtd_bus;
> >-            }
> >-        }
> >-    }
> >-    return vtd_bus;
> >-}
> >-
> >  /* Do a context-cache device-selective invalidation.
> >   * @func_mask: FM field after shifting
> >   */
> >@@ -1146,6 +1278,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
> >                                               VTD_PCI_FUNC(devfn_it));
> >                  vtd_as->context_cache_entry.context_cache_gen = 0;
> >                  /*
> >+                 * Do switch address space when needed, in case if the
> >+                 * device passthrough bit is switched.
> >+                 */
> >+                vtd_switch_address_space(vtd_as, true);
> 
> Do we need to do this also in DSI and GLOBAL invalidation?

Yes. Though this should be optional at least for Linux, but I will add
that later.

Thanks!
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1a7eba2..1d034f9 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -640,6 +640,29 @@  static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
     }
 }
 
+/* Find the VTD address space associated with a given bus number */
+static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
+{
+    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
+    if (!vtd_bus) {
+        /*
+         * Iterate over the registered buses to find the one which
+         * currently hold this bus number, and update the bus_num
+         * lookup table:
+         */
+        GHashTableIter iter;
+
+        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
+        while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) {
+            if (pci_bus_num(vtd_bus->bus) == bus_num) {
+                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
+                return vtd_bus;
+            }
+        }
+    }
+    return vtd_bus;
+}
+
 /* Given the @iova, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
@@ -881,6 +904,11 @@  static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
                 type_fail = true;
             }
             break;
+        case VTD_CONTEXT_TT_PASS_THROUGH:
+            if (!x86_iommu->pt_supported) {
+                type_fail = true;
+            }
+            break;
         default:
             /* Unknwon type */
             type_fail = true;
@@ -894,6 +922,84 @@  static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     return 0;
 }
 
+/*
+ * Fetch translation type for specific device. Returns <0 if error
+ * happens, otherwise return the shifted type to check against
+ * VTD_CONTEXT_TT_*.
+ */
+static int vtd_dev_get_trans_type(VTDAddressSpace *as)
+{
+    IntelIOMMUState *s;
+    VTDContextEntry ce;
+    int ret;
+
+    s = as->iommu_state;
+
+    ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus),
+                                   as->devfn, &ce);
+    if (ret) {
+        return ret;
+    }
+
+    return vtd_ce_get_type(&ce);
+}
+
+static bool vtd_dev_pt_enabled(VTDAddressSpace *as)
+{
+    int ret;
+
+    assert(as);
+
+    ret = vtd_dev_get_trans_type(as);
+    if (ret < 0) {
+        /*
+         * Possibly failed to parse the context entry for some reason
+         * (e.g., during init, or any guest configuration errors on
+         * context entries). We should assume PT not enabled for
+         * safety.
+         */
+        return false;
+    }
+
+    return ret == VTD_CONTEXT_TT_PASS_THROUGH;
+}
+
+/*
+ * When we are during init phase (device realizations, global
+ * enable/disable of translations), we should not detect PT
+ * (passthrough) when switching address spaces. In that cases, we
+ * should set `detect_pt' to false.
+ *
+ * Return whether the device is using IOMMU translation.
+ */
+static bool vtd_switch_address_space(VTDAddressSpace *as, bool detect_pt)
+{
+    bool use_iommu;
+
+    assert(as);
+
+    use_iommu = as->iommu_state->dmar_enabled;
+    if (detect_pt) {
+        use_iommu &= !vtd_dev_pt_enabled(as);
+    }
+
+    trace_vtd_switch_address_space(pci_bus_num(as->bus),
+                                   VTD_PCI_SLOT(as->devfn),
+                                   VTD_PCI_FUNC(as->devfn),
+                                   use_iommu);
+
+    /* Turn off first then on the other */
+    if (use_iommu) {
+        memory_region_set_enabled(&as->sys_alias, false);
+        memory_region_set_enabled(&as->iommu, true);
+    } else {
+        memory_region_set_enabled(&as->iommu, false);
+        memory_region_set_enabled(&as->sys_alias, true);
+    }
+
+    return use_iommu;
+}
+
 static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn)
 {
     return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL);
@@ -931,6 +1037,31 @@  static inline bool vtd_is_interrupt_addr(hwaddr addr)
     return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST;
 }
 
+static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id)
+{
+    VTDBus *vtd_bus;
+    VTDAddressSpace *vtd_as;
+    const char *msg = "FAIL";
+
+    vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
+    if (!vtd_bus) {
+        goto out;
+    }
+
+    vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)];
+    if (!vtd_as) {
+        goto out;
+    }
+
+    if (vtd_switch_address_space(vtd_as, true) == false) {
+        /* We switched off IOMMU region successfully. */
+        msg = "SUCCESS";
+    }
+
+out:
+    trace_vtd_pt_enable_fast_path(source_id, msg);
+}
+
 /* Map dev to context-entry then do a paging-structures walk to do a iommu
  * translation.
  *
@@ -1002,6 +1133,30 @@  static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         cc_entry->context_cache_gen = s->context_cache_gen;
     }
 
+    /*
+     * We don't need to translate for pass-through context entries.
+     * Also, let's ignore IOTLB caching as well for PT devices.
+     */
+    if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) {
+        entry->translated_addr = entry->iova;
+        entry->addr_mask = VTD_PAGE_SIZE - 1;
+        entry->perm = IOMMU_RW;
+        trace_vtd_translate_pt(source_id, entry->iova);
+
+        /*
+         * When this happens, it means firstly caching-mode is not
+         * enabled, and this is the first passthrough translation for
+         * the device. Let's enable the fast path for passthrough.
+         *
+         * When passthrough is disabled again for the device, we can
+         * capture it via the context entry invalidation, then the
+         * IOMMU region can be swapped back.
+         */
+        vtd_pt_enable_fast_path(s, source_id);
+
+        return;
+    }
+
     ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
                                &reads, &writes);
     if (ret_fr) {
@@ -1081,29 +1236,6 @@  static void vtd_context_global_invalidate(IntelIOMMUState *s)
     vtd_iommu_replay_all(s);
 }
 
-
-/* Find the VTD address space currently associated with a given bus number,
- */
-static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
-{
-    VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num];
-    if (!vtd_bus) {
-        /* Iterate over the registered buses to find the one
-         * which currently hold this bus number, and update the bus_num lookup table:
-         */
-        GHashTableIter iter;
-
-        g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
-        while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) {
-            if (pci_bus_num(vtd_bus->bus) == bus_num) {
-                s->vtd_as_by_bus_num[bus_num] = vtd_bus;
-                return vtd_bus;
-            }
-        }
-    }
-    return vtd_bus;
-}
-
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field after shifting
  */
@@ -1146,6 +1278,11 @@  static void vtd_context_device_invalidate(IntelIOMMUState *s,
                                              VTD_PCI_FUNC(devfn_it));
                 vtd_as->context_cache_entry.context_cache_gen = 0;
                 /*
+                 * Do switch address space when needed, in case if the
+                 * device passthrough bit is switched.
+                 */
+                vtd_switch_address_space(vtd_as, true);
+                /*
                  * So a device is moving out of (or moving into) a
                  * domain, a replay() suites here to notify all the
                  * IOMMU_NOTIFIER_MAP registers about this change.
@@ -1377,25 +1514,6 @@  static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
     vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
 }
 
-static void vtd_switch_address_space(VTDAddressSpace *as)
-{
-    assert(as);
-
-    trace_vtd_switch_address_space(pci_bus_num(as->bus),
-                                   VTD_PCI_SLOT(as->devfn),
-                                   VTD_PCI_FUNC(as->devfn),
-                                   as->iommu_state->dmar_enabled);
-
-    /* Turn off first then on the other */
-    if (as->iommu_state->dmar_enabled) {
-        memory_region_set_enabled(&as->sys_alias, false);
-        memory_region_set_enabled(&as->iommu, true);
-    } else {
-        memory_region_set_enabled(&as->iommu, false);
-        memory_region_set_enabled(&as->sys_alias, true);
-    }
-}
-
 static void vtd_switch_address_space_all(IntelIOMMUState *s)
 {
     GHashTableIter iter;
@@ -1408,7 +1526,7 @@  static void vtd_switch_address_space_all(IntelIOMMUState *s)
             if (!vtd_bus->dev_as[i]) {
                 continue;
             }
-            vtd_switch_address_space(vtd_bus->dev_as[i]);
+            vtd_switch_address_space(vtd_bus->dev_as[i], false);
         }
     }
 }
@@ -2712,7 +2830,7 @@  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
                                             &vtd_dev_as->sys_alias, 1);
         memory_region_add_subregion_overlap(&vtd_dev_as->root, 0,
                                             &vtd_dev_as->iommu, 1);
-        vtd_switch_address_space(vtd_dev_as);
+        vtd_switch_address_space(vtd_dev_as, false);
     }
     return vtd_dev_as;
 }
@@ -2860,6 +2978,10 @@  static void vtd_init(IntelIOMMUState *s)
         s->ecap |= VTD_ECAP_DT;
     }
 
+    if (x86_iommu->pt_supported) {
+        s->ecap |= VTD_ECAP_PT;
+    }
+
     if (s->caching_mode) {
         s->cap |= VTD_CAP_CM;
     }
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 29d6707..0e73a65 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -187,6 +187,7 @@ 
 /* Interrupt Remapping support */
 #define VTD_ECAP_IR                 (1ULL << 3)
 #define VTD_ECAP_EIM                (1ULL << 4)
+#define VTD_ECAP_PT                 (1ULL << 6)
 #define VTD_ECAP_MHMV               (15ULL << 20)
 
 /* CAP_REG */
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 04a6980..5c3e466 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -38,6 +38,8 @@  vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
+vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64
+vtd_pt_enable_fast_path(uint16_t sid, const char *msg) "sid 0x%"PRIu16" %s"
 
 # hw/i386/amd_iommu.c
 amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" +  offset 0x%"PRIx32
diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
index 02b8825..293caf8 100644
--- a/hw/i386/x86-iommu.c
+++ b/hw/i386/x86-iommu.c
@@ -91,6 +91,7 @@  static void x86_iommu_realize(DeviceState *dev, Error **errp)
 static Property x86_iommu_properties[] = {
     DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false),
     DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false),
+    DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h
index 361c07c..ef89c0c 100644
--- a/include/hw/i386/x86-iommu.h
+++ b/include/hw/i386/x86-iommu.h
@@ -74,6 +74,7 @@  struct X86IOMMUState {
     SysBusDevice busdev;
     bool intr_supported;        /* Whether vIOMMU supports IR */
     bool dt_supported;          /* Whether vIOMMU supports DT */
+    bool pt_supported;          /* Whether vIOMMU supports pass-through */
     IommuType type;             /* IOMMU type - AMD/Intel     */
     QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */
 };