Message ID | 1494403315-12760-11-git-send-email-peterx@redhat.com |
---|---|
State | New |
Headers | show |
On 2017年05月10日 16:01, Peter Xu wrote: > Hardware support for VT-d device passthrough. Although current Linux can > live with iommu=pt even without this, but this is faster than when using > software passthrough. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > hw/i386/intel_iommu.c | 210 ++++++++++++++++++++++++++++++++--------- > hw/i386/intel_iommu_internal.h | 1 + > hw/i386/trace-events | 2 + > hw/i386/x86-iommu.c | 1 + > include/hw/i386/x86-iommu.h | 1 + > 5 files changed, 171 insertions(+), 44 deletions(-) > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 1a7eba2..1d034f9 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -640,6 +640,29 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) > } > } > > +/* Find the VTD address space associated with a given bus number */ > +static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) > +{ > + VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; > + if (!vtd_bus) { > + /* > + * Iterate over the registered buses to find the one which > + * currently hold this bus number, and update the bus_num > + * lookup table: > + */ > + GHashTableIter iter; > + > + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); > + while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { > + if (pci_bus_num(vtd_bus->bus) == bus_num) { > + s->vtd_as_by_bus_num[bus_num] = vtd_bus; > + return vtd_bus; > + } > + } > + } > + return vtd_bus; > +} > + > /* Given the @iova, get relevant @slptep. @slpte_level will be the last level > * of the translation, can be used for deciding the size of large page. > */ > @@ -881,6 +904,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > type_fail = true; > } > break; > + case VTD_CONTEXT_TT_PASS_THROUGH: > + if (!x86_iommu->pt_supported) { > + type_fail = true; > + } > + break; > default: > /* Unknwon type */ > type_fail = true; > @@ -894,6 +922,84 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > return 0; > } > > +/* > + * Fetch translation type for specific device. Returns <0 if error > + * happens, otherwise return the shifted type to check against > + * VTD_CONTEXT_TT_*. > + */ > +static int vtd_dev_get_trans_type(VTDAddressSpace *as) > +{ > + IntelIOMMUState *s; > + VTDContextEntry ce; > + int ret; > + > + s = as->iommu_state; > + > + ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), > + as->devfn, &ce); > + if (ret) { > + return ret; > + } > + > + return vtd_ce_get_type(&ce); > +} > + > +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) > +{ > + int ret; > + > + assert(as); > + > + ret = vtd_dev_get_trans_type(as); > + if (ret < 0) { > + /* > + * Possibly failed to parse the context entry for some reason > + * (e.g., during init, or any guest configuration errors on > + * context entries). We should assume PT not enabled for > + * safety. > + */ > + return false; > + } > + > + return ret == VTD_CONTEXT_TT_PASS_THROUGH; > +} > + > +/* > + * When we are during init phase (device realizations, global > + * enable/disable of translations), we should not detect PT > + * (passthrough) when switching address spaces. In that cases, we > + * should set `detect_pt' to false. > + * > + * Return whether the device is using IOMMU translation. > + */ > +static bool vtd_switch_address_space(VTDAddressSpace *as, bool detect_pt) > +{ The detect_pt looks suspicious. E.g if the context entry does not exist, vtd_dev_pt_enabled() will return false. > + bool use_iommu; > + > + assert(as); > + > + use_iommu = as->iommu_state->dmar_enabled; > + if (detect_pt) { > + use_iommu &= !vtd_dev_pt_enabled(as); > + } > + > + trace_vtd_switch_address_space(pci_bus_num(as->bus), > + VTD_PCI_SLOT(as->devfn), > + VTD_PCI_FUNC(as->devfn), > + use_iommu); > + > + /* Turn off first then on the other */ > + if (use_iommu) { > + memory_region_set_enabled(&as->sys_alias, false); > + memory_region_set_enabled(&as->iommu, true); > + } else { > + memory_region_set_enabled(&as->iommu, false); > + memory_region_set_enabled(&as->sys_alias, true); > + } > + > + return use_iommu; > +} > + > static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) > { > return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); > @@ -931,6 +1037,31 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) > return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; > } > > +static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) > +{ > + VTDBus *vtd_bus; > + VTDAddressSpace *vtd_as; > + const char *msg = "FAIL"; > + > + vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); > + if (!vtd_bus) { > + goto out; > + } > + > + vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; > + if (!vtd_as) { > + goto out; > + } > + > + if (vtd_switch_address_space(vtd_as, true) == false) { > + /* We switched off IOMMU region successfully. */ > + msg = "SUCCESS"; > + } > + > +out: > + trace_vtd_pt_enable_fast_path(source_id, msg); Looks like using a boolean is better here. > +} > + > /* Map dev to context-entry then do a paging-structures walk to do a iommu > * translation. > * > @@ -1002,6 +1133,30 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, > cc_entry->context_cache_gen = s->context_cache_gen; > } > > + /* > + * We don't need to translate for pass-through context entries. > + * Also, let's ignore IOTLB caching as well for PT devices. > + */ > + if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { > + entry->translated_addr = entry->iova; > + entry->addr_mask = VTD_PAGE_SIZE - 1; > + entry->perm = IOMMU_RW; > + trace_vtd_translate_pt(source_id, entry->iova); > + > + /* > + * When this happens, it means firstly caching-mode is not > + * enabled, and this is the first passthrough translation for > + * the device. Let's enable the fast path for passthrough. > + * > + * When passthrough is disabled again for the device, we can > + * capture it via the context entry invalidation, then the > + * IOMMU region can be swapped back. > + */ > + vtd_pt_enable_fast_path(s, source_id); > + > + return; > + } > + > ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, > &reads, &writes); > if (ret_fr) { > @@ -1081,29 +1236,6 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) > vtd_iommu_replay_all(s); > } > > - > -/* Find the VTD address space currently associated with a given bus number, > - */ > -static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) > -{ > - VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; > - if (!vtd_bus) { > - /* Iterate over the registered buses to find the one > - * which currently hold this bus number, and update the bus_num lookup table: > - */ > - GHashTableIter iter; > - > - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); > - while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { > - if (pci_bus_num(vtd_bus->bus) == bus_num) { > - s->vtd_as_by_bus_num[bus_num] = vtd_bus; > - return vtd_bus; > - } > - } > - } > - return vtd_bus; > -} > - > /* Do a context-cache device-selective invalidation. > * @func_mask: FM field after shifting > */ > @@ -1146,6 +1278,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, > VTD_PCI_FUNC(devfn_it)); > vtd_as->context_cache_entry.context_cache_gen = 0; > /* > + * Do switch address space when needed, in case if the > + * device passthrough bit is switched. > + */ > + vtd_switch_address_space(vtd_as, true); Do we need to do this also in DSI and GLOBAL invalidation? Thanks > + /* > * So a device is moving out of (or moving into) a > * domain, a replay() suites here to notify all the > * IOMMU_NOTIFIER_MAP registers about this change. > @@ -1377,25 +1514,6 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) > vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); > } > > -static void vtd_switch_address_space(VTDAddressSpace *as) > -{ > - assert(as); > - > - trace_vtd_switch_address_space(pci_bus_num(as->bus), > - VTD_PCI_SLOT(as->devfn), > - VTD_PCI_FUNC(as->devfn), > - as->iommu_state->dmar_enabled); > - > - /* Turn off first then on the other */ > - if (as->iommu_state->dmar_enabled) { > - memory_region_set_enabled(&as->sys_alias, false); > - memory_region_set_enabled(&as->iommu, true); > - } else { > - memory_region_set_enabled(&as->iommu, false); > - memory_region_set_enabled(&as->sys_alias, true); > - } > -} > - > static void vtd_switch_address_space_all(IntelIOMMUState *s) > { > GHashTableIter iter; > @@ -1408,7 +1526,7 @@ static void vtd_switch_address_space_all(IntelIOMMUState *s) > if (!vtd_bus->dev_as[i]) { > continue; > } > - vtd_switch_address_space(vtd_bus->dev_as[i]); > + vtd_switch_address_space(vtd_bus->dev_as[i], false); > } > } > } > @@ -2712,7 +2830,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) > &vtd_dev_as->sys_alias, 1); > memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, > &vtd_dev_as->iommu, 1); > - vtd_switch_address_space(vtd_dev_as); > + vtd_switch_address_space(vtd_dev_as, false); > } > return vtd_dev_as; > } > @@ -2860,6 +2978,10 @@ static void vtd_init(IntelIOMMUState *s) > s->ecap |= VTD_ECAP_DT; > } > > + if (x86_iommu->pt_supported) { > + s->ecap |= VTD_ECAP_PT; > + } > + > if (s->caching_mode) { > s->cap |= VTD_CAP_CM; > } > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > index 29d6707..0e73a65 100644 > --- a/hw/i386/intel_iommu_internal.h > +++ b/hw/i386/intel_iommu_internal.h > @@ -187,6 +187,7 @@ > /* Interrupt Remapping support */ > #define VTD_ECAP_IR (1ULL << 3) > #define VTD_ECAP_EIM (1ULL << 4) > +#define VTD_ECAP_PT (1ULL << 6) > #define VTD_ECAP_MHMV (15ULL << 20) > > /* CAP_REG */ > diff --git a/hw/i386/trace-events b/hw/i386/trace-events > index 04a6980..5c3e466 100644 > --- a/hw/i386/trace-events > +++ b/hw/i386/trace-events > @@ -38,6 +38,8 @@ vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P > vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" > vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" > vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 > +vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64 > +vtd_pt_enable_fast_path(uint16_t sid, const char *msg) "sid 0x%"PRIu16" %s" > > # hw/i386/amd_iommu.c > amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 > diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c > index 02b8825..293caf8 100644 > --- a/hw/i386/x86-iommu.c > +++ b/hw/i386/x86-iommu.c > @@ -91,6 +91,7 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp) > static Property x86_iommu_properties[] = { > DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false), > DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), > + DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h > index 361c07c..ef89c0c 100644 > --- a/include/hw/i386/x86-iommu.h > +++ b/include/hw/i386/x86-iommu.h > @@ -74,6 +74,7 @@ struct X86IOMMUState { > SysBusDevice busdev; > bool intr_supported; /* Whether vIOMMU supports IR */ > bool dt_supported; /* Whether vIOMMU supports DT */ > + bool pt_supported; /* Whether vIOMMU supports pass-through */ > IommuType type; /* IOMMU type - AMD/Intel */ > QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */ > };
On Thu, May 11, 2017 at 04:31:40PM +0800, Jason Wang wrote: > > > On 2017年05月10日 16:01, Peter Xu wrote: > >Hardware support for VT-d device passthrough. Although current Linux can > >live with iommu=pt even without this, but this is faster than when using > >software passthrough. > > > >Signed-off-by: Peter Xu <peterx@redhat.com> > >--- > > hw/i386/intel_iommu.c | 210 ++++++++++++++++++++++++++++++++--------- > > hw/i386/intel_iommu_internal.h | 1 + > > hw/i386/trace-events | 2 + > > hw/i386/x86-iommu.c | 1 + > > include/hw/i386/x86-iommu.h | 1 + > > 5 files changed, 171 insertions(+), 44 deletions(-) > > > >diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > >index 1a7eba2..1d034f9 100644 > >--- a/hw/i386/intel_iommu.c > >+++ b/hw/i386/intel_iommu.c > >@@ -640,6 +640,29 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) > > } > > } > >+/* Find the VTD address space associated with a given bus number */ > >+static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) > >+{ > >+ VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; > >+ if (!vtd_bus) { > >+ /* > >+ * Iterate over the registered buses to find the one which > >+ * currently hold this bus number, and update the bus_num > >+ * lookup table: > >+ */ > >+ GHashTableIter iter; > >+ > >+ g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); > >+ while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { > >+ if (pci_bus_num(vtd_bus->bus) == bus_num) { > >+ s->vtd_as_by_bus_num[bus_num] = vtd_bus; > >+ return vtd_bus; > >+ } > >+ } > >+ } > >+ return vtd_bus; > >+} > >+ > > /* Given the @iova, get relevant @slptep. @slpte_level will be the last level > > * of the translation, can be used for deciding the size of large page. > > */ > >@@ -881,6 +904,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > > type_fail = true; > > } > > break; > >+ case VTD_CONTEXT_TT_PASS_THROUGH: > >+ if (!x86_iommu->pt_supported) { > >+ type_fail = true; > >+ } > >+ break; > > default: > > /* Unknwon type */ > > type_fail = true; > >@@ -894,6 +922,84 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > > return 0; > > } > >+/* > >+ * Fetch translation type for specific device. Returns <0 if error > >+ * happens, otherwise return the shifted type to check against > >+ * VTD_CONTEXT_TT_*. > >+ */ > >+static int vtd_dev_get_trans_type(VTDAddressSpace *as) > >+{ > >+ IntelIOMMUState *s; > >+ VTDContextEntry ce; > >+ int ret; > >+ > >+ s = as->iommu_state; > >+ > >+ ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), > >+ as->devfn, &ce); > >+ if (ret) { > >+ return ret; > >+ } > >+ > >+ return vtd_ce_get_type(&ce); > >+} > >+ > >+static bool vtd_dev_pt_enabled(VTDAddressSpace *as) > >+{ > >+ int ret; > >+ > >+ assert(as); > >+ > >+ ret = vtd_dev_get_trans_type(as); > >+ if (ret < 0) { > >+ /* > >+ * Possibly failed to parse the context entry for some reason > >+ * (e.g., during init, or any guest configuration errors on > >+ * context entries). We should assume PT not enabled for > >+ * safety. > >+ */ > >+ return false; > >+ } > >+ > >+ return ret == VTD_CONTEXT_TT_PASS_THROUGH; > >+} > >+ > >+/* > >+ * When we are during init phase (device realizations, global > >+ * enable/disable of translations), we should not detect PT > >+ * (passthrough) when switching address spaces. In that cases, we > >+ * should set `detect_pt' to false. > >+ * > >+ * Return whether the device is using IOMMU translation. > >+ */ > >+static bool vtd_switch_address_space(VTDAddressSpace *as, bool detect_pt) > >+{ > > The detect_pt looks suspicious. E.g if the context entry does not exist, > vtd_dev_pt_enabled() will return false. I forgot why I added that even after reading the comments I wrote. I blame too much context switches recently in my brain. :( (this is an excuse of mine :) I did some test and I see nothing wrong to not hack on this bit. I will remove that in next version, until one day I remembered something. And I will try to add more detailed comments in the future. > > >+ bool use_iommu; > >+ > >+ assert(as); > >+ > >+ use_iommu = as->iommu_state->dmar_enabled; > >+ if (detect_pt) { > >+ use_iommu &= !vtd_dev_pt_enabled(as); > >+ } > >+ > >+ trace_vtd_switch_address_space(pci_bus_num(as->bus), > >+ VTD_PCI_SLOT(as->devfn), > >+ VTD_PCI_FUNC(as->devfn), > >+ use_iommu); > >+ > >+ /* Turn off first then on the other */ > >+ if (use_iommu) { > >+ memory_region_set_enabled(&as->sys_alias, false); > >+ memory_region_set_enabled(&as->iommu, true); > >+ } else { > >+ memory_region_set_enabled(&as->iommu, false); > >+ memory_region_set_enabled(&as->sys_alias, true); > >+ } > >+ > >+ return use_iommu; > >+} > >+ > > static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) > > { > > return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); > >@@ -931,6 +1037,31 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) > > return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; > > } > >+static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) > >+{ > >+ VTDBus *vtd_bus; > >+ VTDAddressSpace *vtd_as; > >+ const char *msg = "FAIL"; > >+ > >+ vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); > >+ if (!vtd_bus) { > >+ goto out; > >+ } > >+ > >+ vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; > >+ if (!vtd_as) { > >+ goto out; > >+ } > >+ > >+ if (vtd_switch_address_space(vtd_as, true) == false) { > >+ /* We switched off IOMMU region successfully. */ > >+ msg = "SUCCESS"; > >+ } > >+ > >+out: > >+ trace_vtd_pt_enable_fast_path(source_id, msg); > > Looks like using a boolean is better here. Sure. > > >+} > >+ > > /* Map dev to context-entry then do a paging-structures walk to do a iommu > > * translation. > > * > >@@ -1002,6 +1133,30 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, > > cc_entry->context_cache_gen = s->context_cache_gen; > > } > >+ /* > >+ * We don't need to translate for pass-through context entries. > >+ * Also, let's ignore IOTLB caching as well for PT devices. > >+ */ > >+ if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { > >+ entry->translated_addr = entry->iova; > >+ entry->addr_mask = VTD_PAGE_SIZE - 1; > >+ entry->perm = IOMMU_RW; > >+ trace_vtd_translate_pt(source_id, entry->iova); > >+ > >+ /* > >+ * When this happens, it means firstly caching-mode is not > >+ * enabled, and this is the first passthrough translation for > >+ * the device. Let's enable the fast path for passthrough. > >+ * > >+ * When passthrough is disabled again for the device, we can > >+ * capture it via the context entry invalidation, then the > >+ * IOMMU region can be swapped back. > >+ */ > >+ vtd_pt_enable_fast_path(s, source_id); > >+ > >+ return; > >+ } > >+ > > ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, > > &reads, &writes); > > if (ret_fr) { > >@@ -1081,29 +1236,6 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) > > vtd_iommu_replay_all(s); > > } > >- > >-/* Find the VTD address space currently associated with a given bus number, > >- */ > >-static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) > >-{ > >- VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; > >- if (!vtd_bus) { > >- /* Iterate over the registered buses to find the one > >- * which currently hold this bus number, and update the bus_num lookup table: > >- */ > >- GHashTableIter iter; > >- > >- g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); > >- while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { > >- if (pci_bus_num(vtd_bus->bus) == bus_num) { > >- s->vtd_as_by_bus_num[bus_num] = vtd_bus; > >- return vtd_bus; > >- } > >- } > >- } > >- return vtd_bus; > >-} > >- > > /* Do a context-cache device-selective invalidation. > > * @func_mask: FM field after shifting > > */ > >@@ -1146,6 +1278,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, > > VTD_PCI_FUNC(devfn_it)); > > vtd_as->context_cache_entry.context_cache_gen = 0; > > /* > >+ * Do switch address space when needed, in case if the > >+ * device passthrough bit is switched. > >+ */ > >+ vtd_switch_address_space(vtd_as, true); > > Do we need to do this also in DSI and GLOBAL invalidation? Yes. Though this should be optional at least for Linux, but I will add that later. Thanks!
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 1a7eba2..1d034f9 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -640,6 +640,29 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) } } +/* Find the VTD address space associated with a given bus number */ +static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) +{ + VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; + if (!vtd_bus) { + /* + * Iterate over the registered buses to find the one which + * currently hold this bus number, and update the bus_num + * lookup table: + */ + GHashTableIter iter; + + g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); + while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { + if (pci_bus_num(vtd_bus->bus) == bus_num) { + s->vtd_as_by_bus_num[bus_num] = vtd_bus; + return vtd_bus; + } + } + } + return vtd_bus; +} + /* Given the @iova, get relevant @slptep. @slpte_level will be the last level * of the translation, can be used for deciding the size of large page. */ @@ -881,6 +904,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, type_fail = true; } break; + case VTD_CONTEXT_TT_PASS_THROUGH: + if (!x86_iommu->pt_supported) { + type_fail = true; + } + break; default: /* Unknwon type */ type_fail = true; @@ -894,6 +922,84 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, return 0; } +/* + * Fetch translation type for specific device. Returns <0 if error + * happens, otherwise return the shifted type to check against + * VTD_CONTEXT_TT_*. + */ +static int vtd_dev_get_trans_type(VTDAddressSpace *as) +{ + IntelIOMMUState *s; + VTDContextEntry ce; + int ret; + + s = as->iommu_state; + + ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), + as->devfn, &ce); + if (ret) { + return ret; + } + + return vtd_ce_get_type(&ce); +} + +static bool vtd_dev_pt_enabled(VTDAddressSpace *as) +{ + int ret; + + assert(as); + + ret = vtd_dev_get_trans_type(as); + if (ret < 0) { + /* + * Possibly failed to parse the context entry for some reason + * (e.g., during init, or any guest configuration errors on + * context entries). We should assume PT not enabled for + * safety. + */ + return false; + } + + return ret == VTD_CONTEXT_TT_PASS_THROUGH; +} + +/* + * When we are during init phase (device realizations, global + * enable/disable of translations), we should not detect PT + * (passthrough) when switching address spaces. In that cases, we + * should set `detect_pt' to false. + * + * Return whether the device is using IOMMU translation. + */ +static bool vtd_switch_address_space(VTDAddressSpace *as, bool detect_pt) +{ + bool use_iommu; + + assert(as); + + use_iommu = as->iommu_state->dmar_enabled; + if (detect_pt) { + use_iommu &= !vtd_dev_pt_enabled(as); + } + + trace_vtd_switch_address_space(pci_bus_num(as->bus), + VTD_PCI_SLOT(as->devfn), + VTD_PCI_FUNC(as->devfn), + use_iommu); + + /* Turn off first then on the other */ + if (use_iommu) { + memory_region_set_enabled(&as->sys_alias, false); + memory_region_set_enabled(&as->iommu, true); + } else { + memory_region_set_enabled(&as->iommu, false); + memory_region_set_enabled(&as->sys_alias, true); + } + + return use_iommu; +} + static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) { return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); @@ -931,6 +1037,31 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; } +static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) +{ + VTDBus *vtd_bus; + VTDAddressSpace *vtd_as; + const char *msg = "FAIL"; + + vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); + if (!vtd_bus) { + goto out; + } + + vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; + if (!vtd_as) { + goto out; + } + + if (vtd_switch_address_space(vtd_as, true) == false) { + /* We switched off IOMMU region successfully. */ + msg = "SUCCESS"; + } + +out: + trace_vtd_pt_enable_fast_path(source_id, msg); +} + /* Map dev to context-entry then do a paging-structures walk to do a iommu * translation. * @@ -1002,6 +1133,30 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, cc_entry->context_cache_gen = s->context_cache_gen; } + /* + * We don't need to translate for pass-through context entries. + * Also, let's ignore IOTLB caching as well for PT devices. + */ + if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { + entry->translated_addr = entry->iova; + entry->addr_mask = VTD_PAGE_SIZE - 1; + entry->perm = IOMMU_RW; + trace_vtd_translate_pt(source_id, entry->iova); + + /* + * When this happens, it means firstly caching-mode is not + * enabled, and this is the first passthrough translation for + * the device. Let's enable the fast path for passthrough. + * + * When passthrough is disabled again for the device, we can + * capture it via the context entry invalidation, then the + * IOMMU region can be swapped back. + */ + vtd_pt_enable_fast_path(s, source_id); + + return; + } + ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level, &reads, &writes); if (ret_fr) { @@ -1081,29 +1236,6 @@ static void vtd_context_global_invalidate(IntelIOMMUState *s) vtd_iommu_replay_all(s); } - -/* Find the VTD address space currently associated with a given bus number, - */ -static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) -{ - VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; - if (!vtd_bus) { - /* Iterate over the registered buses to find the one - * which currently hold this bus number, and update the bus_num lookup table: - */ - GHashTableIter iter; - - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); - while (g_hash_table_iter_next (&iter, NULL, (void**)&vtd_bus)) { - if (pci_bus_num(vtd_bus->bus) == bus_num) { - s->vtd_as_by_bus_num[bus_num] = vtd_bus; - return vtd_bus; - } - } - } - return vtd_bus; -} - /* Do a context-cache device-selective invalidation. * @func_mask: FM field after shifting */ @@ -1146,6 +1278,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, VTD_PCI_FUNC(devfn_it)); vtd_as->context_cache_entry.context_cache_gen = 0; /* + * Do switch address space when needed, in case if the + * device passthrough bit is switched. + */ + vtd_switch_address_space(vtd_as, true); + /* * So a device is moving out of (or moving into) a * domain, a replay() suites here to notify all the * IOMMU_NOTIFIER_MAP registers about this change. @@ -1377,25 +1514,6 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s) vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS); } -static void vtd_switch_address_space(VTDAddressSpace *as) -{ - assert(as); - - trace_vtd_switch_address_space(pci_bus_num(as->bus), - VTD_PCI_SLOT(as->devfn), - VTD_PCI_FUNC(as->devfn), - as->iommu_state->dmar_enabled); - - /* Turn off first then on the other */ - if (as->iommu_state->dmar_enabled) { - memory_region_set_enabled(&as->sys_alias, false); - memory_region_set_enabled(&as->iommu, true); - } else { - memory_region_set_enabled(&as->iommu, false); - memory_region_set_enabled(&as->sys_alias, true); - } -} - static void vtd_switch_address_space_all(IntelIOMMUState *s) { GHashTableIter iter; @@ -1408,7 +1526,7 @@ static void vtd_switch_address_space_all(IntelIOMMUState *s) if (!vtd_bus->dev_as[i]) { continue; } - vtd_switch_address_space(vtd_bus->dev_as[i]); + vtd_switch_address_space(vtd_bus->dev_as[i], false); } } } @@ -2712,7 +2830,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) &vtd_dev_as->sys_alias, 1); memory_region_add_subregion_overlap(&vtd_dev_as->root, 0, &vtd_dev_as->iommu, 1); - vtd_switch_address_space(vtd_dev_as); + vtd_switch_address_space(vtd_dev_as, false); } return vtd_dev_as; } @@ -2860,6 +2978,10 @@ static void vtd_init(IntelIOMMUState *s) s->ecap |= VTD_ECAP_DT; } + if (x86_iommu->pt_supported) { + s->ecap |= VTD_ECAP_PT; + } + if (s->caching_mode) { s->cap |= VTD_CAP_CM; } diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 29d6707..0e73a65 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -187,6 +187,7 @@ /* Interrupt Remapping support */ #define VTD_ECAP_IR (1ULL << 3) #define VTD_ECAP_EIM (1ULL << 4) +#define VTD_ECAP_PT (1ULL << 6) #define VTD_ECAP_MHMV (15ULL << 20) /* CAP_REG */ diff --git a/hw/i386/trace-events b/hw/i386/trace-events index 04a6980..5c3e466 100644 --- a/hw/i386/trace-events +++ b/hw/i386/trace-events @@ -38,6 +38,8 @@ vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"P vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set" vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)" vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64 +vtd_translate_pt(uint16_t sid, uint64_t addr) "source id 0x%"PRIu16", iova 0x%"PRIx64 +vtd_pt_enable_fast_path(uint16_t sid, const char *msg) "sid 0x%"PRIu16" %s" # hw/i386/amd_iommu.c amdvi_evntlog_fail(uint64_t addr, uint32_t head) "error: fail to write at addr 0x%"PRIx64" + offset 0x%"PRIx32 diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c index 02b8825..293caf8 100644 --- a/hw/i386/x86-iommu.c +++ b/hw/i386/x86-iommu.c @@ -91,6 +91,7 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp) static Property x86_iommu_properties[] = { DEFINE_PROP_BOOL("intremap", X86IOMMUState, intr_supported, false), DEFINE_PROP_BOOL("device-iotlb", X86IOMMUState, dt_supported, false), + DEFINE_PROP_BOOL("pt", X86IOMMUState, pt_supported, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/i386/x86-iommu.h b/include/hw/i386/x86-iommu.h index 361c07c..ef89c0c 100644 --- a/include/hw/i386/x86-iommu.h +++ b/include/hw/i386/x86-iommu.h @@ -74,6 +74,7 @@ struct X86IOMMUState { SysBusDevice busdev; bool intr_supported; /* Whether vIOMMU supports IR */ bool dt_supported; /* Whether vIOMMU supports DT */ + bool pt_supported; /* Whether vIOMMU supports pass-through */ IommuType type; /* IOMMU type - AMD/Intel */ QLIST_HEAD(, IEC_Notifier) iec_notifiers; /* IEC notify list */ };
Hardware support for VT-d device passthrough. Although current Linux can live with iommu=pt even without this, but this is faster than when using software passthrough. Signed-off-by: Peter Xu <peterx@redhat.com> --- hw/i386/intel_iommu.c | 210 ++++++++++++++++++++++++++++++++--------- hw/i386/intel_iommu_internal.h | 1 + hw/i386/trace-events | 2 + hw/i386/x86-iommu.c | 1 + include/hw/i386/x86-iommu.h | 1 + 5 files changed, 171 insertions(+), 44 deletions(-)