Message ID | 1349962023-560-4-git-send-email-avi@redhat.com |
---|---|
State | New |
Headers | show |
On 10/11/2012 03:26 PM, Avi Kivity wrote: > Add a new memory region type that translates addresses it is given, > then forwards them to a target address space. This is similar to > an alias, except that the mapping is more flexible than a linear > translation and trucation, and also less efficient since the > translation happens at runtime. > > The implementation uses an AddressSpace mapping the target region to > avoid hierarchical dispatch all the way to the resolved region; only > iommu regions are looked up dynamically. > > > typedef struct { > void *buffer; > + AddressSpace *as; > target_phys_addr_t addr; > target_phys_addr_t len; > } BounceBuffer; > @@ -3563,23 +3564,42 @@ void *address_space_map(AddressSpace *as, > ram_addr_t raddr = RAM_ADDR_MAX; > ram_addr_t rlen; > void *ret; > + IOMMUTLBEntry iotlb; > + target_phys_addr_t xlat; > + AddressSpace *as_xlat; > > while (len > 0) { > + xlat = addr; > + as_xlat = as; > page = addr & TARGET_PAGE_MASK; > l = (page + TARGET_PAGE_SIZE) - addr; > if (l > len) > l = len; > section = phys_page_find(d, page >> TARGET_PAGE_BITS); > > + while (section->mr->iommu_ops) { > + iotlb = section->mr->iommu_ops->translate(section->mr, addr, is_write); should be using xlat here, or this fails the second time around. > + if (iotlb.valid) { > + xlat = ((iotlb.translated_addr & ~iotlb.addr_mask) > + | (addr & iotlb.addr_mask)); > + as_xlat = section->mr->iommu_target_as; > + l = (MIN(xlat + l - 1, xlat | iotlb.addr_mask) - xlat) + 1; > + section = phys_page_find(as_xlat->dispatch, xlat >> TARGET_PAGE_BITS); > + } else { > + section = &phys_sections[phys_section_unassigned]; > + } > + } > + > if (!(memory_region_is_ram(section->mr) && !section->readonly)) { > if (todo || bounce.buffer) { > break; > } > bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE); > - bounce.addr = addr; > + bounce.addr = xlat; > + bounce.as = as_xlat; > bounce.len = l; > if (!is_write) { > - address_space_read(as, addr, bounce.buffer, l); > + address_space_read(as_xlat, xlat, bounce.buffer, l); > } > > *plen = l; > @@ -3587,7 +3607,7 @@ void *address_space_map(AddressSpace *as, > } > if (!todo) { > raddr = memory_region_get_ram_addr(section->mr) > - + memory_region_section_addr(section, addr); > + + memory_region_section_addr(section, xlat); > } > > len -= l; > @@ -3632,7 +3652,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, target_phys_addr_t len, > return; > } > if (is_write) { > - address_space_write(as, bounce.addr, bounce.buffer, access_len); > + address_space_write(bounce.as, bounce.addr, bounce.buffer, access_len); > } > qemu_vfree(bounce.buffer); > bounce.buffer = NULL; > diff --git a/memory.c b/memory.c > index 5df6177..1d92bb8 100644 > --- a/memory.c > +++ b/memory.c > @@ -775,6 +775,12 @@ static void memory_region_destructor_rom_device(MemoryRegion *mr) > qemu_ram_free(mr->ram_addr & TARGET_PAGE_MASK); > } > > +static void memory_region_destructor_iommu(MemoryRegion *mr) > +{ > + address_space_destroy(mr->iommu_target_as); > + g_free(mr->iommu_target_as); > +} > + > static bool memory_region_wrong_endianness(MemoryRegion *mr) > { > #ifdef TARGET_WORDS_BIGENDIAN > @@ -789,6 +795,7 @@ void memory_region_init(MemoryRegion *mr, > uint64_t size) > { > mr->ops = NULL; > + mr->iommu_ops = NULL; > mr->parent = NULL; > mr->size = int128_make64(size); > if (size == UINT64_MAX) { > @@ -980,6 +987,101 @@ void memory_region_init_rom_device(MemoryRegion *mr, > mr->ram_addr = qemu_ram_alloc(size, mr); > } > > +static void memory_region_iommu_rw(MemoryRegion *iommu, target_phys_addr_t addr, > + uint8_t *buf, unsigned len, bool is_write) > +{ > + IOMMUTLBEntry tlb; > + unsigned clen; > + target_phys_addr_t xlat; > + > + while (len) { > + tlb = iommu->iommu_ops->translate(iommu, addr, is_write); > + clen = (MIN(addr | tlb.addr_mask, addr + len - 1) - addr) + 1; > + if (tlb.valid) { > + xlat = (tlb.translated_addr & ~tlb.addr_mask) | (addr & tlb.addr_mask); > + address_space_rw(iommu->iommu_target_as, xlat, buf, clen, is_write); > + } else { > + if (!is_write) { > + memset(buf, 0xff, clen); > + } > + } > + buf += clen; > + addr += clen; > + len -= clen; > + } > +} > + > +static uint64_t memory_region_iommu_read(void *opaque, target_phys_addr_t addr, > + unsigned size) > +{ > + MemoryRegion *iommu = opaque; > + union { > + uint8_t buf[8]; > + uint8_t u8; > + uint16_t u16; > + uint32_t u32; > + uint64_t u64; > + } ret; > + > + memory_region_iommu_rw(iommu, addr, ret.buf, size, false); > + switch (size) { > + case 1: return ret.u8; > + case 2: return ret.u16; > + case 4: return ret.u32; > + case 8: return ret.u64; > + default: abort(); > + } > +} > + > +static void memory_region_iommu_write(void *opaque, target_phys_addr_t addr, > + uint64_t data, unsigned size) > +{ > + MemoryRegion *iommu = opaque; > + union { > + uint8_t buf[8]; > + uint8_t u8; > + uint16_t u16; > + uint32_t u32; > + uint64_t u64; > + } in; > + > + switch (size) { > + case 1: in.u8 = data; break; > + case 2: in.u16 = data; break; > + case 4: in.u32 = data; break; > + case 8: in.u64 = data; break; > + default: abort(); > + } > + memory_region_iommu_rw(iommu, addr, in.buf, size, true); > +} > + > +static MemoryRegionOps memory_region_iommu_ops = { > + .read = memory_region_iommu_read, > + .write = memory_region_iommu_write, > +#ifdef HOST_BIGENDIAN > + .endianness = DEVICE_BIG_ENDIAN, > +#else > + .endianness = DEVICE_LITTLE_ENDIAN, > +#endif > +}; > + > +void memory_region_init_iommu(MemoryRegion *mr, > + MemoryRegionIOMMUOps *ops, > + MemoryRegion *target, > + const char *name, > + uint64_t size) > +{ > + memory_region_init(mr, name, size); > + mr->ops = &memory_region_iommu_ops; > + mr->iommu_ops = ops, > + mr->opaque = mr; > + mr->terminates = true; /* then re-forwards */ > + mr->destructor = memory_region_destructor_iommu; > + mr->iommu_target = target; > + mr->iommu_target_as = g_new(AddressSpace, 1); > + address_space_init(mr->iommu_target_as, target); > +} > + > static uint64_t invalid_read(void *opaque, target_phys_addr_t addr, > unsigned size) > { > @@ -1053,6 +1155,11 @@ bool memory_region_is_rom(MemoryRegion *mr) > return mr->ram && mr->readonly; > } > > +bool memory_region_is_iommu(MemoryRegion *mr) > +{ > + return mr->iommu_ops; > +} > + > void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) > { > uint8_t mask = 1 << client; > diff --git a/memory.h b/memory.h > index 79393f1..299d584 100644 > --- a/memory.h > +++ b/memory.h > @@ -113,12 +113,29 @@ struct MemoryRegionOps { > const MemoryRegionMmio old_mmio; > }; > > +typedef struct IOMMUTLBEntry IOMMUTLBEntry; > +typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps; > + > +struct IOMMUTLBEntry { > + target_phys_addr_t device_addr; > + target_phys_addr_t translated_addr; > + target_phys_addr_t addr_mask; /* 0xfff = 4k translation */ > + bool valid; > +}; > + > +struct MemoryRegionIOMMUOps { > + /* Returns a TLB entry that contains a given address. */ > + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, target_phys_addr_t addr, > + bool is_write); > +}; > + > typedef struct CoalescedMemoryRange CoalescedMemoryRange; > typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; > > struct MemoryRegion { > /* All fields are private - violators will be prosecuted */ > const MemoryRegionOps *ops; > + const MemoryRegionIOMMUOps *iommu_ops; > void *opaque; > MemoryRegion *parent; > Int128 size; > @@ -145,6 +162,8 @@ struct MemoryRegion { > uint8_t dirty_log_mask; > unsigned ioeventfd_nb; > MemoryRegionIoeventfd *ioeventfds; > + MemoryRegion *iommu_target; > + struct AddressSpace *iommu_target_as; > }; > > struct MemoryRegionPortio { > @@ -334,6 +353,24 @@ void memory_region_init_rom_device(MemoryRegion *mr, > void memory_region_init_reservation(MemoryRegion *mr, > const char *name, > uint64_t size); > + > +/** > + * memory_region_init_iommu: Initialize a memory region that translates addresses > + * > + * An IOMMU region translates addresses and forwards accesses to a target memory region. > + * > + * @mr: the #MemoryRegion to be initialized > + * @ops: a function that translates addresses into the @target region > + * @target: a #MemoryRegion that will be used to satisfy accesses to translated addresses > + * @name: used for debugging; not visible to the user or ABI > + * @size: size of the region. > + */ > +void memory_region_init_iommu(MemoryRegion *mr, > + MemoryRegionIOMMUOps *ops, > + MemoryRegion *target, > + const char *name, > + uint64_t size); > + > /** > * memory_region_destroy: Destroy a memory region and reclaim all resources. > * > @@ -373,6 +410,15 @@ static inline bool memory_region_is_romd(MemoryRegion *mr) > } > > /** > + * memory_region_is_ram: check whether a memory region is an iommu > + * > + * Returns %true is a memory region is an iommu. > + * > + * @mr: the memory region being queried > + */ > +bool memory_region_is_iommu(MemoryRegion *mr); > + > +/** > * memory_region_name: get a memory region's name > * > * Returns the string that was used to initialize the memory region. >
On Thu, 2012-10-11 at 15:42 +0200, Paolo Bonzini wrote: > Il 11/10/2012 15:26, Avi Kivity ha scritto: > > +struct MemoryRegionIOMMUOps { > > + /* Returns a TLB entry that contains a given address. */ > > + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, target_phys_addr_t addr, > > + bool is_write); > > +}; > > Do map/unmap still make sense in this model? Ben & David, what were > your plans there? To keep it under the rug for as long as we could ? :-) The problem with map and unmap is invalidations. How do you convey to the devices having done a map that the guest has invalidated a translation entry. Especially nasty on PAPR where the invalidation is a hcall which is assumed to be synchronous. We have simply not solved the problem for now. The risk due to the possible access beyond the end of life of a translation is negligible as long as we are not playing funny mapping tricks with emulated devices (which we might do with some in the future... but not today) and the scope of the problem is limited to the guest corrupting itself. Cheers, Ben.
On Thu, 2012-10-11 at 15:57 +0200, Avi Kivity wrote: > >> Map/unmap is supported via address_space_map(), which calls > >> ->translate(). I don't see how a lower-level map/unmap helps, > unless > >> the hardware supplies such a function. > > > > Yep, it's just the map/unmap callbacks that are not supported > anymore, > > but nobody uses that feature of DMAContext yet. > > What do those callbacks it even mean? Well, the unmap callback was meant for notifying the device that did a map() that the iommu has invalidated part of that mapping. The rough idea was that the actual invalidations would be delayed until all "previous" maps have gone away, which works fine without callbacks for transcient maps (packet buffers ,etc...) but doesn't for long lived ones. So in addition, we would call that callback for devices who own long lived maps, asking them to dispose of them (and eventually re-try them, which might or might not fail depending on why the invalidation occurred in the first place). The invalidation would still be delayed until the last old map has gone away, so it's not a synchronous callback, more like a notification to the device to wakeup & do something. But in the latest patches that went in, because the whole scheme was too complex and not really that useful, I ripped out the whole map tracking etc... I kept the unmap callback API there in case we want to re-do it more sanely. When emulating HW iommu's the "invalidation not complete" is easy to report asynchronously to the guest via a status bit that the guest is supposdly polling after doing an invalidation request. On something like synchronous hcalls (PAPR), the idea was to delay the hcall completion by suspending the cpu who issued it. A lot of pain for what is essentially a corner case that doesn't happen in practice... unless we start doing mapping games. By mapping games, I mean having an emulated device MMIO space being mapped into user space in a way where the kernel might change the mapping "live" (for example to point to backup memory as it migrates thing away, etc...). This kind of stuff typically happens with graphics where graphic objects can move between memory and vram. Cheers, Ben
On Fri, Oct 12, 2012 at 2:45 AM, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote: > On Thu, 2012-10-11 at 15:42 +0200, Paolo Bonzini wrote: >> Il 11/10/2012 15:26, Avi Kivity ha scritto: >> > +struct MemoryRegionIOMMUOps { >> > + /* Returns a TLB entry that contains a given address. */ >> > + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, target_phys_addr_t addr, >> > + bool is_write); >> > +}; >> >> Do map/unmap still make sense in this model? Ben & David, what were >> your plans there? > > To keep it under the rug for as long as we could ? :-) > > The problem with map and unmap is invalidations. How do you convey to > the devices having done a map that the guest has invalidated a > translation entry. Also in Sparc32, IOMMU uses a table in RAM which the guest can change, so a callback to update the translation tables should be available. On Sparc64 there's IOTLB but also a fallback to TSB translation table in memory. We could rely on the guest issuing demaps/flushes when the memory changes and invalidate the translations then. > > Especially nasty on PAPR where the invalidation is a hcall which is > assumed to be synchronous. > > We have simply not solved the problem for now. The risk due to the > possible access beyond the end of life of a translation is negligible as > long as we are not playing funny mapping tricks with emulated devices > (which we might do with some in the future... but not today) and the > scope of the problem is limited to the guest corrupting itself. > > Cheers, > Ben. > >
On Sat, 2012-10-13 at 09:30 +0000, Blue Swirl wrote: > > The problem with map and unmap is invalidations. How do you convey to > > the devices having done a map that the guest has invalidated a > > translation entry. > > Also in Sparc32, IOMMU uses a table in RAM which the guest can change, > so a callback to update the translation tables should be available. On > Sparc64 there's IOTLB but also a fallback to TSB translation table in > memory. We could rely on the guest issuing demaps/flushes when the > memory changes and invalidate the translations then. Right, the table's in memory on power too, but such tables generally also have a cache (TLB) with some MMIO based logic to perform invalidations. Typically that logic involves a bit to perform a TLB kill and some status bit to read back to get confirmation that the flush is completed. In that case we can probably delay that later status bit until all the maps we kept track of are gone .... ... but that means tracking them which is expensive. Also the IBM iommu's are nasty here... some of them, if we ever emulate them, actually participate in the fabric coherency protocol and thus don't require an explicit MMIO for invalidations. So if we were to emulate such HW we would have to intercept accesses to the portion of RAM that is configured as an iommu table. Thankfully we only emulate those machines as "paravirt" with a hypervisor interface to the iommu (aka TCEs) so we are fine for now. Also if we ever emulate the real HW, well, the latter models don't do that anymore (but their MMIO for killing the cache doesn't have a status bit either, the kill is that the latency of a simple read back is enough). Overall, a bloody can of worms... under the rug sounds like a nice place to leave it for now :-) Cheers, Ben.
On 10/12/2012 04:51 AM, Benjamin Herrenschmidt wrote: > On Thu, 2012-10-11 at 15:57 +0200, Avi Kivity wrote: >> >> Map/unmap is supported via address_space_map(), which calls >> >> ->translate(). I don't see how a lower-level map/unmap helps, >> unless >> >> the hardware supplies such a function. >> > >> > Yep, it's just the map/unmap callbacks that are not supported >> anymore, >> > but nobody uses that feature of DMAContext yet. >> >> What do those callbacks it even mean? > > Well, the unmap callback was meant for notifying the device that did a > map() that the iommu has invalidated part of that mapping. > > The rough idea was that the actual invalidations would be delayed until > all "previous" maps have gone away, which works fine without callbacks > for transcient maps (packet buffers ,etc...) but doesn't for long lived > ones. Something like the kernel's kvm_read_guest_cached()? You can then invalidate translations by incrementing a generation counter. Problem is you don't get a simple pointer, but rather something with an API that needs to be used. > > So in addition, we would call that callback for devices who own long > lived maps, asking them to dispose of them (and eventually re-try them, > which might or might not fail depending on why the invalidation occurred > in the first place). > > The invalidation would still be delayed until the last old map has gone > away, so it's not a synchronous callback, more like a notification to > the device to wakeup & do something. > > But in the latest patches that went in, because the whole scheme was too > complex and not really that useful, I ripped out the whole map tracking > etc... I kept the unmap callback API there in case we want to re-do it > more sanely. > > When emulating HW iommu's the "invalidation not complete" is easy to > report asynchronously to the guest via a status bit that the guest is > supposdly polling after doing an invalidation request. > > On something like synchronous hcalls (PAPR), the idea was to delay the > hcall completion by suspending the cpu who issued it. With the above, you can synchronize using synchronize_rcu(). > > A lot of pain for what is essentially a corner case that doesn't happen > in practice... unless we start doing mapping games. > > By mapping games, I mean having an emulated device MMIO space being > mapped into user space in a way where the kernel might change the > mapping "live" (for example to point to backup memory as it migrates > thing away, etc...). This kind of stuff typically happens with graphics > where graphic objects can move between memory and vram. > Sounds nasty. In general we try to avoid special cases during migration, it's fragile enough.
diff --git a/exec.c b/exec.c index 328753d..13df16c 100644 --- a/exec.c +++ b/exec.c @@ -3498,6 +3498,7 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr, typedef struct { void *buffer; + AddressSpace *as; target_phys_addr_t addr; target_phys_addr_t len; } BounceBuffer; @@ -3563,23 +3564,42 @@ void *address_space_map(AddressSpace *as, ram_addr_t raddr = RAM_ADDR_MAX; ram_addr_t rlen; void *ret; + IOMMUTLBEntry iotlb; + target_phys_addr_t xlat; + AddressSpace *as_xlat; while (len > 0) { + xlat = addr; + as_xlat = as; page = addr & TARGET_PAGE_MASK; l = (page + TARGET_PAGE_SIZE) - addr; if (l > len) l = len; section = phys_page_find(d, page >> TARGET_PAGE_BITS); + while (section->mr->iommu_ops) { + iotlb = section->mr->iommu_ops->translate(section->mr, addr, is_write); + if (iotlb.valid) { + xlat = ((iotlb.translated_addr & ~iotlb.addr_mask) + | (addr & iotlb.addr_mask)); + as_xlat = section->mr->iommu_target_as; + l = (MIN(xlat + l - 1, xlat | iotlb.addr_mask) - xlat) + 1; + section = phys_page_find(as_xlat->dispatch, xlat >> TARGET_PAGE_BITS); + } else { + section = &phys_sections[phys_section_unassigned]; + } + } + if (!(memory_region_is_ram(section->mr) && !section->readonly)) { if (todo || bounce.buffer) { break; } bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE); - bounce.addr = addr; + bounce.addr = xlat; + bounce.as = as_xlat; bounce.len = l; if (!is_write) { - address_space_read(as, addr, bounce.buffer, l); + address_space_read(as_xlat, xlat, bounce.buffer, l); } *plen = l; @@ -3587,7 +3607,7 @@ void *address_space_map(AddressSpace *as, } if (!todo) { raddr = memory_region_get_ram_addr(section->mr) - + memory_region_section_addr(section, addr); + + memory_region_section_addr(section, xlat); } len -= l; @@ -3632,7 +3652,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, target_phys_addr_t len, return; } if (is_write) { - address_space_write(as, bounce.addr, bounce.buffer, access_len); + address_space_write(bounce.as, bounce.addr, bounce.buffer, access_len); } qemu_vfree(bounce.buffer); bounce.buffer = NULL; diff --git a/memory.c b/memory.c index 5df6177..1d92bb8 100644 --- a/memory.c +++ b/memory.c @@ -775,6 +775,12 @@ static void memory_region_destructor_rom_device(MemoryRegion *mr) qemu_ram_free(mr->ram_addr & TARGET_PAGE_MASK); } +static void memory_region_destructor_iommu(MemoryRegion *mr) +{ + address_space_destroy(mr->iommu_target_as); + g_free(mr->iommu_target_as); +} + static bool memory_region_wrong_endianness(MemoryRegion *mr) { #ifdef TARGET_WORDS_BIGENDIAN @@ -789,6 +795,7 @@ void memory_region_init(MemoryRegion *mr, uint64_t size) { mr->ops = NULL; + mr->iommu_ops = NULL; mr->parent = NULL; mr->size = int128_make64(size); if (size == UINT64_MAX) { @@ -980,6 +987,101 @@ void memory_region_init_rom_device(MemoryRegion *mr, mr->ram_addr = qemu_ram_alloc(size, mr); } +static void memory_region_iommu_rw(MemoryRegion *iommu, target_phys_addr_t addr, + uint8_t *buf, unsigned len, bool is_write) +{ + IOMMUTLBEntry tlb; + unsigned clen; + target_phys_addr_t xlat; + + while (len) { + tlb = iommu->iommu_ops->translate(iommu, addr, is_write); + clen = (MIN(addr | tlb.addr_mask, addr + len - 1) - addr) + 1; + if (tlb.valid) { + xlat = (tlb.translated_addr & ~tlb.addr_mask) | (addr & tlb.addr_mask); + address_space_rw(iommu->iommu_target_as, xlat, buf, clen, is_write); + } else { + if (!is_write) { + memset(buf, 0xff, clen); + } + } + buf += clen; + addr += clen; + len -= clen; + } +} + +static uint64_t memory_region_iommu_read(void *opaque, target_phys_addr_t addr, + unsigned size) +{ + MemoryRegion *iommu = opaque; + union { + uint8_t buf[8]; + uint8_t u8; + uint16_t u16; + uint32_t u32; + uint64_t u64; + } ret; + + memory_region_iommu_rw(iommu, addr, ret.buf, size, false); + switch (size) { + case 1: return ret.u8; + case 2: return ret.u16; + case 4: return ret.u32; + case 8: return ret.u64; + default: abort(); + } +} + +static void memory_region_iommu_write(void *opaque, target_phys_addr_t addr, + uint64_t data, unsigned size) +{ + MemoryRegion *iommu = opaque; + union { + uint8_t buf[8]; + uint8_t u8; + uint16_t u16; + uint32_t u32; + uint64_t u64; + } in; + + switch (size) { + case 1: in.u8 = data; break; + case 2: in.u16 = data; break; + case 4: in.u32 = data; break; + case 8: in.u64 = data; break; + default: abort(); + } + memory_region_iommu_rw(iommu, addr, in.buf, size, true); +} + +static MemoryRegionOps memory_region_iommu_ops = { + .read = memory_region_iommu_read, + .write = memory_region_iommu_write, +#ifdef HOST_BIGENDIAN + .endianness = DEVICE_BIG_ENDIAN, +#else + .endianness = DEVICE_LITTLE_ENDIAN, +#endif +}; + +void memory_region_init_iommu(MemoryRegion *mr, + MemoryRegionIOMMUOps *ops, + MemoryRegion *target, + const char *name, + uint64_t size) +{ + memory_region_init(mr, name, size); + mr->ops = &memory_region_iommu_ops; + mr->iommu_ops = ops, + mr->opaque = mr; + mr->terminates = true; /* then re-forwards */ + mr->destructor = memory_region_destructor_iommu; + mr->iommu_target = target; + mr->iommu_target_as = g_new(AddressSpace, 1); + address_space_init(mr->iommu_target_as, target); +} + static uint64_t invalid_read(void *opaque, target_phys_addr_t addr, unsigned size) { @@ -1053,6 +1155,11 @@ bool memory_region_is_rom(MemoryRegion *mr) return mr->ram && mr->readonly; } +bool memory_region_is_iommu(MemoryRegion *mr) +{ + return mr->iommu_ops; +} + void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) { uint8_t mask = 1 << client; diff --git a/memory.h b/memory.h index 79393f1..299d584 100644 --- a/memory.h +++ b/memory.h @@ -113,12 +113,29 @@ struct MemoryRegionOps { const MemoryRegionMmio old_mmio; }; +typedef struct IOMMUTLBEntry IOMMUTLBEntry; +typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps; + +struct IOMMUTLBEntry { + target_phys_addr_t device_addr; + target_phys_addr_t translated_addr; + target_phys_addr_t addr_mask; /* 0xfff = 4k translation */ + bool valid; +}; + +struct MemoryRegionIOMMUOps { + /* Returns a TLB entry that contains a given address. */ + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, target_phys_addr_t addr, + bool is_write); +}; + typedef struct CoalescedMemoryRange CoalescedMemoryRange; typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd; struct MemoryRegion { /* All fields are private - violators will be prosecuted */ const MemoryRegionOps *ops; + const MemoryRegionIOMMUOps *iommu_ops; void *opaque; MemoryRegion *parent; Int128 size; @@ -145,6 +162,8 @@ struct MemoryRegion { uint8_t dirty_log_mask; unsigned ioeventfd_nb; MemoryRegionIoeventfd *ioeventfds; + MemoryRegion *iommu_target; + struct AddressSpace *iommu_target_as; }; struct MemoryRegionPortio { @@ -334,6 +353,24 @@ void memory_region_init_rom_device(MemoryRegion *mr, void memory_region_init_reservation(MemoryRegion *mr, const char *name, uint64_t size); + +/** + * memory_region_init_iommu: Initialize a memory region that translates addresses + * + * An IOMMU region translates addresses and forwards accesses to a target memory region. + * + * @mr: the #MemoryRegion to be initialized + * @ops: a function that translates addresses into the @target region + * @target: a #MemoryRegion that will be used to satisfy accesses to translated addresses + * @name: used for debugging; not visible to the user or ABI + * @size: size of the region. + */ +void memory_region_init_iommu(MemoryRegion *mr, + MemoryRegionIOMMUOps *ops, + MemoryRegion *target, + const char *name, + uint64_t size); + /** * memory_region_destroy: Destroy a memory region and reclaim all resources. * @@ -373,6 +410,15 @@ static inline bool memory_region_is_romd(MemoryRegion *mr) } /** + * memory_region_is_ram: check whether a memory region is an iommu + * + * Returns %true is a memory region is an iommu. + * + * @mr: the memory region being queried + */ +bool memory_region_is_iommu(MemoryRegion *mr); + +/** * memory_region_name: get a memory region's name * * Returns the string that was used to initialize the memory region.
Add a new memory region type that translates addresses it is given, then forwards them to a target address space. This is similar to an alias, except that the mapping is more flexible than a linear translation and trucation, and also less efficient since the translation happens at runtime. The implementation uses an AddressSpace mapping the target region to avoid hierarchical dispatch all the way to the resolved region; only iommu regions are looked up dynamically. Signed-off-by: Avi Kivity <avi@redhat.com> --- exec.c | 28 ++++++++++++++--- memory.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ memory.h | 46 +++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 4 deletions(-)