diff mbox series

[for-9.2,v6,03/12] hw/riscv: add RISC-V IOMMU base emulation

Message ID 20240801154334.1009852-4-dbarboza@ventanamicro.com
State New
Headers show
Series riscv: QEMU RISC-V IOMMU Support | expand

Commit Message

Daniel Henrique Barboza Aug. 1, 2024, 3:43 p.m. UTC
From: Tomasz Jeznach <tjeznach@rivosinc.com>

The RISC-V IOMMU specification is now ratified as-per the RISC-V
international process. The latest frozen specifcation can be found at:

https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

Add the foundation of the device emulation for RISC-V IOMMU. It includes
support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
sv39x4, sv48x4, sv57x4 caps).

Other capabilities like ATS and DBG support will be added incrementally
in the next patches.

Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
---
 hw/riscv/Kconfig            |    4 +
 hw/riscv/meson.build        |    1 +
 hw/riscv/riscv-iommu-bits.h |   14 +
 hw/riscv/riscv-iommu.c      | 2017 +++++++++++++++++++++++++++++++++++
 hw/riscv/riscv-iommu.h      |  148 +++
 hw/riscv/trace-events       |   12 +
 hw/riscv/trace.h            |    1 +
 include/hw/riscv/iommu.h    |   36 +
 meson.build                 |    1 +
 9 files changed, 2234 insertions(+)
 create mode 100644 hw/riscv/riscv-iommu.c
 create mode 100644 hw/riscv/riscv-iommu.h
 create mode 100644 hw/riscv/trace-events
 create mode 100644 hw/riscv/trace.h
 create mode 100644 include/hw/riscv/iommu.h

Comments

Alistair Francis Aug. 4, 2024, 11:23 p.m. UTC | #1
On Fri, Aug 2, 2024 at 1:47 AM Daniel Henrique Barboza
<dbarboza@ventanamicro.com> wrote:
>
> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>
> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> international process. The latest frozen specifcation can be found at:
>
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>
> Add the foundation of the device emulation for RISC-V IOMMU. It includes
> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
> sv39x4, sv48x4, sv57x4 caps).
>
> Other capabilities like ATS and DBG support will be added incrementally
> in the next patches.
>
> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>

Acked-by: Alistair Francis <alistair.francis@wdc.com>

Alistair

> ---
>  hw/riscv/Kconfig            |    4 +
>  hw/riscv/meson.build        |    1 +
>  hw/riscv/riscv-iommu-bits.h |   14 +
>  hw/riscv/riscv-iommu.c      | 2017 +++++++++++++++++++++++++++++++++++
>  hw/riscv/riscv-iommu.h      |  148 +++
>  hw/riscv/trace-events       |   12 +
>  hw/riscv/trace.h            |    1 +
>  include/hw/riscv/iommu.h    |   36 +
>  meson.build                 |    1 +
>  9 files changed, 2234 insertions(+)
>  create mode 100644 hw/riscv/riscv-iommu.c
>  create mode 100644 hw/riscv/riscv-iommu.h
>  create mode 100644 hw/riscv/trace-events
>  create mode 100644 hw/riscv/trace.h
>  create mode 100644 include/hw/riscv/iommu.h
>
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index a2030e3a6f..f69d6e3c8e 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -1,3 +1,6 @@
> +config RISCV_IOMMU
> +    bool
> +
>  config RISCV_NUMA
>      bool
>
> @@ -47,6 +50,7 @@ config RISCV_VIRT
>      select SERIAL
>      select RISCV_ACLINT
>      select RISCV_APLIC
> +    select RISCV_IOMMU
>      select RISCV_IMSIC
>      select SIFIVE_PLIC
>      select SIFIVE_TEST
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index f872674093..cbc99c6e8e 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>  riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>  riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>  riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>
>  hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> index 9d9512ca71..6f1b9ab61b 100644
> --- a/hw/riscv/riscv-iommu-bits.h
> +++ b/hw/riscv/riscv-iommu-bits.h
> @@ -69,6 +69,14 @@ struct riscv_iommu_pq_record {
>  /* 5.3 IOMMU Capabilities (64bits) */
>  #define RISCV_IOMMU_REG_CAP             0x0000
>  #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>  #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>  #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>  #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> @@ -80,7 +88,9 @@ struct riscv_iommu_pq_record {
>
>  /* 5.4 Features control register (32bits) */
>  #define RISCV_IOMMU_REG_FCTL            0x0008
> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
>  #define RISCV_IOMMU_FCTL_WSI            BIT(1)
> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>
>  /* 5.5 Device-directory-table pointer (64bits) */
>  #define RISCV_IOMMU_REG_DDTP            0x0010
> @@ -203,6 +213,8 @@ struct riscv_iommu_dc {
>  #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>  #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>  #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>  #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>  #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
>  #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> @@ -309,9 +321,11 @@ enum riscv_iommu_fq_causes {
>
>  /* Translation attributes fields */
>  #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
> +#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
>
>  /* First stage context fields */
>  #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
> +#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
>
>  enum riscv_iommu_fq_ttypes {
>      RISCV_IOMMU_FQ_TTYPE_NONE = 0,
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> new file mode 100644
> index 0000000000..ebe3a53a04
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.c
> @@ -0,0 +1,2017 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2021-2023, Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_device.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/timer.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +#include "trace.h"
> +
> +#define LIMIT_CACHE_CTX               (1U << 7)
> +#define LIMIT_CACHE_IOT               (1U << 20)
> +
> +/* Physical page number coversions */
> +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> +
> +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> +
> +/* Device assigned I/O address space */
> +struct RISCVIOMMUSpace {
> +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> +    AddressSpace iova_as;       /* IOVA address space for attached device */
> +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> +    uint32_t devid;             /* Requester identifier, AKA device_id */
> +    bool notifier;              /* IOMMU unmap notifier enabled */
> +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> +};
> +
> +/* Device translation context state. */
> +struct RISCVIOMMUContext {
> +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> +    uint64_t process_id:20;     /* Process ID. PASID for PCIe */
> +    uint64_t __rfu:20;          /* reserved */
> +    uint64_t tc;                /* Translation Control */
> +    uint64_t ta;                /* Translation Attributes */
> +    uint64_t satp;              /* S-Stage address translation and protection */
> +    uint64_t gatp;              /* G-Stage address translation and protection */
> +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> +    uint64_t msiptp;            /* MSI redirection page table pointer */
> +};
> +
> +/* IOMMU index for transactions without process_id specified. */
> +#define RISCV_IOMMU_NOPROCID 0
> +
> +static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
> +{
> +    g_assert(vec_type < 4);
> +    return (icvec >> (vec_type * 4)) & 0x0F;
> +}
> +
> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
> +{
> +    const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
> +    uint32_t ipsr, icvec;
> +
> +    if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
> +        return;
> +    }
> +
> +    icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
> +    ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> +
> +    if (!(ipsr & (1 << vec))) {
> +        s->notify(s, riscv_iommu_get_icvec_vector(icvec, vec));
> +    }
> +}
> +
> +static void riscv_iommu_fault(RISCVIOMMUState *s,
> +                              struct riscv_iommu_fq_record *ev)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> +    uint32_t next = (tail + 1) & s->fq_mask;
> +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> +
> +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> +
> +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                              RISCV_IOMMU_FQCSR_FQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                                  RISCV_IOMMU_FQCSR_FQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> +    }
> +}
> +
> +static void riscv_iommu_pri(RISCVIOMMUState *s,
> +    struct riscv_iommu_pq_record *pr)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> +    uint32_t next = (tail + 1) & s->pq_mask;
> +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> +
> +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), pr->payload);
> +
> +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                              RISCV_IOMMU_PQCSR_PQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                                  RISCV_IOMMU_PQCSR_PQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> +    }
> +}
> +
> +/* Portable implementation of pext_u64, bit-mask extraction. */
> +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> +{
> +    uint64_t ret = 0;
> +    uint64_t rot = 1;
> +
> +    while (ext) {
> +        if (ext & 1) {
> +            if (val & 1) {
> +                ret |= rot;
> +            }
> +            rot <<= 1;
> +        }
> +        val >>= 1;
> +        ext >>= 1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Check if GPA matches MSI/MRIF pattern. */
> +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    dma_addr_t gpa)
> +{
> +    if (!s->enable_msi) {
> +        return false;
> +    }
> +
> +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +        return false; /* Invalid MSI/MRIF mode */
> +    }
> +
> +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.
> + *
> + * @s        : IOMMU Device State
> + * @ctx      : Translation context for device id and process address space id.
> + * @iotlb    : translation data: physical address and access mode.
> + * @return   : success or fault cause code.
> + */
> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    dma_addr_t addr, base;
> +    uint64_t satp, gatp, pte;
> +    bool en_s, en_g;
> +    struct {
> +        unsigned char step;
> +        unsigned char levels;
> +        unsigned char ptidxbits;
> +        unsigned char ptesize;
> +    } sc[2];
> +    /* Translation stage phase */
> +    enum {
> +        S_STAGE = 0,
> +        G_STAGE = 1,
> +    } pass;
> +
> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> +
> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> +
> +    /* Early check for MSI address match when IOVA == GPA */
> +    if ((iotlb->perm & IOMMU_WO) &&
> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> +        iotlb->target_as = &s->trap_as;
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        return 0;
> +    }
> +
> +    /* Exit early for pass-through mode. */
> +    if (!(en_s || en_g)) {
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        /* Allow R/W in pass-through mode */
> +        iotlb->perm = IOMMU_RW;
> +        return 0;
> +    }
> +
> +    /* S/G translation parameters. */
> +    for (pass = 0; pass < 2; pass++) {
> +        uint32_t sv_mode;
> +
> +        sc[pass].step = 0;
> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> +            /* 32bit mode for GXL/SXL == 1 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 2;
> +                sc[pass].ptidxbits = 10;
> +                sc[pass].ptesize   = 4;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        } else {
> +            /* 64bit mode for GXL/SXL == 0 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 3;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 4;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 5;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        }
> +    };
> +
> +    /* S/G stages translation tables root pointers */
> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> +    base = en_g ? gatp : satp;
> +    pass = en_g ? G_STAGE : S_STAGE;
> +
> +    do {
> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> +                                 (sc[pass].levels - 1 - sc[pass].step);
> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> +        const bool ade =
> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> +
> +        /* Address range check before first level lookup */
> +        if (!sc[pass].step) {
> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> +            if ((addr & va_mask) != addr) {
> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +            }
> +        }
> +
> +        /* Read page table entry */
> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> +        }
> +
> +        if (sc[pass].ptesize == 4) {
> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> +        } else {
> +            pte = le64_to_cpu(pte);
> +        }
> +
> +        sc[pass].step++;
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            break;                /* Invalid PTE */
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            break;                /* Reserved leaf PTE flags: PTE_W */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> +            break;                /* Misaligned PPN */
> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> +            break;                /* Read access check failed */
> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> +            break;                /* Write access check failed */
> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> +            break;                /* Access bit not set */
> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> +            break;                /* Dirty bit not set */
> +        } else {
> +            /* Leaf PTE, translation completed. */
> +            sc[pass].step = sc[pass].levels;
> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> +            /* Update address mask based on smallest translation granularity */
> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> +            /* Continue with S-Stage translation? */
> +            if (pass && sc[0].step != sc[0].levels) {
> +                pass = S_STAGE;
> +                addr = iotlb->iova;
> +                continue;
> +            }
> +            /* Translation phase completed (GPA or SPA) */
> +            iotlb->translated_addr = base;
> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> +                                                         : IOMMU_RO;
> +
> +            /* Check MSI GPA address match */
> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> +                riscv_iommu_msi_check(s, ctx, base)) {
> +                /* Trap MSI writes and return GPA address. */
> +                iotlb->target_as = &s->trap_as;
> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +                return 0;
> +            }
> +
> +            /* Continue with G-Stage translation? */
> +            if (!pass && en_g) {
> +                pass = G_STAGE;
> +                addr = base;
> +                base = gatp;
> +                sc[pass].step = 0;
> +                continue;
> +            }
> +
> +            return 0;
> +        }
> +
> +        if (sc[pass].step == sc[pass].levels) {
> +            break; /* Can't find leaf PTE */
> +        }
> +
> +        /* Continue with G-Stage translation? */
> +        if (!pass && en_g) {
> +            pass = G_STAGE;
> +            addr = base;
> +            base = gatp;
> +            sc[pass].step = 0;
> +        }
> +    } while (1);
> +
> +    return (iotlb->perm & IOMMU_WO) ?
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
> +}
> +
> +static void riscv_iommu_report_fault(RISCVIOMMUState *s,
> +                                     RISCVIOMMUContext *ctx,
> +                                     uint32_t fault_type, uint32_t cause,
> +                                     bool pv,
> +                                     uint64_t iotval, uint64_t iotval2)
> +{
> +    struct riscv_iommu_fq_record ev = { 0 };
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
> +        switch (cause) {
> +        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
> +        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
> +        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
> +            break;
> +        default:
> +            /* DTF prevents reporting a fault for this given cause */
> +            return;
> +        }
> +    }
> +
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
> +
> +    if (pv) {
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
> +    }
> +
> +    ev.iotval = iotval;
> +    ev.iotval2 = iotval2;
> +
> +    riscv_iommu_fault(s, &ev);
> +}
> +
> +/* Redirect MSI write for given GPA. */
> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> +    unsigned size, MemTxAttrs attrs)
> +{
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint64_t intn;
> +    uint32_t n190;
> +    uint64_t pte[2];
> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +    int cause;
> +
> +    /* Interrupt File Number */
> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> +    if (intn >= 256) {
> +        /* Interrupt file number out of range */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    /* fetch MSI PTE */
> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> +    addr = addr | (intn * sizeof(pte));
> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> +            MEMTXATTRS_UNSPECIFIED);
> +    if (res != MEMTX_OK) {
> +        if (res == MEMTX_DECODE_ERROR) {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
> +        } else {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        }
> +        goto err;
> +    }
> +
> +    le64_to_cpus(&pte[0]);
> +    le64_to_cpus(&pte[1]);
> +
> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> +        /*
> +         * The spec mentions that: "If msipte.C == 1, then further
> +         * processing to interpret the PTE is implementation
> +         * defined.". We'll abort with cause = 262 for this
> +         * case too.
> +         */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
> +        goto err;
> +    }
> +
> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> +        /* MSI Pass-through mode */
> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> +        addr = addr | (gpa & TARGET_PAGE_MASK);
> +
> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                              gpa, addr);
> +
> +        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
> +        if (res != MEMTX_OK) {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +            goto err;
> +        }
> +
> +        return MEMTX_OK;
> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> +        /* MRIF mode, continue. */
> +        break;
> +    default:
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> +        goto err;
> +    }
> +
> +    /*
> +     * Report an error for interrupt identities exceeding the maximum allowed
> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> +     */
> +    if ((data > 2047) || (gpa & 3)) {
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> +        goto err;
> +    }
> +
> +    /* MSI MRIF mode, non atomic pending bit update */
> +
> +    /* MRIF pending bit address */
> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> +    addr = addr | ((data & 0x7c0) >> 3);
> +
> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                          gpa, addr);
> +
> +    /* MRIF pending bit mask */
> +    data = 1ULL << (data & 0x03f);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    intn = intn | data;
> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +        goto err;
> +    }
> +
> +    /* Get MRIF enable bits */
> +    addr = addr + sizeof(intn);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    if (!(intn & data)) {
> +        /* notification disabled, MRIF update completed. */
> +        return MEMTX_OK;
> +    }
> +
> +    /* Send notification message */
> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> +
> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +        goto err;
> +    }
> +
> +    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
> +
> +    return MEMTX_OK;
> +
> +err:
> +    riscv_iommu_report_fault(s, ctx, fault_type, cause,
> +                             !!ctx->process_id, 0, 0);
> +    return res;
> +}
> +
> +/*
> + * Check device context configuration as described by the
> + * riscv-iommu spec section "Device-context configuration
> + * checks".
> + */
> +static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
> +                                            RISCVIOMMUContext *ctx)
> +{
> +    uint32_t fsc_mode, msi_mode;
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
> +        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
> +        return false;
> +    }
> +
> +    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
> +        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
> +        return false;
> +    }
> +
> +    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
> +        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
> +
> +        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
> +            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +            return false;
> +        }
> +    }
> +
> +    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
> +        switch (fsc_mode) {
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
> +                return false;
> +            }
> +            break;
> +        }
> +    } else {
> +        /* DC.tc.PDTV is 0 */
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
> +            return false;
> +        }
> +
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> +            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
> +                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> +                return false;
> +            }
> +        } else {
> +            switch (fsc_mode) {
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> +                    return false;
> +                }
> +                break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> +                    return false;
> +                }
> +            break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> +                    return false;
> +                }
> +                break;
> +            }
> +        }
> +    }
> +
> +    /*
> +     * CAP_END is always zero (only one endianess). FCTL_BE is
> +     * always zero (little-endian accesses). Thus TC_SBE must
> +     * always be LE, i.e. zero.
> +     */
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * Validate process context (PC) according to section
> + * "Process-context configuration checks".
> + */
> +static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
> +                                             RISCVIOMMUContext *ctx)
> +{
> +    uint32_t mode;
> +
> +    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
> +        return false;
> +    }
> +
> +    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
> +        return false;
> +    }
> +
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    switch (mode) {
> +    case RISCV_IOMMU_DC_FSC_MODE_BARE:
> +    /* sv39 and sv32 modes have the same value (8) */
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +        break;
> +    default:
> +        return false;
> +    }
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> +        if (mode == RISCV_IOMMU_CAP_SV32 &&
> +            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> +                return false;
> +        }
> +    } else {
> +        switch (mode) {
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> +                return false;
> +            }
> +            break;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> + *
> + * @s         : IOMMU Device State
> + * @ctx       : Device Translation Context with devid and process_id set.
> + * @return    : success or fault code.
> + */
> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> +{
> +    const uint64_t ddtp = s->ddtp;
> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> +    struct riscv_iommu_dc dc;
> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
> +    const int dc_fmt = !s->enable_msi;
> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> +    unsigned depth;
> +    uint64_t de;
> +
> +    switch (mode) {
> +    case RISCV_IOMMU_DDTP_MODE_OFF:
> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +
> +    case RISCV_IOMMU_DDTP_MODE_BARE:
> +        /* mock up pass-through translation context */
> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
> +        ctx->ta = 0;
> +        ctx->msiptp = 0;
> +        return 0;
> +
> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> +        depth = 0;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> +        depth = 1;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> +        depth = 2;
> +        break;
> +
> +    default:
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /*
> +     * Check supported device id width (in bits).
> +     * See IOMMU Specification, Chapter 6. Software guidelines.
> +     * - if extended device-context format is used:
> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> +     * - if base device-context format is used:
> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> +     */
> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> +        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +    }
> +
> +    /* Device directory tree walk */
> +    for (; depth-- > 0; ) {
> +        /*
> +         * Select device id index bits based on device directory tree level
> +         * and device context format.
> +         * See IOMMU Specification, Chapter 2. Data Structures.
> +         * - if extended device-context format is used:
> +         *   device index: [23:15][14:6][5:0]
> +         * - if base device-context format is used:
> +         *   device index: [23:16][15:7][6:0]
> +         */
> +        const int split = depth * 9 + 6 + dc_fmt;
> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> +            /* invalid directory entry */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> +            /* reserved bits set */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> +    }
> +
> +    /* index into device context entry page */
> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> +
> +    memset(&dc, 0, sizeof(dc));
> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +    }
> +
> +    /* Set translation context. */
> +    ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /* FSC field checks */
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> +        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
> +            /* PID is disabled */
> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +        }
> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> +            /* Invalid translation mode */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        return 0;
> +    }
> +
> +    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> +            /* No default process_id enabled, set BARE mode */
> +            ctx->satp = 0ULL;
> +            return 0;
> +        } else {
> +            /* Use default process_id #0 */
> +            ctx->process_id = 0;
> +        }
> +    }
> +
> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> +        /* No S-Stage translation, done. */
> +        return 0;
> +    }
> +
> +    /* FSC.TC.PDTV enabled */
> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> +        /* Invalid PDTP.MODE */
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> +        /*
> +         * Select process id index bits based on process directory tree
> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> +         */
> +        const int split = depth * 9 + 8;
> +        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> +    }
> +
> +    /* Leaf entry in PDT */
> +    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +    }
> +
> +    /* Use FSC and TA from process directory entry. */
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +
> +    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +    }
> +
> +    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Translation Context cache support */
> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
> +{
> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> +    return c1->devid == c2->devid &&
> +           c1->process_id == c2->process_id;
> +}
> +
> +static guint __ctx_hash(gconstpointer v)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> +    /*
> +     * Generate simple hash of (process_id, devid)
> +     * assuming 24-bit wide devid.
> +     */
> +    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
> +}
> +
> +static void __ctx_inval_devid_procid(gpointer key, gpointer value,
> +                                     gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid &&
> +        ctx->process_id == arg->process_id) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> +                                  uint32_t devid, uint32_t process_id)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .process_id = process_id,
> +    };
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    qemu_mutex_lock(&s->ctx_lock);
> +    g_hash_table_foreach(ctx_cache, func, &key);
> +    qemu_mutex_unlock(&s->ctx_lock);
> +    g_hash_table_unref(ctx_cache);
> +}
> +
> +/* Find or allocate translation context for a given {device_id, process_id} */
> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> +                                          unsigned devid, unsigned process_id,
> +                                          void **ref)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext *ctx;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .process_id = process_id,
> +    };
> +
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    qemu_mutex_lock(&s->ctx_lock);
> +    ctx = g_hash_table_lookup(ctx_cache, &key);
> +    qemu_mutex_unlock(&s->ctx_lock);
> +
> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    ctx = g_new0(RISCVIOMMUContext, 1);
> +    ctx->devid = devid;
> +    ctx->process_id = process_id;
> +
> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> +    if (!fault) {
> +        qemu_mutex_lock(&s->ctx_lock);
> +        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
> +            g_hash_table_unref(ctx_cache);
> +            ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                              g_free, NULL);
> +            g_hash_table_ref(ctx_cache);
> +            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> +        }
> +        g_hash_table_add(ctx_cache, ctx);
> +        qemu_mutex_unlock(&s->ctx_lock);
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    g_hash_table_unref(ctx_cache);
> +    *ref = NULL;
> +
> +    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
> +                             fault, !!process_id, 0, 0);
> +
> +    g_free(ctx);
> +    return NULL;
> +}
> +
> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> +{
> +    if (ref) {
> +        g_hash_table_unref((GHashTable *)ref);
> +    }
> +}
> +
> +/* Find or allocate address space for a given device */
> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> +{
> +    RISCVIOMMUSpace *as;
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid) {
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    if (as == NULL) {
> +        char name[64];
> +        as = g_new0(RISCVIOMMUSpace, 1);
> +
> +        as->iommu = s;
> +        as->devid = devid;
> +
> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +
> +        /* IOVA address space, untranslated addresses */
> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> +            OBJECT(as), "riscv_iommu", UINT64_MAX);
> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
> +
> +        qemu_mutex_lock(&s->core_lock);
> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> +        qemu_mutex_unlock(&s->core_lock);
> +
> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +    }
> +    return &as->iova_as;
> +}
> +
> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    bool enable_pid;
> +    bool enable_pri;
> +    int fault;
> +
> +    /*
> +     * TC[32] is reserved for custom extensions, used here to temporarily
> +     * enable automatic page-request generation for ATS queries.
> +     */
> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> +    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> +
> +    /* Translate using device directory / page table information. */
> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
> +
> +    if (enable_pri && fault) {
> +        struct riscv_iommu_pq_record pr = {0};
> +        if (enable_pid) {
> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> +                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
> +        }
> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
> +        riscv_iommu_pri(s, &pr);
> +        return fault;
> +    }
> +
> +    if (fault) {
> +        unsigned ttype;
> +
> +        if (iotlb->perm & IOMMU_RW) {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +        } else {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
> +        }
> +
> +        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
> +                                 iotlb->iova, iotlb->translated_addr);
> +        return fault;
> +    }
> +
> +    return 0;
> +}
> +
> +/* IOMMU Command Interface */
> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> +    uint64_t addr, uint32_t data)
> +{
> +    /*
> +     * ATS processing in this implementation of the IOMMU is synchronous,
> +     * no need to wait for completions here.
> +     */
> +    if (!notify) {
> +        return MEMTX_OK;
> +    }
> +
> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> +        MEMTXATTRS_UNSPECIFIED);
> +}
> +
> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> +{
> +    uint64_t old_ddtp = s->ddtp;
> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    bool ok = false;
> +
> +    /*
> +     * Check for allowed DDTP.MODE transitions:
> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> +     */
> +    if (new_mode == old_mode ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> +        ok = true;
> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> +    }
> +
> +    if (ok) {
> +        /* clear reserved and busy bits, report back sanitized version */
> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> +    } else {
> +        new_ddtp = old_ddtp;
> +    }
> +    s->ddtp = new_ddtp;
> +
> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> +}
> +
> +/* Command function and opcode field. */
> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> +
> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> +{
> +    struct riscv_iommu_command cmd;
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint32_t tail, head, ctrl;
> +    uint64_t cmd_opcode;
> +    GHFunc func;
> +
> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> +
> +    /* Check for pending error or queue processing disabled */
> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> +        return;
> +    }
> +
> +    while (tail != head) {
> +        addr = s->cq_addr  + head * sizeof(cmd);
> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> +                              MEMTXATTRS_UNSPECIFIED);
> +
> +        if (res != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
> +            goto fault;
> +        }
> +
> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> +
> +        cmd_opcode = get_field(cmd.dword0,
> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
> +
> +        switch (cmd_opcode) {
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> +            res = riscv_iommu_iofence(s,
> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> +
> +            if (res != MEMTX_OK) {
> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> +                goto fault;
> +            }
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> +                goto cmd_ill;
> +            }
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* invalidate all device context cache mappings */
> +                func = __ctx_inval_all;
> +            } else {
> +                /* invalidate all device context matching DID */
> +                func = __ctx_inval_devid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* illegal command arguments IODIR_PDT & DV == 0 */
> +                goto cmd_ill;
> +            } else {
> +                func = __ctx_inval_devid_procid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> +            break;
> +
> +        default:
> +        cmd_ill:
> +            /* Invalid instruction, do not advance instruction index. */
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> +            goto fault;
> +        }
> +
> +        /* Advance and update head pointer after command completes. */
> +        head = (head + 1) & s->cq_mask;
> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> +    }
> +    return;
> +
> +fault:
> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> +    }
> +}
> +
> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
> +                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> +            RISCV_IOMMU_FQCSR_FQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> +            RISCV_IOMMU_PQCSR_PQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_icvec_update(RISCVIOMMUState *s)
> +{
> +    uint32_t icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
> +
> +    s->icvec_update(s,
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_CQ),
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_FQ),
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PM),
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PQ));
> +}
> +
> +typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
> +
> +static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
> +{
> +    uint32_t cqcsr, fqcsr, pqcsr;
> +    uint32_t ipsr_set = 0;
> +    uint32_t ipsr_clr = 0;
> +
> +    if (data & RISCV_IOMMU_IPSR_CIP) {
> +        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +
> +        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
> +            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> +    }
> +
> +    if (data & RISCV_IOMMU_IPSR_FIP) {
> +        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +
> +        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
> +            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
> +             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> +    }
> +
> +    if (data & RISCV_IOMMU_IPSR_PIP) {
> +        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +
> +        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
> +            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
> +             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
> +}
> +
> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    riscv_iommu_process_fn *process_fn = NULL;
> +    RISCVIOMMUState *s = opaque;
> +    uint32_t regb = addr & ~3;
> +    uint32_t busy = 0;
> +    uint64_t val = 0;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment or access size */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        /* Unsupported MMIO access location. */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Track actionable MMIO write. */
> +    switch (regb) {
> +    case RISCV_IOMMU_REG_DDTP:
> +    case RISCV_IOMMU_REG_DDTP + 4:
> +        process_fn = riscv_iommu_process_ddtp;
> +        regb = RISCV_IOMMU_REG_DDTP;
> +        busy = RISCV_IOMMU_DDTP_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQT:
> +        process_fn = riscv_iommu_process_cq_tail;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQCSR:
> +        process_fn = riscv_iommu_process_cq_control;
> +        busy = RISCV_IOMMU_CQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQCSR:
> +        process_fn = riscv_iommu_process_fq_control;
> +        busy = RISCV_IOMMU_FQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQCSR:
> +        process_fn = riscv_iommu_process_pq_control;
> +        busy = RISCV_IOMMU_PQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_ICVEC:
> +        process_fn = riscv_iommu_process_icvec_update;
> +        break;
> +
> +    case RISCV_IOMMU_REG_IPSR:
> +        /*
> +         * IPSR has special procedures to update. Execute it
> +         * and exit.
> +         */
> +        if (size == 4) {
> +            uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +            uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +            uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> +            stl_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +        } else if (size == 8) {
> +            uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +            uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +            uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> +            stq_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +        }
> +
> +        riscv_iommu_update_ipsr(s, val);
> +
> +        return MEMTX_OK;
> +
> +    default:
> +        break;
> +    }
> +
> +    /*
> +     * Registers update might be not synchronized with core logic.
> +     * If system software updates register when relevant BUSY bit
> +     * is set IOMMU behavior of additional writes to the register
> +     * is UNSPECIFIED.
> +     */
> +    qemu_spin_lock(&s->regs_lock);
> +    if (size == 1) {
> +        uint8_t ro = s->regs_ro[addr];
> +        uint8_t wc = s->regs_wc[addr];
> +        uint8_t rw = s->regs_rw[addr];
> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
> +    } else if (size == 2) {
> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 4) {
> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 8) {
> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    }
> +
> +    /* Busy flag update, MSB 4-byte register. */
> +    if (busy) {
> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> +        stl_le_p(&s->regs_rw[regb], rw | busy);
> +    }
> +    qemu_spin_unlock(&s->regs_lock);
> +
> +    if (process_fn) {
> +        qemu_mutex_lock(&s->core_lock);
> +        process_fn(s);
> +        qemu_mutex_unlock(&s->core_lock);
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint64_t val = -1;
> +    uint8_t *ptr;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment. */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    ptr = &s->regs_rw[addr];
> +
> +    if (size == 1) {
> +        val = (uint64_t)*ptr;
> +    } else if (size == 2) {
> +        val = lduw_le_p(ptr);
> +    } else if (size == 4) {
> +        val = ldl_le_p(ptr);
> +    } else if (size == 8) {
> +        val = ldq_le_p(ptr);
> +    } else {
> +        return MEMTX_ERROR;
> +    }
> +
> +    *data = val;
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> +    .read_with_attrs = riscv_iommu_mmio_read,
> +    .write_with_attrs = riscv_iommu_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +        .unaligned = false,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +/*
> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> + * memory region as untranslated address, for additional MSI/MRIF interception
> + * by IOMMU interrupt remapping implementation.
> + * Note: Device emulation code generating an MSI is expected to provide a valid
> + * memory transaction attributes with requested_id set.
> + */
> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> +    RISCVIOMMUContext *ctx;
> +    MemTxResult res;
> +    void *ref;
> +    uint32_t devid = attrs.requester_id;
> +
> +    if (attrs.unspecified) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> +    if (ctx == NULL) {
> +        res = MEMTX_ACCESS_ERROR;
> +    } else {
> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> +    }
> +    riscv_iommu_ctx_put(s, ref);
> +    return res;
> +}
> +
> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    return MEMTX_ACCESS_ERROR;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> +    .read_with_attrs = riscv_iommu_trap_read,
> +    .write_with_attrs = riscv_iommu_trap_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
> +    if (s->enable_msi) {
> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
> +    }
> +    if (s->enable_s_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
> +    }
> +    if (s->enable_g_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
> +    }
> +    /* Report QEMU target physical address space limits */
> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
> +                       TARGET_PHYS_ADDR_SPACE_BITS);
> +
> +    /* TODO: method to report supported PID bits */
> +    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
> +    s->cap |= RISCV_IOMMU_CAP_PD8;
> +
> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> +
> +    /* register storage */
> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +
> +     /* Mark all registers read-only */
> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> +
> +    /*
> +     * Register complete MMIO space, including MSI/PBA registers.
> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> +     * managed directly by the PCIDevice implementation.
> +     */
> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> +
> +    /* Set power-on register state */
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
> +             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> +        RISCV_IOMMU_CQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> +        RISCV_IOMMU_FQCSR_FQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> +        RISCV_IOMMU_FQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> +        RISCV_IOMMU_PQCSR_PQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> +        RISCV_IOMMU_PQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> +
> +    /* Memory region for downstream access, if specified. */
> +    if (s->target_mr) {
> +        s->target_as = g_new0(AddressSpace, 1);
> +        address_space_init(s->target_as, s->target_mr,
> +            "riscv-iommu-downstream");
> +    } else {
> +        /* Fallback to global system memory. */
> +        s->target_as = &address_space_memory;
> +    }
> +
> +    /* Memory region for untranslated MRIF/MSI writes */
> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> +            "riscv-iommu-trap", ~0ULL);
> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> +
> +    /* Device translation context cache */
> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                         g_free, NULL);
> +    qemu_mutex_init(&s->ctx_lock);
> +
> +    s->iommus.le_next = NULL;
> +    s->iommus.le_prev = NULL;
> +    QLIST_INIT(&s->spaces);
> +    qemu_mutex_init(&s->core_lock);
> +    qemu_spin_init(&s->regs_lock);
> +}
> +
> +static void riscv_iommu_unrealize(DeviceState *dev)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    qemu_mutex_destroy(&s->core_lock);
> +    g_hash_table_unref(s->ctx_cache);
> +}
> +
> +static Property riscv_iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> +        RISCV_IOMMU_SPEC_DOT_VER),
> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> +        TYPE_MEMORY_REGION, MemoryRegion *),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> +    dc->user_creatable = false;
> +    dc->realize = riscv_iommu_realize;
> +    dc->unrealize = riscv_iommu_unrealize;
> +    device_class_set_props(dc, riscv_iommu_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_info = {
> +    .name = TYPE_RISCV_IOMMU,
> +    .parent = TYPE_DEVICE,
> +    .instance_size = sizeof(RISCVIOMMUState),
> +    .class_init = riscv_iommu_class_init,
> +};
> +
> +static const char *IOMMU_FLAG_STR[] = {
> +    "NA",
> +    "RO",
> +    "WR",
> +    "RW",
> +};
> +
> +/* RISC-V IOMMU Memory Region - Address Translation Space */
> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> +    IOMMUAccessFlags flag, int iommu_idx)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    RISCVIOMMUContext *ctx;
> +    void *ref;
> +    IOMMUTLBEntry iotlb = {
> +        .iova = addr,
> +        .target_as = as->iommu->target_as,
> +        .addr_mask = ~0ULL,
> +        .perm = flag,
> +    };
> +
> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> +    if (ctx == NULL) {
> +        /* Translation disabled or invalid. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
> +        /* Translation disabled or fault reported. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    }
> +
> +    /* Trace all dma translations with original access flags. */
> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> +                          iotlb.translated_addr);
> +
> +    riscv_iommu_ctx_put(as->iommu, ref);
> +
> +    return iotlb;
> +}
> +
> +static int riscv_iommu_memory_region_notify(
> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> +    IOMMUNotifierFlag new, Error **errp)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = true;
> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> +    } else if (new == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = false;
> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> +    }
> +
> +    return 0;
> +}
> +
> +static inline bool pci_is_iommu(PCIDevice *pdev)
> +{
> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> +}
> +
> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> +    AddressSpace *as = NULL;
> +
> +    if (pdev && pci_is_iommu(pdev)) {
> +        return s->target_as;
> +    }
> +
> +    /* Find first registered IOMMU device */
> +    while (s->iommus.le_prev) {
> +        s = *(s->iommus.le_prev);
> +    }
> +
> +    /* Find first matching IOMMU */
> +    while (s != NULL && as == NULL) {
> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> +        s = s->iommus.le_next;
> +    }
> +
> +    return as ? as : &address_space_memory;
> +}
> +
> +static const PCIIOMMUOps riscv_iommu_ops = {
> +    .get_address_space = riscv_iommu_find_as,
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +        Error **errp)
> +{
> +    if (bus->iommu_ops &&
> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> +    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> +    } else {
> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> +            pci_bus_num(bus));
> +    }
> +}
> +
> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> +    MemTxAttrs attrs)
> +{
> +    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
> +}
> +
> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    return 1 << as->iommu->pid_bits;
> +}
> +
> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> +{
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +
> +    imrc->translate = riscv_iommu_memory_region_translate;
> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> +}
> +
> +static const TypeInfo riscv_iommu_memory_region_info = {
> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> +    .class_init = riscv_iommu_memory_region_init,
> +};
> +
> +static void riscv_iommu_register_mr_types(void)
> +{
> +    type_register_static(&riscv_iommu_memory_region_info);
> +    type_register_static(&riscv_iommu_info);
> +}
> +
> +type_init(riscv_iommu_register_mr_types);
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> new file mode 100644
> index 0000000000..6d76cb9b1a
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.h
> @@ -0,0 +1,148 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_STATE_H
> +#define HW_RISCV_IOMMU_STATE_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#include "hw/riscv/iommu.h"
> +
> +struct RISCVIOMMUState {
> +    /*< private >*/
> +    DeviceState parent_obj;
> +
> +    /*< public >*/
> +    uint32_t version;     /* Reported interface version number */
> +    uint32_t pid_bits;    /* process identifier width */
> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> +
> +    uint64_t cap;         /* IOMMU supported capabilities */
> +    uint64_t fctl;        /* IOMMU enabled features */
> +
> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> +    bool enable_msi;      /* Enable MSI remapping */
> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
> +    bool enable_g_stage;  /* Enable G-Stage translation */
> +
> +    /* IOMMU Internal State */
> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> +
> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> +
> +    uint32_t cq_mask;     /* Command queue index bit mask */
> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> +
> +    /* interrupt notifier */
> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> +    /* register icvec interrupts */
> +    void (*icvec_update)(RISCVIOMMUState *iommu,
> +                         uint32_t civ, uint32_t fiv,
> +                         uint32_t pmiv, uint32_t piv);
> +
> +    /* IOMMU State Machine */
> +    QemuThread core_proc; /* Background processing thread */
> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> +    QemuCond core_cond;   /* Background processing wake up signal */
> +    unsigned core_exec;   /* Processing thread execution actions */
> +
> +    /* IOMMU target address space */
> +    AddressSpace *target_as;
> +    MemoryRegion *target_mr;
> +
> +    /* MSI / MRIF access trap */
> +    AddressSpace trap_as;
> +    MemoryRegion trap_mr;
> +
> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> +    QemuMutex ctx_lock;      /* Device translation Cache update lock */
> +
> +    /* MMIO Hardware Interface */
> +    MemoryRegion regs_mr;
> +    QemuSpin regs_lock;
> +    uint8_t *regs_rw;  /* register state (user write) */
> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> +    uint8_t *regs_ro;  /* read-only mask */
> +
> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +         Error **errp);
> +
> +/* private helpers */
> +
> +/* Register helper functions */
> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set, uint32_t clr)
> +{
> +    uint32_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldl_le_p(s->regs_rw + idx);
> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stl_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldl_le_p(s->regs_rw + idx);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set, uint64_t clr)
> +{
> +    uint64_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldq_le_p(s->regs_rw + idx);
> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stq_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldq_le_p(s->regs_rw + idx);
> +}
> +
> +
> +
> +#endif
> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> new file mode 100644
> index 0000000000..bdd8b657a6
> --- /dev/null
> +++ b/hw/riscv/trace-events
> @@ -0,0 +1,12 @@
> +# See documentation at docs/devel/tracing.rst
> +
> +# riscv-iommu.c
> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> new file mode 100644
> index 0000000000..8c0e3ca1f3
> --- /dev/null
> +++ b/hw/riscv/trace.h
> @@ -0,0 +1 @@
> +#include "trace/trace-hw_riscv.h"
> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> new file mode 100644
> index 0000000000..070ee69973
> --- /dev/null
> +++ b/include/hw/riscv/iommu.h
> @@ -0,0 +1,36 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_H
> +#define HW_RISCV_IOMMU_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> +
> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> +
> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index a1e51277b0..359f836d8e 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -3373,6 +3373,7 @@ if have_system
>      'hw/pci-host',
>      'hw/ppc',
>      'hw/rtc',
> +    'hw/riscv',
>      'hw/s390x',
>      'hw/scsi',
>      'hw/sd',
> --
> 2.45.2
>
>
Andrew Jones Aug. 17, 2024, 11:34 a.m. UTC | #2
On Thu, Aug 01, 2024 at 12:43:24PM GMT, Daniel Henrique Barboza wrote:
> From: Tomasz Jeznach <tjeznach@rivosinc.com>
> 
> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> international process. The latest frozen specifcation can be found at:
> 
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
> 
> Add the foundation of the device emulation for RISC-V IOMMU. It includes
> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
> sv39x4, sv48x4, sv57x4 caps).
> 
> Other capabilities like ATS and DBG support will be added incrementally
> in the next patches.
> 
> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> ---
>  hw/riscv/Kconfig            |    4 +
>  hw/riscv/meson.build        |    1 +
>  hw/riscv/riscv-iommu-bits.h |   14 +
>  hw/riscv/riscv-iommu.c      | 2017 +++++++++++++++++++++++++++++++++++
>  hw/riscv/riscv-iommu.h      |  148 +++
>  hw/riscv/trace-events       |   12 +
>  hw/riscv/trace.h            |    1 +
>  include/hw/riscv/iommu.h    |   36 +
>  meson.build                 |    1 +
>  9 files changed, 2234 insertions(+)
>  create mode 100644 hw/riscv/riscv-iommu.c
>  create mode 100644 hw/riscv/riscv-iommu.h
>  create mode 100644 hw/riscv/trace-events
>  create mode 100644 hw/riscv/trace.h
>  create mode 100644 include/hw/riscv/iommu.h
> 
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index a2030e3a6f..f69d6e3c8e 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -1,3 +1,6 @@
> +config RISCV_IOMMU
> +    bool
> +
>  config RISCV_NUMA
>      bool
>  
> @@ -47,6 +50,7 @@ config RISCV_VIRT
>      select SERIAL
>      select RISCV_ACLINT
>      select RISCV_APLIC
> +    select RISCV_IOMMU
>      select RISCV_IMSIC
>      select SIFIVE_PLIC
>      select SIFIVE_TEST
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index f872674093..cbc99c6e8e 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>  riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>  riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>  riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>  
>  hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> index 9d9512ca71..6f1b9ab61b 100644
> --- a/hw/riscv/riscv-iommu-bits.h
> +++ b/hw/riscv/riscv-iommu-bits.h
> @@ -69,6 +69,14 @@ struct riscv_iommu_pq_record {
>  /* 5.3 IOMMU Capabilities (64bits) */
>  #define RISCV_IOMMU_REG_CAP             0x0000
>  #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>  #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>  #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>  #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> @@ -80,7 +88,9 @@ struct riscv_iommu_pq_record {
>  
>  /* 5.4 Features control register (32bits) */
>  #define RISCV_IOMMU_REG_FCTL            0x0008
> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
>  #define RISCV_IOMMU_FCTL_WSI            BIT(1)
> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>  
>  /* 5.5 Device-directory-table pointer (64bits) */
>  #define RISCV_IOMMU_REG_DDTP            0x0010
> @@ -203,6 +213,8 @@ struct riscv_iommu_dc {
>  #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>  #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>  #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>  #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>  #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
>  #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> @@ -309,9 +321,11 @@ enum riscv_iommu_fq_causes {
>  
>  /* Translation attributes fields */
>  #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
> +#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
>  
>  /* First stage context fields */
>  #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
> +#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
>  
>  enum riscv_iommu_fq_ttypes {
>      RISCV_IOMMU_FQ_TTYPE_NONE = 0,
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> new file mode 100644
> index 0000000000..ebe3a53a04
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.c
> @@ -0,0 +1,2017 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2021-2023, Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_device.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/timer.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +#include "trace.h"
> +
> +#define LIMIT_CACHE_CTX               (1U << 7)
> +#define LIMIT_CACHE_IOT               (1U << 20)
> +
> +/* Physical page number coversions */
> +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> +
> +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> +
> +/* Device assigned I/O address space */
> +struct RISCVIOMMUSpace {
> +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> +    AddressSpace iova_as;       /* IOVA address space for attached device */
> +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> +    uint32_t devid;             /* Requester identifier, AKA device_id */
> +    bool notifier;              /* IOMMU unmap notifier enabled */
> +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> +};
> +
> +/* Device translation context state. */
> +struct RISCVIOMMUContext {
> +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> +    uint64_t process_id:20;     /* Process ID. PASID for PCIe */
> +    uint64_t __rfu:20;          /* reserved */
> +    uint64_t tc;                /* Translation Control */
> +    uint64_t ta;                /* Translation Attributes */
> +    uint64_t satp;              /* S-Stage address translation and protection */
> +    uint64_t gatp;              /* G-Stage address translation and protection */
> +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> +    uint64_t msiptp;            /* MSI redirection page table pointer */
> +};
> +
> +/* IOMMU index for transactions without process_id specified. */
> +#define RISCV_IOMMU_NOPROCID 0
> +
> +static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
> +{
> +    g_assert(vec_type < 4);
> +    return (icvec >> (vec_type * 4)) & 0x0F;
> +}
> +
> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
> +{
> +    const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
> +    uint32_t ipsr, icvec;
> +
> +    if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
> +        return;
> +    }
> +
> +    icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
> +    ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> +
> +    if (!(ipsr & (1 << vec))) {
> +        s->notify(s, riscv_iommu_get_icvec_vector(icvec, vec));
> +    }
> +}
> +
> +static void riscv_iommu_fault(RISCVIOMMUState *s,
> +                              struct riscv_iommu_fq_record *ev)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> +    uint32_t next = (tail + 1) & s->fq_mask;
> +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> +
> +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> +
> +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                              RISCV_IOMMU_FQCSR_FQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                                  RISCV_IOMMU_FQCSR_FQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> +    }
> +}
> +
> +static void riscv_iommu_pri(RISCVIOMMUState *s,
> +    struct riscv_iommu_pq_record *pr)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> +    uint32_t next = (tail + 1) & s->pq_mask;
> +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> +
> +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), pr->payload);
> +
> +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                              RISCV_IOMMU_PQCSR_PQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                                  RISCV_IOMMU_PQCSR_PQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> +    }
> +}
> +
> +/* Portable implementation of pext_u64, bit-mask extraction. */
> +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> +{
> +    uint64_t ret = 0;
> +    uint64_t rot = 1;
> +
> +    while (ext) {
> +        if (ext & 1) {
> +            if (val & 1) {
> +                ret |= rot;
> +            }
> +            rot <<= 1;
> +        }
> +        val >>= 1;
> +        ext >>= 1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Check if GPA matches MSI/MRIF pattern. */
> +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    dma_addr_t gpa)
> +{
> +    if (!s->enable_msi) {
> +        return false;
> +    }
> +
> +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +        return false; /* Invalid MSI/MRIF mode */
> +    }
> +
> +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.
> + *
> + * @s        : IOMMU Device State
> + * @ctx      : Translation context for device id and process address space id.
> + * @iotlb    : translation data: physical address and access mode.
> + * @return   : success or fault cause code.
> + */
> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    dma_addr_t addr, base;
> +    uint64_t satp, gatp, pte;
> +    bool en_s, en_g;
> +    struct {
> +        unsigned char step;
> +        unsigned char levels;
> +        unsigned char ptidxbits;
> +        unsigned char ptesize;
> +    } sc[2];
> +    /* Translation stage phase */
> +    enum {
> +        S_STAGE = 0,
> +        G_STAGE = 1,
> +    } pass;
> +
> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> +
> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> +
> +    /* Early check for MSI address match when IOVA == GPA */
> +    if ((iotlb->perm & IOMMU_WO) &&
> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> +        iotlb->target_as = &s->trap_as;
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        return 0;
> +    }
> +
> +    /* Exit early for pass-through mode. */
> +    if (!(en_s || en_g)) {
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        /* Allow R/W in pass-through mode */
> +        iotlb->perm = IOMMU_RW;
> +        return 0;
> +    }
> +
> +    /* S/G translation parameters. */
> +    for (pass = 0; pass < 2; pass++) {
> +        uint32_t sv_mode;
> +
> +        sc[pass].step = 0;
> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> +            /* 32bit mode for GXL/SXL == 1 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 2;
> +                sc[pass].ptidxbits = 10;
> +                sc[pass].ptesize   = 4;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        } else {
> +            /* 64bit mode for GXL/SXL == 0 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 3;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 4;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 5;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        }
> +    };
> +
> +    /* S/G stages translation tables root pointers */
> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> +    base = en_g ? gatp : satp;
> +    pass = en_g ? G_STAGE : S_STAGE;
> +
> +    do {
> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> +                                 (sc[pass].levels - 1 - sc[pass].step);
> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> +        const bool ade =
> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> +
> +        /* Address range check before first level lookup */
> +        if (!sc[pass].step) {
> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> +            if ((addr & va_mask) != addr) {
> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +            }
> +        }
> +
> +        /* Read page table entry */
> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> +        }
> +
> +        if (sc[pass].ptesize == 4) {
> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> +        } else {
> +            pte = le64_to_cpu(pte);
> +        }
> +
> +        sc[pass].step++;
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            break;                /* Invalid PTE */
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            break;                /* Reserved leaf PTE flags: PTE_W */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> +            break;                /* Misaligned PPN */
> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> +            break;                /* Read access check failed */
> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> +            break;                /* Write access check failed */
> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> +            break;                /* Access bit not set */
> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> +            break;                /* Dirty bit not set */
> +        } else {
> +            /* Leaf PTE, translation completed. */
> +            sc[pass].step = sc[pass].levels;
> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> +            /* Update address mask based on smallest translation granularity */
> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> +            /* Continue with S-Stage translation? */
> +            if (pass && sc[0].step != sc[0].levels) {
> +                pass = S_STAGE;
> +                addr = iotlb->iova;
> +                continue;
> +            }
> +            /* Translation phase completed (GPA or SPA) */
> +            iotlb->translated_addr = base;
> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> +                                                         : IOMMU_RO;
> +
> +            /* Check MSI GPA address match */
> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> +                riscv_iommu_msi_check(s, ctx, base)) {
> +                /* Trap MSI writes and return GPA address. */
> +                iotlb->target_as = &s->trap_as;
> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +                return 0;
> +            }
> +
> +            /* Continue with G-Stage translation? */
> +            if (!pass && en_g) {
> +                pass = G_STAGE;
> +                addr = base;
> +                base = gatp;
> +                sc[pass].step = 0;
> +                continue;
> +            }
> +
> +            return 0;
> +        }
> +
> +        if (sc[pass].step == sc[pass].levels) {
> +            break; /* Can't find leaf PTE */
> +        }
> +
> +        /* Continue with G-Stage translation? */
> +        if (!pass && en_g) {
> +            pass = G_STAGE;
> +            addr = base;
> +            base = gatp;
> +            sc[pass].step = 0;
> +        }
> +    } while (1);
> +
> +    return (iotlb->perm & IOMMU_WO) ?
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
> +}
> +
> +static void riscv_iommu_report_fault(RISCVIOMMUState *s,
> +                                     RISCVIOMMUContext *ctx,
> +                                     uint32_t fault_type, uint32_t cause,
> +                                     bool pv,
> +                                     uint64_t iotval, uint64_t iotval2)
> +{
> +    struct riscv_iommu_fq_record ev = { 0 };
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
> +        switch (cause) {
> +        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
> +        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
> +        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
> +            break;
> +        default:
> +            /* DTF prevents reporting a fault for this given cause */
> +            return;
> +        }
> +    }
> +
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
> +
> +    if (pv) {
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
> +    }
> +
> +    ev.iotval = iotval;
> +    ev.iotval2 = iotval2;
> +
> +    riscv_iommu_fault(s, &ev);
> +}
> +
> +/* Redirect MSI write for given GPA. */
> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> +    unsigned size, MemTxAttrs attrs)
> +{
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint64_t intn;
> +    uint32_t n190;
> +    uint64_t pte[2];
> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +    int cause;
> +
> +    /* Interrupt File Number */
> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> +    if (intn >= 256) {
> +        /* Interrupt file number out of range */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    /* fetch MSI PTE */
> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> +    addr = addr | (intn * sizeof(pte));
> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> +            MEMTXATTRS_UNSPECIFIED);
> +    if (res != MEMTX_OK) {
> +        if (res == MEMTX_DECODE_ERROR) {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
> +        } else {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        }
> +        goto err;
> +    }
> +
> +    le64_to_cpus(&pte[0]);
> +    le64_to_cpus(&pte[1]);
> +
> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> +        /*
> +         * The spec mentions that: "If msipte.C == 1, then further
> +         * processing to interpret the PTE is implementation
> +         * defined.". We'll abort with cause = 262 for this
> +         * case too.
> +         */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
> +        goto err;
> +    }
> +
> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> +        /* MSI Pass-through mode */
> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> +        addr = addr | (gpa & TARGET_PAGE_MASK);

I'm not sure what the idea was with this (maybe a misunderstanding of how
guest interrupt files get targeted?), but we shouldn't be modifying the
result of a translation with the input to that translation. It breaks
translations where guest imsic address bits don't strictly overlap host
imsic address bits and it allows the guest to access host memory it
shouldn't. The fix is just to remove the line.

Thanks,
drew
Daniel Henrique Barboza Aug. 18, 2024, 6:20 p.m. UTC | #3
On 8/17/24 8:34 AM, Andrew Jones wrote:
> On Thu, Aug 01, 2024 at 12:43:24PM GMT, Daniel Henrique Barboza wrote:
>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>>
>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>> international process. The latest frozen specifcation can be found at:
>>
>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>>
>> Add the foundation of the device emulation for RISC-V IOMMU. It includes
>> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
>> sv39x4, sv48x4, sv57x4 caps).
>>
>> Other capabilities like ATS and DBG support will be added incrementally
>> in the next patches.
>>
>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
>> ---

  (...)

>> +/* Redirect MSI write for given GPA. */
>> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
>> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
>> +    unsigned size, MemTxAttrs attrs)
>> +{
>> +    MemTxResult res;
>> +    dma_addr_t addr;
>> +    uint64_t intn;
>> +    uint32_t n190;
>> +    uint64_t pte[2];
>> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
>> +    int cause;
>> +
>> +    /* Interrupt File Number */
>> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
>> +    if (intn >= 256) {
>> +        /* Interrupt file number out of range */
>> +        res = MEMTX_ACCESS_ERROR;
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>> +        goto err;
>> +    }
>> +
>> +    /* fetch MSI PTE */
>> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
>> +    addr = addr | (intn * sizeof(pte));
>> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
>> +            MEMTXATTRS_UNSPECIFIED);
>> +    if (res != MEMTX_OK) {
>> +        if (res == MEMTX_DECODE_ERROR) {
>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
>> +        } else {
>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>> +        }
>> +        goto err;
>> +    }
>> +
>> +    le64_to_cpus(&pte[0]);
>> +    le64_to_cpus(&pte[1]);
>> +
>> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
>> +        /*
>> +         * The spec mentions that: "If msipte.C == 1, then further
>> +         * processing to interpret the PTE is implementation
>> +         * defined.". We'll abort with cause = 262 for this
>> +         * case too.
>> +         */
>> +        res = MEMTX_ACCESS_ERROR;
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
>> +        goto err;
>> +    }
>> +
>> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
>> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
>> +        /* MSI Pass-through mode */
>> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
>> +        addr = addr | (gpa & TARGET_PAGE_MASK);
> 
> I'm not sure what the idea was with this (maybe a misunderstanding of how
> guest interrupt files get targeted?), but we shouldn't be modifying the
> result of a translation with the input to that translation. It breaks
> translations where guest imsic address bits don't strictly overlap host
> imsic address bits and it allows the guest to access host memory it
> shouldn't. The fix is just to remove the line.

I'll remove this line in v7. Thanks,


Daniel

> 
> Thanks,
> drew
Jason Chien Aug. 20, 2024, 3:16 p.m. UTC | #4
Hi Daniel,

On 2024/8/1 下午 11:43, Daniel Henrique Barboza wrote:
> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>
> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> international process. The latest frozen specifcation can be found at:
>
> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>
> Add the foundation of the device emulation for RISC-V IOMMU. It includes
> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
> sv39x4, sv48x4, sv57x4 caps).
>
> Other capabilities like ATS and DBG support will be added incrementally
> in the next patches.
>
> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> ---
>   hw/riscv/Kconfig            |    4 +
>   hw/riscv/meson.build        |    1 +
>   hw/riscv/riscv-iommu-bits.h |   14 +
>   hw/riscv/riscv-iommu.c      | 2017 +++++++++++++++++++++++++++++++++++
>   hw/riscv/riscv-iommu.h      |  148 +++
>   hw/riscv/trace-events       |   12 +
>   hw/riscv/trace.h            |    1 +
>   include/hw/riscv/iommu.h    |   36 +
>   meson.build                 |    1 +
>   9 files changed, 2234 insertions(+)
>   create mode 100644 hw/riscv/riscv-iommu.c
>   create mode 100644 hw/riscv/riscv-iommu.h
>   create mode 100644 hw/riscv/trace-events
>   create mode 100644 hw/riscv/trace.h
>   create mode 100644 include/hw/riscv/iommu.h
>
> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> index a2030e3a6f..f69d6e3c8e 100644
> --- a/hw/riscv/Kconfig
> +++ b/hw/riscv/Kconfig
> @@ -1,3 +1,6 @@
> +config RISCV_IOMMU
> +    bool
> +
>   config RISCV_NUMA
>       bool
>   
> @@ -47,6 +50,7 @@ config RISCV_VIRT
>       select SERIAL
>       select RISCV_ACLINT
>       select RISCV_APLIC
> +    select RISCV_IOMMU
>       select RISCV_IMSIC
>       select SIFIVE_PLIC
>       select SIFIVE_TEST
> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> index f872674093..cbc99c6e8e 100644
> --- a/hw/riscv/meson.build
> +++ b/hw/riscv/meson.build
> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>   riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>   riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>   riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>   
>   hw_arch += {'riscv': riscv_ss}
> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> index 9d9512ca71..6f1b9ab61b 100644
> --- a/hw/riscv/riscv-iommu-bits.h
> +++ b/hw/riscv/riscv-iommu-bits.h
> @@ -69,6 +69,14 @@ struct riscv_iommu_pq_record {
>   /* 5.3 IOMMU Capabilities (64bits) */
>   #define RISCV_IOMMU_REG_CAP             0x0000
>   #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>   #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>   #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>   #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> @@ -80,7 +88,9 @@ struct riscv_iommu_pq_record {
>   
>   /* 5.4 Features control register (32bits) */
>   #define RISCV_IOMMU_REG_FCTL            0x0008
> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
>   #define RISCV_IOMMU_FCTL_WSI            BIT(1)
> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>   
>   /* 5.5 Device-directory-table pointer (64bits) */
>   #define RISCV_IOMMU_REG_DDTP            0x0010
> @@ -203,6 +213,8 @@ struct riscv_iommu_dc {
>   #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>   #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>   #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>   #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>   #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
>   #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> @@ -309,9 +321,11 @@ enum riscv_iommu_fq_causes {
>   
>   /* Translation attributes fields */
>   #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
> +#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
>   
>   /* First stage context fields */
>   #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
> +#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
>   
>   enum riscv_iommu_fq_ttypes {
>       RISCV_IOMMU_FQ_TTYPE_NONE = 0,
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> new file mode 100644
> index 0000000000..ebe3a53a04
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.c
> @@ -0,0 +1,2017 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2021-2023, Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +#include "hw/pci/pci_bus.h"
> +#include "hw/pci/pci_device.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/riscv/riscv_hart.h"
> +#include "migration/vmstate.h"
> +#include "qapi/error.h"
> +#include "qemu/timer.h"
> +
> +#include "cpu_bits.h"
> +#include "riscv-iommu.h"
> +#include "riscv-iommu-bits.h"
> +#include "trace.h"
> +
> +#define LIMIT_CACHE_CTX               (1U << 7)
> +#define LIMIT_CACHE_IOT               (1U << 20)
> +
> +/* Physical page number coversions */
> +#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
> +#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
> +
> +typedef struct RISCVIOMMUContext RISCVIOMMUContext;
> +typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
> +
> +/* Device assigned I/O address space */
> +struct RISCVIOMMUSpace {
> +    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
> +    AddressSpace iova_as;       /* IOVA address space for attached device */
> +    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
> +    uint32_t devid;             /* Requester identifier, AKA device_id */
> +    bool notifier;              /* IOMMU unmap notifier enabled */
> +    QLIST_ENTRY(RISCVIOMMUSpace) list;
> +};
> +
> +/* Device translation context state. */
> +struct RISCVIOMMUContext {
> +    uint64_t devid:24;          /* Requester Id, AKA device_id */
> +    uint64_t process_id:20;     /* Process ID. PASID for PCIe */
> +    uint64_t __rfu:20;          /* reserved */
> +    uint64_t tc;                /* Translation Control */
> +    uint64_t ta;                /* Translation Attributes */
> +    uint64_t satp;              /* S-Stage address translation and protection */
> +    uint64_t gatp;              /* G-Stage address translation and protection */
> +    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
> +    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
> +    uint64_t msiptp;            /* MSI redirection page table pointer */
> +};
> +
> +/* IOMMU index for transactions without process_id specified. */
> +#define RISCV_IOMMU_NOPROCID 0
> +
> +static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
> +{
> +    g_assert(vec_type < 4);
> +    return (icvec >> (vec_type * 4)) & 0x0F;
> +}
> +
> +static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
> +{
> +    const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
> +    uint32_t ipsr, icvec;
> +
> +    if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
> +        return;
> +    }
> +
> +    icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
> +    ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
> +
> +    if (!(ipsr & (1 << vec))) {
> +        s->notify(s, riscv_iommu_get_icvec_vector(icvec, vec));
> +    }
> +}
> +
> +static void riscv_iommu_fault(RISCVIOMMUState *s,
> +                              struct riscv_iommu_fq_record *ev)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
> +    uint32_t next = (tail + 1) & s->fq_mask;
> +    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
> +
> +    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), ev->hdr, ev->iotval);
> +
> +    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
> +        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                              RISCV_IOMMU_FQCSR_FQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
> +        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
> +                                  RISCV_IOMMU_FQCSR_FQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
> +    }
> +}
> +
> +static void riscv_iommu_pri(RISCVIOMMUState *s,
> +    struct riscv_iommu_pq_record *pr)
> +{
> +    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
> +    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
> +    uint32_t next = (tail + 1) & s->pq_mask;
> +    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
> +
> +    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
> +                          PCI_FUNC(devid), pr->payload);
> +
> +    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
> +        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
> +        return;
> +    }
> +
> +    if (head == next) {
> +        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                              RISCV_IOMMU_PQCSR_PQOF, 0);
> +    } else {
> +        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
> +        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
> +                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
> +                                  RISCV_IOMMU_PQCSR_PQMF, 0);
> +        } else {
> +            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
> +        }
> +    }
> +
> +    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
> +    }
> +}
> +
> +/* Portable implementation of pext_u64, bit-mask extraction. */
> +static uint64_t _pext_u64(uint64_t val, uint64_t ext)
> +{
> +    uint64_t ret = 0;
> +    uint64_t rot = 1;
> +
> +    while (ext) {
> +        if (ext & 1) {
> +            if (val & 1) {
> +                ret |= rot;
> +            }
> +            rot <<= 1;
> +        }
> +        val >>= 1;
> +        ext >>= 1;
> +    }
> +
> +    return ret;
> +}
> +
> +/* Check if GPA matches MSI/MRIF pattern. */
> +static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    dma_addr_t gpa)
> +{
> +    if (!s->enable_msi) {
> +        return false;
> +    }
> +
> +    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
> +        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +        return false; /* Invalid MSI/MRIF mode */
> +    }
> +
> +    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
> +        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.
> + *
> + * @s        : IOMMU Device State
> + * @ctx      : Translation context for device id and process address space id.
> + * @iotlb    : translation data: physical address and access mode.
> + * @return   : success or fault cause code.
> + */
> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    dma_addr_t addr, base;
> +    uint64_t satp, gatp, pte;
> +    bool en_s, en_g;
> +    struct {
> +        unsigned char step;
> +        unsigned char levels;
> +        unsigned char ptidxbits;
> +        unsigned char ptesize;
> +    } sc[2];
> +    /* Translation stage phase */
> +    enum {
> +        S_STAGE = 0,
> +        G_STAGE = 1,
> +    } pass;
> +
> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> +
> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> +
> +    /* Early check for MSI address match when IOVA == GPA */
> +    if ((iotlb->perm & IOMMU_WO) &&
> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
I think the IOVA here may be a GVA and we should not use a GVA to 
perform msi check. Section 2.3 states that we should always walk first 
stage page table to get the GPA (step 17) and then use the GPA to do the 
msi check (step 18).
> +        iotlb->target_as = &s->trap_as;
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        return 0;
> +    }
> +
> +    /* Exit early for pass-through mode. */
> +    if (!(en_s || en_g)) {
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        /* Allow R/W in pass-through mode */
> +        iotlb->perm = IOMMU_RW;
> +        return 0;
> +    }
> +
> +    /* S/G translation parameters. */
> +    for (pass = 0; pass < 2; pass++) {
> +        uint32_t sv_mode;
> +
> +        sc[pass].step = 0;
> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> +            /* 32bit mode for GXL/SXL == 1 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 2;
> +                sc[pass].ptidxbits = 10;
> +                sc[pass].ptesize   = 4;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        } else {
> +            /* 64bit mode for GXL/SXL == 0 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 3;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 4;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 5;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        }
> +    };
> +
> +    /* S/G stages translation tables root pointers */
> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> +    base = en_g ? gatp : satp;
> +    pass = en_g ? G_STAGE : S_STAGE;
> +
> +    do {
> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> +                                 (sc[pass].levels - 1 - sc[pass].step);
> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> +        const bool ade =
> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> +
> +        /* Address range check before first level lookup */
> +        if (!sc[pass].step) {
> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> +            if ((addr & va_mask) != addr) {
> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +            }
> +        }
> +
> +        /* Read page table entry */
> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> +        }
> +
> +        if (sc[pass].ptesize == 4) {
> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> +        } else {
> +            pte = le64_to_cpu(pte);
> +        }
> +
> +        sc[pass].step++;
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            break;                /* Invalid PTE */
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            break;                /* Reserved leaf PTE flags: PTE_W */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> +            break;                /* Misaligned PPN */
> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> +            break;                /* Read access check failed */
> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> +            break;                /* Write access check failed */
> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> +            break;                /* Access bit not set */
> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> +            break;                /* Dirty bit not set */
> +        } else {
> +            /* Leaf PTE, translation completed. */
> +            sc[pass].step = sc[pass].levels;
> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> +            /* Update address mask based on smallest translation granularity */
> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> +            /* Continue with S-Stage translation? */
> +            if (pass && sc[0].step != sc[0].levels) {
> +                pass = S_STAGE;
> +                addr = iotlb->iova;
> +                continue;
> +            }
> +            /* Translation phase completed (GPA or SPA) */
> +            iotlb->translated_addr = base;
> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> +                                                         : IOMMU_RO;
> +
> +            /* Check MSI GPA address match */
> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> +                riscv_iommu_msi_check(s, ctx, base)) {
> +                /* Trap MSI writes and return GPA address. */
> +                iotlb->target_as = &s->trap_as;
> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +                return 0;
> +            }
> +
> +            /* Continue with G-Stage translation? */
> +            if (!pass && en_g) {
> +                pass = G_STAGE;
> +                addr = base;
> +                base = gatp;
> +                sc[pass].step = 0;
> +                continue;
> +            }
> +
> +            return 0;
> +        }
> +
> +        if (sc[pass].step == sc[pass].levels) {
> +            break; /* Can't find leaf PTE */
> +        }
> +
> +        /* Continue with G-Stage translation? */
> +        if (!pass && en_g) {
> +            pass = G_STAGE;
> +            addr = base;
> +            base = gatp;
> +            sc[pass].step = 0;
> +        }
> +    } while (1);
> +
> +    return (iotlb->perm & IOMMU_WO) ?
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
> +}
> +
> +static void riscv_iommu_report_fault(RISCVIOMMUState *s,
> +                                     RISCVIOMMUContext *ctx,
> +                                     uint32_t fault_type, uint32_t cause,
> +                                     bool pv,
> +                                     uint64_t iotval, uint64_t iotval2)
> +{
> +    struct riscv_iommu_fq_record ev = { 0 };
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
> +        switch (cause) {
> +        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
> +        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
> +        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
> +        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
> +            break;
> +        default:
> +            /* DTF prevents reporting a fault for this given cause */
> +            return;
> +        }
> +    }
> +
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
> +
> +    if (pv) {
> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
> +    }
> +
> +    ev.iotval = iotval;
> +    ev.iotval2 = iotval2;
> +
> +    riscv_iommu_fault(s, &ev);
> +}
> +
> +/* Redirect MSI write for given GPA. */
> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> +    unsigned size, MemTxAttrs attrs)
> +{
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint64_t intn;
> +    uint32_t n190;
> +    uint64_t pte[2];
> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +    int cause;
> +
> +    /* Interrupt File Number */
> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> +    if (intn >= 256) {
> +        /* Interrupt file number out of range */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    /* fetch MSI PTE */
> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> +    addr = addr | (intn * sizeof(pte));
> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> +            MEMTXATTRS_UNSPECIFIED);
> +    if (res != MEMTX_OK) {
> +        if (res == MEMTX_DECODE_ERROR) {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
> +        } else {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        }
> +        goto err;
> +    }
> +
> +    le64_to_cpus(&pte[0]);
> +    le64_to_cpus(&pte[1]);
> +
> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> +        /*
> +         * The spec mentions that: "If msipte.C == 1, then further
> +         * processing to interpret the PTE is implementation
> +         * defined.". We'll abort with cause = 262 for this
> +         * case too.
> +         */
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
> +        goto err;
> +    }
> +
> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> +        /* MSI Pass-through mode */
> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> +        addr = addr | (gpa & TARGET_PAGE_MASK);
> +
> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                              gpa, addr);
> +
> +        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
> +        if (res != MEMTX_OK) {
> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +            goto err;
> +        }
> +
> +        return MEMTX_OK;
> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> +        /* MRIF mode, continue. */
> +        break;
> +    default:
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> +        goto err;
> +    }
> +
> +    /*
> +     * Report an error for interrupt identities exceeding the maximum allowed
> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> +     */
> +    if ((data > 2047) || (gpa & 3)) {
> +        res = MEMTX_ACCESS_ERROR;
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> +        goto err;
> +    }
> +
> +    /* MSI MRIF mode, non atomic pending bit update */
> +
> +    /* MRIF pending bit address */
> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> +    addr = addr | ((data & 0x7c0) >> 3);
> +
> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> +                          gpa, addr);
> +
> +    /* MRIF pending bit mask */
> +    data = 1ULL << (data & 0x03f);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    intn = intn | data;
> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +        goto err;
> +    }
> +
> +    /* Get MRIF enable bits */
> +    addr = addr + sizeof(intn);
> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> +        goto err;
> +    }
> +
> +    if (!(intn & data)) {
> +        /* notification disabled, MRIF update completed. */
> +        return MEMTX_OK;
> +    }
> +
> +    /* Send notification message */
> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> +
> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> +    if (res != MEMTX_OK) {
> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> +        goto err;
> +    }
> +
> +    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
> +
> +    return MEMTX_OK;
> +
> +err:
> +    riscv_iommu_report_fault(s, ctx, fault_type, cause,
> +                             !!ctx->process_id, 0, 0);
> +    return res;
> +}
> +
> +/*
> + * Check device context configuration as described by the
> + * riscv-iommu spec section "Device-context configuration
> + * checks".
> + */
> +static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
> +                                            RISCVIOMMUContext *ctx)
> +{
> +    uint32_t fsc_mode, msi_mode;
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
> +        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
> +        return false;
> +    }
> +
> +    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
> +        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
> +        return false;
> +    }
> +
> +    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
> +        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
> +
> +        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
> +            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> +            return false;
> +        }
> +    }
> +
> +    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
> +        switch (fsc_mode) {
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
> +                return false;
> +            }
> +            break;
> +        }
> +    } else {
> +        /* DC.tc.PDTV is 0 */
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
> +            return false;
> +        }
> +
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> +            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
> +                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> +                return false;
> +            }
> +        } else {
> +            switch (fsc_mode) {
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> +                    return false;
> +                }
> +                break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> +                    return false;
> +                }
> +            break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> +                    return false;
> +                }
> +                break;
> +            }
> +        }
> +    }
> +
> +    /*
> +     * CAP_END is always zero (only one endianess). FCTL_BE is
> +     * always zero (little-endian accesses). Thus TC_SBE must
> +     * always be LE, i.e. zero.
> +     */
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * Validate process context (PC) according to section
> + * "Process-context configuration checks".
> + */
> +static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
> +                                             RISCVIOMMUContext *ctx)
> +{
> +    uint32_t mode;
> +
> +    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
> +        return false;
> +    }
> +
> +    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
> +        return false;
> +    }
> +
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    switch (mode) {
> +    case RISCV_IOMMU_DC_FSC_MODE_BARE:
> +    /* sv39 and sv32 modes have the same value (8) */
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +        break;
> +    default:
> +        return false;
> +    }
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> +        if (mode == RISCV_IOMMU_CAP_SV32 &&
> +            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> +                return false;
> +        }
> +    } else {
> +        switch (mode) {
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> +                return false;
> +            }
> +            break;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> + *
> + * @s         : IOMMU Device State
> + * @ctx       : Device Translation Context with devid and process_id set.
> + * @return    : success or fault code.
> + */
> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> +{
> +    const uint64_t ddtp = s->ddtp;
> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> +    struct riscv_iommu_dc dc;
> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
> +    const int dc_fmt = !s->enable_msi;
> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> +    unsigned depth;
> +    uint64_t de;
> +
> +    switch (mode) {
> +    case RISCV_IOMMU_DDTP_MODE_OFF:
> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +
> +    case RISCV_IOMMU_DDTP_MODE_BARE:
> +        /* mock up pass-through translation context */
> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
> +        ctx->ta = 0;
> +        ctx->msiptp = 0;
> +        return 0;
> +
> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> +        depth = 0;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> +        depth = 1;
> +        break;
> +
> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> +        depth = 2;
> +        break;
> +
> +    default:
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /*
> +     * Check supported device id width (in bits).
> +     * See IOMMU Specification, Chapter 6. Software guidelines.
> +     * - if extended device-context format is used:
> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> +     * - if base device-context format is used:
> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> +     */
> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> +        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +    }
> +
> +    /* Device directory tree walk */
> +    for (; depth-- > 0; ) {
> +        /*
> +         * Select device id index bits based on device directory tree level
> +         * and device context format.
> +         * See IOMMU Specification, Chapter 2. Data Structures.
> +         * - if extended device-context format is used:
> +         *   device index: [23:15][14:6][5:0]
> +         * - if base device-context format is used:
> +         *   device index: [23:16][15:7][6:0]
> +         */
> +        const int split = depth * 9 + 6 + dc_fmt;
> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> +            /* invalid directory entry */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> +            /* reserved bits set */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> +    }
> +
> +    /* index into device context entry page */
> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> +
> +    memset(&dc, 0, sizeof(dc));
> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> +    }
> +
> +    /* Set translation context. */
> +    ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +    }
> +
> +    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +    }
> +
> +    /* FSC field checks */
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> +
> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> +        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
> +            /* PID is disabled */
> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> +        }
> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> +            /* Invalid translation mode */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
> +        return 0;
> +    }
> +
> +    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> +            /* No default process_id enabled, set BARE mode */
> +            ctx->satp = 0ULL;
> +            return 0;
> +        } else {
> +            /* Use default process_id #0 */
> +            ctx->process_id = 0;
> +        }
> +    }
> +
> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> +        /* No S-Stage translation, done. */
> +        return 0;
> +    }
> +
> +    /* FSC.TC.PDTV enabled */
> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> +        /* Invalid PDTP.MODE */
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> +        /*
> +         * Select process id index bits based on process directory tree
> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> +         */
> +        const int split = depth * 9 + 8;
> +        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +        }
> +        le64_to_cpus(&de);
> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +        }
> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> +    }
> +
> +    /* Leaf entry in PDT */
> +    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> +    }
> +
> +    /* Use FSC and TA from process directory entry. */
> +    ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->satp = le64_to_cpu(dc.fsc);
> +
> +    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> +    }
> +
> +    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Translation Context cache support */
> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
> +{
> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> +    return c1->devid == c2->devid &&
> +           c1->process_id == c2->process_id;
> +}
> +
> +static guint __ctx_hash(gconstpointer v)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> +    /*
> +     * Generate simple hash of (process_id, devid)
> +     * assuming 24-bit wide devid.
> +     */
> +    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
> +}
> +
> +static void __ctx_inval_devid_procid(gpointer key, gpointer value,
> +                                     gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid &&
> +        ctx->process_id == arg->process_id) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> +        ctx->devid == arg->devid) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
> +{
> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> +    }
> +}
> +
> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> +                                  uint32_t devid, uint32_t process_id)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .process_id = process_id,
> +    };
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    qemu_mutex_lock(&s->ctx_lock);
> +    g_hash_table_foreach(ctx_cache, func, &key);
> +    qemu_mutex_unlock(&s->ctx_lock);
> +    g_hash_table_unref(ctx_cache);
> +}
> +
> +/* Find or allocate translation context for a given {device_id, process_id} */
> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> +                                          unsigned devid, unsigned process_id,
> +                                          void **ref)
> +{
> +    GHashTable *ctx_cache;
> +    RISCVIOMMUContext *ctx;
> +    RISCVIOMMUContext key = {
> +        .devid = devid,
> +        .process_id = process_id,
> +    };
> +
> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> +    qemu_mutex_lock(&s->ctx_lock);
> +    ctx = g_hash_table_lookup(ctx_cache, &key);
> +    qemu_mutex_unlock(&s->ctx_lock);
> +
> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    ctx = g_new0(RISCVIOMMUContext, 1);
> +    ctx->devid = devid;
> +    ctx->process_id = process_id;
> +
> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> +    if (!fault) {
> +        qemu_mutex_lock(&s->ctx_lock);
> +        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
> +            g_hash_table_unref(ctx_cache);
> +            ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                              g_free, NULL);
> +            g_hash_table_ref(ctx_cache);
> +            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> +        }
> +        g_hash_table_add(ctx_cache, ctx);
> +        qemu_mutex_unlock(&s->ctx_lock);
> +        *ref = ctx_cache;
> +        return ctx;
> +    }
> +
> +    g_hash_table_unref(ctx_cache);
> +    *ref = NULL;
> +
> +    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
> +                             fault, !!process_id, 0, 0);
> +
> +    g_free(ctx);
> +    return NULL;
> +}
> +
> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> +{
> +    if (ref) {
> +        g_hash_table_unref((GHashTable *)ref);
> +    }
> +}
> +
> +/* Find or allocate address space for a given device */
> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> +{
> +    RISCVIOMMUSpace *as;
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    qemu_mutex_lock(&s->core_lock);
> +    QLIST_FOREACH(as, &s->spaces, list) {
> +        if (as->devid == devid) {
> +            break;
> +        }
> +    }
> +    qemu_mutex_unlock(&s->core_lock);
> +
> +    if (as == NULL) {
> +        char name[64];
> +        as = g_new0(RISCVIOMMUSpace, 1);
> +
> +        as->iommu = s;
> +        as->devid = devid;
> +
> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +
> +        /* IOVA address space, untranslated addresses */
> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> +            OBJECT(as), "riscv_iommu", UINT64_MAX);
> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
> +
> +        qemu_mutex_lock(&s->core_lock);
> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> +        qemu_mutex_unlock(&s->core_lock);
> +
> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> +    }
> +    return &as->iova_as;
> +}
> +
> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> +    IOMMUTLBEntry *iotlb)
> +{
> +    bool enable_pid;
> +    bool enable_pri;
> +    int fault;
> +
> +    /*
> +     * TC[32] is reserved for custom extensions, used here to temporarily
> +     * enable automatic page-request generation for ATS queries.
> +     */
> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> +    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> +
> +    /* Translate using device directory / page table information. */
> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
> +
> +    if (enable_pri && fault) {
> +        struct riscv_iommu_pq_record pr = {0};
> +        if (enable_pid) {
> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> +                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
> +        }
> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
> +        riscv_iommu_pri(s, &pr);
> +        return fault;
> +    }
> +
> +    if (fault) {
> +        unsigned ttype;
> +
> +        if (iotlb->perm & IOMMU_RW) {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> +        } else {
> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
> +        }
> +
> +        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
> +                                 iotlb->iova, iotlb->translated_addr);
> +        return fault;
> +    }
> +
> +    return 0;
> +}
> +
> +/* IOMMU Command Interface */
> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> +    uint64_t addr, uint32_t data)
> +{
> +    /*
> +     * ATS processing in this implementation of the IOMMU is synchronous,
> +     * no need to wait for completions here.
> +     */
> +    if (!notify) {
> +        return MEMTX_OK;
> +    }
> +
> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> +        MEMTXATTRS_UNSPECIFIED);
> +}
> +
> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> +{
> +    uint64_t old_ddtp = s->ddtp;
> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> +    bool ok = false;
> +
> +    /*
> +     * Check for allowed DDTP.MODE transitions:
> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> +     */
> +    if (new_mode == old_mode ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> +        ok = true;
> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> +    }
> +
> +    if (ok) {
> +        /* clear reserved and busy bits, report back sanitized version */
> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> +    } else {
> +        new_ddtp = old_ddtp;
> +    }
> +    s->ddtp = new_ddtp;
> +
> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> +}
> +
> +/* Command function and opcode field. */
> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> +
> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> +{
> +    struct riscv_iommu_command cmd;
> +    MemTxResult res;
> +    dma_addr_t addr;
> +    uint32_t tail, head, ctrl;
> +    uint64_t cmd_opcode;
> +    GHFunc func;
> +
> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> +
> +    /* Check for pending error or queue processing disabled */
> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> +        return;
> +    }
> +
> +    while (tail != head) {
> +        addr = s->cq_addr  + head * sizeof(cmd);
> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> +                              MEMTXATTRS_UNSPECIFIED);
> +
> +        if (res != MEMTX_OK) {
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
> +            goto fault;
> +        }
> +
> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> +
> +        cmd_opcode = get_field(cmd.dword0,
> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
> +
> +        switch (cmd_opcode) {
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> +            res = riscv_iommu_iofence(s,
> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> +
> +            if (res != MEMTX_OK) {
> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> +                goto fault;
> +            }
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> +                goto cmd_ill;
> +            }
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> +            /* translation cache not implemented yet */
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* invalidate all device context cache mappings */
> +                func = __ctx_inval_all;
> +            } else {
> +                /* invalidate all device context matching DID */
> +                func = __ctx_inval_devid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> +            break;
> +
> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> +                /* illegal command arguments IODIR_PDT & DV == 0 */
> +                goto cmd_ill;
> +            } else {
> +                func = __ctx_inval_devid_procid;
> +            }
> +            riscv_iommu_ctx_inval(s, func,
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> +            break;
> +
> +        default:
> +        cmd_ill:
> +            /* Invalid instruction, do not advance instruction index. */
> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> +            goto fault;
> +        }
> +
> +        /* Advance and update head pointer after command completes. */
> +        head = (head + 1) & s->cq_mask;
> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> +    }
> +    return;
> +
> +fault:
> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> +    }
> +}
> +
> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
> +                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> +            RISCV_IOMMU_FQCSR_FQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> +{
> +    uint64_t base;
> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +    uint32_t ctrl_clr;
> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> +
> +    if (enable && !active) {
> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> +            RISCV_IOMMU_PQCSR_PQOF;
> +    } else if (!enable && active) {
> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> +    } else {
> +        ctrl_set = 0;
> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> +}
> +
> +static void riscv_iommu_process_icvec_update(RISCVIOMMUState *s)
> +{
> +    uint32_t icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
> +
> +    s->icvec_update(s,
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_CQ),
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_FQ),
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PM),
> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PQ));
> +}
> +
> +typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
> +
> +static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
> +{
> +    uint32_t cqcsr, fqcsr, pqcsr;
> +    uint32_t ipsr_set = 0;
> +    uint32_t ipsr_clr = 0;
> +
> +    if (data & RISCV_IOMMU_IPSR_CIP) {
> +        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> +
> +        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
> +            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
> +             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> +    }
> +
> +    if (data & RISCV_IOMMU_IPSR_FIP) {
> +        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> +
> +        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
> +            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
> +             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> +    }
> +
> +    if (data & RISCV_IOMMU_IPSR_PIP) {
> +        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> +
> +        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
> +            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
> +             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
> +            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
> +        } else {
> +            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> +        }
> +    } else {
> +        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> +    }
> +
> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
> +}
> +
> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    riscv_iommu_process_fn *process_fn = NULL;
> +    RISCVIOMMUState *s = opaque;
> +    uint32_t regb = addr & ~3;
> +    uint32_t busy = 0;
> +    uint64_t val = 0;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment or access size */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        /* Unsupported MMIO access location. */
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* Track actionable MMIO write. */
> +    switch (regb) {
> +    case RISCV_IOMMU_REG_DDTP:
> +    case RISCV_IOMMU_REG_DDTP + 4:
> +        process_fn = riscv_iommu_process_ddtp;
> +        regb = RISCV_IOMMU_REG_DDTP;
> +        busy = RISCV_IOMMU_DDTP_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQT:
> +        process_fn = riscv_iommu_process_cq_tail;
> +        break;
> +
> +    case RISCV_IOMMU_REG_CQCSR:
> +        process_fn = riscv_iommu_process_cq_control;
> +        busy = RISCV_IOMMU_CQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_FQCSR:
> +        process_fn = riscv_iommu_process_fq_control;
> +        busy = RISCV_IOMMU_FQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_PQCSR:
> +        process_fn = riscv_iommu_process_pq_control;
> +        busy = RISCV_IOMMU_PQCSR_BUSY;
> +        break;
> +
> +    case RISCV_IOMMU_REG_ICVEC:
> +        process_fn = riscv_iommu_process_icvec_update;
> +        break;
> +
> +    case RISCV_IOMMU_REG_IPSR:
> +        /*
> +         * IPSR has special procedures to update. Execute it
> +         * and exit.
> +         */
> +        if (size == 4) {
> +            uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +            uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +            uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> +            stl_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +        } else if (size == 8) {
> +            uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +            uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +            uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> +            stq_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +        }
> +
> +        riscv_iommu_update_ipsr(s, val);
> +
> +        return MEMTX_OK;
> +
> +    default:
> +        break;
> +    }
> +
> +    /*
> +     * Registers update might be not synchronized with core logic.
> +     * If system software updates register when relevant BUSY bit
> +     * is set IOMMU behavior of additional writes to the register
> +     * is UNSPECIFIED.
> +     */
> +    qemu_spin_lock(&s->regs_lock);
> +    if (size == 1) {
> +        uint8_t ro = s->regs_ro[addr];
> +        uint8_t wc = s->regs_wc[addr];
> +        uint8_t rw = s->regs_rw[addr];
> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
> +    } else if (size == 2) {
> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 4) {
> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    } else if (size == 8) {
> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> +    }
> +
> +    /* Busy flag update, MSB 4-byte register. */
> +    if (busy) {
> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> +        stl_le_p(&s->regs_rw[regb], rw | busy);
> +    }
> +    qemu_spin_unlock(&s->regs_lock);
> +
> +    if (process_fn) {
> +        qemu_mutex_lock(&s->core_lock);
> +        process_fn(s);
> +        qemu_mutex_unlock(&s->core_lock);
> +    }
> +
> +    return MEMTX_OK;
> +}
> +
> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState *s = opaque;
> +    uint64_t val = -1;
> +    uint8_t *ptr;
> +
> +    if ((addr & (size - 1)) != 0) {
> +        /* Unsupported MMIO alignment. */
> +        return MEMTX_ERROR;
> +    }
> +
> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    ptr = &s->regs_rw[addr];
> +
> +    if (size == 1) {
> +        val = (uint64_t)*ptr;
> +    } else if (size == 2) {
> +        val = lduw_le_p(ptr);
> +    } else if (size == 4) {
> +        val = ldl_le_p(ptr);
> +    } else if (size == 8) {
> +        val = ldq_le_p(ptr);
> +    } else {
> +        return MEMTX_ERROR;
> +    }
> +
> +    *data = val;
> +
> +    return MEMTX_OK;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> +    .read_with_attrs = riscv_iommu_mmio_read,
> +    .write_with_attrs = riscv_iommu_mmio_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +        .unaligned = false,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +/*
> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> + * memory region as untranslated address, for additional MSI/MRIF interception
> + * by IOMMU interrupt remapping implementation.
> + * Note: Device emulation code generating an MSI is expected to provide a valid
> + * memory transaction attributes with requested_id set.
> + */
> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> +{
> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> +    RISCVIOMMUContext *ctx;
> +    MemTxResult res;
> +    void *ref;
> +    uint32_t devid = attrs.requester_id;
> +
> +    if (attrs.unspecified) {
> +        return MEMTX_ACCESS_ERROR;
> +    }
> +
> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> +    devid |= s->bus << 8;
> +
> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> +    if (ctx == NULL) {
> +        res = MEMTX_ACCESS_ERROR;
> +    } else {
> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> +    }
> +    riscv_iommu_ctx_put(s, ref);
> +    return res;
> +}
> +
> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> +{
> +    return MEMTX_ACCESS_ERROR;
> +}
> +
> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> +    .read_with_attrs = riscv_iommu_trap_read,
> +    .write_with_attrs = riscv_iommu_trap_write,
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +        .unaligned = true,
> +    },
> +    .valid = {
> +        .min_access_size = 4,
> +        .max_access_size = 8,
> +    }
> +};
> +
> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
> +    if (s->enable_msi) {
> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
> +    }
> +    if (s->enable_s_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
> +    }
> +    if (s->enable_g_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
> +    }
> +    /* Report QEMU target physical address space limits */
> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
> +                       TARGET_PHYS_ADDR_SPACE_BITS);
> +
> +    /* TODO: method to report supported PID bits */
> +    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
> +    s->cap |= RISCV_IOMMU_CAP_PD8;
> +
> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> +
> +    /* register storage */
> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> +
> +     /* Mark all registers read-only */
> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> +
> +    /*
> +     * Register complete MMIO space, including MSI/PBA registers.
> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> +     * managed directly by the PCIDevice implementation.
> +     */
> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> +
> +    /* Set power-on register state */
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
> +             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> +        RISCV_IOMMU_CQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> +        RISCV_IOMMU_FQCSR_FQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> +        RISCV_IOMMU_FQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> +        RISCV_IOMMU_PQCSR_PQOF);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> +        RISCV_IOMMU_PQCSR_BUSY);
> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> +
> +    /* Memory region for downstream access, if specified. */
> +    if (s->target_mr) {
> +        s->target_as = g_new0(AddressSpace, 1);
> +        address_space_init(s->target_as, s->target_mr,
> +            "riscv-iommu-downstream");
> +    } else {
> +        /* Fallback to global system memory. */
> +        s->target_as = &address_space_memory;
> +    }
> +
> +    /* Memory region for untranslated MRIF/MSI writes */
> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> +            "riscv-iommu-trap", ~0ULL);
> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> +
> +    /* Device translation context cache */
> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> +                                         g_free, NULL);
> +    qemu_mutex_init(&s->ctx_lock);
> +
> +    s->iommus.le_next = NULL;
> +    s->iommus.le_prev = NULL;
> +    QLIST_INIT(&s->spaces);
> +    qemu_mutex_init(&s->core_lock);
> +    qemu_spin_init(&s->regs_lock);
> +}
> +
> +static void riscv_iommu_unrealize(DeviceState *dev)
> +{
> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> +
> +    qemu_mutex_destroy(&s->core_lock);
> +    g_hash_table_unref(s->ctx_cache);
> +}
> +
> +static Property riscv_iommu_properties[] = {
> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> +        RISCV_IOMMU_SPEC_DOT_VER),
> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> +        TYPE_MEMORY_REGION, MemoryRegion *),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> +{
> +    DeviceClass *dc = DEVICE_CLASS(klass);
> +
> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> +    dc->user_creatable = false;
> +    dc->realize = riscv_iommu_realize;
> +    dc->unrealize = riscv_iommu_unrealize;
> +    device_class_set_props(dc, riscv_iommu_properties);
> +}
> +
> +static const TypeInfo riscv_iommu_info = {
> +    .name = TYPE_RISCV_IOMMU,
> +    .parent = TYPE_DEVICE,
> +    .instance_size = sizeof(RISCVIOMMUState),
> +    .class_init = riscv_iommu_class_init,
> +};
> +
> +static const char *IOMMU_FLAG_STR[] = {
> +    "NA",
> +    "RO",
> +    "WR",
> +    "RW",
> +};
> +
> +/* RISC-V IOMMU Memory Region - Address Translation Space */
> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> +    IOMMUAccessFlags flag, int iommu_idx)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    RISCVIOMMUContext *ctx;
> +    void *ref;
> +    IOMMUTLBEntry iotlb = {
> +        .iova = addr,
> +        .target_as = as->iommu->target_as,
> +        .addr_mask = ~0ULL,
> +        .perm = flag,
> +    };
> +
> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> +    if (ctx == NULL) {
> +        /* Translation disabled or invalid. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
> +        /* Translation disabled or fault reported. */
> +        iotlb.addr_mask = 0;
> +        iotlb.perm = IOMMU_NONE;
> +    }
> +
> +    /* Trace all dma translations with original access flags. */
> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> +                          iotlb.translated_addr);
> +
> +    riscv_iommu_ctx_put(as->iommu, ref);
> +
> +    return iotlb;
> +}
> +
> +static int riscv_iommu_memory_region_notify(
> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> +    IOMMUNotifierFlag new, Error **errp)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +
> +    if (old == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = true;
> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> +    } else if (new == IOMMU_NOTIFIER_NONE) {
> +        as->notifier = false;
> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> +    }
> +
> +    return 0;
> +}
> +
> +static inline bool pci_is_iommu(PCIDevice *pdev)
> +{
> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> +}
> +
> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> +{
> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> +    AddressSpace *as = NULL;
> +
> +    if (pdev && pci_is_iommu(pdev)) {
> +        return s->target_as;
> +    }
> +
> +    /* Find first registered IOMMU device */
> +    while (s->iommus.le_prev) {
> +        s = *(s->iommus.le_prev);
> +    }
> +
> +    /* Find first matching IOMMU */
> +    while (s != NULL && as == NULL) {
> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> +        s = s->iommus.le_next;
> +    }
> +
> +    return as ? as : &address_space_memory;
> +}
> +
> +static const PCIIOMMUOps riscv_iommu_ops = {
> +    .get_address_space = riscv_iommu_find_as,
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +        Error **errp)
> +{
> +    if (bus->iommu_ops &&
> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> +    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> +    } else {
> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> +            pci_bus_num(bus));
> +    }
> +}
> +
> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> +    MemTxAttrs attrs)
> +{
> +    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
> +}
> +
> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> +{
> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> +    return 1 << as->iommu->pid_bits;
> +}
> +
> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> +{
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> +
> +    imrc->translate = riscv_iommu_memory_region_translate;
> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> +}
> +
> +static const TypeInfo riscv_iommu_memory_region_info = {
> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> +    .class_init = riscv_iommu_memory_region_init,
> +};
> +
> +static void riscv_iommu_register_mr_types(void)
> +{
> +    type_register_static(&riscv_iommu_memory_region_info);
> +    type_register_static(&riscv_iommu_info);
> +}
> +
> +type_init(riscv_iommu_register_mr_types);
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> new file mode 100644
> index 0000000000..6d76cb9b1a
> --- /dev/null
> +++ b/hw/riscv/riscv-iommu.h
> @@ -0,0 +1,148 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_STATE_H
> +#define HW_RISCV_IOMMU_STATE_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#include "hw/riscv/iommu.h"
> +
> +struct RISCVIOMMUState {
> +    /*< private >*/
> +    DeviceState parent_obj;
> +
> +    /*< public >*/
> +    uint32_t version;     /* Reported interface version number */
> +    uint32_t pid_bits;    /* process identifier width */
> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> +
> +    uint64_t cap;         /* IOMMU supported capabilities */
> +    uint64_t fctl;        /* IOMMU enabled features */
> +
> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> +    bool enable_msi;      /* Enable MSI remapping */
> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
> +    bool enable_g_stage;  /* Enable G-Stage translation */
> +
> +    /* IOMMU Internal State */
> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> +
> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> +
> +    uint32_t cq_mask;     /* Command queue index bit mask */
> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> +
> +    /* interrupt notifier */
> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> +    /* register icvec interrupts */
> +    void (*icvec_update)(RISCVIOMMUState *iommu,
> +                         uint32_t civ, uint32_t fiv,
> +                         uint32_t pmiv, uint32_t piv);
> +
> +    /* IOMMU State Machine */
> +    QemuThread core_proc; /* Background processing thread */
> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> +    QemuCond core_cond;   /* Background processing wake up signal */
> +    unsigned core_exec;   /* Processing thread execution actions */
> +
> +    /* IOMMU target address space */
> +    AddressSpace *target_as;
> +    MemoryRegion *target_mr;
> +
> +    /* MSI / MRIF access trap */
> +    AddressSpace trap_as;
> +    MemoryRegion trap_mr;
> +
> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> +    QemuMutex ctx_lock;      /* Device translation Cache update lock */
> +
> +    /* MMIO Hardware Interface */
> +    MemoryRegion regs_mr;
> +    QemuSpin regs_lock;
> +    uint8_t *regs_rw;  /* register state (user write) */
> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> +    uint8_t *regs_ro;  /* read-only mask */
> +
> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> +};
> +
> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> +         Error **errp);
> +
> +/* private helpers */
> +
> +/* Register helper functions */
> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set, uint32_t clr)
> +{
> +    uint32_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldl_le_p(s->regs_rw + idx);
> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> +    unsigned idx, uint32_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stl_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldl_le_p(s->regs_rw + idx);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set, uint64_t clr)
> +{
> +    uint64_t val;
> +    qemu_spin_lock(&s->regs_lock);
> +    val = ldq_le_p(s->regs_rw + idx);
> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> +    qemu_spin_unlock(&s->regs_lock);
> +    return val;
> +}
> +
> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> +    unsigned idx, uint64_t set)
> +{
> +    qemu_spin_lock(&s->regs_lock);
> +    stq_le_p(s->regs_rw + idx, set);
> +    qemu_spin_unlock(&s->regs_lock);
> +}
> +
> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> +    unsigned idx)
> +{
> +    return ldq_le_p(s->regs_rw + idx);
> +}
> +
> +
> +
> +#endif
> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> new file mode 100644
> index 0000000000..bdd8b657a6
> --- /dev/null
> +++ b/hw/riscv/trace-events
> @@ -0,0 +1,12 @@
> +# See documentation at docs/devel/tracing.rst
> +
> +# riscv-iommu.c
> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> +riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> new file mode 100644
> index 0000000000..8c0e3ca1f3
> --- /dev/null
> +++ b/hw/riscv/trace.h
> @@ -0,0 +1 @@
> +#include "trace/trace-hw_riscv.h"
> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> new file mode 100644
> index 0000000000..070ee69973
> --- /dev/null
> +++ b/include/hw/riscv/iommu.h
> @@ -0,0 +1,36 @@
> +/*
> + * QEMU emulation of an RISC-V IOMMU
> + *
> + * Copyright (C) 2022-2023 Rivos Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef HW_RISCV_IOMMU_H
> +#define HW_RISCV_IOMMU_H
> +
> +#include "qemu/osdep.h"
> +#include "qom/object.h"
> +
> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> +
> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> +
> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> +
> +#endif
> diff --git a/meson.build b/meson.build
> index a1e51277b0..359f836d8e 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -3373,6 +3373,7 @@ if have_system
>       'hw/pci-host',
>       'hw/ppc',
>       'hw/rtc',
> +    'hw/riscv',
>       'hw/s390x',
>       'hw/scsi',
>       'hw/sd',
Daniel Henrique Barboza Aug. 23, 2024, 12:42 p.m. UTC | #5
On 8/20/24 12:16 PM, Jason Chien wrote:
> Hi Daniel,
> 
> On 2024/8/1 下午 11:43, Daniel Henrique Barboza wrote:
>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>>
>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>> international process. The latest frozen specifcation can be found at:
>>
>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>>
>> Add the foundation of the device emulation for RISC-V IOMMU. It includes
>> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
>> sv39x4, sv48x4, sv57x4 caps).
>>
>> Other capabilities like ATS and DBG support will be added incrementally
>> in the next patches.
>>
>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
>> ---
>>   hw/riscv/Kconfig            |    4 +
>>   hw/riscv/meson.build        |    1 +
>>   hw/riscv/riscv-iommu-bits.h |   14 +
>>   hw/riscv/riscv-iommu.c      | 2017 +++++++++++++++++++++++++++++++++++
>>   hw/riscv/riscv-iommu.h      |  148 +++
>>   hw/riscv/trace-events       |   12 +
>>   hw/riscv/trace.h            |    1 +
>>   include/hw/riscv/iommu.h    |   36 +
>>   meson.build                 |    1 +
>>   9 files changed, 2234 insertions(+)
>>   create mode 100644 hw/riscv/riscv-iommu.c
>>   create mode 100644 hw/riscv/riscv-iommu.h
>>   create mode 100644 hw/riscv/trace-events
>>   create mode 100644 hw/riscv/trace.h
>>   create mode 100644 include/hw/riscv/iommu.h
>>
>> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
>> index a2030e3a6f..f69d6e3c8e 100644
>> --- a/hw/riscv/Kconfig
>> +++ b/hw/riscv/Kconfig
>> @@ -1,3 +1,6 @@
>> +config RISCV_IOMMU
>> +    bool
>> +
>>   config RISCV_NUMA
>>       bool
>> @@ -47,6 +50,7 @@ config RISCV_VIRT
>>       select SERIAL
>>       select RISCV_ACLINT
>>       select RISCV_APLIC
>> +    select RISCV_IOMMU
>>       select RISCV_IMSIC
>>       select SIFIVE_PLIC
>>       select SIFIVE_TEST
>> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
>> index f872674093..cbc99c6e8e 100644
>> --- a/hw/riscv/meson.build
>> +++ b/hw/riscv/meson.build
>> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>>   riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>>   riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>>   riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
>> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>>   hw_arch += {'riscv': riscv_ss}
>> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
>> index 9d9512ca71..6f1b9ab61b 100644
>> --- a/hw/riscv/riscv-iommu-bits.h
>> +++ b/hw/riscv/riscv-iommu-bits.h
>> @@ -69,6 +69,14 @@ struct riscv_iommu_pq_record {
>>   /* 5.3 IOMMU Capabilities (64bits) */
>>   #define RISCV_IOMMU_REG_CAP             0x0000
>>   #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
>> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
>> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
>> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
>> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
>> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
>> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
>> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
>> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>>   #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>>   #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>>   #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
>> @@ -80,7 +88,9 @@ struct riscv_iommu_pq_record {
>>   /* 5.4 Features control register (32bits) */
>>   #define RISCV_IOMMU_REG_FCTL            0x0008
>> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
>>   #define RISCV_IOMMU_FCTL_WSI            BIT(1)
>> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>>   /* 5.5 Device-directory-table pointer (64bits) */
>>   #define RISCV_IOMMU_REG_DDTP            0x0010
>> @@ -203,6 +213,8 @@ struct riscv_iommu_dc {
>>   #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>>   #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>>   #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
>> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
>> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>>   #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>>   #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
>>   #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
>> @@ -309,9 +321,11 @@ enum riscv_iommu_fq_causes {
>>   /* Translation attributes fields */
>>   #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
>> +#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
>>   /* First stage context fields */
>>   #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
>> +#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
>>   enum riscv_iommu_fq_ttypes {
>>       RISCV_IOMMU_FQ_TTYPE_NONE = 0,
>> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
>> new file mode 100644
>> index 0000000000..ebe3a53a04
>> --- /dev/null
>> +++ b/hw/riscv/riscv-iommu.c
>> @@ -0,0 +1,2017 @@
(...)

>> +
>> +/*
>> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
>> + *
>> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
>> + * Both implementation can be merged into single helper function in future.
>> + * Keeping them separate for now, as error reporting and flow specifics are
>> + * sufficiently different for separate implementation.
>> + *
>> + * @s        : IOMMU Device State
>> + * @ctx      : Translation context for device id and process address space id.
>> + * @iotlb    : translation data: physical address and access mode.
>> + * @return   : success or fault cause code.
>> + */
>> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>> +    IOMMUTLBEntry *iotlb)
>> +{
>> +    dma_addr_t addr, base;
>> +    uint64_t satp, gatp, pte;
>> +    bool en_s, en_g;
>> +    struct {
>> +        unsigned char step;
>> +        unsigned char levels;
>> +        unsigned char ptidxbits;
>> +        unsigned char ptesize;
>> +    } sc[2];
>> +    /* Translation stage phase */
>> +    enum {
>> +        S_STAGE = 0,
>> +        G_STAGE = 1,
>> +    } pass;
>> +
>> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
>> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
>> +
>> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
>> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
>> +
>> +    /* Early check for MSI address match when IOVA == GPA */
>> +    if ((iotlb->perm & IOMMU_WO) &&
>> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> I think the IOVA here may be a GVA and we should not use a GVA to perform msi check. Section 2.3 states that we should always walk first stage page table to get the GPA (step 17) and then use the GPA to do the msi check (step 18).

That's a good point. Even if we rule out the address as a GVA by doing a second stage
check (via en_g) we're still assuming that iotlb->iova = GPA without going through
the first stage translation. We're basically gambling for a match calling
riscv_iommu_msi_check() this early.

I'll remove this early check entirely since we're doing a proper msi check with a
translated GPA at the end of first stage:


>> +            /* Translation phase completed (GPA or SPA) */
>> +            iotlb->translated_addr = base;
>> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
>> +                                                         : IOMMU_RO;
>> +
>> +            /* Check MSI GPA address match */
>> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
>> +                riscv_iommu_msi_check(s, ctx, base)) {
>> +                /* Trap MSI writes and return GPA address. */
>> +                iotlb->target_as = &s->trap_as;
>> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
>> +                return 0;
>> +            }


This will be suboptimal for cases where IOVA == GPA and we would have an early exit by
guessing it right, but I'm more confortable adhering fully with the spec this time.


Thanks,

Daniel


>> +        iotlb->target_as = &s->trap_as;
>> +        iotlb->translated_addr = iotlb->iova;
>> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
>> +        return 0;
>> +    }
>> +
>> +    /* Exit early for pass-through mode. */
>> +    if (!(en_s || en_g)) {
>> +        iotlb->translated_addr = iotlb->iova;
>> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
>> +        /* Allow R/W in pass-through mode */
>> +        iotlb->perm = IOMMU_RW;
>> +        return 0;
>> +    }
>> +
>> +    /* S/G translation parameters. */
>> +    for (pass = 0; pass < 2; pass++) {
>> +        uint32_t sv_mode;
>> +
>> +        sc[pass].step = 0;
>> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
>> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
>> +            /* 32bit mode for GXL/SXL == 1 */
>> +            switch (pass ? gatp : satp) {
>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
>> +                sc[pass].levels    = 0;
>> +                sc[pass].ptidxbits = 0;
>> +                sc[pass].ptesize   = 0;
>> +                break;
>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
>> +                if (!(s->cap & sv_mode)) {
>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +                }
>> +                sc[pass].levels    = 2;
>> +                sc[pass].ptidxbits = 10;
>> +                sc[pass].ptesize   = 4;
>> +                break;
>> +            default:
>> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +            }
>> +        } else {
>> +            /* 64bit mode for GXL/SXL == 0 */
>> +            switch (pass ? gatp : satp) {
>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
>> +                sc[pass].levels    = 0;
>> +                sc[pass].ptidxbits = 0;
>> +                sc[pass].ptesize   = 0;
>> +                break;
>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
>> +                if (!(s->cap & sv_mode)) {
>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +                }
>> +                sc[pass].levels    = 3;
>> +                sc[pass].ptidxbits = 9;
>> +                sc[pass].ptesize   = 8;
>> +                break;
>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
>> +                if (!(s->cap & sv_mode)) {
>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +                }
>> +                sc[pass].levels    = 4;
>> +                sc[pass].ptidxbits = 9;
>> +                sc[pass].ptesize   = 8;
>> +                break;
>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
>> +                if (!(s->cap & sv_mode)) {
>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +                }
>> +                sc[pass].levels    = 5;
>> +                sc[pass].ptidxbits = 9;
>> +                sc[pass].ptesize   = 8;
>> +                break;
>> +            default:
>> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +            }
>> +        }
>> +    };
>> +
>> +    /* S/G stages translation tables root pointers */
>> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
>> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
>> +    addr = (en_s && en_g) ? satp : iotlb->iova;
>> +    base = en_g ? gatp : satp;
>> +    pass = en_g ? G_STAGE : S_STAGE;
>> +
>> +    do {
>> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
>> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
>> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
>> +                                 (sc[pass].levels - 1 - sc[pass].step);
>> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
>> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
>> +        const bool ade =
>> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
>> +
>> +        /* Address range check before first level lookup */
>> +        if (!sc[pass].step) {
>> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
>> +            if ((addr & va_mask) != addr) {
>> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
>> +            }
>> +        }
>> +
>> +        /* Read page table entry */
>> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
>> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
>> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
>> +        }
>> +
>> +        if (sc[pass].ptesize == 4) {
>> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
>> +        } else {
>> +            pte = le64_to_cpu(pte);
>> +        }
>> +
>> +        sc[pass].step++;
>> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
>> +
>> +        if (!(pte & PTE_V)) {
>> +            break;                /* Invalid PTE */
>> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
>> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
>> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
>> +            break;                /* Reserved leaf PTE flags: PTE_W */
>> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
>> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
>> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
>> +            break;                /* Misaligned PPN */
>> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
>> +            break;                /* Read access check failed */
>> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
>> +            break;                /* Write access check failed */
>> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
>> +            break;                /* Access bit not set */
>> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
>> +            break;                /* Dirty bit not set */
>> +        } else {
>> +            /* Leaf PTE, translation completed. */
>> +            sc[pass].step = sc[pass].levels;
>> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
>> +            /* Update address mask based on smallest translation granularity */
>> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
>> +            /* Continue with S-Stage translation? */
>> +            if (pass && sc[0].step != sc[0].levels) {
>> +                pass = S_STAGE;
>> +                addr = iotlb->iova;
>> +                continue;
>> +            }
>> +            /* Translation phase completed (GPA or SPA) */
>> +            iotlb->translated_addr = base;
>> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
>> +                                                         : IOMMU_RO;
>> +
>> +            /* Check MSI GPA address match */
>> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
>> +                riscv_iommu_msi_check(s, ctx, base)) {
>> +                /* Trap MSI writes and return GPA address. */
>> +                iotlb->target_as = &s->trap_as;
>> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
>> +                return 0;
>> +            }
>> +
>> +            /* Continue with G-Stage translation? */
>> +            if (!pass && en_g) {
>> +                pass = G_STAGE;
>> +                addr = base;
>> +                base = gatp;
>> +                sc[pass].step = 0;
>> +                continue;
>> +            }
>> +
>> +            return 0;
>> +        }
>> +
>> +        if (sc[pass].step == sc[pass].levels) {
>> +            break; /* Can't find leaf PTE */
>> +        }
>> +
>> +        /* Continue with G-Stage translation? */
>> +        if (!pass && en_g) {
>> +            pass = G_STAGE;
>> +            addr = base;
>> +            base = gatp;
>> +            sc[pass].step = 0;
>> +        }
>> +    } while (1);
>> +
>> +    return (iotlb->perm & IOMMU_WO) ?
>> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
>> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
>> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
>> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
>> +}
>> +
>> +static void riscv_iommu_report_fault(RISCVIOMMUState *s,
>> +                                     RISCVIOMMUContext *ctx,
>> +                                     uint32_t fault_type, uint32_t cause,
>> +                                     bool pv,
>> +                                     uint64_t iotval, uint64_t iotval2)
>> +{
>> +    struct riscv_iommu_fq_record ev = { 0 };
>> +
>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
>> +        switch (cause) {
>> +        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
>> +        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
>> +        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
>> +            break;
>> +        default:
>> +            /* DTF prevents reporting a fault for this given cause */
>> +            return;
>> +        }
>> +    }
>> +
>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
>> +
>> +    if (pv) {
>> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
>> +    }
>> +
>> +    ev.iotval = iotval;
>> +    ev.iotval2 = iotval2;
>> +
>> +    riscv_iommu_fault(s, &ev);
>> +}
>> +
>> +/* Redirect MSI write for given GPA. */
>> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
>> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
>> +    unsigned size, MemTxAttrs attrs)
>> +{
>> +    MemTxResult res;
>> +    dma_addr_t addr;
>> +    uint64_t intn;
>> +    uint32_t n190;
>> +    uint64_t pte[2];
>> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
>> +    int cause;
>> +
>> +    /* Interrupt File Number */
>> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
>> +    if (intn >= 256) {
>> +        /* Interrupt file number out of range */
>> +        res = MEMTX_ACCESS_ERROR;
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>> +        goto err;
>> +    }
>> +
>> +    /* fetch MSI PTE */
>> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
>> +    addr = addr | (intn * sizeof(pte));
>> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
>> +            MEMTXATTRS_UNSPECIFIED);
>> +    if (res != MEMTX_OK) {
>> +        if (res == MEMTX_DECODE_ERROR) {
>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
>> +        } else {
>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>> +        }
>> +        goto err;
>> +    }
>> +
>> +    le64_to_cpus(&pte[0]);
>> +    le64_to_cpus(&pte[1]);
>> +
>> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
>> +        /*
>> +         * The spec mentions that: "If msipte.C == 1, then further
>> +         * processing to interpret the PTE is implementation
>> +         * defined.". We'll abort with cause = 262 for this
>> +         * case too.
>> +         */
>> +        res = MEMTX_ACCESS_ERROR;
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
>> +        goto err;
>> +    }
>> +
>> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
>> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
>> +        /* MSI Pass-through mode */
>> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
>> +        addr = addr | (gpa & TARGET_PAGE_MASK);
>> +
>> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
>> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
>> +                              gpa, addr);
>> +
>> +        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
>> +        if (res != MEMTX_OK) {
>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
>> +            goto err;
>> +        }
>> +
>> +        return MEMTX_OK;
>> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
>> +        /* MRIF mode, continue. */
>> +        break;
>> +    default:
>> +        res = MEMTX_ACCESS_ERROR;
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
>> +        goto err;
>> +    }
>> +
>> +    /*
>> +     * Report an error for interrupt identities exceeding the maximum allowed
>> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
>> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
>> +     */
>> +    if ((data > 2047) || (gpa & 3)) {
>> +        res = MEMTX_ACCESS_ERROR;
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
>> +        goto err;
>> +    }
>> +
>> +    /* MSI MRIF mode, non atomic pending bit update */
>> +
>> +    /* MRIF pending bit address */
>> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
>> +    addr = addr | ((data & 0x7c0) >> 3);
>> +
>> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
>> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
>> +                          gpa, addr);
>> +
>> +    /* MRIF pending bit mask */
>> +    data = 1ULL << (data & 0x03f);
>> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
>> +    if (res != MEMTX_OK) {
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>> +        goto err;
>> +    }
>> +
>> +    intn = intn | data;
>> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
>> +    if (res != MEMTX_OK) {
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
>> +        goto err;
>> +    }
>> +
>> +    /* Get MRIF enable bits */
>> +    addr = addr + sizeof(intn);
>> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
>> +    if (res != MEMTX_OK) {
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>> +        goto err;
>> +    }
>> +
>> +    if (!(intn & data)) {
>> +        /* notification disabled, MRIF update completed. */
>> +        return MEMTX_OK;
>> +    }
>> +
>> +    /* Send notification message */
>> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
>> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
>> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
>> +
>> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
>> +    if (res != MEMTX_OK) {
>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
>> +        goto err;
>> +    }
>> +
>> +    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
>> +
>> +    return MEMTX_OK;
>> +
>> +err:
>> +    riscv_iommu_report_fault(s, ctx, fault_type, cause,
>> +                             !!ctx->process_id, 0, 0);
>> +    return res;
>> +}
>> +
>> +/*
>> + * Check device context configuration as described by the
>> + * riscv-iommu spec section "Device-context configuration
>> + * checks".
>> + */
>> +static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
>> +                                            RISCVIOMMUContext *ctx)
>> +{
>> +    uint32_t fsc_mode, msi_mode;
>> +
>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
>> +        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
>> +        return false;
>> +    }
>> +
>> +    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
>> +        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
>> +        return false;
>> +    }
>> +
>> +    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
>> +        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
>> +
>> +        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
>> +            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
>> +            return false;
>> +        }
>> +    }
>> +
>> +    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
>> +
>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
>> +        switch (fsc_mode) {
>> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
>> +            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
>> +                return false;
>> +            }
>> +            break;
>> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
>> +            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
>> +                return false;
>> +            }
>> +            break;
>> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
>> +            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
>> +                return false;
>> +            }
>> +            break;
>> +        }
>> +    } else {
>> +        /* DC.tc.PDTV is 0 */
>> +        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
>> +            return false;
>> +        }
>> +
>> +        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
>> +            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
>> +                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
>> +                return false;
>> +            }
>> +        } else {
>> +            switch (fsc_mode) {
>> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
>> +                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
>> +                    return false;
>> +                }
>> +                break;
>> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
>> +                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
>> +                    return false;
>> +                }
>> +            break;
>> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
>> +                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
>> +                    return false;
>> +                }
>> +                break;
>> +            }
>> +        }
>> +    }
>> +
>> +    /*
>> +     * CAP_END is always zero (only one endianess). FCTL_BE is
>> +     * always zero (little-endian accesses). Thus TC_SBE must
>> +     * always be LE, i.e. zero.
>> +     */
>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
>> +        return false;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +/*
>> + * Validate process context (PC) according to section
>> + * "Process-context configuration checks".
>> + */
>> +static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
>> +                                             RISCVIOMMUContext *ctx)
>> +{
>> +    uint32_t mode;
>> +
>> +    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
>> +        return false;
>> +    }
>> +
>> +    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
>> +        return false;
>> +    }
>> +
>> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
>> +    switch (mode) {
>> +    case RISCV_IOMMU_DC_FSC_MODE_BARE:
>> +    /* sv39 and sv32 modes have the same value (8) */
>> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
>> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
>> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
>> +        break;
>> +    default:
>> +        return false;
>> +    }
>> +
>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
>> +        if (mode == RISCV_IOMMU_CAP_SV32 &&
>> +            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
>> +                return false;
>> +        }
>> +    } else {
>> +        switch (mode) {
>> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
>> +            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
>> +                return false;
>> +            }
>> +            break;
>> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
>> +            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
>> +                return false;
>> +            }
>> +            break;
>> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
>> +            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
>> +                return false;
>> +            }
>> +            break;
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +/*
>> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
>> + *
>> + * @s         : IOMMU Device State
>> + * @ctx       : Device Translation Context with devid and process_id set.
>> + * @return    : success or fault code.
>> + */
>> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>> +{
>> +    const uint64_t ddtp = s->ddtp;
>> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
>> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
>> +    struct riscv_iommu_dc dc;
>> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
>> +    const int dc_fmt = !s->enable_msi;
>> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
>> +    unsigned depth;
>> +    uint64_t de;
>> +
>> +    switch (mode) {
>> +    case RISCV_IOMMU_DDTP_MODE_OFF:
>> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
>> +
>> +    case RISCV_IOMMU_DDTP_MODE_BARE:
>> +        /* mock up pass-through translation context */
>> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
>> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
>> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
>> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
>> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
>> +        ctx->ta = 0;
>> +        ctx->msiptp = 0;
>> +        return 0;
>> +
>> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
>> +        depth = 0;
>> +        break;
>> +
>> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
>> +        depth = 1;
>> +        break;
>> +
>> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
>> +        depth = 2;
>> +        break;
>> +
>> +    default:
>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +    }
>> +
>> +    /*
>> +     * Check supported device id width (in bits).
>> +     * See IOMMU Specification, Chapter 6. Software guidelines.
>> +     * - if extended device-context format is used:
>> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
>> +     * - if base device-context format is used:
>> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
>> +     */
>> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
>> +        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>> +    }
>> +
>> +    /* Device directory tree walk */
>> +    for (; depth-- > 0; ) {
>> +        /*
>> +         * Select device id index bits based on device directory tree level
>> +         * and device context format.
>> +         * See IOMMU Specification, Chapter 2. Data Structures.
>> +         * - if extended device-context format is used:
>> +         *   device index: [23:15][14:6][5:0]
>> +         * - if base device-context format is used:
>> +         *   device index: [23:16][15:7][6:0]
>> +         */
>> +        const int split = depth * 9 + 6 + dc_fmt;
>> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
>> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
>> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
>> +        }
>> +        le64_to_cpus(&de);
>> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
>> +            /* invalid directory entry */
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>> +        }
>> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
>> +            /* reserved bits set */
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +        }
>> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
>> +    }
>> +
>> +    /* index into device context entry page */
>> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
>> +
>> +    memset(&dc, 0, sizeof(dc));
>> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
>> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
>> +    }
>> +
>> +    /* Set translation context. */
>> +    ctx->tc = le64_to_cpu(dc.tc);
>> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
>> +    ctx->satp = le64_to_cpu(dc.fsc);
>> +    ctx->ta = le64_to_cpu(dc.ta);
>> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
>> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
>> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
>> +
>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>> +    }
>> +
>> +    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>> +    }
>> +
>> +    /* FSC field checks */
>> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
>> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
>> +
>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
>> +        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
>> +            /* PID is disabled */
>> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>> +        }
>> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
>> +            /* Invalid translation mode */
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>> +        }
>> +        return 0;
>> +    }
>> +
>> +    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
>> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
>> +            /* No default process_id enabled, set BARE mode */
>> +            ctx->satp = 0ULL;
>> +            return 0;
>> +        } else {
>> +            /* Use default process_id #0 */
>> +            ctx->process_id = 0;
>> +        }
>> +    }
>> +
>> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
>> +        /* No S-Stage translation, done. */
>> +        return 0;
>> +    }
>> +
>> +    /* FSC.TC.PDTV enabled */
>> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
>> +        /* Invalid PDTP.MODE */
>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
>> +    }
>> +
>> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
>> +        /*
>> +         * Select process id index bits based on process directory tree
>> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
>> +         */
>> +        const int split = depth * 9 + 8;
>> +        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
>> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
>> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
>> +        }
>> +        le64_to_cpus(&de);
>> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
>> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
>> +        }
>> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
>> +    }
>> +
>> +    /* Leaf entry in PDT */
>> +    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
>> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
>> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
>> +    }
>> +
>> +    /* Use FSC and TA from process directory entry. */
>> +    ctx->ta = le64_to_cpu(dc.ta);
>> +    ctx->satp = le64_to_cpu(dc.fsc);
>> +
>> +    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
>> +    }
>> +
>> +    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/* Translation Context cache support */
>> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
>> +{
>> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
>> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
>> +    return c1->devid == c2->devid &&
>> +           c1->process_id == c2->process_id;
>> +}
>> +
>> +static guint __ctx_hash(gconstpointer v)
>> +{
>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
>> +    /*
>> +     * Generate simple hash of (process_id, devid)
>> +     * assuming 24-bit wide devid.
>> +     */
>> +    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
>> +}
>> +
>> +static void __ctx_inval_devid_procid(gpointer key, gpointer value,
>> +                                     gpointer data)
>> +{
>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
>> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
>> +        ctx->devid == arg->devid &&
>> +        ctx->process_id == arg->process_id) {
>> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
>> +    }
>> +}
>> +
>> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
>> +{
>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
>> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
>> +        ctx->devid == arg->devid) {
>> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
>> +    }
>> +}
>> +
>> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
>> +{
>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
>> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
>> +    }
>> +}
>> +
>> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
>> +                                  uint32_t devid, uint32_t process_id)
>> +{
>> +    GHashTable *ctx_cache;
>> +    RISCVIOMMUContext key = {
>> +        .devid = devid,
>> +        .process_id = process_id,
>> +    };
>> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
>> +    qemu_mutex_lock(&s->ctx_lock);
>> +    g_hash_table_foreach(ctx_cache, func, &key);
>> +    qemu_mutex_unlock(&s->ctx_lock);
>> +    g_hash_table_unref(ctx_cache);
>> +}
>> +
>> +/* Find or allocate translation context for a given {device_id, process_id} */
>> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
>> +                                          unsigned devid, unsigned process_id,
>> +                                          void **ref)
>> +{
>> +    GHashTable *ctx_cache;
>> +    RISCVIOMMUContext *ctx;
>> +    RISCVIOMMUContext key = {
>> +        .devid = devid,
>> +        .process_id = process_id,
>> +    };
>> +
>> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
>> +    qemu_mutex_lock(&s->ctx_lock);
>> +    ctx = g_hash_table_lookup(ctx_cache, &key);
>> +    qemu_mutex_unlock(&s->ctx_lock);
>> +
>> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
>> +        *ref = ctx_cache;
>> +        return ctx;
>> +    }
>> +
>> +    ctx = g_new0(RISCVIOMMUContext, 1);
>> +    ctx->devid = devid;
>> +    ctx->process_id = process_id;
>> +
>> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
>> +    if (!fault) {
>> +        qemu_mutex_lock(&s->ctx_lock);
>> +        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
>> +            g_hash_table_unref(ctx_cache);
>> +            ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
>> +                                              g_free, NULL);
>> +            g_hash_table_ref(ctx_cache);
>> +            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
>> +        }
>> +        g_hash_table_add(ctx_cache, ctx);
>> +        qemu_mutex_unlock(&s->ctx_lock);
>> +        *ref = ctx_cache;
>> +        return ctx;
>> +    }
>> +
>> +    g_hash_table_unref(ctx_cache);
>> +    *ref = NULL;
>> +
>> +    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
>> +                             fault, !!process_id, 0, 0);
>> +
>> +    g_free(ctx);
>> +    return NULL;
>> +}
>> +
>> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
>> +{
>> +    if (ref) {
>> +        g_hash_table_unref((GHashTable *)ref);
>> +    }
>> +}
>> +
>> +/* Find or allocate address space for a given device */
>> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
>> +{
>> +    RISCVIOMMUSpace *as;
>> +
>> +    /* FIXME: PCIe bus remapping for attached endpoints. */
>> +    devid |= s->bus << 8;
>> +
>> +    qemu_mutex_lock(&s->core_lock);
>> +    QLIST_FOREACH(as, &s->spaces, list) {
>> +        if (as->devid == devid) {
>> +            break;
>> +        }
>> +    }
>> +    qemu_mutex_unlock(&s->core_lock);
>> +
>> +    if (as == NULL) {
>> +        char name[64];
>> +        as = g_new0(RISCVIOMMUSpace, 1);
>> +
>> +        as->iommu = s;
>> +        as->devid = devid;
>> +
>> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
>> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
>> +
>> +        /* IOVA address space, untranslated addresses */
>> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
>> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
>> +            OBJECT(as), "riscv_iommu", UINT64_MAX);
>> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
>> +
>> +        qemu_mutex_lock(&s->core_lock);
>> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
>> +        qemu_mutex_unlock(&s->core_lock);
>> +
>> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
>> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
>> +    }
>> +    return &as->iova_as;
>> +}
>> +
>> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>> +    IOMMUTLBEntry *iotlb)
>> +{
>> +    bool enable_pid;
>> +    bool enable_pri;
>> +    int fault;
>> +
>> +    /*
>> +     * TC[32] is reserved for custom extensions, used here to temporarily
>> +     * enable automatic page-request generation for ATS queries.
>> +     */
>> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
>> +    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
>> +
>> +    /* Translate using device directory / page table information. */
>> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
>> +
>> +    if (enable_pri && fault) {
>> +        struct riscv_iommu_pq_record pr = {0};
>> +        if (enable_pid) {
>> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
>> +                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
>> +        }
>> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
>> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
>> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
>> +        riscv_iommu_pri(s, &pr);
>> +        return fault;
>> +    }
>> +
>> +    if (fault) {
>> +        unsigned ttype;
>> +
>> +        if (iotlb->perm & IOMMU_RW) {
>> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
>> +        } else {
>> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
>> +        }
>> +
>> +        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
>> +                                 iotlb->iova, iotlb->translated_addr);
>> +        return fault;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/* IOMMU Command Interface */
>> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
>> +    uint64_t addr, uint32_t data)
>> +{
>> +    /*
>> +     * ATS processing in this implementation of the IOMMU is synchronous,
>> +     * no need to wait for completions here.
>> +     */
>> +    if (!notify) {
>> +        return MEMTX_OK;
>> +    }
>> +
>> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
>> +        MEMTXATTRS_UNSPECIFIED);
>> +}
>> +
>> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
>> +{
>> +    uint64_t old_ddtp = s->ddtp;
>> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
>> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
>> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
>> +    bool ok = false;
>> +
>> +    /*
>> +     * Check for allowed DDTP.MODE transitions:
>> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
>> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
>> +     */
>> +    if (new_mode == old_mode ||
>> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
>> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
>> +        ok = true;
>> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
>> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
>> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
>> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
>> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
>> +    }
>> +
>> +    if (ok) {
>> +        /* clear reserved and busy bits, report back sanitized version */
>> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
>> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
>> +    } else {
>> +        new_ddtp = old_ddtp;
>> +    }
>> +    s->ddtp = new_ddtp;
>> +
>> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
>> +}
>> +
>> +/* Command function and opcode field. */
>> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
>> +
>> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
>> +{
>> +    struct riscv_iommu_command cmd;
>> +    MemTxResult res;
>> +    dma_addr_t addr;
>> +    uint32_t tail, head, ctrl;
>> +    uint64_t cmd_opcode;
>> +    GHFunc func;
>> +
>> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
>> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
>> +
>> +    /* Check for pending error or queue processing disabled */
>> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
>> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
>> +        return;
>> +    }
>> +
>> +    while (tail != head) {
>> +        addr = s->cq_addr  + head * sizeof(cmd);
>> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
>> +                              MEMTXATTRS_UNSPECIFIED);
>> +
>> +        if (res != MEMTX_OK) {
>> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
>> +            goto fault;
>> +        }
>> +
>> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
>> +
>> +        cmd_opcode = get_field(cmd.dword0,
>> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
>> +
>> +        switch (cmd_opcode) {
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
>> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
>> +            res = riscv_iommu_iofence(s,
>> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
>> +
>> +            if (res != MEMTX_OK) {
>> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
>> +                goto fault;
>> +            }
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
>> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
>> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
>> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
>> +                goto cmd_ill;
>> +            }
>> +            /* translation cache not implemented yet */
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
>> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
>> +            /* translation cache not implemented yet */
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
>> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
>> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
>> +                /* invalidate all device context cache mappings */
>> +                func = __ctx_inval_all;
>> +            } else {
>> +                /* invalidate all device context matching DID */
>> +                func = __ctx_inval_devid;
>> +            }
>> +            riscv_iommu_ctx_inval(s, func,
>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
>> +            break;
>> +
>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
>> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
>> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
>> +                /* illegal command arguments IODIR_PDT & DV == 0 */
>> +                goto cmd_ill;
>> +            } else {
>> +                func = __ctx_inval_devid_procid;
>> +            }
>> +            riscv_iommu_ctx_inval(s, func,
>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
>> +            break;
>> +
>> +        default:
>> +        cmd_ill:
>> +            /* Invalid instruction, do not advance instruction index. */
>> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
>> +            goto fault;
>> +        }
>> +
>> +        /* Advance and update head pointer after command completes. */
>> +        head = (head + 1) & s->cq_mask;
>> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
>> +    }
>> +    return;
>> +
>> +fault:
>> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
>> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
>> +    }
>> +}
>> +
>> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
>> +{
>> +    uint64_t base;
>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>> +    uint32_t ctrl_clr;
>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
>> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
>> +
>> +    if (enable && !active) {
>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
>> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
>> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
>> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
>> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
>> +                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
>> +    } else if (!enable && active) {
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
>> +    } else {
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
>> +}
>> +
>> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
>> +{
>> +    uint64_t base;
>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
>> +    uint32_t ctrl_clr;
>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
>> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
>> +
>> +    if (enable && !active) {
>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
>> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
>> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
>> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
>> +            RISCV_IOMMU_FQCSR_FQOF;
>> +    } else if (!enable && active) {
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
>> +    } else {
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
>> +}
>> +
>> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
>> +{
>> +    uint64_t base;
>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
>> +    uint32_t ctrl_clr;
>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
>> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
>> +
>> +    if (enable && !active) {
>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
>> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
>> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
>> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
>> +            RISCV_IOMMU_PQCSR_PQOF;
>> +    } else if (!enable && active) {
>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
>> +    } else {
>> +        ctrl_set = 0;
>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
>> +}
>> +
>> +static void riscv_iommu_process_icvec_update(RISCVIOMMUState *s)
>> +{
>> +    uint32_t icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
>> +
>> +    s->icvec_update(s,
>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_CQ),
>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_FQ),
>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PM),
>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PQ));
>> +}
>> +
>> +typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
>> +
>> +static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
>> +{
>> +    uint32_t cqcsr, fqcsr, pqcsr;
>> +    uint32_t ipsr_set = 0;
>> +    uint32_t ipsr_clr = 0;
>> +
>> +    if (data & RISCV_IOMMU_IPSR_CIP) {
>> +        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>> +
>> +        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
>> +            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
>> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
>> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
>> +             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
>> +            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
>> +        } else {
>> +            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
>> +        }
>> +    } else {
>> +        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
>> +    }
>> +
>> +    if (data & RISCV_IOMMU_IPSR_FIP) {
>> +        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
>> +
>> +        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
>> +            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
>> +             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
>> +            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
>> +        } else {
>> +            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
>> +        }
>> +    } else {
>> +        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
>> +    }
>> +
>> +    if (data & RISCV_IOMMU_IPSR_PIP) {
>> +        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
>> +
>> +        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
>> +            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
>> +             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
>> +            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
>> +        } else {
>> +            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
>> +        }
>> +    } else {
>> +        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
>> +    }
>> +
>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
>> +}
>> +
>> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
>> +    uint64_t data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    riscv_iommu_process_fn *process_fn = NULL;
>> +    RISCVIOMMUState *s = opaque;
>> +    uint32_t regb = addr & ~3;
>> +    uint32_t busy = 0;
>> +    uint64_t val = 0;
>> +
>> +    if ((addr & (size - 1)) != 0) {
>> +        /* Unsupported MMIO alignment or access size */
>> +        return MEMTX_ERROR;
>> +    }
>> +
>> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
>> +        /* Unsupported MMIO access location. */
>> +        return MEMTX_ACCESS_ERROR;
>> +    }
>> +
>> +    /* Track actionable MMIO write. */
>> +    switch (regb) {
>> +    case RISCV_IOMMU_REG_DDTP:
>> +    case RISCV_IOMMU_REG_DDTP + 4:
>> +        process_fn = riscv_iommu_process_ddtp;
>> +        regb = RISCV_IOMMU_REG_DDTP;
>> +        busy = RISCV_IOMMU_DDTP_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_CQT:
>> +        process_fn = riscv_iommu_process_cq_tail;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_CQCSR:
>> +        process_fn = riscv_iommu_process_cq_control;
>> +        busy = RISCV_IOMMU_CQCSR_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_FQCSR:
>> +        process_fn = riscv_iommu_process_fq_control;
>> +        busy = RISCV_IOMMU_FQCSR_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_PQCSR:
>> +        process_fn = riscv_iommu_process_pq_control;
>> +        busy = RISCV_IOMMU_PQCSR_BUSY;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_ICVEC:
>> +        process_fn = riscv_iommu_process_icvec_update;
>> +        break;
>> +
>> +    case RISCV_IOMMU_REG_IPSR:
>> +        /*
>> +         * IPSR has special procedures to update. Execute it
>> +         * and exit.
>> +         */
>> +        if (size == 4) {
>> +            uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
>> +            uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
>> +            uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
>> +            stl_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
>> +        } else if (size == 8) {
>> +            uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
>> +            uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
>> +            uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
>> +            stq_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
>> +        }
>> +
>> +        riscv_iommu_update_ipsr(s, val);
>> +
>> +        return MEMTX_OK;
>> +
>> +    default:
>> +        break;
>> +    }
>> +
>> +    /*
>> +     * Registers update might be not synchronized with core logic.
>> +     * If system software updates register when relevant BUSY bit
>> +     * is set IOMMU behavior of additional writes to the register
>> +     * is UNSPECIFIED.
>> +     */
>> +    qemu_spin_lock(&s->regs_lock);
>> +    if (size == 1) {
>> +        uint8_t ro = s->regs_ro[addr];
>> +        uint8_t wc = s->regs_wc[addr];
>> +        uint8_t rw = s->regs_rw[addr];
>> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
>> +    } else if (size == 2) {
>> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
>> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
>> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
>> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
>> +    } else if (size == 4) {
>> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
>> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
>> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
>> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
>> +    } else if (size == 8) {
>> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
>> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
>> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
>> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
>> +    }
>> +
>> +    /* Busy flag update, MSB 4-byte register. */
>> +    if (busy) {
>> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
>> +        stl_le_p(&s->regs_rw[regb], rw | busy);
>> +    }
>> +    qemu_spin_unlock(&s->regs_lock);
>> +
>> +    if (process_fn) {
>> +        qemu_mutex_lock(&s->core_lock);
>> +        process_fn(s);
>> +        qemu_mutex_unlock(&s->core_lock);
>> +    }
>> +
>> +    return MEMTX_OK;
>> +}
>> +
>> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
>> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    RISCVIOMMUState *s = opaque;
>> +    uint64_t val = -1;
>> +    uint8_t *ptr;
>> +
>> +    if ((addr & (size - 1)) != 0) {
>> +        /* Unsupported MMIO alignment. */
>> +        return MEMTX_ERROR;
>> +    }
>> +
>> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
>> +        return MEMTX_ACCESS_ERROR;
>> +    }
>> +
>> +    ptr = &s->regs_rw[addr];
>> +
>> +    if (size == 1) {
>> +        val = (uint64_t)*ptr;
>> +    } else if (size == 2) {
>> +        val = lduw_le_p(ptr);
>> +    } else if (size == 4) {
>> +        val = ldl_le_p(ptr);
>> +    } else if (size == 8) {
>> +        val = ldq_le_p(ptr);
>> +    } else {
>> +        return MEMTX_ERROR;
>> +    }
>> +
>> +    *data = val;
>> +
>> +    return MEMTX_OK;
>> +}
>> +
>> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
>> +    .read_with_attrs = riscv_iommu_mmio_read,
>> +    .write_with_attrs = riscv_iommu_mmio_write,
>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>> +    .impl = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +        .unaligned = false,
>> +    },
>> +    .valid = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +    }
>> +};
>> +
>> +/*
>> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
>> + * memory region as untranslated address, for additional MSI/MRIF interception
>> + * by IOMMU interrupt remapping implementation.
>> + * Note: Device emulation code generating an MSI is expected to provide a valid
>> + * memory transaction attributes with requested_id set.
>> + */
>> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
>> +    uint64_t data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
>> +    RISCVIOMMUContext *ctx;
>> +    MemTxResult res;
>> +    void *ref;
>> +    uint32_t devid = attrs.requester_id;
>> +
>> +    if (attrs.unspecified) {
>> +        return MEMTX_ACCESS_ERROR;
>> +    }
>> +
>> +    /* FIXME: PCIe bus remapping for attached endpoints. */
>> +    devid |= s->bus << 8;
>> +
>> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
>> +    if (ctx == NULL) {
>> +        res = MEMTX_ACCESS_ERROR;
>> +    } else {
>> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
>> +    }
>> +    riscv_iommu_ctx_put(s, ref);
>> +    return res;
>> +}
>> +
>> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
>> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
>> +{
>> +    return MEMTX_ACCESS_ERROR;
>> +}
>> +
>> +static const MemoryRegionOps riscv_iommu_trap_ops = {
>> +    .read_with_attrs = riscv_iommu_trap_read,
>> +    .write_with_attrs = riscv_iommu_trap_write,
>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>> +    .impl = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +        .unaligned = true,
>> +    },
>> +    .valid = {
>> +        .min_access_size = 4,
>> +        .max_access_size = 8,
>> +    }
>> +};
>> +
>> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
>> +{
>> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
>> +
>> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
>> +    if (s->enable_msi) {
>> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
>> +    }
>> +    if (s->enable_s_stage) {
>> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
>> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
>> +    }
>> +    if (s->enable_g_stage) {
>> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
>> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
>> +    }
>> +    /* Report QEMU target physical address space limits */
>> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
>> +                       TARGET_PHYS_ADDR_SPACE_BITS);
>> +
>> +    /* TODO: method to report supported PID bits */
>> +    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
>> +    s->cap |= RISCV_IOMMU_CAP_PD8;
>> +
>> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
>> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
>> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
>> +
>> +    /* register storage */
>> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>> +
>> +     /* Mark all registers read-only */
>> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
>> +
>> +    /*
>> +     * Register complete MMIO space, including MSI/PBA registers.
>> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
>> +     * managed directly by the PCIDevice implementation.
>> +     */
>> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
>> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
>> +
>> +    /* Set power-on register state */
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
>> +             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
>> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
>> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
>> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
>> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
>> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
>> +        RISCV_IOMMU_CQCSR_BUSY);
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
>> +        RISCV_IOMMU_FQCSR_FQOF);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
>> +        RISCV_IOMMU_FQCSR_BUSY);
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
>> +        RISCV_IOMMU_PQCSR_PQOF);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
>> +        RISCV_IOMMU_PQCSR_BUSY);
>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
>> +
>> +    /* Memory region for downstream access, if specified. */
>> +    if (s->target_mr) {
>> +        s->target_as = g_new0(AddressSpace, 1);
>> +        address_space_init(s->target_as, s->target_mr,
>> +            "riscv-iommu-downstream");
>> +    } else {
>> +        /* Fallback to global system memory. */
>> +        s->target_as = &address_space_memory;
>> +    }
>> +
>> +    /* Memory region for untranslated MRIF/MSI writes */
>> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
>> +            "riscv-iommu-trap", ~0ULL);
>> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
>> +
>> +    /* Device translation context cache */
>> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
>> +                                         g_free, NULL);
>> +    qemu_mutex_init(&s->ctx_lock);
>> +
>> +    s->iommus.le_next = NULL;
>> +    s->iommus.le_prev = NULL;
>> +    QLIST_INIT(&s->spaces);
>> +    qemu_mutex_init(&s->core_lock);
>> +    qemu_spin_init(&s->regs_lock);
>> +}
>> +
>> +static void riscv_iommu_unrealize(DeviceState *dev)
>> +{
>> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
>> +
>> +    qemu_mutex_destroy(&s->core_lock);
>> +    g_hash_table_unref(s->ctx_cache);
>> +}
>> +
>> +static Property riscv_iommu_properties[] = {
>> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
>> +        RISCV_IOMMU_SPEC_DOT_VER),
>> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
>> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
>> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
>> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
>> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
>> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
>> +        TYPE_MEMORY_REGION, MemoryRegion *),
>> +    DEFINE_PROP_END_OF_LIST(),
>> +};
>> +
>> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
>> +{
>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>> +
>> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
>> +    dc->user_creatable = false;
>> +    dc->realize = riscv_iommu_realize;
>> +    dc->unrealize = riscv_iommu_unrealize;
>> +    device_class_set_props(dc, riscv_iommu_properties);
>> +}
>> +
>> +static const TypeInfo riscv_iommu_info = {
>> +    .name = TYPE_RISCV_IOMMU,
>> +    .parent = TYPE_DEVICE,
>> +    .instance_size = sizeof(RISCVIOMMUState),
>> +    .class_init = riscv_iommu_class_init,
>> +};
>> +
>> +static const char *IOMMU_FLAG_STR[] = {
>> +    "NA",
>> +    "RO",
>> +    "WR",
>> +    "RW",
>> +};
>> +
>> +/* RISC-V IOMMU Memory Region - Address Translation Space */
>> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
>> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
>> +    IOMMUAccessFlags flag, int iommu_idx)
>> +{
>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>> +    RISCVIOMMUContext *ctx;
>> +    void *ref;
>> +    IOMMUTLBEntry iotlb = {
>> +        .iova = addr,
>> +        .target_as = as->iommu->target_as,
>> +        .addr_mask = ~0ULL,
>> +        .perm = flag,
>> +    };
>> +
>> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
>> +    if (ctx == NULL) {
>> +        /* Translation disabled or invalid. */
>> +        iotlb.addr_mask = 0;
>> +        iotlb.perm = IOMMU_NONE;
>> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
>> +        /* Translation disabled or fault reported. */
>> +        iotlb.addr_mask = 0;
>> +        iotlb.perm = IOMMU_NONE;
>> +    }
>> +
>> +    /* Trace all dma translations with original access flags. */
>> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
>> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
>> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
>> +                          iotlb.translated_addr);
>> +
>> +    riscv_iommu_ctx_put(as->iommu, ref);
>> +
>> +    return iotlb;
>> +}
>> +
>> +static int riscv_iommu_memory_region_notify(
>> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
>> +    IOMMUNotifierFlag new, Error **errp)
>> +{
>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>> +
>> +    if (old == IOMMU_NOTIFIER_NONE) {
>> +        as->notifier = true;
>> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
>> +    } else if (new == IOMMU_NOTIFIER_NONE) {
>> +        as->notifier = false;
>> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static inline bool pci_is_iommu(PCIDevice *pdev)
>> +{
>> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
>> +}
>> +
>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>> +{
>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>> +    AddressSpace *as = NULL;
>> +
>> +    if (pdev && pci_is_iommu(pdev)) {
>> +        return s->target_as;
>> +    }
>> +
>> +    /* Find first registered IOMMU device */
>> +    while (s->iommus.le_prev) {
>> +        s = *(s->iommus.le_prev);
>> +    }
>> +
>> +    /* Find first matching IOMMU */
>> +    while (s != NULL && as == NULL) {
>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
>> +        s = s->iommus.le_next;
>> +    }
>> +
>> +    return as ? as : &address_space_memory;
>> +}
>> +
>> +static const PCIIOMMUOps riscv_iommu_ops = {
>> +    .get_address_space = riscv_iommu_find_as,
>> +};
>> +
>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>> +        Error **errp)
>> +{
>> +    if (bus->iommu_ops &&
>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>> +    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>> +    } else {
>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>> +            pci_bus_num(bus));
>> +    }
>> +}
>> +
>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>> +    MemTxAttrs attrs)
>> +{
>> +    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
>> +}
>> +
>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>> +{
>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>> +    return 1 << as->iommu->pid_bits;
>> +}
>> +
>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>> +{
>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>> +
>> +    imrc->translate = riscv_iommu_memory_region_translate;
>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>> +}
>> +
>> +static const TypeInfo riscv_iommu_memory_region_info = {
>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>> +    .class_init = riscv_iommu_memory_region_init,
>> +};
>> +
>> +static void riscv_iommu_register_mr_types(void)
>> +{
>> +    type_register_static(&riscv_iommu_memory_region_info);
>> +    type_register_static(&riscv_iommu_info);
>> +}
>> +
>> +type_init(riscv_iommu_register_mr_types);
>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>> new file mode 100644
>> index 0000000000..6d76cb9b1a
>> --- /dev/null
>> +++ b/hw/riscv/riscv-iommu.h
>> @@ -0,0 +1,148 @@
>> +/*
>> + * QEMU emulation of an RISC-V IOMMU
>> + *
>> + * Copyright (C) 2022-2023 Rivos Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef HW_RISCV_IOMMU_STATE_H
>> +#define HW_RISCV_IOMMU_STATE_H
>> +
>> +#include "qemu/osdep.h"
>> +#include "qom/object.h"
>> +
>> +#include "hw/riscv/iommu.h"
>> +
>> +struct RISCVIOMMUState {
>> +    /*< private >*/
>> +    DeviceState parent_obj;
>> +
>> +    /*< public >*/
>> +    uint32_t version;     /* Reported interface version number */
>> +    uint32_t pid_bits;    /* process identifier width */
>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>> +
>> +    uint64_t cap;         /* IOMMU supported capabilities */
>> +    uint64_t fctl;        /* IOMMU enabled features */
>> +
>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>> +    bool enable_msi;      /* Enable MSI remapping */
>> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
>> +    bool enable_g_stage;  /* Enable G-Stage translation */
>> +
>> +    /* IOMMU Internal State */
>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>> +
>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>> +
>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>> +
>> +    /* interrupt notifier */
>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>> +    /* register icvec interrupts */
>> +    void (*icvec_update)(RISCVIOMMUState *iommu,
>> +                         uint32_t civ, uint32_t fiv,
>> +                         uint32_t pmiv, uint32_t piv);
>> +
>> +    /* IOMMU State Machine */
>> +    QemuThread core_proc; /* Background processing thread */
>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
>> +    QemuCond core_cond;   /* Background processing wake up signal */
>> +    unsigned core_exec;   /* Processing thread execution actions */
>> +
>> +    /* IOMMU target address space */
>> +    AddressSpace *target_as;
>> +    MemoryRegion *target_mr;
>> +
>> +    /* MSI / MRIF access trap */
>> +    AddressSpace trap_as;
>> +    MemoryRegion trap_mr;
>> +
>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>> +    QemuMutex ctx_lock;      /* Device translation Cache update lock */
>> +
>> +    /* MMIO Hardware Interface */
>> +    MemoryRegion regs_mr;
>> +    QemuSpin regs_lock;
>> +    uint8_t *regs_rw;  /* register state (user write) */
>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>> +    uint8_t *regs_ro;  /* read-only mask */
>> +
>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>> +};
>> +
>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>> +         Error **errp);
>> +
>> +/* private helpers */
>> +
>> +/* Register helper functions */
>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>> +    unsigned idx, uint32_t set, uint32_t clr)
>> +{
>> +    uint32_t val;
>> +    qemu_spin_lock(&s->regs_lock);
>> +    val = ldl_le_p(s->regs_rw + idx);
>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +    return val;
>> +}
>> +
>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
>> +    unsigned idx, uint32_t set)
>> +{
>> +    qemu_spin_lock(&s->regs_lock);
>> +    stl_le_p(s->regs_rw + idx, set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +}
>> +
>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
>> +    unsigned idx)
>> +{
>> +    return ldl_le_p(s->regs_rw + idx);
>> +}
>> +
>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
>> +    unsigned idx, uint64_t set, uint64_t clr)
>> +{
>> +    uint64_t val;
>> +    qemu_spin_lock(&s->regs_lock);
>> +    val = ldq_le_p(s->regs_rw + idx);
>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +    return val;
>> +}
>> +
>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
>> +    unsigned idx, uint64_t set)
>> +{
>> +    qemu_spin_lock(&s->regs_lock);
>> +    stq_le_p(s->regs_rw + idx, set);
>> +    qemu_spin_unlock(&s->regs_lock);
>> +}
>> +
>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>> +    unsigned idx)
>> +{
>> +    return ldq_le_p(s->regs_rw + idx);
>> +}
>> +
>> +
>> +
>> +#endif
>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>> new file mode 100644
>> index 0000000000..bdd8b657a6
>> --- /dev/null
>> +++ b/hw/riscv/trace-events
>> @@ -0,0 +1,12 @@
>> +# See documentation at docs/devel/tracing.rst
>> +
>> +# riscv-iommu.c
>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>> +riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>> new file mode 100644
>> index 0000000000..8c0e3ca1f3
>> --- /dev/null
>> +++ b/hw/riscv/trace.h
>> @@ -0,0 +1 @@
>> +#include "trace/trace-hw_riscv.h"
>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>> new file mode 100644
>> index 0000000000..070ee69973
>> --- /dev/null
>> +++ b/include/hw/riscv/iommu.h
>> @@ -0,0 +1,36 @@
>> +/*
>> + * QEMU emulation of an RISC-V IOMMU
>> + *
>> + * Copyright (C) 2022-2023 Rivos Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along
>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef HW_RISCV_IOMMU_H
>> +#define HW_RISCV_IOMMU_H
>> +
>> +#include "qemu/osdep.h"
>> +#include "qom/object.h"
>> +
>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>> +
>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>> +
>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>> +
>> +#endif
>> diff --git a/meson.build b/meson.build
>> index a1e51277b0..359f836d8e 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -3373,6 +3373,7 @@ if have_system
>>       'hw/pci-host',
>>       'hw/ppc',
>>       'hw/rtc',
>> +    'hw/riscv',
>>       'hw/s390x',
>>       'hw/scsi',
>>       'hw/sd',
Tomasz Jeznach Aug. 27, 2024, 2:18 a.m. UTC | #6
On Fri, Aug 23, 2024 at 5:42 AM Daniel Henrique Barboza
<dbarboza@ventanamicro.com> wrote:
>
>
>
> On 8/20/24 12:16 PM, Jason Chien wrote:
> > Hi Daniel,
> >
> > On 2024/8/1 下午 11:43, Daniel Henrique Barboza wrote:
> >> From: Tomasz Jeznach <tjeznach@rivosinc.com>
> >>
> >> The RISC-V IOMMU specification is now ratified as-per the RISC-V
> >> international process. The latest frozen specifcation can be found at:
> >>
> >> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
> >>
> >> Add the foundation of the device emulation for RISC-V IOMMU. It includes
> >> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
> >> sv39x4, sv48x4, sv57x4 caps).
> >>
> >> Other capabilities like ATS and DBG support will be added incrementally
> >> in the next patches.
> >>
> >> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
> >> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
> >> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> >> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> >> ---
> >>   hw/riscv/Kconfig            |    4 +
> >>   hw/riscv/meson.build        |    1 +
> >>   hw/riscv/riscv-iommu-bits.h |   14 +
> >>   hw/riscv/riscv-iommu.c      | 2017 +++++++++++++++++++++++++++++++++++
> >>   hw/riscv/riscv-iommu.h      |  148 +++
> >>   hw/riscv/trace-events       |   12 +
> >>   hw/riscv/trace.h            |    1 +
> >>   include/hw/riscv/iommu.h    |   36 +
> >>   meson.build                 |    1 +
> >>   9 files changed, 2234 insertions(+)
> >>   create mode 100644 hw/riscv/riscv-iommu.c
> >>   create mode 100644 hw/riscv/riscv-iommu.h
> >>   create mode 100644 hw/riscv/trace-events
> >>   create mode 100644 hw/riscv/trace.h
> >>   create mode 100644 include/hw/riscv/iommu.h
> >>
> >> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
> >> index a2030e3a6f..f69d6e3c8e 100644
> >> --- a/hw/riscv/Kconfig
> >> +++ b/hw/riscv/Kconfig
> >> @@ -1,3 +1,6 @@
> >> +config RISCV_IOMMU
> >> +    bool
> >> +
> >>   config RISCV_NUMA
> >>       bool
> >> @@ -47,6 +50,7 @@ config RISCV_VIRT
> >>       select SERIAL
> >>       select RISCV_ACLINT
> >>       select RISCV_APLIC
> >> +    select RISCV_IOMMU
> >>       select RISCV_IMSIC
> >>       select SIFIVE_PLIC
> >>       select SIFIVE_TEST
> >> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
> >> index f872674093..cbc99c6e8e 100644
> >> --- a/hw/riscv/meson.build
> >> +++ b/hw/riscv/meson.build
> >> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
> >>   riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
> >>   riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
> >>   riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
> >> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
> >>   hw_arch += {'riscv': riscv_ss}
> >> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> >> index 9d9512ca71..6f1b9ab61b 100644
> >> --- a/hw/riscv/riscv-iommu-bits.h
> >> +++ b/hw/riscv/riscv-iommu-bits.h
> >> @@ -69,6 +69,14 @@ struct riscv_iommu_pq_record {
> >>   /* 5.3 IOMMU Capabilities (64bits) */
> >>   #define RISCV_IOMMU_REG_CAP             0x0000
> >>   #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> >> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
> >> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
> >> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
> >> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
> >> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
> >> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
> >> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
> >> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
> >>   #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
> >>   #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
> >>   #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> >> @@ -80,7 +88,9 @@ struct riscv_iommu_pq_record {
> >>   /* 5.4 Features control register (32bits) */
> >>   #define RISCV_IOMMU_REG_FCTL            0x0008
> >> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
> >>   #define RISCV_IOMMU_FCTL_WSI            BIT(1)
> >> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
> >>   /* 5.5 Device-directory-table pointer (64bits) */
> >>   #define RISCV_IOMMU_REG_DDTP            0x0010
> >> @@ -203,6 +213,8 @@ struct riscv_iommu_dc {
> >>   #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
> >>   #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
> >>   #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> >> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> >> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
> >>   #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
> >>   #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
> >>   #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> >> @@ -309,9 +321,11 @@ enum riscv_iommu_fq_causes {
> >>   /* Translation attributes fields */
> >>   #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
> >> +#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
> >>   /* First stage context fields */
> >>   #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
> >> +#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
> >>   enum riscv_iommu_fq_ttypes {
> >>       RISCV_IOMMU_FQ_TTYPE_NONE = 0,
> >> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> >> new file mode 100644
> >> index 0000000000..ebe3a53a04
> >> --- /dev/null
> >> +++ b/hw/riscv/riscv-iommu.c
> >> @@ -0,0 +1,2017 @@
> (...)
>
> >> +
> >> +/*
> >> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> >> + *
> >> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> >> + * Both implementation can be merged into single helper function in future.
> >> + * Keeping them separate for now, as error reporting and flow specifics are
> >> + * sufficiently different for separate implementation.
> >> + *
> >> + * @s        : IOMMU Device State
> >> + * @ctx      : Translation context for device id and process address space id.
> >> + * @iotlb    : translation data: physical address and access mode.
> >> + * @return   : success or fault cause code.
> >> + */
> >> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> >> +    IOMMUTLBEntry *iotlb)
> >> +{
> >> +    dma_addr_t addr, base;
> >> +    uint64_t satp, gatp, pte;
> >> +    bool en_s, en_g;
> >> +    struct {
> >> +        unsigned char step;
> >> +        unsigned char levels;
> >> +        unsigned char ptidxbits;
> >> +        unsigned char ptesize;
> >> +    } sc[2];
> >> +    /* Translation stage phase */
> >> +    enum {
> >> +        S_STAGE = 0,
> >> +        G_STAGE = 1,
> >> +    } pass;
> >> +
> >> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> >> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> >> +
> >> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
> >> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> >> +
> >> +    /* Early check for MSI address match when IOVA == GPA */
> >> +    if ((iotlb->perm & IOMMU_WO) &&
> >> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
> > I think the IOVA here may be a GVA and we should not use a GVA to perform msi check. Section 2.3 states that we should always walk first stage page table to get the GPA (step 17) and then use the GPA to do the msi check (step 18).
>
> That's a good point. Even if we rule out the address as a GVA by doing a second stage
> check (via en_g) we're still assuming that iotlb->iova = GPA without going through
> the first stage translation. We're basically gambling for a match calling
> riscv_iommu_msi_check() this early.
>
> I'll remove this early check entirely since we're doing a proper msi check with a
> translated GPA at the end of first stage:
>
>

This check was required to ensure MSI translation is applied in case
first-stage translation is set to BARE mode. In this case IOVA
provided is a valid GPA. Running translation through page walk w/
second stage translation will incorrectly try to translate GPA to host
physical page, likely hitting IOPF.

Looking back, I've missed removal of an important check for en_s flag
in v2->v3 changes:

     /* Early check for MSI address match when IOVA == GPA */
-    if (!en_s && (iotlb->perm & IOMMU_WO) &&
+    if ((iotlb->perm & IOMMU_WO) &&
         riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
         iotlb->target_as = &s->trap_as;

I'd suggest fixing the missing en_s check, to enable proper handling of MSIP.

Thanks,
- Tomasz

> >> +            /* Translation phase completed (GPA or SPA) */
> >> +            iotlb->translated_addr = base;
> >> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> >> +                                                         : IOMMU_RO;
> >> +
> >> +            /* Check MSI GPA address match */
> >> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> >> +                riscv_iommu_msi_check(s, ctx, base)) {
> >> +                /* Trap MSI writes and return GPA address. */
> >> +                iotlb->target_as = &s->trap_as;
> >> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> >> +                return 0;
> >> +            }
>
>
> This will be suboptimal for cases where IOVA == GPA and we would have an early exit by
> guessing it right, but I'm more confortable adhering fully with the spec this time.
>
>
> Thanks,
>
> Daniel
>
>
> >> +        iotlb->target_as = &s->trap_as;
> >> +        iotlb->translated_addr = iotlb->iova;
> >> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> >> +        return 0;
> >> +    }
> >> +
> >> +    /* Exit early for pass-through mode. */
> >> +    if (!(en_s || en_g)) {
> >> +        iotlb->translated_addr = iotlb->iova;
> >> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> >> +        /* Allow R/W in pass-through mode */
> >> +        iotlb->perm = IOMMU_RW;
> >> +        return 0;
> >> +    }
> >> +
> >> +    /* S/G translation parameters. */
> >> +    for (pass = 0; pass < 2; pass++) {
> >> +        uint32_t sv_mode;
> >> +
> >> +        sc[pass].step = 0;
> >> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> >> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> >> +            /* 32bit mode for GXL/SXL == 1 */
> >> +            switch (pass ? gatp : satp) {
> >> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> >> +                sc[pass].levels    = 0;
> >> +                sc[pass].ptidxbits = 0;
> >> +                sc[pass].ptesize   = 0;
> >> +                break;
> >> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> >> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
> >> +                if (!(s->cap & sv_mode)) {
> >> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +                }
> >> +                sc[pass].levels    = 2;
> >> +                sc[pass].ptidxbits = 10;
> >> +                sc[pass].ptesize   = 4;
> >> +                break;
> >> +            default:
> >> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +            }
> >> +        } else {
> >> +            /* 64bit mode for GXL/SXL == 0 */
> >> +            switch (pass ? gatp : satp) {
> >> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> >> +                sc[pass].levels    = 0;
> >> +                sc[pass].ptidxbits = 0;
> >> +                sc[pass].ptesize   = 0;
> >> +                break;
> >> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> >> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
> >> +                if (!(s->cap & sv_mode)) {
> >> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +                }
> >> +                sc[pass].levels    = 3;
> >> +                sc[pass].ptidxbits = 9;
> >> +                sc[pass].ptesize   = 8;
> >> +                break;
> >> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> >> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
> >> +                if (!(s->cap & sv_mode)) {
> >> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +                }
> >> +                sc[pass].levels    = 4;
> >> +                sc[pass].ptidxbits = 9;
> >> +                sc[pass].ptesize   = 8;
> >> +                break;
> >> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> >> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
> >> +                if (!(s->cap & sv_mode)) {
> >> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +                }
> >> +                sc[pass].levels    = 5;
> >> +                sc[pass].ptidxbits = 9;
> >> +                sc[pass].ptesize   = 8;
> >> +                break;
> >> +            default:
> >> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +            }
> >> +        }
> >> +    };
> >> +
> >> +    /* S/G stages translation tables root pointers */
> >> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> >> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> >> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> >> +    base = en_g ? gatp : satp;
> >> +    pass = en_g ? G_STAGE : S_STAGE;
> >> +
> >> +    do {
> >> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> >> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> >> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> >> +                                 (sc[pass].levels - 1 - sc[pass].step);
> >> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> >> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> >> +        const bool ade =
> >> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> >> +
> >> +        /* Address range check before first level lookup */
> >> +        if (!sc[pass].step) {
> >> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> >> +            if ((addr & va_mask) != addr) {
> >> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> >> +            }
> >> +        }
> >> +
> >> +        /* Read page table entry */
> >> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> >> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> >> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> >> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> >> +        }
> >> +
> >> +        if (sc[pass].ptesize == 4) {
> >> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> >> +        } else {
> >> +            pte = le64_to_cpu(pte);
> >> +        }
> >> +
> >> +        sc[pass].step++;
> >> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> >> +
> >> +        if (!(pte & PTE_V)) {
> >> +            break;                /* Invalid PTE */
> >> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> >> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> >> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> >> +            break;                /* Reserved leaf PTE flags: PTE_W */
> >> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> >> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> >> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> >> +            break;                /* Misaligned PPN */
> >> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> >> +            break;                /* Read access check failed */
> >> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> >> +            break;                /* Write access check failed */
> >> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> >> +            break;                /* Access bit not set */
> >> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> >> +            break;                /* Dirty bit not set */
> >> +        } else {
> >> +            /* Leaf PTE, translation completed. */
> >> +            sc[pass].step = sc[pass].levels;
> >> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> >> +            /* Update address mask based on smallest translation granularity */
> >> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> >> +            /* Continue with S-Stage translation? */
> >> +            if (pass && sc[0].step != sc[0].levels) {
> >> +                pass = S_STAGE;
> >> +                addr = iotlb->iova;
> >> +                continue;
> >> +            }
> >> +            /* Translation phase completed (GPA or SPA) */
> >> +            iotlb->translated_addr = base;
> >> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> >> +                                                         : IOMMU_RO;
> >> +
> >> +            /* Check MSI GPA address match */
> >> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> >> +                riscv_iommu_msi_check(s, ctx, base)) {
> >> +                /* Trap MSI writes and return GPA address. */
> >> +                iotlb->target_as = &s->trap_as;
> >> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> >> +                return 0;
> >> +            }
> >> +
> >> +            /* Continue with G-Stage translation? */
> >> +            if (!pass && en_g) {
> >> +                pass = G_STAGE;
> >> +                addr = base;
> >> +                base = gatp;
> >> +                sc[pass].step = 0;
> >> +                continue;
> >> +            }
> >> +
> >> +            return 0;
> >> +        }
> >> +
> >> +        if (sc[pass].step == sc[pass].levels) {
> >> +            break; /* Can't find leaf PTE */
> >> +        }
> >> +
> >> +        /* Continue with G-Stage translation? */
> >> +        if (!pass && en_g) {
> >> +            pass = G_STAGE;
> >> +            addr = base;
> >> +            base = gatp;
> >> +            sc[pass].step = 0;
> >> +        }
> >> +    } while (1);
> >> +
> >> +    return (iotlb->perm & IOMMU_WO) ?
> >> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> >> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> >> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> >> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
> >> +}
> >> +
> >> +static void riscv_iommu_report_fault(RISCVIOMMUState *s,
> >> +                                     RISCVIOMMUContext *ctx,
> >> +                                     uint32_t fault_type, uint32_t cause,
> >> +                                     bool pv,
> >> +                                     uint64_t iotval, uint64_t iotval2)
> >> +{
> >> +    struct riscv_iommu_fq_record ev = { 0 };
> >> +
> >> +    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
> >> +        switch (cause) {
> >> +        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
> >> +        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
> >> +        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
> >> +        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
> >> +        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
> >> +        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
> >> +        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
> >> +            break;
> >> +        default:
> >> +            /* DTF prevents reporting a fault for this given cause */
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
> >> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
> >> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
> >> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
> >> +
> >> +    if (pv) {
> >> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
> >> +    }
> >> +
> >> +    ev.iotval = iotval;
> >> +    ev.iotval2 = iotval2;
> >> +
> >> +    riscv_iommu_fault(s, &ev);
> >> +}
> >> +
> >> +/* Redirect MSI write for given GPA. */
> >> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
> >> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
> >> +    unsigned size, MemTxAttrs attrs)
> >> +{
> >> +    MemTxResult res;
> >> +    dma_addr_t addr;
> >> +    uint64_t intn;
> >> +    uint32_t n190;
> >> +    uint64_t pte[2];
> >> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> >> +    int cause;
> >> +
> >> +    /* Interrupt File Number */
> >> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
> >> +    if (intn >= 256) {
> >> +        /* Interrupt file number out of range */
> >> +        res = MEMTX_ACCESS_ERROR;
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* fetch MSI PTE */
> >> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
> >> +    addr = addr | (intn * sizeof(pte));
> >> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
> >> +            MEMTXATTRS_UNSPECIFIED);
> >> +    if (res != MEMTX_OK) {
> >> +        if (res == MEMTX_DECODE_ERROR) {
> >> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
> >> +        } else {
> >> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> >> +        }
> >> +        goto err;
> >> +    }
> >> +
> >> +    le64_to_cpus(&pte[0]);
> >> +    le64_to_cpus(&pte[1]);
> >> +
> >> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
> >> +        /*
> >> +         * The spec mentions that: "If msipte.C == 1, then further
> >> +         * processing to interpret the PTE is implementation
> >> +         * defined.". We'll abort with cause = 262 for this
> >> +         * case too.
> >> +         */
> >> +        res = MEMTX_ACCESS_ERROR;
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
> >> +        goto err;
> >> +    }
> >> +
> >> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
> >> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
> >> +        /* MSI Pass-through mode */
> >> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
> >> +        addr = addr | (gpa & TARGET_PAGE_MASK);
> >> +
> >> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> >> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> >> +                              gpa, addr);
> >> +
> >> +        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
> >> +        if (res != MEMTX_OK) {
> >> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> >> +            goto err;
> >> +        }
> >> +
> >> +        return MEMTX_OK;
> >> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
> >> +        /* MRIF mode, continue. */
> >> +        break;
> >> +    default:
> >> +        res = MEMTX_ACCESS_ERROR;
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> >> +        goto err;
> >> +    }
> >> +
> >> +    /*
> >> +     * Report an error for interrupt identities exceeding the maximum allowed
> >> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
> >> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
> >> +     */
> >> +    if ((data > 2047) || (gpa & 3)) {
> >> +        res = MEMTX_ACCESS_ERROR;
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* MSI MRIF mode, non atomic pending bit update */
> >> +
> >> +    /* MRIF pending bit address */
> >> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
> >> +    addr = addr | ((data & 0x7c0) >> 3);
> >> +
> >> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
> >> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
> >> +                          gpa, addr);
> >> +
> >> +    /* MRIF pending bit mask */
> >> +    data = 1ULL << (data & 0x03f);
> >> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> >> +    if (res != MEMTX_OK) {
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> >> +        goto err;
> >> +    }
> >> +
> >> +    intn = intn | data;
> >> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
> >> +    if (res != MEMTX_OK) {
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* Get MRIF enable bits */
> >> +    addr = addr + sizeof(intn);
> >> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
> >> +    if (res != MEMTX_OK) {
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
> >> +        goto err;
> >> +    }
> >> +
> >> +    if (!(intn & data)) {
> >> +        /* notification disabled, MRIF update completed. */
> >> +        return MEMTX_OK;
> >> +    }
> >> +
> >> +    /* Send notification message */
> >> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
> >> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
> >> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
> >> +
> >> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
> >> +    if (res != MEMTX_OK) {
> >> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
> >> +        goto err;
> >> +    }
> >> +
> >> +    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
> >> +
> >> +    return MEMTX_OK;
> >> +
> >> +err:
> >> +    riscv_iommu_report_fault(s, ctx, fault_type, cause,
> >> +                             !!ctx->process_id, 0, 0);
> >> +    return res;
> >> +}
> >> +
> >> +/*
> >> + * Check device context configuration as described by the
> >> + * riscv-iommu spec section "Device-context configuration
> >> + * checks".
> >> + */
> >> +static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
> >> +                                            RISCVIOMMUContext *ctx)
> >> +{
> >> +    uint32_t fsc_mode, msi_mode;
> >> +
> >> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
> >> +        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
> >> +        return false;
> >> +    }
> >> +
> >> +    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
> >> +        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
> >> +        return false;
> >> +    }
> >> +
> >> +    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
> >> +        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
> >> +
> >> +        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
> >> +            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
> >> +            return false;
> >> +        }
> >> +    }
> >> +
> >> +    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> >> +
> >> +    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
> >> +        switch (fsc_mode) {
> >> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
> >> +            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
> >> +                return false;
> >> +            }
> >> +            break;
> >> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
> >> +            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
> >> +                return false;
> >> +            }
> >> +            break;
> >> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
> >> +            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
> >> +                return false;
> >> +            }
> >> +            break;
> >> +        }
> >> +    } else {
> >> +        /* DC.tc.PDTV is 0 */
> >> +        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
> >> +            return false;
> >> +        }
> >> +
> >> +        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> >> +            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
> >> +                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> >> +                return false;
> >> +            }
> >> +        } else {
> >> +            switch (fsc_mode) {
> >> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> >> +                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> >> +                    return false;
> >> +                }
> >> +                break;
> >> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> >> +                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> >> +                    return false;
> >> +                }
> >> +            break;
> >> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> >> +                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> >> +                    return false;
> >> +                }
> >> +                break;
> >> +            }
> >> +        }
> >> +    }
> >> +
> >> +    /*
> >> +     * CAP_END is always zero (only one endianess). FCTL_BE is
> >> +     * always zero (little-endian accesses). Thus TC_SBE must
> >> +     * always be LE, i.e. zero.
> >> +     */
> >> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
> >> +        return false;
> >> +    }
> >> +
> >> +    return true;
> >> +}
> >> +
> >> +/*
> >> + * Validate process context (PC) according to section
> >> + * "Process-context configuration checks".
> >> + */
> >> +static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
> >> +                                             RISCVIOMMUContext *ctx)
> >> +{
> >> +    uint32_t mode;
> >> +
> >> +    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
> >> +        return false;
> >> +    }
> >> +
> >> +    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
> >> +        return false;
> >> +    }
> >> +
> >> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> >> +    switch (mode) {
> >> +    case RISCV_IOMMU_DC_FSC_MODE_BARE:
> >> +    /* sv39 and sv32 modes have the same value (8) */
> >> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> >> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> >> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> >> +        break;
> >> +    default:
> >> +        return false;
> >> +    }
> >> +
> >> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> >> +        if (mode == RISCV_IOMMU_CAP_SV32 &&
> >> +            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> >> +                return false;
> >> +        }
> >> +    } else {
> >> +        switch (mode) {
> >> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> >> +            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> >> +                return false;
> >> +            }
> >> +            break;
> >> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> >> +            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> >> +                return false;
> >> +            }
> >> +            break;
> >> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> >> +            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> >> +                return false;
> >> +            }
> >> +            break;
> >> +        }
> >> +    }
> >> +
> >> +    return true;
> >> +}
> >> +
> >> +/*
> >> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
> >> + *
> >> + * @s         : IOMMU Device State
> >> + * @ctx       : Device Translation Context with devid and process_id set.
> >> + * @return    : success or fault code.
> >> + */
> >> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
> >> +{
> >> +    const uint64_t ddtp = s->ddtp;
> >> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
> >> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
> >> +    struct riscv_iommu_dc dc;
> >> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
> >> +    const int dc_fmt = !s->enable_msi;
> >> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
> >> +    unsigned depth;
> >> +    uint64_t de;
> >> +
> >> +    switch (mode) {
> >> +    case RISCV_IOMMU_DDTP_MODE_OFF:
> >> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> >> +
> >> +    case RISCV_IOMMU_DDTP_MODE_BARE:
> >> +        /* mock up pass-through translation context */
> >> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> >> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> >> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> >> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
> >> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
> >> +        ctx->ta = 0;
> >> +        ctx->msiptp = 0;
> >> +        return 0;
> >> +
> >> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
> >> +        depth = 0;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
> >> +        depth = 1;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
> >> +        depth = 2;
> >> +        break;
> >> +
> >> +    default:
> >> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +    }
> >> +
> >> +    /*
> >> +     * Check supported device id width (in bits).
> >> +     * See IOMMU Specification, Chapter 6. Software guidelines.
> >> +     * - if extended device-context format is used:
> >> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
> >> +     * - if base device-context format is used:
> >> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
> >> +     */
> >> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
> >> +        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> >> +    }
> >> +
> >> +    /* Device directory tree walk */
> >> +    for (; depth-- > 0; ) {
> >> +        /*
> >> +         * Select device id index bits based on device directory tree level
> >> +         * and device context format.
> >> +         * See IOMMU Specification, Chapter 2. Data Structures.
> >> +         * - if extended device-context format is used:
> >> +         *   device index: [23:15][14:6][5:0]
> >> +         * - if base device-context format is used:
> >> +         *   device index: [23:16][15:7][6:0]
> >> +         */
> >> +        const int split = depth * 9 + 6 + dc_fmt;
> >> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
> >> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> >> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> >> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> >> +        }
> >> +        le64_to_cpus(&de);
> >> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
> >> +            /* invalid directory entry */
> >> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> >> +        }
> >> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
> >> +            /* reserved bits set */
> >> +            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +        }
> >> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
> >> +    }
> >> +
> >> +    /* index into device context entry page */
> >> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
> >> +
> >> +    memset(&dc, 0, sizeof(dc));
> >> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
> >> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> >> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
> >> +    }
> >> +
> >> +    /* Set translation context. */
> >> +    ctx->tc = le64_to_cpu(dc.tc);
> >> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> >> +    ctx->satp = le64_to_cpu(dc.fsc);
> >> +    ctx->ta = le64_to_cpu(dc.ta);
> >> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
> >> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> >> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
> >> +
> >> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> >> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> >> +    }
> >> +
> >> +    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
> >> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> >> +    }
> >> +
> >> +    /* FSC field checks */
> >> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> >> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> >> +
> >> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
> >> +        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
> >> +            /* PID is disabled */
> >> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
> >> +        }
> >> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> >> +            /* Invalid translation mode */
> >> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> >> +        }
> >> +        return 0;
> >> +    }
> >> +
> >> +    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
> >> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> >> +            /* No default process_id enabled, set BARE mode */
> >> +            ctx->satp = 0ULL;
> >> +            return 0;
> >> +        } else {
> >> +            /* Use default process_id #0 */
> >> +            ctx->process_id = 0;
> >> +        }
> >> +    }
> >> +
> >> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> >> +        /* No S-Stage translation, done. */
> >> +        return 0;
> >> +    }
> >> +
> >> +    /* FSC.TC.PDTV enabled */
> >> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
> >> +        /* Invalid PDTP.MODE */
> >> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> >> +    }
> >> +
> >> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
> >> +        /*
> >> +         * Select process id index bits based on process directory tree
> >> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
> >> +         */
> >> +        const int split = depth * 9 + 8;
> >> +        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
> >> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
> >> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> >> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> >> +        }
> >> +        le64_to_cpus(&de);
> >> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
> >> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> >> +        }
> >> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
> >> +    }
> >> +
> >> +    /* Leaf entry in PDT */
> >> +    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
> >> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
> >> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> >> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
> >> +    }
> >> +
> >> +    /* Use FSC and TA from process directory entry. */
> >> +    ctx->ta = le64_to_cpu(dc.ta);
> >> +    ctx->satp = le64_to_cpu(dc.fsc);
> >> +
> >> +    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
> >> +        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
> >> +    }
> >> +
> >> +    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
> >> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +/* Translation Context cache support */
> >> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
> >> +{
> >> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
> >> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
> >> +    return c1->devid == c2->devid &&
> >> +           c1->process_id == c2->process_id;
> >> +}
> >> +
> >> +static guint __ctx_hash(gconstpointer v)
> >> +{
> >> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
> >> +    /*
> >> +     * Generate simple hash of (process_id, devid)
> >> +     * assuming 24-bit wide devid.
> >> +     */
> >> +    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
> >> +}
> >> +
> >> +static void __ctx_inval_devid_procid(gpointer key, gpointer value,
> >> +                                     gpointer data)
> >> +{
> >> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> >> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> >> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> >> +        ctx->devid == arg->devid &&
> >> +        ctx->process_id == arg->process_id) {
> >> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> >> +    }
> >> +}
> >> +
> >> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
> >> +{
> >> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> >> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
> >> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
> >> +        ctx->devid == arg->devid) {
> >> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> >> +    }
> >> +}
> >> +
> >> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
> >> +{
> >> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
> >> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
> >> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
> >> +    }
> >> +}
> >> +
> >> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
> >> +                                  uint32_t devid, uint32_t process_id)
> >> +{
> >> +    GHashTable *ctx_cache;
> >> +    RISCVIOMMUContext key = {
> >> +        .devid = devid,
> >> +        .process_id = process_id,
> >> +    };
> >> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> >> +    qemu_mutex_lock(&s->ctx_lock);
> >> +    g_hash_table_foreach(ctx_cache, func, &key);
> >> +    qemu_mutex_unlock(&s->ctx_lock);
> >> +    g_hash_table_unref(ctx_cache);
> >> +}
> >> +
> >> +/* Find or allocate translation context for a given {device_id, process_id} */
> >> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
> >> +                                          unsigned devid, unsigned process_id,
> >> +                                          void **ref)
> >> +{
> >> +    GHashTable *ctx_cache;
> >> +    RISCVIOMMUContext *ctx;
> >> +    RISCVIOMMUContext key = {
> >> +        .devid = devid,
> >> +        .process_id = process_id,
> >> +    };
> >> +
> >> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
> >> +    qemu_mutex_lock(&s->ctx_lock);
> >> +    ctx = g_hash_table_lookup(ctx_cache, &key);
> >> +    qemu_mutex_unlock(&s->ctx_lock);
> >> +
> >> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
> >> +        *ref = ctx_cache;
> >> +        return ctx;
> >> +    }
> >> +
> >> +    ctx = g_new0(RISCVIOMMUContext, 1);
> >> +    ctx->devid = devid;
> >> +    ctx->process_id = process_id;
> >> +
> >> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
> >> +    if (!fault) {
> >> +        qemu_mutex_lock(&s->ctx_lock);
> >> +        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
> >> +            g_hash_table_unref(ctx_cache);
> >> +            ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> >> +                                              g_free, NULL);
> >> +            g_hash_table_ref(ctx_cache);
> >> +            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
> >> +        }
> >> +        g_hash_table_add(ctx_cache, ctx);
> >> +        qemu_mutex_unlock(&s->ctx_lock);
> >> +        *ref = ctx_cache;
> >> +        return ctx;
> >> +    }
> >> +
> >> +    g_hash_table_unref(ctx_cache);
> >> +    *ref = NULL;
> >> +
> >> +    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
> >> +                             fault, !!process_id, 0, 0);
> >> +
> >> +    g_free(ctx);
> >> +    return NULL;
> >> +}
> >> +
> >> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
> >> +{
> >> +    if (ref) {
> >> +        g_hash_table_unref((GHashTable *)ref);
> >> +    }
> >> +}
> >> +
> >> +/* Find or allocate address space for a given device */
> >> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
> >> +{
> >> +    RISCVIOMMUSpace *as;
> >> +
> >> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> >> +    devid |= s->bus << 8;
> >> +
> >> +    qemu_mutex_lock(&s->core_lock);
> >> +    QLIST_FOREACH(as, &s->spaces, list) {
> >> +        if (as->devid == devid) {
> >> +            break;
> >> +        }
> >> +    }
> >> +    qemu_mutex_unlock(&s->core_lock);
> >> +
> >> +    if (as == NULL) {
> >> +        char name[64];
> >> +        as = g_new0(RISCVIOMMUSpace, 1);
> >> +
> >> +        as->iommu = s;
> >> +        as->devid = devid;
> >> +
> >> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
> >> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> >> +
> >> +        /* IOVA address space, untranslated addresses */
> >> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
> >> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
> >> +            OBJECT(as), "riscv_iommu", UINT64_MAX);
> >> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
> >> +
> >> +        qemu_mutex_lock(&s->core_lock);
> >> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
> >> +        qemu_mutex_unlock(&s->core_lock);
> >> +
> >> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
> >> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
> >> +    }
> >> +    return &as->iova_as;
> >> +}
> >> +
> >> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
> >> +    IOMMUTLBEntry *iotlb)
> >> +{
> >> +    bool enable_pid;
> >> +    bool enable_pri;
> >> +    int fault;
> >> +
> >> +    /*
> >> +     * TC[32] is reserved for custom extensions, used here to temporarily
> >> +     * enable automatic page-request generation for ATS queries.
> >> +     */
> >> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
> >> +    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
> >> +
> >> +    /* Translate using device directory / page table information. */
> >> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
> >> +
> >> +    if (enable_pri && fault) {
> >> +        struct riscv_iommu_pq_record pr = {0};
> >> +        if (enable_pid) {
> >> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
> >> +                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
> >> +        }
> >> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
> >> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
> >> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
> >> +        riscv_iommu_pri(s, &pr);
> >> +        return fault;
> >> +    }
> >> +
> >> +    if (fault) {
> >> +        unsigned ttype;
> >> +
> >> +        if (iotlb->perm & IOMMU_RW) {
> >> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
> >> +        } else {
> >> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
> >> +        }
> >> +
> >> +        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
> >> +                                 iotlb->iova, iotlb->translated_addr);
> >> +        return fault;
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +/* IOMMU Command Interface */
> >> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
> >> +    uint64_t addr, uint32_t data)
> >> +{
> >> +    /*
> >> +     * ATS processing in this implementation of the IOMMU is synchronous,
> >> +     * no need to wait for completions here.
> >> +     */
> >> +    if (!notify) {
> >> +        return MEMTX_OK;
> >> +    }
> >> +
> >> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
> >> +        MEMTXATTRS_UNSPECIFIED);
> >> +}
> >> +
> >> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
> >> +{
> >> +    uint64_t old_ddtp = s->ddtp;
> >> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
> >> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
> >> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
> >> +    bool ok = false;
> >> +
> >> +    /*
> >> +     * Check for allowed DDTP.MODE transitions:
> >> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
> >> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
> >> +     */
> >> +    if (new_mode == old_mode ||
> >> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> >> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
> >> +        ok = true;
> >> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
> >> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
> >> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
> >> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
> >> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
> >> +    }
> >> +
> >> +    if (ok) {
> >> +        /* clear reserved and busy bits, report back sanitized version */
> >> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
> >> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
> >> +    } else {
> >> +        new_ddtp = old_ddtp;
> >> +    }
> >> +    s->ddtp = new_ddtp;
> >> +
> >> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
> >> +}
> >> +
> >> +/* Command function and opcode field. */
> >> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
> >> +
> >> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
> >> +{
> >> +    struct riscv_iommu_command cmd;
> >> +    MemTxResult res;
> >> +    dma_addr_t addr;
> >> +    uint32_t tail, head, ctrl;
> >> +    uint64_t cmd_opcode;
> >> +    GHFunc func;
> >> +
> >> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> >> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
> >> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
> >> +
> >> +    /* Check for pending error or queue processing disabled */
> >> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
> >> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
> >> +        return;
> >> +    }
> >> +
> >> +    while (tail != head) {
> >> +        addr = s->cq_addr  + head * sizeof(cmd);
> >> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
> >> +                              MEMTXATTRS_UNSPECIFIED);
> >> +
> >> +        if (res != MEMTX_OK) {
> >> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> >> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
> >> +            goto fault;
> >> +        }
> >> +
> >> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
> >> +
> >> +        cmd_opcode = get_field(cmd.dword0,
> >> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
> >> +
> >> +        switch (cmd_opcode) {
> >> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
> >> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
> >> +            res = riscv_iommu_iofence(s,
> >> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
> >> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
> >> +
> >> +            if (res != MEMTX_OK) {
> >> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> >> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
> >> +                goto fault;
> >> +            }
> >> +            break;
> >> +
> >> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
> >> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> >> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
> >> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
> >> +                goto cmd_ill;
> >> +            }
> >> +            /* translation cache not implemented yet */
> >> +            break;
> >> +
> >> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
> >> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
> >> +            /* translation cache not implemented yet */
> >> +            break;
> >> +
> >> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
> >> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> >> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> >> +                /* invalidate all device context cache mappings */
> >> +                func = __ctx_inval_all;
> >> +            } else {
> >> +                /* invalidate all device context matching DID */
> >> +                func = __ctx_inval_devid;
> >> +            }
> >> +            riscv_iommu_ctx_inval(s, func,
> >> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
> >> +            break;
> >> +
> >> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
> >> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
> >> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
> >> +                /* illegal command arguments IODIR_PDT & DV == 0 */
> >> +                goto cmd_ill;
> >> +            } else {
> >> +                func = __ctx_inval_devid_procid;
> >> +            }
> >> +            riscv_iommu_ctx_inval(s, func,
> >> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
> >> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
> >> +            break;
> >> +
> >> +        default:
> >> +        cmd_ill:
> >> +            /* Invalid instruction, do not advance instruction index. */
> >> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
> >> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
> >> +            goto fault;
> >> +        }
> >> +
> >> +        /* Advance and update head pointer after command completes. */
> >> +        head = (head + 1) & s->cq_mask;
> >> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
> >> +    }
> >> +    return;
> >> +
> >> +fault:
> >> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
> >> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
> >> +    }
> >> +}
> >> +
> >> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
> >> +{
> >> +    uint64_t base;
> >> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> >> +    uint32_t ctrl_clr;
> >> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
> >> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
> >> +
> >> +    if (enable && !active) {
> >> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
> >> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
> >> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
> >> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
> >> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
> >> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
> >> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
> >> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
> >> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
> >> +                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
> >> +    } else if (!enable && active) {
> >> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
> >> +        ctrl_set = 0;
> >> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
> >> +    } else {
> >> +        ctrl_set = 0;
> >> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
> >> +    }
> >> +
> >> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
> >> +}
> >> +
> >> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
> >> +{
> >> +    uint64_t base;
> >> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> >> +    uint32_t ctrl_clr;
> >> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
> >> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
> >> +
> >> +    if (enable && !active) {
> >> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
> >> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
> >> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
> >> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
> >> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
> >> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
> >> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
> >> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
> >> +            RISCV_IOMMU_FQCSR_FQOF;
> >> +    } else if (!enable && active) {
> >> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
> >> +        ctrl_set = 0;
> >> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
> >> +    } else {
> >> +        ctrl_set = 0;
> >> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
> >> +    }
> >> +
> >> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
> >> +}
> >> +
> >> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
> >> +{
> >> +    uint64_t base;
> >> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> >> +    uint32_t ctrl_clr;
> >> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
> >> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
> >> +
> >> +    if (enable && !active) {
> >> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
> >> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
> >> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
> >> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
> >> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
> >> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
> >> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
> >> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
> >> +            RISCV_IOMMU_PQCSR_PQOF;
> >> +    } else if (!enable && active) {
> >> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
> >> +        ctrl_set = 0;
> >> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
> >> +    } else {
> >> +        ctrl_set = 0;
> >> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
> >> +    }
> >> +
> >> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
> >> +}
> >> +
> >> +static void riscv_iommu_process_icvec_update(RISCVIOMMUState *s)
> >> +{
> >> +    uint32_t icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
> >> +
> >> +    s->icvec_update(s,
> >> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_CQ),
> >> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_FQ),
> >> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PM),
> >> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PQ));
> >> +}
> >> +
> >> +typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
> >> +
> >> +static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
> >> +{
> >> +    uint32_t cqcsr, fqcsr, pqcsr;
> >> +    uint32_t ipsr_set = 0;
> >> +    uint32_t ipsr_clr = 0;
> >> +
> >> +    if (data & RISCV_IOMMU_IPSR_CIP) {
> >> +        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
> >> +
> >> +        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
> >> +            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
> >> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
> >> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
> >> +             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
> >> +            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
> >> +        } else {
> >> +            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> >> +        }
> >> +    } else {
> >> +        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
> >> +    }
> >> +
> >> +    if (data & RISCV_IOMMU_IPSR_FIP) {
> >> +        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
> >> +
> >> +        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
> >> +            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
> >> +             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
> >> +            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
> >> +        } else {
> >> +            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> >> +        }
> >> +    } else {
> >> +        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
> >> +    }
> >> +
> >> +    if (data & RISCV_IOMMU_IPSR_PIP) {
> >> +        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
> >> +
> >> +        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
> >> +            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
> >> +             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
> >> +            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
> >> +        } else {
> >> +            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> >> +        }
> >> +    } else {
> >> +        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
> >> +    }
> >> +
> >> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
> >> +}
> >> +
> >> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
> >> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> >> +{
> >> +    riscv_iommu_process_fn *process_fn = NULL;
> >> +    RISCVIOMMUState *s = opaque;
> >> +    uint32_t regb = addr & ~3;
> >> +    uint32_t busy = 0;
> >> +    uint64_t val = 0;
> >> +
> >> +    if ((addr & (size - 1)) != 0) {
> >> +        /* Unsupported MMIO alignment or access size */
> >> +        return MEMTX_ERROR;
> >> +    }
> >> +
> >> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> >> +        /* Unsupported MMIO access location. */
> >> +        return MEMTX_ACCESS_ERROR;
> >> +    }
> >> +
> >> +    /* Track actionable MMIO write. */
> >> +    switch (regb) {
> >> +    case RISCV_IOMMU_REG_DDTP:
> >> +    case RISCV_IOMMU_REG_DDTP + 4:
> >> +        process_fn = riscv_iommu_process_ddtp;
> >> +        regb = RISCV_IOMMU_REG_DDTP;
> >> +        busy = RISCV_IOMMU_DDTP_BUSY;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_REG_CQT:
> >> +        process_fn = riscv_iommu_process_cq_tail;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_REG_CQCSR:
> >> +        process_fn = riscv_iommu_process_cq_control;
> >> +        busy = RISCV_IOMMU_CQCSR_BUSY;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_REG_FQCSR:
> >> +        process_fn = riscv_iommu_process_fq_control;
> >> +        busy = RISCV_IOMMU_FQCSR_BUSY;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_REG_PQCSR:
> >> +        process_fn = riscv_iommu_process_pq_control;
> >> +        busy = RISCV_IOMMU_PQCSR_BUSY;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_REG_ICVEC:
> >> +        process_fn = riscv_iommu_process_icvec_update;
> >> +        break;
> >> +
> >> +    case RISCV_IOMMU_REG_IPSR:
> >> +        /*
> >> +         * IPSR has special procedures to update. Execute it
> >> +         * and exit.
> >> +         */
> >> +        if (size == 4) {
> >> +            uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> >> +            uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> >> +            uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> >> +            stl_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
> >> +        } else if (size == 8) {
> >> +            uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> >> +            uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> >> +            uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> >> +            stq_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
> >> +        }
> >> +
> >> +        riscv_iommu_update_ipsr(s, val);
> >> +
> >> +        return MEMTX_OK;
> >> +
> >> +    default:
> >> +        break;
> >> +    }
> >> +
> >> +    /*
> >> +     * Registers update might be not synchronized with core logic.
> >> +     * If system software updates register when relevant BUSY bit
> >> +     * is set IOMMU behavior of additional writes to the register
> >> +     * is UNSPECIFIED.
> >> +     */
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    if (size == 1) {
> >> +        uint8_t ro = s->regs_ro[addr];
> >> +        uint8_t wc = s->regs_wc[addr];
> >> +        uint8_t rw = s->regs_rw[addr];
> >> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
> >> +    } else if (size == 2) {
> >> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
> >> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
> >> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
> >> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> >> +    } else if (size == 4) {
> >> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
> >> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
> >> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
> >> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> >> +    } else if (size == 8) {
> >> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
> >> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
> >> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
> >> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
> >> +    }
> >> +
> >> +    /* Busy flag update, MSB 4-byte register. */
> >> +    if (busy) {
> >> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
> >> +        stl_le_p(&s->regs_rw[regb], rw | busy);
> >> +    }
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +
> >> +    if (process_fn) {
> >> +        qemu_mutex_lock(&s->core_lock);
> >> +        process_fn(s);
> >> +        qemu_mutex_unlock(&s->core_lock);
> >> +    }
> >> +
> >> +    return MEMTX_OK;
> >> +}
> >> +
> >> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
> >> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> >> +{
> >> +    RISCVIOMMUState *s = opaque;
> >> +    uint64_t val = -1;
> >> +    uint8_t *ptr;
> >> +
> >> +    if ((addr & (size - 1)) != 0) {
> >> +        /* Unsupported MMIO alignment. */
> >> +        return MEMTX_ERROR;
> >> +    }
> >> +
> >> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
> >> +        return MEMTX_ACCESS_ERROR;
> >> +    }
> >> +
> >> +    ptr = &s->regs_rw[addr];
> >> +
> >> +    if (size == 1) {
> >> +        val = (uint64_t)*ptr;
> >> +    } else if (size == 2) {
> >> +        val = lduw_le_p(ptr);
> >> +    } else if (size == 4) {
> >> +        val = ldl_le_p(ptr);
> >> +    } else if (size == 8) {
> >> +        val = ldq_le_p(ptr);
> >> +    } else {
> >> +        return MEMTX_ERROR;
> >> +    }
> >> +
> >> +    *data = val;
> >> +
> >> +    return MEMTX_OK;
> >> +}
> >> +
> >> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
> >> +    .read_with_attrs = riscv_iommu_mmio_read,
> >> +    .write_with_attrs = riscv_iommu_mmio_write,
> >> +    .endianness = DEVICE_NATIVE_ENDIAN,
> >> +    .impl = {
> >> +        .min_access_size = 4,
> >> +        .max_access_size = 8,
> >> +        .unaligned = false,
> >> +    },
> >> +    .valid = {
> >> +        .min_access_size = 4,
> >> +        .max_access_size = 8,
> >> +    }
> >> +};
> >> +
> >> +/*
> >> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
> >> + * memory region as untranslated address, for additional MSI/MRIF interception
> >> + * by IOMMU interrupt remapping implementation.
> >> + * Note: Device emulation code generating an MSI is expected to provide a valid
> >> + * memory transaction attributes with requested_id set.
> >> + */
> >> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
> >> +    uint64_t data, unsigned size, MemTxAttrs attrs)
> >> +{
> >> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
> >> +    RISCVIOMMUContext *ctx;
> >> +    MemTxResult res;
> >> +    void *ref;
> >> +    uint32_t devid = attrs.requester_id;
> >> +
> >> +    if (attrs.unspecified) {
> >> +        return MEMTX_ACCESS_ERROR;
> >> +    }
> >> +
> >> +    /* FIXME: PCIe bus remapping for attached endpoints. */
> >> +    devid |= s->bus << 8;
> >> +
> >> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
> >> +    if (ctx == NULL) {
> >> +        res = MEMTX_ACCESS_ERROR;
> >> +    } else {
> >> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
> >> +    }
> >> +    riscv_iommu_ctx_put(s, ref);
> >> +    return res;
> >> +}
> >> +
> >> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
> >> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
> >> +{
> >> +    return MEMTX_ACCESS_ERROR;
> >> +}
> >> +
> >> +static const MemoryRegionOps riscv_iommu_trap_ops = {
> >> +    .read_with_attrs = riscv_iommu_trap_read,
> >> +    .write_with_attrs = riscv_iommu_trap_write,
> >> +    .endianness = DEVICE_LITTLE_ENDIAN,
> >> +    .impl = {
> >> +        .min_access_size = 4,
> >> +        .max_access_size = 8,
> >> +        .unaligned = true,
> >> +    },
> >> +    .valid = {
> >> +        .min_access_size = 4,
> >> +        .max_access_size = 8,
> >> +    }
> >> +};
> >> +
> >> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
> >> +{
> >> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> >> +
> >> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
> >> +    if (s->enable_msi) {
> >> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
> >> +    }
> >> +    if (s->enable_s_stage) {
> >> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
> >> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
> >> +    }
> >> +    if (s->enable_g_stage) {
> >> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
> >> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
> >> +    }
> >> +    /* Report QEMU target physical address space limits */
> >> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
> >> +                       TARGET_PHYS_ADDR_SPACE_BITS);
> >> +
> >> +    /* TODO: method to report supported PID bits */
> >> +    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
> >> +    s->cap |= RISCV_IOMMU_CAP_PD8;
> >> +
> >> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
> >> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
> >> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
> >> +
> >> +    /* register storage */
> >> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> >> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> >> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
> >> +
> >> +     /* Mark all registers read-only */
> >> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
> >> +
> >> +    /*
> >> +     * Register complete MMIO space, including MSI/PBA registers.
> >> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
> >> +     * managed directly by the PCIDevice implementation.
> >> +     */
> >> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
> >> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
> >> +
> >> +    /* Set power-on register state */
> >> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
> >> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
> >> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
> >> +             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
> >> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
> >> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
> >> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
> >> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
> >> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
> >> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
> >> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
> >> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
> >> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
> >> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
> >> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
> >> +        RISCV_IOMMU_CQCSR_BUSY);
> >> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
> >> +        RISCV_IOMMU_FQCSR_FQOF);
> >> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
> >> +        RISCV_IOMMU_FQCSR_BUSY);
> >> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
> >> +        RISCV_IOMMU_PQCSR_PQOF);
> >> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
> >> +        RISCV_IOMMU_PQCSR_BUSY);
> >> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
> >> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
> >> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
> >> +
> >> +    /* Memory region for downstream access, if specified. */
> >> +    if (s->target_mr) {
> >> +        s->target_as = g_new0(AddressSpace, 1);
> >> +        address_space_init(s->target_as, s->target_mr,
> >> +            "riscv-iommu-downstream");
> >> +    } else {
> >> +        /* Fallback to global system memory. */
> >> +        s->target_as = &address_space_memory;
> >> +    }
> >> +
> >> +    /* Memory region for untranslated MRIF/MSI writes */
> >> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
> >> +            "riscv-iommu-trap", ~0ULL);
> >> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
> >> +
> >> +    /* Device translation context cache */
> >> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
> >> +                                         g_free, NULL);
> >> +    qemu_mutex_init(&s->ctx_lock);
> >> +
> >> +    s->iommus.le_next = NULL;
> >> +    s->iommus.le_prev = NULL;
> >> +    QLIST_INIT(&s->spaces);
> >> +    qemu_mutex_init(&s->core_lock);
> >> +    qemu_spin_init(&s->regs_lock);
> >> +}
> >> +
> >> +static void riscv_iommu_unrealize(DeviceState *dev)
> >> +{
> >> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
> >> +
> >> +    qemu_mutex_destroy(&s->core_lock);
> >> +    g_hash_table_unref(s->ctx_cache);
> >> +}
> >> +
> >> +static Property riscv_iommu_properties[] = {
> >> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
> >> +        RISCV_IOMMU_SPEC_DOT_VER),
> >> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
> >> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
> >> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> >> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
> >> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
> >> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
> >> +        TYPE_MEMORY_REGION, MemoryRegion *),
> >> +    DEFINE_PROP_END_OF_LIST(),
> >> +};
> >> +
> >> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
> >> +{
> >> +    DeviceClass *dc = DEVICE_CLASS(klass);
> >> +
> >> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
> >> +    dc->user_creatable = false;
> >> +    dc->realize = riscv_iommu_realize;
> >> +    dc->unrealize = riscv_iommu_unrealize;
> >> +    device_class_set_props(dc, riscv_iommu_properties);
> >> +}
> >> +
> >> +static const TypeInfo riscv_iommu_info = {
> >> +    .name = TYPE_RISCV_IOMMU,
> >> +    .parent = TYPE_DEVICE,
> >> +    .instance_size = sizeof(RISCVIOMMUState),
> >> +    .class_init = riscv_iommu_class_init,
> >> +};
> >> +
> >> +static const char *IOMMU_FLAG_STR[] = {
> >> +    "NA",
> >> +    "RO",
> >> +    "WR",
> >> +    "RW",
> >> +};
> >> +
> >> +/* RISC-V IOMMU Memory Region - Address Translation Space */
> >> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
> >> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
> >> +    IOMMUAccessFlags flag, int iommu_idx)
> >> +{
> >> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> >> +    RISCVIOMMUContext *ctx;
> >> +    void *ref;
> >> +    IOMMUTLBEntry iotlb = {
> >> +        .iova = addr,
> >> +        .target_as = as->iommu->target_as,
> >> +        .addr_mask = ~0ULL,
> >> +        .perm = flag,
> >> +    };
> >> +
> >> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
> >> +    if (ctx == NULL) {
> >> +        /* Translation disabled or invalid. */
> >> +        iotlb.addr_mask = 0;
> >> +        iotlb.perm = IOMMU_NONE;
> >> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
> >> +        /* Translation disabled or fault reported. */
> >> +        iotlb.addr_mask = 0;
> >> +        iotlb.perm = IOMMU_NONE;
> >> +    }
> >> +
> >> +    /* Trace all dma translations with original access flags. */
> >> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
> >> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
> >> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
> >> +                          iotlb.translated_addr);
> >> +
> >> +    riscv_iommu_ctx_put(as->iommu, ref);
> >> +
> >> +    return iotlb;
> >> +}
> >> +
> >> +static int riscv_iommu_memory_region_notify(
> >> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
> >> +    IOMMUNotifierFlag new, Error **errp)
> >> +{
> >> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> >> +
> >> +    if (old == IOMMU_NOTIFIER_NONE) {
> >> +        as->notifier = true;
> >> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
> >> +    } else if (new == IOMMU_NOTIFIER_NONE) {
> >> +        as->notifier = false;
> >> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >> +
> >> +static inline bool pci_is_iommu(PCIDevice *pdev)
> >> +{
> >> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
> >> +}
> >> +
> >> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
> >> +{
> >> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
> >> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
> >> +    AddressSpace *as = NULL;
> >> +
> >> +    if (pdev && pci_is_iommu(pdev)) {
> >> +        return s->target_as;
> >> +    }
> >> +
> >> +    /* Find first registered IOMMU device */
> >> +    while (s->iommus.le_prev) {
> >> +        s = *(s->iommus.le_prev);
> >> +    }
> >> +
> >> +    /* Find first matching IOMMU */
> >> +    while (s != NULL && as == NULL) {
> >> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
> >> +        s = s->iommus.le_next;
> >> +    }
> >> +
> >> +    return as ? as : &address_space_memory;
> >> +}
> >> +
> >> +static const PCIIOMMUOps riscv_iommu_ops = {
> >> +    .get_address_space = riscv_iommu_find_as,
> >> +};
> >> +
> >> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> >> +        Error **errp)
> >> +{
> >> +    if (bus->iommu_ops &&
> >> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
> >> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
> >> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
> >> +        QLIST_INSERT_AFTER(last, iommu, iommus);
> >> +    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
> >> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
> >> +    } else {
> >> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
> >> +            pci_bus_num(bus));
> >> +    }
> >> +}
> >> +
> >> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
> >> +    MemTxAttrs attrs)
> >> +{
> >> +    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
> >> +}
> >> +
> >> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
> >> +{
> >> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
> >> +    return 1 << as->iommu->pid_bits;
> >> +}
> >> +
> >> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
> >> +{
> >> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
> >> +
> >> +    imrc->translate = riscv_iommu_memory_region_translate;
> >> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
> >> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
> >> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
> >> +}
> >> +
> >> +static const TypeInfo riscv_iommu_memory_region_info = {
> >> +    .parent = TYPE_IOMMU_MEMORY_REGION,
> >> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
> >> +    .class_init = riscv_iommu_memory_region_init,
> >> +};
> >> +
> >> +static void riscv_iommu_register_mr_types(void)
> >> +{
> >> +    type_register_static(&riscv_iommu_memory_region_info);
> >> +    type_register_static(&riscv_iommu_info);
> >> +}
> >> +
> >> +type_init(riscv_iommu_register_mr_types);
> >> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> >> new file mode 100644
> >> index 0000000000..6d76cb9b1a
> >> --- /dev/null
> >> +++ b/hw/riscv/riscv-iommu.h
> >> @@ -0,0 +1,148 @@
> >> +/*
> >> + * QEMU emulation of an RISC-V IOMMU
> >> + *
> >> + * Copyright (C) 2022-2023 Rivos Inc.
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License as published by
> >> + * the Free Software Foundation; either version 2 of the License.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License along
> >> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#ifndef HW_RISCV_IOMMU_STATE_H
> >> +#define HW_RISCV_IOMMU_STATE_H
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include "qom/object.h"
> >> +
> >> +#include "hw/riscv/iommu.h"
> >> +
> >> +struct RISCVIOMMUState {
> >> +    /*< private >*/
> >> +    DeviceState parent_obj;
> >> +
> >> +    /*< public >*/
> >> +    uint32_t version;     /* Reported interface version number */
> >> +    uint32_t pid_bits;    /* process identifier width */
> >> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
> >> +
> >> +    uint64_t cap;         /* IOMMU supported capabilities */
> >> +    uint64_t fctl;        /* IOMMU enabled features */
> >> +
> >> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
> >> +    bool enable_msi;      /* Enable MSI remapping */
> >> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
> >> +    bool enable_g_stage;  /* Enable G-Stage translation */
> >> +
> >> +    /* IOMMU Internal State */
> >> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
> >> +
> >> +    dma_addr_t cq_addr;   /* Command queue base physical address */
> >> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
> >> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
> >> +
> >> +    uint32_t cq_mask;     /* Command queue index bit mask */
> >> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
> >> +    uint32_t pq_mask;     /* Page request queue index bit mask */
> >> +
> >> +    /* interrupt notifier */
> >> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
> >> +    /* register icvec interrupts */
> >> +    void (*icvec_update)(RISCVIOMMUState *iommu,
> >> +                         uint32_t civ, uint32_t fiv,
> >> +                         uint32_t pmiv, uint32_t piv);
> >> +
> >> +    /* IOMMU State Machine */
> >> +    QemuThread core_proc; /* Background processing thread */
> >> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
> >> +    QemuCond core_cond;   /* Background processing wake up signal */
> >> +    unsigned core_exec;   /* Processing thread execution actions */
> >> +
> >> +    /* IOMMU target address space */
> >> +    AddressSpace *target_as;
> >> +    MemoryRegion *target_mr;
> >> +
> >> +    /* MSI / MRIF access trap */
> >> +    AddressSpace trap_as;
> >> +    MemoryRegion trap_mr;
> >> +
> >> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
> >> +    QemuMutex ctx_lock;      /* Device translation Cache update lock */
> >> +
> >> +    /* MMIO Hardware Interface */
> >> +    MemoryRegion regs_mr;
> >> +    QemuSpin regs_lock;
> >> +    uint8_t *regs_rw;  /* register state (user write) */
> >> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
> >> +    uint8_t *regs_ro;  /* read-only mask */
> >> +
> >> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
> >> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
> >> +};
> >> +
> >> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
> >> +         Error **errp);
> >> +
> >> +/* private helpers */
> >> +
> >> +/* Register helper functions */
> >> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
> >> +    unsigned idx, uint32_t set, uint32_t clr)
> >> +{
> >> +    uint32_t val;
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    val = ldl_le_p(s->regs_rw + idx);
> >> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +    return val;
> >> +}
> >> +
> >> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
> >> +    unsigned idx, uint32_t set)
> >> +{
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    stl_le_p(s->regs_rw + idx, set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +}
> >> +
> >> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
> >> +    unsigned idx)
> >> +{
> >> +    return ldl_le_p(s->regs_rw + idx);
> >> +}
> >> +
> >> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
> >> +    unsigned idx, uint64_t set, uint64_t clr)
> >> +{
> >> +    uint64_t val;
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    val = ldq_le_p(s->regs_rw + idx);
> >> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +    return val;
> >> +}
> >> +
> >> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
> >> +    unsigned idx, uint64_t set)
> >> +{
> >> +    qemu_spin_lock(&s->regs_lock);
> >> +    stq_le_p(s->regs_rw + idx, set);
> >> +    qemu_spin_unlock(&s->regs_lock);
> >> +}
> >> +
> >> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
> >> +    unsigned idx)
> >> +{
> >> +    return ldq_le_p(s->regs_rw + idx);
> >> +}
> >> +
> >> +
> >> +
> >> +#endif
> >> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
> >> new file mode 100644
> >> index 0000000000..bdd8b657a6
> >> --- /dev/null
> >> +++ b/hw/riscv/trace-events
> >> @@ -0,0 +1,12 @@
> >> +# See documentation at docs/devel/tracing.rst
> >> +
> >> +# riscv-iommu.c
> >> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
> >> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
> >> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
> >> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
> >> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
> >> +riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
> >> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
> >> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
> >> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
> >> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
> >> new file mode 100644
> >> index 0000000000..8c0e3ca1f3
> >> --- /dev/null
> >> +++ b/hw/riscv/trace.h
> >> @@ -0,0 +1 @@
> >> +#include "trace/trace-hw_riscv.h"
> >> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
> >> new file mode 100644
> >> index 0000000000..070ee69973
> >> --- /dev/null
> >> +++ b/include/hw/riscv/iommu.h
> >> @@ -0,0 +1,36 @@
> >> +/*
> >> + * QEMU emulation of an RISC-V IOMMU
> >> + *
> >> + * Copyright (C) 2022-2023 Rivos Inc.
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License as published by
> >> + * the Free Software Foundation; either version 2 of the License.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License along
> >> + * with this program; if not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#ifndef HW_RISCV_IOMMU_H
> >> +#define HW_RISCV_IOMMU_H
> >> +
> >> +#include "qemu/osdep.h"
> >> +#include "qom/object.h"
> >> +
> >> +#define TYPE_RISCV_IOMMU "riscv-iommu"
> >> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
> >> +typedef struct RISCVIOMMUState RISCVIOMMUState;
> >> +
> >> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
> >> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
> >> +
> >> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
> >> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
> >> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
> >> +
> >> +#endif
> >> diff --git a/meson.build b/meson.build
> >> index a1e51277b0..359f836d8e 100644
> >> --- a/meson.build
> >> +++ b/meson.build
> >> @@ -3373,6 +3373,7 @@ if have_system
> >>       'hw/pci-host',
> >>       'hw/ppc',
> >>       'hw/rtc',
> >> +    'hw/riscv',
> >>       'hw/s390x',
> >>       'hw/scsi',
> >>       'hw/sd',
Daniel Henrique Barboza Sept. 3, 2024, 11:46 a.m. UTC | #7
On 8/26/24 11:18 PM, Tomasz Jeznach wrote:
> On Fri, Aug 23, 2024 at 5:42 AM Daniel Henrique Barboza
> <dbarboza@ventanamicro.com> wrote:
>>
>>
>>
>> On 8/20/24 12:16 PM, Jason Chien wrote:
>>> Hi Daniel,
>>>
>>> On 2024/8/1 下午 11:43, Daniel Henrique Barboza wrote:
>>>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>>>>
>>>> The RISC-V IOMMU specification is now ratified as-per the RISC-V
>>>> international process. The latest frozen specifcation can be found at:
>>>>
>>>> https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf
>>>>
>>>> Add the foundation of the device emulation for RISC-V IOMMU. It includes
>>>> support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage (sv32x4,
>>>> sv39x4, sv48x4, sv57x4 caps).
>>>>
>>>> Other capabilities like ATS and DBG support will be added incrementally
>>>> in the next patches.
>>>>
>>>> Co-developed-by: Sebastien Boeuf <seb@rivosinc.com>
>>>> Signed-off-by: Sebastien Boeuf <seb@rivosinc.com>
>>>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
>>>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
>>>> ---
>>>>    hw/riscv/Kconfig            |    4 +
>>>>    hw/riscv/meson.build        |    1 +
>>>>    hw/riscv/riscv-iommu-bits.h |   14 +
>>>>    hw/riscv/riscv-iommu.c      | 2017 +++++++++++++++++++++++++++++++++++
>>>>    hw/riscv/riscv-iommu.h      |  148 +++
>>>>    hw/riscv/trace-events       |   12 +
>>>>    hw/riscv/trace.h            |    1 +
>>>>    include/hw/riscv/iommu.h    |   36 +
>>>>    meson.build                 |    1 +
>>>>    9 files changed, 2234 insertions(+)
>>>>    create mode 100644 hw/riscv/riscv-iommu.c
>>>>    create mode 100644 hw/riscv/riscv-iommu.h
>>>>    create mode 100644 hw/riscv/trace-events
>>>>    create mode 100644 hw/riscv/trace.h
>>>>    create mode 100644 include/hw/riscv/iommu.h
>>>>
>>>> diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
>>>> index a2030e3a6f..f69d6e3c8e 100644
>>>> --- a/hw/riscv/Kconfig
>>>> +++ b/hw/riscv/Kconfig
>>>> @@ -1,3 +1,6 @@
>>>> +config RISCV_IOMMU
>>>> +    bool
>>>> +
>>>>    config RISCV_NUMA
>>>>        bool
>>>> @@ -47,6 +50,7 @@ config RISCV_VIRT
>>>>        select SERIAL
>>>>        select RISCV_ACLINT
>>>>        select RISCV_APLIC
>>>> +    select RISCV_IOMMU
>>>>        select RISCV_IMSIC
>>>>        select SIFIVE_PLIC
>>>>        select SIFIVE_TEST
>>>> diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
>>>> index f872674093..cbc99c6e8e 100644
>>>> --- a/hw/riscv/meson.build
>>>> +++ b/hw/riscv/meson.build
>>>> @@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
>>>>    riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
>>>>    riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
>>>>    riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
>>>> +riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
>>>>    hw_arch += {'riscv': riscv_ss}
>>>> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
>>>> index 9d9512ca71..6f1b9ab61b 100644
>>>> --- a/hw/riscv/riscv-iommu-bits.h
>>>> +++ b/hw/riscv/riscv-iommu-bits.h
>>>> @@ -69,6 +69,14 @@ struct riscv_iommu_pq_record {
>>>>    /* 5.3 IOMMU Capabilities (64bits) */
>>>>    #define RISCV_IOMMU_REG_CAP             0x0000
>>>>    #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
>>>> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
>>>> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
>>>> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
>>>> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
>>>> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
>>>> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
>>>> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
>>>> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>>>>    #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>>>>    #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>>>>    #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
>>>> @@ -80,7 +88,9 @@ struct riscv_iommu_pq_record {
>>>>    /* 5.4 Features control register (32bits) */
>>>>    #define RISCV_IOMMU_REG_FCTL            0x0008
>>>> +#define RISCV_IOMMU_FCTL_BE             BIT(0)
>>>>    #define RISCV_IOMMU_FCTL_WSI            BIT(1)
>>>> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>>>>    /* 5.5 Device-directory-table pointer (64bits) */
>>>>    #define RISCV_IOMMU_REG_DDTP            0x0010
>>>> @@ -203,6 +213,8 @@ struct riscv_iommu_dc {
>>>>    #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>>>>    #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>>>>    #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
>>>> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
>>>> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>>>>    #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>>>>    #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
>>>>    #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
>>>> @@ -309,9 +321,11 @@ enum riscv_iommu_fq_causes {
>>>>    /* Translation attributes fields */
>>>>    #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
>>>> +#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
>>>>    /* First stage context fields */
>>>>    #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
>>>> +#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
>>>>    enum riscv_iommu_fq_ttypes {
>>>>        RISCV_IOMMU_FQ_TTYPE_NONE = 0,
>>>> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
>>>> new file mode 100644
>>>> index 0000000000..ebe3a53a04
>>>> --- /dev/null
>>>> +++ b/hw/riscv/riscv-iommu.c
>>>> @@ -0,0 +1,2017 @@
>> (...)
>>
>>>> +
>>>> +/*
>>>> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
>>>> + *
>>>> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
>>>> + * Both implementation can be merged into single helper function in future.
>>>> + * Keeping them separate for now, as error reporting and flow specifics are
>>>> + * sufficiently different for separate implementation.
>>>> + *
>>>> + * @s        : IOMMU Device State
>>>> + * @ctx      : Translation context for device id and process address space id.
>>>> + * @iotlb    : translation data: physical address and access mode.
>>>> + * @return   : success or fault cause code.
>>>> + */
>>>> +static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>>>> +    IOMMUTLBEntry *iotlb)
>>>> +{
>>>> +    dma_addr_t addr, base;
>>>> +    uint64_t satp, gatp, pte;
>>>> +    bool en_s, en_g;
>>>> +    struct {
>>>> +        unsigned char step;
>>>> +        unsigned char levels;
>>>> +        unsigned char ptidxbits;
>>>> +        unsigned char ptesize;
>>>> +    } sc[2];
>>>> +    /* Translation stage phase */
>>>> +    enum {
>>>> +        S_STAGE = 0,
>>>> +        G_STAGE = 1,
>>>> +    } pass;
>>>> +
>>>> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
>>>> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
>>>> +
>>>> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
>>>> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
>>>> +
>>>> +    /* Early check for MSI address match when IOVA == GPA */
>>>> +    if ((iotlb->perm & IOMMU_WO) &&
>>>> +        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
>>> I think the IOVA here may be a GVA and we should not use a GVA to perform msi check. Section 2.3 states that we should always walk first stage page table to get the GPA (step 17) and then use the GPA to do the msi check (step 18).
>>
>> That's a good point. Even if we rule out the address as a GVA by doing a second stage
>> check (via en_g) we're still assuming that iotlb->iova = GPA without going through
>> the first stage translation. We're basically gambling for a match calling
>> riscv_iommu_msi_check() this early.
>>
>> I'll remove this early check entirely since we're doing a proper msi check with a
>> translated GPA at the end of first stage:
>>
>>
> 
> This check was required to ensure MSI translation is applied in case
> first-stage translation is set to BARE mode. In this case IOVA
> provided is a valid GPA. Running translation through page walk w/
> second stage translation will incorrectly try to translate GPA to host
> physical page, likely hitting IOPF.

Right. Let's keep the check adding this observation about why it is needed.

> 
> Looking back, I've missed removal of an important check for en_s flag
> in v2->v3 changes:
> 
>       /* Early check for MSI address match when IOVA == GPA */
> -    if (!en_s && (iotlb->perm & IOMMU_WO) &&
> +    if ((iotlb->perm & IOMMU_WO) &&
>           riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
>           iotlb->target_as = &s->trap_as;
> 
> I'd suggest fixing the missing en_s check, to enable proper handling of MSIP.

The '!en_s' check was removed because it was breaking irqbypass. I'll let Drew explain
more about it since he's been working in the kernel support for this use case.


Thanks,

Daniel


> 
> Thanks,
> - Tomasz
> 
>>>> +            /* Translation phase completed (GPA or SPA) */
>>>> +            iotlb->translated_addr = base;
>>>> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
>>>> +                                                         : IOMMU_RO;
>>>> +
>>>> +            /* Check MSI GPA address match */
>>>> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
>>>> +                riscv_iommu_msi_check(s, ctx, base)) {
>>>> +                /* Trap MSI writes and return GPA address. */
>>>> +                iotlb->target_as = &s->trap_as;
>>>> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
>>>> +                return 0;
>>>> +            }
>>
>>
>> This will be suboptimal for cases where IOVA == GPA and we would have an early exit by
>> guessing it right, but I'm more confortable adhering fully with the spec this time.
>>
>>
>> Thanks,
>>
>> Daniel
>>
>>
>>>> +        iotlb->target_as = &s->trap_as;
>>>> +        iotlb->translated_addr = iotlb->iova;
>>>> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    /* Exit early for pass-through mode. */
>>>> +    if (!(en_s || en_g)) {
>>>> +        iotlb->translated_addr = iotlb->iova;
>>>> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
>>>> +        /* Allow R/W in pass-through mode */
>>>> +        iotlb->perm = IOMMU_RW;
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    /* S/G translation parameters. */
>>>> +    for (pass = 0; pass < 2; pass++) {
>>>> +        uint32_t sv_mode;
>>>> +
>>>> +        sc[pass].step = 0;
>>>> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
>>>> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
>>>> +            /* 32bit mode for GXL/SXL == 1 */
>>>> +            switch (pass ? gatp : satp) {
>>>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
>>>> +                sc[pass].levels    = 0;
>>>> +                sc[pass].ptidxbits = 0;
>>>> +                sc[pass].ptesize   = 0;
>>>> +                break;
>>>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
>>>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
>>>> +                if (!(s->cap & sv_mode)) {
>>>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +                }
>>>> +                sc[pass].levels    = 2;
>>>> +                sc[pass].ptidxbits = 10;
>>>> +                sc[pass].ptesize   = 4;
>>>> +                break;
>>>> +            default:
>>>> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +            }
>>>> +        } else {
>>>> +            /* 64bit mode for GXL/SXL == 0 */
>>>> +            switch (pass ? gatp : satp) {
>>>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
>>>> +                sc[pass].levels    = 0;
>>>> +                sc[pass].ptidxbits = 0;
>>>> +                sc[pass].ptesize   = 0;
>>>> +                break;
>>>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
>>>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
>>>> +                if (!(s->cap & sv_mode)) {
>>>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +                }
>>>> +                sc[pass].levels    = 3;
>>>> +                sc[pass].ptidxbits = 9;
>>>> +                sc[pass].ptesize   = 8;
>>>> +                break;
>>>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
>>>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
>>>> +                if (!(s->cap & sv_mode)) {
>>>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +                }
>>>> +                sc[pass].levels    = 4;
>>>> +                sc[pass].ptidxbits = 9;
>>>> +                sc[pass].ptesize   = 8;
>>>> +                break;
>>>> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
>>>> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
>>>> +                if (!(s->cap & sv_mode)) {
>>>> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +                }
>>>> +                sc[pass].levels    = 5;
>>>> +                sc[pass].ptidxbits = 9;
>>>> +                sc[pass].ptesize   = 8;
>>>> +                break;
>>>> +            default:
>>>> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +            }
>>>> +        }
>>>> +    };
>>>> +
>>>> +    /* S/G stages translation tables root pointers */
>>>> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
>>>> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
>>>> +    addr = (en_s && en_g) ? satp : iotlb->iova;
>>>> +    base = en_g ? gatp : satp;
>>>> +    pass = en_g ? G_STAGE : S_STAGE;
>>>> +
>>>> +    do {
>>>> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
>>>> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
>>>> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
>>>> +                                 (sc[pass].levels - 1 - sc[pass].step);
>>>> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
>>>> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
>>>> +        const bool ade =
>>>> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
>>>> +
>>>> +        /* Address range check before first level lookup */
>>>> +        if (!sc[pass].step) {
>>>> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
>>>> +            if ((addr & va_mask) != addr) {
>>>> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
>>>> +            }
>>>> +        }
>>>> +
>>>> +        /* Read page table entry */
>>>> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
>>>> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>>>> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
>>>> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
>>>> +        }
>>>> +
>>>> +        if (sc[pass].ptesize == 4) {
>>>> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
>>>> +        } else {
>>>> +            pte = le64_to_cpu(pte);
>>>> +        }
>>>> +
>>>> +        sc[pass].step++;
>>>> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
>>>> +
>>>> +        if (!(pte & PTE_V)) {
>>>> +            break;                /* Invalid PTE */
>>>> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
>>>> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
>>>> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
>>>> +            break;                /* Reserved leaf PTE flags: PTE_W */
>>>> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
>>>> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
>>>> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
>>>> +            break;                /* Misaligned PPN */
>>>> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
>>>> +            break;                /* Read access check failed */
>>>> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
>>>> +            break;                /* Write access check failed */
>>>> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
>>>> +            break;                /* Access bit not set */
>>>> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
>>>> +            break;                /* Dirty bit not set */
>>>> +        } else {
>>>> +            /* Leaf PTE, translation completed. */
>>>> +            sc[pass].step = sc[pass].levels;
>>>> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
>>>> +            /* Update address mask based on smallest translation granularity */
>>>> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
>>>> +            /* Continue with S-Stage translation? */
>>>> +            if (pass && sc[0].step != sc[0].levels) {
>>>> +                pass = S_STAGE;
>>>> +                addr = iotlb->iova;
>>>> +                continue;
>>>> +            }
>>>> +            /* Translation phase completed (GPA or SPA) */
>>>> +            iotlb->translated_addr = base;
>>>> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
>>>> +                                                         : IOMMU_RO;
>>>> +
>>>> +            /* Check MSI GPA address match */
>>>> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
>>>> +                riscv_iommu_msi_check(s, ctx, base)) {
>>>> +                /* Trap MSI writes and return GPA address. */
>>>> +                iotlb->target_as = &s->trap_as;
>>>> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
>>>> +                return 0;
>>>> +            }
>>>> +
>>>> +            /* Continue with G-Stage translation? */
>>>> +            if (!pass && en_g) {
>>>> +                pass = G_STAGE;
>>>> +                addr = base;
>>>> +                base = gatp;
>>>> +                sc[pass].step = 0;
>>>> +                continue;
>>>> +            }
>>>> +
>>>> +            return 0;
>>>> +        }
>>>> +
>>>> +        if (sc[pass].step == sc[pass].levels) {
>>>> +            break; /* Can't find leaf PTE */
>>>> +        }
>>>> +
>>>> +        /* Continue with G-Stage translation? */
>>>> +        if (!pass && en_g) {
>>>> +            pass = G_STAGE;
>>>> +            addr = base;
>>>> +            base = gatp;
>>>> +            sc[pass].step = 0;
>>>> +        }
>>>> +    } while (1);
>>>> +
>>>> +    return (iotlb->perm & IOMMU_WO) ?
>>>> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
>>>> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
>>>> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
>>>> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
>>>> +}
>>>> +
>>>> +static void riscv_iommu_report_fault(RISCVIOMMUState *s,
>>>> +                                     RISCVIOMMUContext *ctx,
>>>> +                                     uint32_t fault_type, uint32_t cause,
>>>> +                                     bool pv,
>>>> +                                     uint64_t iotval, uint64_t iotval2)
>>>> +{
>>>> +    struct riscv_iommu_fq_record ev = { 0 };
>>>> +
>>>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
>>>> +        switch (cause) {
>>>> +        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
>>>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
>>>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
>>>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
>>>> +        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
>>>> +        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
>>>> +        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
>>>> +            break;
>>>> +        default:
>>>> +            /* DTF prevents reporting a fault for this given cause */
>>>> +            return;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
>>>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
>>>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
>>>> +    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
>>>> +
>>>> +    if (pv) {
>>>> +        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
>>>> +    }
>>>> +
>>>> +    ev.iotval = iotval;
>>>> +    ev.iotval2 = iotval2;
>>>> +
>>>> +    riscv_iommu_fault(s, &ev);
>>>> +}
>>>> +
>>>> +/* Redirect MSI write for given GPA. */
>>>> +static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
>>>> +    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
>>>> +    unsigned size, MemTxAttrs attrs)
>>>> +{
>>>> +    MemTxResult res;
>>>> +    dma_addr_t addr;
>>>> +    uint64_t intn;
>>>> +    uint32_t n190;
>>>> +    uint64_t pte[2];
>>>> +    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
>>>> +    int cause;
>>>> +
>>>> +    /* Interrupt File Number */
>>>> +    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
>>>> +    if (intn >= 256) {
>>>> +        /* Interrupt file number out of range */
>>>> +        res = MEMTX_ACCESS_ERROR;
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    /* fetch MSI PTE */
>>>> +    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
>>>> +    addr = addr | (intn * sizeof(pte));
>>>> +    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
>>>> +            MEMTXATTRS_UNSPECIFIED);
>>>> +    if (res != MEMTX_OK) {
>>>> +        if (res == MEMTX_DECODE_ERROR) {
>>>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
>>>> +        } else {
>>>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>>>> +        }
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    le64_to_cpus(&pte[0]);
>>>> +    le64_to_cpus(&pte[1]);
>>>> +
>>>> +    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
>>>> +        /*
>>>> +         * The spec mentions that: "If msipte.C == 1, then further
>>>> +         * processing to interpret the PTE is implementation
>>>> +         * defined.". We'll abort with cause = 262 for this
>>>> +         * case too.
>>>> +         */
>>>> +        res = MEMTX_ACCESS_ERROR;
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
>>>> +    case RISCV_IOMMU_MSI_PTE_M_BASIC:
>>>> +        /* MSI Pass-through mode */
>>>> +        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
>>>> +        addr = addr | (gpa & TARGET_PAGE_MASK);
>>>> +
>>>> +        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
>>>> +                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
>>>> +                              gpa, addr);
>>>> +
>>>> +        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
>>>> +        if (res != MEMTX_OK) {
>>>> +            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
>>>> +            goto err;
>>>> +        }
>>>> +
>>>> +        return MEMTX_OK;
>>>> +    case RISCV_IOMMU_MSI_PTE_M_MRIF:
>>>> +        /* MRIF mode, continue. */
>>>> +        break;
>>>> +    default:
>>>> +        res = MEMTX_ACCESS_ERROR;
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Report an error for interrupt identities exceeding the maximum allowed
>>>> +     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
>>>> +     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
>>>> +     */
>>>> +    if ((data > 2047) || (gpa & 3)) {
>>>> +        res = MEMTX_ACCESS_ERROR;
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    /* MSI MRIF mode, non atomic pending bit update */
>>>> +
>>>> +    /* MRIF pending bit address */
>>>> +    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
>>>> +    addr = addr | ((data & 0x7c0) >> 3);
>>>> +
>>>> +    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
>>>> +                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
>>>> +                          gpa, addr);
>>>> +
>>>> +    /* MRIF pending bit mask */
>>>> +    data = 1ULL << (data & 0x03f);
>>>> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
>>>> +    if (res != MEMTX_OK) {
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    intn = intn | data;
>>>> +    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
>>>> +    if (res != MEMTX_OK) {
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    /* Get MRIF enable bits */
>>>> +    addr = addr + sizeof(intn);
>>>> +    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
>>>> +    if (res != MEMTX_OK) {
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    if (!(intn & data)) {
>>>> +        /* notification disabled, MRIF update completed. */
>>>> +        return MEMTX_OK;
>>>> +    }
>>>> +
>>>> +    /* Send notification message */
>>>> +    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
>>>> +    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
>>>> +          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
>>>> +
>>>> +    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
>>>> +    if (res != MEMTX_OK) {
>>>> +        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
>>>> +        goto err;
>>>> +    }
>>>> +
>>>> +    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
>>>> +
>>>> +    return MEMTX_OK;
>>>> +
>>>> +err:
>>>> +    riscv_iommu_report_fault(s, ctx, fault_type, cause,
>>>> +                             !!ctx->process_id, 0, 0);
>>>> +    return res;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Check device context configuration as described by the
>>>> + * riscv-iommu spec section "Device-context configuration
>>>> + * checks".
>>>> + */
>>>> +static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
>>>> +                                            RISCVIOMMUContext *ctx)
>>>> +{
>>>> +    uint32_t fsc_mode, msi_mode;
>>>> +
>>>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
>>>> +        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
>>>> +        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
>>>> +        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
>>>> +
>>>> +        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
>>>> +            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
>>>> +            return false;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
>>>> +
>>>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
>>>> +        switch (fsc_mode) {
>>>> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
>>>> +            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
>>>> +                return false;
>>>> +            }
>>>> +            break;
>>>> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
>>>> +            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
>>>> +                return false;
>>>> +            }
>>>> +            break;
>>>> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
>>>> +            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
>>>> +                return false;
>>>> +            }
>>>> +            break;
>>>> +        }
>>>> +    } else {
>>>> +        /* DC.tc.PDTV is 0 */
>>>> +        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
>>>> +            return false;
>>>> +        }
>>>> +
>>>> +        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
>>>> +            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
>>>> +                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
>>>> +                return false;
>>>> +            }
>>>> +        } else {
>>>> +            switch (fsc_mode) {
>>>> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
>>>> +                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
>>>> +                    return false;
>>>> +                }
>>>> +                break;
>>>> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
>>>> +                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
>>>> +                    return false;
>>>> +                }
>>>> +            break;
>>>> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
>>>> +                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
>>>> +                    return false;
>>>> +                }
>>>> +                break;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * CAP_END is always zero (only one endianess). FCTL_BE is
>>>> +     * always zero (little-endian accesses). Thus TC_SBE must
>>>> +     * always be LE, i.e. zero.
>>>> +     */
>>>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Validate process context (PC) according to section
>>>> + * "Process-context configuration checks".
>>>> + */
>>>> +static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
>>>> +                                             RISCVIOMMUContext *ctx)
>>>> +{
>>>> +    uint32_t mode;
>>>> +
>>>> +    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
>>>> +    switch (mode) {
>>>> +    case RISCV_IOMMU_DC_FSC_MODE_BARE:
>>>> +    /* sv39 and sv32 modes have the same value (8) */
>>>> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
>>>> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
>>>> +    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
>>>> +        break;
>>>> +    default:
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
>>>> +        if (mode == RISCV_IOMMU_CAP_SV32 &&
>>>> +            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
>>>> +                return false;
>>>> +        }
>>>> +    } else {
>>>> +        switch (mode) {
>>>> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
>>>> +            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
>>>> +                return false;
>>>> +            }
>>>> +            break;
>>>> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
>>>> +            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
>>>> +                return false;
>>>> +            }
>>>> +            break;
>>>> +        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
>>>> +            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
>>>> +                return false;
>>>> +            }
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>> +
>>>> +/*
>>>> + * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
>>>> + *
>>>> + * @s         : IOMMU Device State
>>>> + * @ctx       : Device Translation Context with devid and process_id set.
>>>> + * @return    : success or fault code.
>>>> + */
>>>> +static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>>>> +{
>>>> +    const uint64_t ddtp = s->ddtp;
>>>> +    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
>>>> +    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
>>>> +    struct riscv_iommu_dc dc;
>>>> +    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
>>>> +    const int dc_fmt = !s->enable_msi;
>>>> +    const size_t dc_len = sizeof(dc) >> dc_fmt;
>>>> +    unsigned depth;
>>>> +    uint64_t de;
>>>> +
>>>> +    switch (mode) {
>>>> +    case RISCV_IOMMU_DDTP_MODE_OFF:
>>>> +        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
>>>> +
>>>> +    case RISCV_IOMMU_DDTP_MODE_BARE:
>>>> +        /* mock up pass-through translation context */
>>>> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
>>>> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
>>>> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
>>>> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
>>>> +        ctx->tc = RISCV_IOMMU_DC_TC_V;
>>>> +        ctx->ta = 0;
>>>> +        ctx->msiptp = 0;
>>>> +        return 0;
>>>> +
>>>> +    case RISCV_IOMMU_DDTP_MODE_1LVL:
>>>> +        depth = 0;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_DDTP_MODE_2LVL:
>>>> +        depth = 1;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_DDTP_MODE_3LVL:
>>>> +        depth = 2;
>>>> +        break;
>>>> +
>>>> +    default:
>>>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Check supported device id width (in bits).
>>>> +     * See IOMMU Specification, Chapter 6. Software guidelines.
>>>> +     * - if extended device-context format is used:
>>>> +     *   1LVL: 6, 2LVL: 15, 3LVL: 24
>>>> +     * - if base device-context format is used:
>>>> +     *   1LVL: 7, 2LVL: 16, 3LVL: 24
>>>> +     */
>>>> +    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
>>>> +        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>>>> +    }
>>>> +
>>>> +    /* Device directory tree walk */
>>>> +    for (; depth-- > 0; ) {
>>>> +        /*
>>>> +         * Select device id index bits based on device directory tree level
>>>> +         * and device context format.
>>>> +         * See IOMMU Specification, Chapter 2. Data Structures.
>>>> +         * - if extended device-context format is used:
>>>> +         *   device index: [23:15][14:6][5:0]
>>>> +         * - if base device-context format is used:
>>>> +         *   device index: [23:16][15:7][6:0]
>>>> +         */
>>>> +        const int split = depth * 9 + 6 + dc_fmt;
>>>> +        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
>>>> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
>>>> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>>>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
>>>> +        }
>>>> +        le64_to_cpus(&de);
>>>> +        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
>>>> +            /* invalid directory entry */
>>>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>>>> +        }
>>>> +        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
>>>> +            /* reserved bits set */
>>>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +        }
>>>> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
>>>> +    }
>>>> +
>>>> +    /* index into device context entry page */
>>>> +    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
>>>> +
>>>> +    memset(&dc, 0, sizeof(dc));
>>>> +    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
>>>> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>>>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
>>>> +    }
>>>> +
>>>> +    /* Set translation context. */
>>>> +    ctx->tc = le64_to_cpu(dc.tc);
>>>> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
>>>> +    ctx->satp = le64_to_cpu(dc.fsc);
>>>> +    ctx->ta = le64_to_cpu(dc.ta);
>>>> +    ctx->msiptp = le64_to_cpu(dc.msiptp);
>>>> +    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
>>>> +    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
>>>> +
>>>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
>>>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>>>> +    }
>>>> +
>>>> +    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
>>>> +        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
>>>> +    }
>>>> +
>>>> +    /* FSC field checks */
>>>> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
>>>> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
>>>> +
>>>> +    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
>>>> +        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
>>>> +            /* PID is disabled */
>>>> +            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>>>> +        }
>>>> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
>>>> +            /* Invalid translation mode */
>>>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>>>> +        }
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
>>>> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
>>>> +            /* No default process_id enabled, set BARE mode */
>>>> +            ctx->satp = 0ULL;
>>>> +            return 0;
>>>> +        } else {
>>>> +            /* Use default process_id #0 */
>>>> +            ctx->process_id = 0;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
>>>> +        /* No S-Stage translation, done. */
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    /* FSC.TC.PDTV enabled */
>>>> +    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
>>>> +        /* Invalid PDTP.MODE */
>>>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
>>>> +    }
>>>> +
>>>> +    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
>>>> +        /*
>>>> +         * Select process id index bits based on process directory tree
>>>> +         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
>>>> +         */
>>>> +        const int split = depth * 9 + 8;
>>>> +        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
>>>> +        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
>>>> +                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>>>> +            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
>>>> +        }
>>>> +        le64_to_cpus(&de);
>>>> +        if (!(de & RISCV_IOMMU_PC_TA_V)) {
>>>> +            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
>>>> +        }
>>>> +        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
>>>> +    }
>>>> +
>>>> +    /* Leaf entry in PDT */
>>>> +    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
>>>> +    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
>>>> +                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
>>>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
>>>> +    }
>>>> +
>>>> +    /* Use FSC and TA from process directory entry. */
>>>> +    ctx->ta = le64_to_cpu(dc.ta);
>>>> +    ctx->satp = le64_to_cpu(dc.fsc);
>>>> +
>>>> +    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
>>>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
>>>> +    }
>>>> +
>>>> +    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
>>>> +        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +/* Translation Context cache support */
>>>> +static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
>>>> +{
>>>> +    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
>>>> +    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
>>>> +    return c1->devid == c2->devid &&
>>>> +           c1->process_id == c2->process_id;
>>>> +}
>>>> +
>>>> +static guint __ctx_hash(gconstpointer v)
>>>> +{
>>>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
>>>> +    /*
>>>> +     * Generate simple hash of (process_id, devid)
>>>> +     * assuming 24-bit wide devid.
>>>> +     */
>>>> +    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
>>>> +}
>>>> +
>>>> +static void __ctx_inval_devid_procid(gpointer key, gpointer value,
>>>> +                                     gpointer data)
>>>> +{
>>>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
>>>> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
>>>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
>>>> +        ctx->devid == arg->devid &&
>>>> +        ctx->process_id == arg->process_id) {
>>>> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
>>>> +    }
>>>> +}
>>>> +
>>>> +static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
>>>> +{
>>>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
>>>> +    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
>>>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
>>>> +        ctx->devid == arg->devid) {
>>>> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
>>>> +    }
>>>> +}
>>>> +
>>>> +static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
>>>> +{
>>>> +    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
>>>> +    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
>>>> +        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
>>>> +    }
>>>> +}
>>>> +
>>>> +static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
>>>> +                                  uint32_t devid, uint32_t process_id)
>>>> +{
>>>> +    GHashTable *ctx_cache;
>>>> +    RISCVIOMMUContext key = {
>>>> +        .devid = devid,
>>>> +        .process_id = process_id,
>>>> +    };
>>>> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
>>>> +    qemu_mutex_lock(&s->ctx_lock);
>>>> +    g_hash_table_foreach(ctx_cache, func, &key);
>>>> +    qemu_mutex_unlock(&s->ctx_lock);
>>>> +    g_hash_table_unref(ctx_cache);
>>>> +}
>>>> +
>>>> +/* Find or allocate translation context for a given {device_id, process_id} */
>>>> +static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
>>>> +                                          unsigned devid, unsigned process_id,
>>>> +                                          void **ref)
>>>> +{
>>>> +    GHashTable *ctx_cache;
>>>> +    RISCVIOMMUContext *ctx;
>>>> +    RISCVIOMMUContext key = {
>>>> +        .devid = devid,
>>>> +        .process_id = process_id,
>>>> +    };
>>>> +
>>>> +    ctx_cache = g_hash_table_ref(s->ctx_cache);
>>>> +    qemu_mutex_lock(&s->ctx_lock);
>>>> +    ctx = g_hash_table_lookup(ctx_cache, &key);
>>>> +    qemu_mutex_unlock(&s->ctx_lock);
>>>> +
>>>> +    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
>>>> +        *ref = ctx_cache;
>>>> +        return ctx;
>>>> +    }
>>>> +
>>>> +    ctx = g_new0(RISCVIOMMUContext, 1);
>>>> +    ctx->devid = devid;
>>>> +    ctx->process_id = process_id;
>>>> +
>>>> +    int fault = riscv_iommu_ctx_fetch(s, ctx);
>>>> +    if (!fault) {
>>>> +        qemu_mutex_lock(&s->ctx_lock);
>>>> +        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
>>>> +            g_hash_table_unref(ctx_cache);
>>>> +            ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
>>>> +                                              g_free, NULL);
>>>> +            g_hash_table_ref(ctx_cache);
>>>> +            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
>>>> +        }
>>>> +        g_hash_table_add(ctx_cache, ctx);
>>>> +        qemu_mutex_unlock(&s->ctx_lock);
>>>> +        *ref = ctx_cache;
>>>> +        return ctx;
>>>> +    }
>>>> +
>>>> +    g_hash_table_unref(ctx_cache);
>>>> +    *ref = NULL;
>>>> +
>>>> +    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
>>>> +                             fault, !!process_id, 0, 0);
>>>> +
>>>> +    g_free(ctx);
>>>> +    return NULL;
>>>> +}
>>>> +
>>>> +static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
>>>> +{
>>>> +    if (ref) {
>>>> +        g_hash_table_unref((GHashTable *)ref);
>>>> +    }
>>>> +}
>>>> +
>>>> +/* Find or allocate address space for a given device */
>>>> +static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
>>>> +{
>>>> +    RISCVIOMMUSpace *as;
>>>> +
>>>> +    /* FIXME: PCIe bus remapping for attached endpoints. */
>>>> +    devid |= s->bus << 8;
>>>> +
>>>> +    qemu_mutex_lock(&s->core_lock);
>>>> +    QLIST_FOREACH(as, &s->spaces, list) {
>>>> +        if (as->devid == devid) {
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +    qemu_mutex_unlock(&s->core_lock);
>>>> +
>>>> +    if (as == NULL) {
>>>> +        char name[64];
>>>> +        as = g_new0(RISCVIOMMUSpace, 1);
>>>> +
>>>> +        as->iommu = s;
>>>> +        as->devid = devid;
>>>> +
>>>> +        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
>>>> +            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
>>>> +
>>>> +        /* IOVA address space, untranslated addresses */
>>>> +        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
>>>> +            TYPE_RISCV_IOMMU_MEMORY_REGION,
>>>> +            OBJECT(as), "riscv_iommu", UINT64_MAX);
>>>> +        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
>>>> +
>>>> +        qemu_mutex_lock(&s->core_lock);
>>>> +        QLIST_INSERT_HEAD(&s->spaces, as, list);
>>>> +        qemu_mutex_unlock(&s->core_lock);
>>>> +
>>>> +        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
>>>> +                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
>>>> +    }
>>>> +    return &as->iova_as;
>>>> +}
>>>> +
>>>> +static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>>>> +    IOMMUTLBEntry *iotlb)
>>>> +{
>>>> +    bool enable_pid;
>>>> +    bool enable_pri;
>>>> +    int fault;
>>>> +
>>>> +    /*
>>>> +     * TC[32] is reserved for custom extensions, used here to temporarily
>>>> +     * enable automatic page-request generation for ATS queries.
>>>> +     */
>>>> +    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
>>>> +    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
>>>> +
>>>> +    /* Translate using device directory / page table information. */
>>>> +    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
>>>> +
>>>> +    if (enable_pri && fault) {
>>>> +        struct riscv_iommu_pq_record pr = {0};
>>>> +        if (enable_pid) {
>>>> +            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
>>>> +                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
>>>> +        }
>>>> +        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
>>>> +        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
>>>> +                     RISCV_IOMMU_PREQ_PAYLOAD_M;
>>>> +        riscv_iommu_pri(s, &pr);
>>>> +        return fault;
>>>> +    }
>>>> +
>>>> +    if (fault) {
>>>> +        unsigned ttype;
>>>> +
>>>> +        if (iotlb->perm & IOMMU_RW) {
>>>> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
>>>> +        } else {
>>>> +            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
>>>> +        }
>>>> +
>>>> +        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
>>>> +                                 iotlb->iova, iotlb->translated_addr);
>>>> +        return fault;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +/* IOMMU Command Interface */
>>>> +static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
>>>> +    uint64_t addr, uint32_t data)
>>>> +{
>>>> +    /*
>>>> +     * ATS processing in this implementation of the IOMMU is synchronous,
>>>> +     * no need to wait for completions here.
>>>> +     */
>>>> +    if (!notify) {
>>>> +        return MEMTX_OK;
>>>> +    }
>>>> +
>>>> +    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
>>>> +        MEMTXATTRS_UNSPECIFIED);
>>>> +}
>>>> +
>>>> +static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
>>>> +{
>>>> +    uint64_t old_ddtp = s->ddtp;
>>>> +    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
>>>> +    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
>>>> +    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
>>>> +    bool ok = false;
>>>> +
>>>> +    /*
>>>> +     * Check for allowed DDTP.MODE transitions:
>>>> +     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
>>>> +     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
>>>> +     */
>>>> +    if (new_mode == old_mode ||
>>>> +        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
>>>> +        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
>>>> +        ok = true;
>>>> +    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
>>>> +               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
>>>> +               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
>>>> +        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
>>>> +             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
>>>> +    }
>>>> +
>>>> +    if (ok) {
>>>> +        /* clear reserved and busy bits, report back sanitized version */
>>>> +        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
>>>> +                             RISCV_IOMMU_DDTP_MODE, new_mode);
>>>> +    } else {
>>>> +        new_ddtp = old_ddtp;
>>>> +    }
>>>> +    s->ddtp = new_ddtp;
>>>> +
>>>> +    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
>>>> +}
>>>> +
>>>> +/* Command function and opcode field. */
>>>> +#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
>>>> +
>>>> +static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
>>>> +{
>>>> +    struct riscv_iommu_command cmd;
>>>> +    MemTxResult res;
>>>> +    dma_addr_t addr;
>>>> +    uint32_t tail, head, ctrl;
>>>> +    uint64_t cmd_opcode;
>>>> +    GHFunc func;
>>>> +
>>>> +    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>>>> +    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
>>>> +    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
>>>> +
>>>> +    /* Check for pending error or queue processing disabled */
>>>> +    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
>>>> +        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    while (tail != head) {
>>>> +        addr = s->cq_addr  + head * sizeof(cmd);
>>>> +        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
>>>> +                              MEMTXATTRS_UNSPECIFIED);
>>>> +
>>>> +        if (res != MEMTX_OK) {
>>>> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>>>> +                                  RISCV_IOMMU_CQCSR_CQMF, 0);
>>>> +            goto fault;
>>>> +        }
>>>> +
>>>> +        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
>>>> +
>>>> +        cmd_opcode = get_field(cmd.dword0,
>>>> +                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
>>>> +
>>>> +        switch (cmd_opcode) {
>>>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
>>>> +                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
>>>> +            res = riscv_iommu_iofence(s,
>>>> +                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
>>>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
>>>> +
>>>> +            if (res != MEMTX_OK) {
>>>> +                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>>>> +                                      RISCV_IOMMU_CQCSR_CQMF, 0);
>>>> +                goto fault;
>>>> +            }
>>>> +            break;
>>>> +
>>>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
>>>> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
>>>> +            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
>>>> +                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
>>>> +                goto cmd_ill;
>>>> +            }
>>>> +            /* translation cache not implemented yet */
>>>> +            break;
>>>> +
>>>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
>>>> +                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
>>>> +            /* translation cache not implemented yet */
>>>> +            break;
>>>> +
>>>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
>>>> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
>>>> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
>>>> +                /* invalidate all device context cache mappings */
>>>> +                func = __ctx_inval_all;
>>>> +            } else {
>>>> +                /* invalidate all device context matching DID */
>>>> +                func = __ctx_inval_devid;
>>>> +            }
>>>> +            riscv_iommu_ctx_inval(s, func,
>>>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
>>>> +            break;
>>>> +
>>>> +        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
>>>> +                             RISCV_IOMMU_CMD_IODIR_OPCODE):
>>>> +            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
>>>> +                /* illegal command arguments IODIR_PDT & DV == 0 */
>>>> +                goto cmd_ill;
>>>> +            } else {
>>>> +                func = __ctx_inval_devid_procid;
>>>> +            }
>>>> +            riscv_iommu_ctx_inval(s, func,
>>>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
>>>> +                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
>>>> +            break;
>>>> +
>>>> +        default:
>>>> +        cmd_ill:
>>>> +            /* Invalid instruction, do not advance instruction index. */
>>>> +            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
>>>> +                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
>>>> +            goto fault;
>>>> +        }
>>>> +
>>>> +        /* Advance and update head pointer after command completes. */
>>>> +        head = (head + 1) & s->cq_mask;
>>>> +        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
>>>> +    }
>>>> +    return;
>>>> +
>>>> +fault:
>>>> +    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
>>>> +        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
>>>> +    }
>>>> +}
>>>> +
>>>> +static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
>>>> +{
>>>> +    uint64_t base;
>>>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>>>> +    uint32_t ctrl_clr;
>>>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
>>>> +    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
>>>> +
>>>> +    if (enable && !active) {
>>>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
>>>> +        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
>>>> +        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
>>>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
>>>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
>>>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
>>>> +        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
>>>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
>>>> +                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
>>>> +                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
>>>> +    } else if (!enable && active) {
>>>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
>>>> +        ctrl_set = 0;
>>>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
>>>> +    } else {
>>>> +        ctrl_set = 0;
>>>> +        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
>>>> +    }
>>>> +
>>>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
>>>> +}
>>>> +
>>>> +static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
>>>> +{
>>>> +    uint64_t base;
>>>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
>>>> +    uint32_t ctrl_clr;
>>>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
>>>> +    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
>>>> +
>>>> +    if (enable && !active) {
>>>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
>>>> +        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
>>>> +        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
>>>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
>>>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
>>>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
>>>> +        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
>>>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
>>>> +            RISCV_IOMMU_FQCSR_FQOF;
>>>> +    } else if (!enable && active) {
>>>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
>>>> +        ctrl_set = 0;
>>>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
>>>> +    } else {
>>>> +        ctrl_set = 0;
>>>> +        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
>>>> +    }
>>>> +
>>>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
>>>> +}
>>>> +
>>>> +static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
>>>> +{
>>>> +    uint64_t base;
>>>> +    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
>>>> +    uint32_t ctrl_clr;
>>>> +    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
>>>> +    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
>>>> +
>>>> +    if (enable && !active) {
>>>> +        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
>>>> +        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
>>>> +        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
>>>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
>>>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
>>>> +        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
>>>> +        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
>>>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
>>>> +            RISCV_IOMMU_PQCSR_PQOF;
>>>> +    } else if (!enable && active) {
>>>> +        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
>>>> +        ctrl_set = 0;
>>>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
>>>> +    } else {
>>>> +        ctrl_set = 0;
>>>> +        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
>>>> +    }
>>>> +
>>>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
>>>> +}
>>>> +
>>>> +static void riscv_iommu_process_icvec_update(RISCVIOMMUState *s)
>>>> +{
>>>> +    uint32_t icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
>>>> +
>>>> +    s->icvec_update(s,
>>>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_CQ),
>>>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_FQ),
>>>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PM),
>>>> +                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PQ));
>>>> +}
>>>> +
>>>> +typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
>>>> +
>>>> +static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
>>>> +{
>>>> +    uint32_t cqcsr, fqcsr, pqcsr;
>>>> +    uint32_t ipsr_set = 0;
>>>> +    uint32_t ipsr_clr = 0;
>>>> +
>>>> +    if (data & RISCV_IOMMU_IPSR_CIP) {
>>>> +        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
>>>> +
>>>> +        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
>>>> +            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
>>>> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
>>>> +             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
>>>> +             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
>>>> +            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
>>>> +        } else {
>>>> +            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
>>>> +        }
>>>> +    } else {
>>>> +        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
>>>> +    }
>>>> +
>>>> +    if (data & RISCV_IOMMU_IPSR_FIP) {
>>>> +        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
>>>> +
>>>> +        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
>>>> +            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
>>>> +             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
>>>> +            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
>>>> +        } else {
>>>> +            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
>>>> +        }
>>>> +    } else {
>>>> +        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
>>>> +    }
>>>> +
>>>> +    if (data & RISCV_IOMMU_IPSR_PIP) {
>>>> +        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
>>>> +
>>>> +        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
>>>> +            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
>>>> +             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
>>>> +            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
>>>> +        } else {
>>>> +            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
>>>> +        }
>>>> +    } else {
>>>> +        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
>>>> +    }
>>>> +
>>>> +    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
>>>> +}
>>>> +
>>>> +static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
>>>> +    uint64_t data, unsigned size, MemTxAttrs attrs)
>>>> +{
>>>> +    riscv_iommu_process_fn *process_fn = NULL;
>>>> +    RISCVIOMMUState *s = opaque;
>>>> +    uint32_t regb = addr & ~3;
>>>> +    uint32_t busy = 0;
>>>> +    uint64_t val = 0;
>>>> +
>>>> +    if ((addr & (size - 1)) != 0) {
>>>> +        /* Unsupported MMIO alignment or access size */
>>>> +        return MEMTX_ERROR;
>>>> +    }
>>>> +
>>>> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
>>>> +        /* Unsupported MMIO access location. */
>>>> +        return MEMTX_ACCESS_ERROR;
>>>> +    }
>>>> +
>>>> +    /* Track actionable MMIO write. */
>>>> +    switch (regb) {
>>>> +    case RISCV_IOMMU_REG_DDTP:
>>>> +    case RISCV_IOMMU_REG_DDTP + 4:
>>>> +        process_fn = riscv_iommu_process_ddtp;
>>>> +        regb = RISCV_IOMMU_REG_DDTP;
>>>> +        busy = RISCV_IOMMU_DDTP_BUSY;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_REG_CQT:
>>>> +        process_fn = riscv_iommu_process_cq_tail;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_REG_CQCSR:
>>>> +        process_fn = riscv_iommu_process_cq_control;
>>>> +        busy = RISCV_IOMMU_CQCSR_BUSY;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_REG_FQCSR:
>>>> +        process_fn = riscv_iommu_process_fq_control;
>>>> +        busy = RISCV_IOMMU_FQCSR_BUSY;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_REG_PQCSR:
>>>> +        process_fn = riscv_iommu_process_pq_control;
>>>> +        busy = RISCV_IOMMU_PQCSR_BUSY;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_REG_ICVEC:
>>>> +        process_fn = riscv_iommu_process_icvec_update;
>>>> +        break;
>>>> +
>>>> +    case RISCV_IOMMU_REG_IPSR:
>>>> +        /*
>>>> +         * IPSR has special procedures to update. Execute it
>>>> +         * and exit.
>>>> +         */
>>>> +        if (size == 4) {
>>>> +            uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
>>>> +            uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
>>>> +            uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
>>>> +            stl_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
>>>> +        } else if (size == 8) {
>>>> +            uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
>>>> +            uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
>>>> +            uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
>>>> +            stq_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
>>>> +        }
>>>> +
>>>> +        riscv_iommu_update_ipsr(s, val);
>>>> +
>>>> +        return MEMTX_OK;
>>>> +
>>>> +    default:
>>>> +        break;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Registers update might be not synchronized with core logic.
>>>> +     * If system software updates register when relevant BUSY bit
>>>> +     * is set IOMMU behavior of additional writes to the register
>>>> +     * is UNSPECIFIED.
>>>> +     */
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    if (size == 1) {
>>>> +        uint8_t ro = s->regs_ro[addr];
>>>> +        uint8_t wc = s->regs_wc[addr];
>>>> +        uint8_t rw = s->regs_rw[addr];
>>>> +        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
>>>> +    } else if (size == 2) {
>>>> +        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
>>>> +        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
>>>> +        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
>>>> +        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
>>>> +    } else if (size == 4) {
>>>> +        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
>>>> +        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
>>>> +        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
>>>> +        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
>>>> +    } else if (size == 8) {
>>>> +        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
>>>> +        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
>>>> +        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
>>>> +        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
>>>> +    }
>>>> +
>>>> +    /* Busy flag update, MSB 4-byte register. */
>>>> +    if (busy) {
>>>> +        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
>>>> +        stl_le_p(&s->regs_rw[regb], rw | busy);
>>>> +    }
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +
>>>> +    if (process_fn) {
>>>> +        qemu_mutex_lock(&s->core_lock);
>>>> +        process_fn(s);
>>>> +        qemu_mutex_unlock(&s->core_lock);
>>>> +    }
>>>> +
>>>> +    return MEMTX_OK;
>>>> +}
>>>> +
>>>> +static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
>>>> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
>>>> +{
>>>> +    RISCVIOMMUState *s = opaque;
>>>> +    uint64_t val = -1;
>>>> +    uint8_t *ptr;
>>>> +
>>>> +    if ((addr & (size - 1)) != 0) {
>>>> +        /* Unsupported MMIO alignment. */
>>>> +        return MEMTX_ERROR;
>>>> +    }
>>>> +
>>>> +    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
>>>> +        return MEMTX_ACCESS_ERROR;
>>>> +    }
>>>> +
>>>> +    ptr = &s->regs_rw[addr];
>>>> +
>>>> +    if (size == 1) {
>>>> +        val = (uint64_t)*ptr;
>>>> +    } else if (size == 2) {
>>>> +        val = lduw_le_p(ptr);
>>>> +    } else if (size == 4) {
>>>> +        val = ldl_le_p(ptr);
>>>> +    } else if (size == 8) {
>>>> +        val = ldq_le_p(ptr);
>>>> +    } else {
>>>> +        return MEMTX_ERROR;
>>>> +    }
>>>> +
>>>> +    *data = val;
>>>> +
>>>> +    return MEMTX_OK;
>>>> +}
>>>> +
>>>> +static const MemoryRegionOps riscv_iommu_mmio_ops = {
>>>> +    .read_with_attrs = riscv_iommu_mmio_read,
>>>> +    .write_with_attrs = riscv_iommu_mmio_write,
>>>> +    .endianness = DEVICE_NATIVE_ENDIAN,
>>>> +    .impl = {
>>>> +        .min_access_size = 4,
>>>> +        .max_access_size = 8,
>>>> +        .unaligned = false,
>>>> +    },
>>>> +    .valid = {
>>>> +        .min_access_size = 4,
>>>> +        .max_access_size = 8,
>>>> +    }
>>>> +};
>>>> +
>>>> +/*
>>>> + * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
>>>> + * memory region as untranslated address, for additional MSI/MRIF interception
>>>> + * by IOMMU interrupt remapping implementation.
>>>> + * Note: Device emulation code generating an MSI is expected to provide a valid
>>>> + * memory transaction attributes with requested_id set.
>>>> + */
>>>> +static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
>>>> +    uint64_t data, unsigned size, MemTxAttrs attrs)
>>>> +{
>>>> +    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
>>>> +    RISCVIOMMUContext *ctx;
>>>> +    MemTxResult res;
>>>> +    void *ref;
>>>> +    uint32_t devid = attrs.requester_id;
>>>> +
>>>> +    if (attrs.unspecified) {
>>>> +        return MEMTX_ACCESS_ERROR;
>>>> +    }
>>>> +
>>>> +    /* FIXME: PCIe bus remapping for attached endpoints. */
>>>> +    devid |= s->bus << 8;
>>>> +
>>>> +    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
>>>> +    if (ctx == NULL) {
>>>> +        res = MEMTX_ACCESS_ERROR;
>>>> +    } else {
>>>> +        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
>>>> +    }
>>>> +    riscv_iommu_ctx_put(s, ref);
>>>> +    return res;
>>>> +}
>>>> +
>>>> +static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
>>>> +    uint64_t *data, unsigned size, MemTxAttrs attrs)
>>>> +{
>>>> +    return MEMTX_ACCESS_ERROR;
>>>> +}
>>>> +
>>>> +static const MemoryRegionOps riscv_iommu_trap_ops = {
>>>> +    .read_with_attrs = riscv_iommu_trap_read,
>>>> +    .write_with_attrs = riscv_iommu_trap_write,
>>>> +    .endianness = DEVICE_LITTLE_ENDIAN,
>>>> +    .impl = {
>>>> +        .min_access_size = 4,
>>>> +        .max_access_size = 8,
>>>> +        .unaligned = true,
>>>> +    },
>>>> +    .valid = {
>>>> +        .min_access_size = 4,
>>>> +        .max_access_size = 8,
>>>> +    }
>>>> +};
>>>> +
>>>> +static void riscv_iommu_realize(DeviceState *dev, Error **errp)
>>>> +{
>>>> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
>>>> +
>>>> +    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
>>>> +    if (s->enable_msi) {
>>>> +        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
>>>> +    }
>>>> +    if (s->enable_s_stage) {
>>>> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
>>>> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
>>>> +    }
>>>> +    if (s->enable_g_stage) {
>>>> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
>>>> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
>>>> +    }
>>>> +    /* Report QEMU target physical address space limits */
>>>> +    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
>>>> +                       TARGET_PHYS_ADDR_SPACE_BITS);
>>>> +
>>>> +    /* TODO: method to report supported PID bits */
>>>> +    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
>>>> +    s->cap |= RISCV_IOMMU_CAP_PD8;
>>>> +
>>>> +    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
>>>> +    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
>>>> +                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
>>>> +
>>>> +    /* register storage */
>>>> +    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>>>> +    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>>>> +    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
>>>> +
>>>> +     /* Mark all registers read-only */
>>>> +    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
>>>> +
>>>> +    /*
>>>> +     * Register complete MMIO space, including MSI/PBA registers.
>>>> +     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
>>>> +     * managed directly by the PCIDevice implementation.
>>>> +     */
>>>> +    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
>>>> +        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
>>>> +
>>>> +    /* Set power-on register state */
>>>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
>>>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
>>>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
>>>> +             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
>>>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
>>>> +        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
>>>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
>>>> +        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
>>>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
>>>> +        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
>>>> +    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
>>>> +        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
>>>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
>>>> +        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
>>>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
>>>> +        RISCV_IOMMU_CQCSR_BUSY);
>>>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
>>>> +        RISCV_IOMMU_FQCSR_FQOF);
>>>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
>>>> +        RISCV_IOMMU_FQCSR_BUSY);
>>>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
>>>> +        RISCV_IOMMU_PQCSR_PQOF);
>>>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
>>>> +        RISCV_IOMMU_PQCSR_BUSY);
>>>> +    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
>>>> +    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
>>>> +    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
>>>> +
>>>> +    /* Memory region for downstream access, if specified. */
>>>> +    if (s->target_mr) {
>>>> +        s->target_as = g_new0(AddressSpace, 1);
>>>> +        address_space_init(s->target_as, s->target_mr,
>>>> +            "riscv-iommu-downstream");
>>>> +    } else {
>>>> +        /* Fallback to global system memory. */
>>>> +        s->target_as = &address_space_memory;
>>>> +    }
>>>> +
>>>> +    /* Memory region for untranslated MRIF/MSI writes */
>>>> +    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
>>>> +            "riscv-iommu-trap", ~0ULL);
>>>> +    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
>>>> +
>>>> +    /* Device translation context cache */
>>>> +    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
>>>> +                                         g_free, NULL);
>>>> +    qemu_mutex_init(&s->ctx_lock);
>>>> +
>>>> +    s->iommus.le_next = NULL;
>>>> +    s->iommus.le_prev = NULL;
>>>> +    QLIST_INIT(&s->spaces);
>>>> +    qemu_mutex_init(&s->core_lock);
>>>> +    qemu_spin_init(&s->regs_lock);
>>>> +}
>>>> +
>>>> +static void riscv_iommu_unrealize(DeviceState *dev)
>>>> +{
>>>> +    RISCVIOMMUState *s = RISCV_IOMMU(dev);
>>>> +
>>>> +    qemu_mutex_destroy(&s->core_lock);
>>>> +    g_hash_table_unref(s->ctx_cache);
>>>> +}
>>>> +
>>>> +static Property riscv_iommu_properties[] = {
>>>> +    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
>>>> +        RISCV_IOMMU_SPEC_DOT_VER),
>>>> +    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
>>>> +    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
>>>> +    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
>>>> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
>>>> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
>>>> +    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
>>>> +        TYPE_MEMORY_REGION, MemoryRegion *),
>>>> +    DEFINE_PROP_END_OF_LIST(),
>>>> +};
>>>> +
>>>> +static void riscv_iommu_class_init(ObjectClass *klass, void* data)
>>>> +{
>>>> +    DeviceClass *dc = DEVICE_CLASS(klass);
>>>> +
>>>> +    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
>>>> +    dc->user_creatable = false;
>>>> +    dc->realize = riscv_iommu_realize;
>>>> +    dc->unrealize = riscv_iommu_unrealize;
>>>> +    device_class_set_props(dc, riscv_iommu_properties);
>>>> +}
>>>> +
>>>> +static const TypeInfo riscv_iommu_info = {
>>>> +    .name = TYPE_RISCV_IOMMU,
>>>> +    .parent = TYPE_DEVICE,
>>>> +    .instance_size = sizeof(RISCVIOMMUState),
>>>> +    .class_init = riscv_iommu_class_init,
>>>> +};
>>>> +
>>>> +static const char *IOMMU_FLAG_STR[] = {
>>>> +    "NA",
>>>> +    "RO",
>>>> +    "WR",
>>>> +    "RW",
>>>> +};
>>>> +
>>>> +/* RISC-V IOMMU Memory Region - Address Translation Space */
>>>> +static IOMMUTLBEntry riscv_iommu_memory_region_translate(
>>>> +    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
>>>> +    IOMMUAccessFlags flag, int iommu_idx)
>>>> +{
>>>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>>>> +    RISCVIOMMUContext *ctx;
>>>> +    void *ref;
>>>> +    IOMMUTLBEntry iotlb = {
>>>> +        .iova = addr,
>>>> +        .target_as = as->iommu->target_as,
>>>> +        .addr_mask = ~0ULL,
>>>> +        .perm = flag,
>>>> +    };
>>>> +
>>>> +    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
>>>> +    if (ctx == NULL) {
>>>> +        /* Translation disabled or invalid. */
>>>> +        iotlb.addr_mask = 0;
>>>> +        iotlb.perm = IOMMU_NONE;
>>>> +    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
>>>> +        /* Translation disabled or fault reported. */
>>>> +        iotlb.addr_mask = 0;
>>>> +        iotlb.perm = IOMMU_NONE;
>>>> +    }
>>>> +
>>>> +    /* Trace all dma translations with original access flags. */
>>>> +    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
>>>> +                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
>>>> +                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
>>>> +                          iotlb.translated_addr);
>>>> +
>>>> +    riscv_iommu_ctx_put(as->iommu, ref);
>>>> +
>>>> +    return iotlb;
>>>> +}
>>>> +
>>>> +static int riscv_iommu_memory_region_notify(
>>>> +    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
>>>> +    IOMMUNotifierFlag new, Error **errp)
>>>> +{
>>>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>>>> +
>>>> +    if (old == IOMMU_NOTIFIER_NONE) {
>>>> +        as->notifier = true;
>>>> +        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
>>>> +    } else if (new == IOMMU_NOTIFIER_NONE) {
>>>> +        as->notifier = false;
>>>> +        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static inline bool pci_is_iommu(PCIDevice *pdev)
>>>> +{
>>>> +    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
>>>> +}
>>>> +
>>>> +static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
>>>> +{
>>>> +    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
>>>> +    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
>>>> +    AddressSpace *as = NULL;
>>>> +
>>>> +    if (pdev && pci_is_iommu(pdev)) {
>>>> +        return s->target_as;
>>>> +    }
>>>> +
>>>> +    /* Find first registered IOMMU device */
>>>> +    while (s->iommus.le_prev) {
>>>> +        s = *(s->iommus.le_prev);
>>>> +    }
>>>> +
>>>> +    /* Find first matching IOMMU */
>>>> +    while (s != NULL && as == NULL) {
>>>> +        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
>>>> +        s = s->iommus.le_next;
>>>> +    }
>>>> +
>>>> +    return as ? as : &address_space_memory;
>>>> +}
>>>> +
>>>> +static const PCIIOMMUOps riscv_iommu_ops = {
>>>> +    .get_address_space = riscv_iommu_find_as,
>>>> +};
>>>> +
>>>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>>>> +        Error **errp)
>>>> +{
>>>> +    if (bus->iommu_ops &&
>>>> +        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
>>>> +        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
>>>> +        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
>>>> +        QLIST_INSERT_AFTER(last, iommu, iommus);
>>>> +    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
>>>> +        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
>>>> +    } else {
>>>> +        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
>>>> +            pci_bus_num(bus));
>>>> +    }
>>>> +}
>>>> +
>>>> +static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
>>>> +    MemTxAttrs attrs)
>>>> +{
>>>> +    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
>>>> +}
>>>> +
>>>> +static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
>>>> +{
>>>> +    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
>>>> +    return 1 << as->iommu->pid_bits;
>>>> +}
>>>> +
>>>> +static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
>>>> +{
>>>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>>>> +
>>>> +    imrc->translate = riscv_iommu_memory_region_translate;
>>>> +    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
>>>> +    imrc->attrs_to_index = riscv_iommu_memory_region_index;
>>>> +    imrc->num_indexes = riscv_iommu_memory_region_index_len;
>>>> +}
>>>> +
>>>> +static const TypeInfo riscv_iommu_memory_region_info = {
>>>> +    .parent = TYPE_IOMMU_MEMORY_REGION,
>>>> +    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
>>>> +    .class_init = riscv_iommu_memory_region_init,
>>>> +};
>>>> +
>>>> +static void riscv_iommu_register_mr_types(void)
>>>> +{
>>>> +    type_register_static(&riscv_iommu_memory_region_info);
>>>> +    type_register_static(&riscv_iommu_info);
>>>> +}
>>>> +
>>>> +type_init(riscv_iommu_register_mr_types);
>>>> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
>>>> new file mode 100644
>>>> index 0000000000..6d76cb9b1a
>>>> --- /dev/null
>>>> +++ b/hw/riscv/riscv-iommu.h
>>>> @@ -0,0 +1,148 @@
>>>> +/*
>>>> + * QEMU emulation of an RISC-V IOMMU
>>>> + *
>>>> + * Copyright (C) 2022-2023 Rivos Inc.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License as published by
>>>> + * the Free Software Foundation; either version 2 of the License.
>>>> + *
>>>> + * This program is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>> + * GNU General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU General Public License along
>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#ifndef HW_RISCV_IOMMU_STATE_H
>>>> +#define HW_RISCV_IOMMU_STATE_H
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qom/object.h"
>>>> +
>>>> +#include "hw/riscv/iommu.h"
>>>> +
>>>> +struct RISCVIOMMUState {
>>>> +    /*< private >*/
>>>> +    DeviceState parent_obj;
>>>> +
>>>> +    /*< public >*/
>>>> +    uint32_t version;     /* Reported interface version number */
>>>> +    uint32_t pid_bits;    /* process identifier width */
>>>> +    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
>>>> +
>>>> +    uint64_t cap;         /* IOMMU supported capabilities */
>>>> +    uint64_t fctl;        /* IOMMU enabled features */
>>>> +
>>>> +    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>>>> +    bool enable_msi;      /* Enable MSI remapping */
>>>> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
>>>> +    bool enable_g_stage;  /* Enable G-Stage translation */
>>>> +
>>>> +    /* IOMMU Internal State */
>>>> +    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
>>>> +
>>>> +    dma_addr_t cq_addr;   /* Command queue base physical address */
>>>> +    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
>>>> +    dma_addr_t pq_addr;   /* Page request queue base physical address */
>>>> +
>>>> +    uint32_t cq_mask;     /* Command queue index bit mask */
>>>> +    uint32_t fq_mask;     /* Fault/event queue index bit mask */
>>>> +    uint32_t pq_mask;     /* Page request queue index bit mask */
>>>> +
>>>> +    /* interrupt notifier */
>>>> +    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
>>>> +    /* register icvec interrupts */
>>>> +    void (*icvec_update)(RISCVIOMMUState *iommu,
>>>> +                         uint32_t civ, uint32_t fiv,
>>>> +                         uint32_t pmiv, uint32_t piv);
>>>> +
>>>> +    /* IOMMU State Machine */
>>>> +    QemuThread core_proc; /* Background processing thread */
>>>> +    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
>>>> +    QemuCond core_cond;   /* Background processing wake up signal */
>>>> +    unsigned core_exec;   /* Processing thread execution actions */
>>>> +
>>>> +    /* IOMMU target address space */
>>>> +    AddressSpace *target_as;
>>>> +    MemoryRegion *target_mr;
>>>> +
>>>> +    /* MSI / MRIF access trap */
>>>> +    AddressSpace trap_as;
>>>> +    MemoryRegion trap_mr;
>>>> +
>>>> +    GHashTable *ctx_cache;          /* Device translation Context Cache */
>>>> +    QemuMutex ctx_lock;      /* Device translation Cache update lock */
>>>> +
>>>> +    /* MMIO Hardware Interface */
>>>> +    MemoryRegion regs_mr;
>>>> +    QemuSpin regs_lock;
>>>> +    uint8_t *regs_rw;  /* register state (user write) */
>>>> +    uint8_t *regs_wc;  /* write-1-to-clear mask */
>>>> +    uint8_t *regs_ro;  /* read-only mask */
>>>> +
>>>> +    QLIST_ENTRY(RISCVIOMMUState) iommus;
>>>> +    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
>>>> +};
>>>> +
>>>> +void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
>>>> +         Error **errp);
>>>> +
>>>> +/* private helpers */
>>>> +
>>>> +/* Register helper functions */
>>>> +static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint32_t set, uint32_t clr)
>>>> +{
>>>> +    uint32_t val;
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    val = ldl_le_p(s->regs_rw + idx);
>>>> +    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +    return val;
>>>> +}
>>>> +
>>>> +static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint32_t set)
>>>> +{
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    stl_le_p(s->regs_rw + idx, set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +}
>>>> +
>>>> +static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
>>>> +    unsigned idx)
>>>> +{
>>>> +    return ldl_le_p(s->regs_rw + idx);
>>>> +}
>>>> +
>>>> +static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint64_t set, uint64_t clr)
>>>> +{
>>>> +    uint64_t val;
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    val = ldq_le_p(s->regs_rw + idx);
>>>> +    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +    return val;
>>>> +}
>>>> +
>>>> +static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
>>>> +    unsigned idx, uint64_t set)
>>>> +{
>>>> +    qemu_spin_lock(&s->regs_lock);
>>>> +    stq_le_p(s->regs_rw + idx, set);
>>>> +    qemu_spin_unlock(&s->regs_lock);
>>>> +}
>>>> +
>>>> +static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
>>>> +    unsigned idx)
>>>> +{
>>>> +    return ldq_le_p(s->regs_rw + idx);
>>>> +}
>>>> +
>>>> +
>>>> +
>>>> +#endif
>>>> diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
>>>> new file mode 100644
>>>> index 0000000000..bdd8b657a6
>>>> --- /dev/null
>>>> +++ b/hw/riscv/trace-events
>>>> @@ -0,0 +1,12 @@
>>>> +# See documentation at docs/devel/tracing.rst
>>>> +
>>>> +# riscv-iommu.c
>>>> +riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
>>>> +riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
>>>> +riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
>>>> +riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
>>>> +riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
>>>> +riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
>>>> +riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
>>>> +riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
>>>> +riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
>>>> diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
>>>> new file mode 100644
>>>> index 0000000000..8c0e3ca1f3
>>>> --- /dev/null
>>>> +++ b/hw/riscv/trace.h
>>>> @@ -0,0 +1 @@
>>>> +#include "trace/trace-hw_riscv.h"
>>>> diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
>>>> new file mode 100644
>>>> index 0000000000..070ee69973
>>>> --- /dev/null
>>>> +++ b/include/hw/riscv/iommu.h
>>>> @@ -0,0 +1,36 @@
>>>> +/*
>>>> + * QEMU emulation of an RISC-V IOMMU
>>>> + *
>>>> + * Copyright (C) 2022-2023 Rivos Inc.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License as published by
>>>> + * the Free Software Foundation; either version 2 of the License.
>>>> + *
>>>> + * This program is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>> + * GNU General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU General Public License along
>>>> + * with this program; if not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#ifndef HW_RISCV_IOMMU_H
>>>> +#define HW_RISCV_IOMMU_H
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qom/object.h"
>>>> +
>>>> +#define TYPE_RISCV_IOMMU "riscv-iommu"
>>>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
>>>> +typedef struct RISCVIOMMUState RISCVIOMMUState;
>>>> +
>>>> +#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
>>>> +typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
>>>> +
>>>> +#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
>>>> +OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
>>>> +typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
>>>> +
>>>> +#endif
>>>> diff --git a/meson.build b/meson.build
>>>> index a1e51277b0..359f836d8e 100644
>>>> --- a/meson.build
>>>> +++ b/meson.build
>>>> @@ -3373,6 +3373,7 @@ if have_system
>>>>        'hw/pci-host',
>>>>        'hw/ppc',
>>>>        'hw/rtc',
>>>> +    'hw/riscv',
>>>>        'hw/s390x',
>>>>        'hw/scsi',
>>>>        'hw/sd',
diff mbox series

Patch

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index a2030e3a6f..f69d6e3c8e 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -1,3 +1,6 @@ 
+config RISCV_IOMMU
+    bool
+
 config RISCV_NUMA
     bool
 
@@ -47,6 +50,7 @@  config RISCV_VIRT
     select SERIAL
     select RISCV_ACLINT
     select RISCV_APLIC
+    select RISCV_IOMMU
     select RISCV_IMSIC
     select SIFIVE_PLIC
     select SIFIVE_TEST
diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
index f872674093..cbc99c6e8e 100644
--- a/hw/riscv/meson.build
+++ b/hw/riscv/meson.build
@@ -10,5 +10,6 @@  riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: files('sifive_u.c'))
 riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
 riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: files('microchip_pfsoc.c'))
 riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
+riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c'))
 
 hw_arch += {'riscv': riscv_ss}
diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
index 9d9512ca71..6f1b9ab61b 100644
--- a/hw/riscv/riscv-iommu-bits.h
+++ b/hw/riscv/riscv-iommu-bits.h
@@ -69,6 +69,14 @@  struct riscv_iommu_pq_record {
 /* 5.3 IOMMU Capabilities (64bits) */
 #define RISCV_IOMMU_REG_CAP             0x0000
 #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
+#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
+#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
+#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
+#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
+#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
+#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
+#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
 #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
 #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
 #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
@@ -80,7 +88,9 @@  struct riscv_iommu_pq_record {
 
 /* 5.4 Features control register (32bits) */
 #define RISCV_IOMMU_REG_FCTL            0x0008
+#define RISCV_IOMMU_FCTL_BE             BIT(0)
 #define RISCV_IOMMU_FCTL_WSI            BIT(1)
+#define RISCV_IOMMU_FCTL_GXL            BIT(2)
 
 /* 5.5 Device-directory-table pointer (64bits) */
 #define RISCV_IOMMU_REG_DDTP            0x0010
@@ -203,6 +213,8 @@  struct riscv_iommu_dc {
 #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
 #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
 #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
+#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
+#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
 #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
 #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
 #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
@@ -309,9 +321,11 @@  enum riscv_iommu_fq_causes {
 
 /* Translation attributes fields */
 #define RISCV_IOMMU_PC_TA_V             BIT_ULL(0)
+#define RISCV_IOMMU_PC_TA_RESERVED      GENMASK_ULL(63, 32)
 
 /* First stage context fields */
 #define RISCV_IOMMU_PC_FSC_PPN          GENMASK_ULL(43, 0)
+#define RISCV_IOMMU_PC_FSC_RESERVED     GENMASK_ULL(59, 44)
 
 enum riscv_iommu_fq_ttypes {
     RISCV_IOMMU_FQ_TTYPE_NONE = 0,
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
new file mode 100644
index 0000000000..ebe3a53a04
--- /dev/null
+++ b/hw/riscv/riscv-iommu.c
@@ -0,0 +1,2017 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU
+ *
+ * Copyright (C) 2021-2023, Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+#include "hw/pci/pci_bus.h"
+#include "hw/pci/pci_device.h"
+#include "hw/qdev-properties.h"
+#include "hw/riscv/riscv_hart.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qemu/timer.h"
+
+#include "cpu_bits.h"
+#include "riscv-iommu.h"
+#include "riscv-iommu-bits.h"
+#include "trace.h"
+
+#define LIMIT_CACHE_CTX               (1U << 7)
+#define LIMIT_CACHE_IOT               (1U << 20)
+
+/* Physical page number coversions */
+#define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
+#define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
+
+typedef struct RISCVIOMMUContext RISCVIOMMUContext;
+typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
+
+/* Device assigned I/O address space */
+struct RISCVIOMMUSpace {
+    IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
+    AddressSpace iova_as;       /* IOVA address space for attached device */
+    RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
+    uint32_t devid;             /* Requester identifier, AKA device_id */
+    bool notifier;              /* IOMMU unmap notifier enabled */
+    QLIST_ENTRY(RISCVIOMMUSpace) list;
+};
+
+/* Device translation context state. */
+struct RISCVIOMMUContext {
+    uint64_t devid:24;          /* Requester Id, AKA device_id */
+    uint64_t process_id:20;     /* Process ID. PASID for PCIe */
+    uint64_t __rfu:20;          /* reserved */
+    uint64_t tc;                /* Translation Control */
+    uint64_t ta;                /* Translation Attributes */
+    uint64_t satp;              /* S-Stage address translation and protection */
+    uint64_t gatp;              /* G-Stage address translation and protection */
+    uint64_t msi_addr_mask;     /* MSI filtering - address mask */
+    uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
+    uint64_t msiptp;            /* MSI redirection page table pointer */
+};
+
+/* IOMMU index for transactions without process_id specified. */
+#define RISCV_IOMMU_NOPROCID 0
+
+static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
+{
+    g_assert(vec_type < 4);
+    return (icvec >> (vec_type * 4)) & 0x0F;
+}
+
+static void riscv_iommu_notify(RISCVIOMMUState *s, int vec)
+{
+    const uint32_t fctl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FCTL);
+    uint32_t ipsr, icvec;
+
+    if (fctl & RISCV_IOMMU_FCTL_WSI || !s->notify) {
+        return;
+    }
+
+    icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
+    ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec), 0);
+
+    if (!(ipsr & (1 << vec))) {
+        s->notify(s, riscv_iommu_get_icvec_vector(icvec, vec));
+    }
+}
+
+static void riscv_iommu_fault(RISCVIOMMUState *s,
+                              struct riscv_iommu_fq_record *ev)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
+    uint32_t next = (tail + 1) & s->fq_mask;
+    uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
+
+    trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), ev->hdr, ev->iotval);
+
+    if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
+        !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
+                              RISCV_IOMMU_FQCSR_FQOF, 0);
+    } else {
+        dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
+        if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
+                                  RISCV_IOMMU_FQCSR_FQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
+    }
+}
+
+static void riscv_iommu_pri(RISCVIOMMUState *s,
+    struct riscv_iommu_pq_record *pr)
+{
+    uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
+    uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
+    uint32_t next = (tail + 1) & s->pq_mask;
+    uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
+
+    trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
+                          PCI_FUNC(devid), pr->payload);
+
+    if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
+        !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
+        return;
+    }
+
+    if (head == next) {
+        riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
+                              RISCV_IOMMU_PQCSR_PQOF, 0);
+    } else {
+        dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
+        if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
+                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
+                                  RISCV_IOMMU_PQCSR_PQMF, 0);
+        } else {
+            riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
+        }
+    }
+
+    if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
+    }
+}
+
+/* Portable implementation of pext_u64, bit-mask extraction. */
+static uint64_t _pext_u64(uint64_t val, uint64_t ext)
+{
+    uint64_t ret = 0;
+    uint64_t rot = 1;
+
+    while (ext) {
+        if (ext & 1) {
+            if (val & 1) {
+                ret |= rot;
+            }
+            rot <<= 1;
+        }
+        val >>= 1;
+        ext >>= 1;
+    }
+
+    return ret;
+}
+
+/* Check if GPA matches MSI/MRIF pattern. */
+static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    dma_addr_t gpa)
+{
+    if (!s->enable_msi) {
+        return false;
+    }
+
+    if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
+        RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
+        return false; /* Invalid MSI/MRIF mode */
+    }
+
+    if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
+        return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
+    }
+
+    return true;
+}
+
+/*
+ * RISCV IOMMU Address Translation Lookup - Page Table Walk
+ *
+ * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
+ * Both implementation can be merged into single helper function in future.
+ * Keeping them separate for now, as error reporting and flow specifics are
+ * sufficiently different for separate implementation.
+ *
+ * @s        : IOMMU Device State
+ * @ctx      : Translation context for device id and process address space id.
+ * @iotlb    : translation data: physical address and access mode.
+ * @return   : success or fault cause code.
+ */
+static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb)
+{
+    dma_addr_t addr, base;
+    uint64_t satp, gatp, pte;
+    bool en_s, en_g;
+    struct {
+        unsigned char step;
+        unsigned char levels;
+        unsigned char ptidxbits;
+        unsigned char ptesize;
+    } sc[2];
+    /* Translation stage phase */
+    enum {
+        S_STAGE = 0,
+        G_STAGE = 1,
+    } pass;
+
+    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
+    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
+
+    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
+    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
+
+    /* Early check for MSI address match when IOVA == GPA */
+    if ((iotlb->perm & IOMMU_WO) &&
+        riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
+        iotlb->target_as = &s->trap_as;
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        return 0;
+    }
+
+    /* Exit early for pass-through mode. */
+    if (!(en_s || en_g)) {
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        /* Allow R/W in pass-through mode */
+        iotlb->perm = IOMMU_RW;
+        return 0;
+    }
+
+    /* S/G translation parameters. */
+    for (pass = 0; pass < 2; pass++) {
+        uint32_t sv_mode;
+
+        sc[pass].step = 0;
+        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
+            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
+            /* 32bit mode for GXL/SXL == 1 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 2;
+                sc[pass].ptidxbits = 10;
+                sc[pass].ptesize   = 4;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        } else {
+            /* 64bit mode for GXL/SXL == 0 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 3;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 4;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 5;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        }
+    };
+
+    /* S/G stages translation tables root pointers */
+    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
+    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
+    addr = (en_s && en_g) ? satp : iotlb->iova;
+    base = en_g ? gatp : satp;
+    pass = en_g ? G_STAGE : S_STAGE;
+
+    do {
+        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
+        const unsigned va_bits = widened + sc[pass].ptidxbits;
+        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
+                                 (sc[pass].levels - 1 - sc[pass].step);
+        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
+        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
+        const bool ade =
+            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
+
+        /* Address range check before first level lookup */
+        if (!sc[pass].step) {
+            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
+            if ((addr & va_mask) != addr) {
+                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+            }
+        }
+
+        /* Read page table entry */
+        if (dma_memory_read(s->target_as, pte_addr, &pte,
+                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
+                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
+        }
+
+        if (sc[pass].ptesize == 4) {
+            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
+        } else {
+            pte = le64_to_cpu(pte);
+        }
+
+        sc[pass].step++;
+        hwaddr ppn = pte >> PTE_PPN_SHIFT;
+
+        if (!(pte & PTE_V)) {
+            break;                /* Invalid PTE */
+        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
+            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
+            break;                /* Reserved leaf PTE flags: PTE_W */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
+            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
+        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
+            break;                /* Misaligned PPN */
+        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
+            break;                /* Read access check failed */
+        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
+            break;                /* Write access check failed */
+        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
+            break;                /* Access bit not set */
+        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
+            break;                /* Dirty bit not set */
+        } else {
+            /* Leaf PTE, translation completed. */
+            sc[pass].step = sc[pass].levels;
+            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
+            /* Update address mask based on smallest translation granularity */
+            iotlb->addr_mask &= (1ULL << va_skip) - 1;
+            /* Continue with S-Stage translation? */
+            if (pass && sc[0].step != sc[0].levels) {
+                pass = S_STAGE;
+                addr = iotlb->iova;
+                continue;
+            }
+            /* Translation phase completed (GPA or SPA) */
+            iotlb->translated_addr = base;
+            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
+                                                         : IOMMU_RO;
+
+            /* Check MSI GPA address match */
+            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
+                riscv_iommu_msi_check(s, ctx, base)) {
+                /* Trap MSI writes and return GPA address. */
+                iotlb->target_as = &s->trap_as;
+                iotlb->addr_mask = ~TARGET_PAGE_MASK;
+                return 0;
+            }
+
+            /* Continue with G-Stage translation? */
+            if (!pass && en_g) {
+                pass = G_STAGE;
+                addr = base;
+                base = gatp;
+                sc[pass].step = 0;
+                continue;
+            }
+
+            return 0;
+        }
+
+        if (sc[pass].step == sc[pass].levels) {
+            break; /* Can't find leaf PTE */
+        }
+
+        /* Continue with G-Stage translation? */
+        if (!pass && en_g) {
+            pass = G_STAGE;
+            addr = base;
+            base = gatp;
+            sc[pass].step = 0;
+        }
+    } while (1);
+
+    return (iotlb->perm & IOMMU_WO) ?
+                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
+                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
+}
+
+static void riscv_iommu_report_fault(RISCVIOMMUState *s,
+                                     RISCVIOMMUContext *ctx,
+                                     uint32_t fault_type, uint32_t cause,
+                                     bool pv,
+                                     uint64_t iotval, uint64_t iotval2)
+{
+    struct riscv_iommu_fq_record ev = { 0 };
+
+    if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
+        switch (cause) {
+        case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
+        case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
+        case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
+        case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
+            break;
+        default:
+            /* DTF prevents reporting a fault for this given cause */
+            return;
+        }
+    }
+
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
+    ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
+
+    if (pv) {
+        ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
+    }
+
+    ev.iotval = iotval;
+    ev.iotval2 = iotval2;
+
+    riscv_iommu_fault(s, &ev);
+}
+
+/* Redirect MSI write for given GPA. */
+static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
+    RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
+    unsigned size, MemTxAttrs attrs)
+{
+    MemTxResult res;
+    dma_addr_t addr;
+    uint64_t intn;
+    uint32_t n190;
+    uint64_t pte[2];
+    int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
+    int cause;
+
+    /* Interrupt File Number */
+    intn = _pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
+    if (intn >= 256) {
+        /* Interrupt file number out of range */
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        goto err;
+    }
+
+    /* fetch MSI PTE */
+    addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
+    addr = addr | (intn * sizeof(pte));
+    res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
+            MEMTXATTRS_UNSPECIFIED);
+    if (res != MEMTX_OK) {
+        if (res == MEMTX_DECODE_ERROR) {
+            cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
+        } else {
+            cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        }
+        goto err;
+    }
+
+    le64_to_cpus(&pte[0]);
+    le64_to_cpus(&pte[1]);
+
+    if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
+        /*
+         * The spec mentions that: "If msipte.C == 1, then further
+         * processing to interpret the PTE is implementation
+         * defined.". We'll abort with cause = 262 for this
+         * case too.
+         */
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
+        goto err;
+    }
+
+    switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
+    case RISCV_IOMMU_MSI_PTE_M_BASIC:
+        /* MSI Pass-through mode */
+        addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
+        addr = addr | (gpa & TARGET_PAGE_MASK);
+
+        trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                              PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                              gpa, addr);
+
+        res = dma_memory_write(s->target_as, addr, &data, size, attrs);
+        if (res != MEMTX_OK) {
+            cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
+            goto err;
+        }
+
+        return MEMTX_OK;
+    case RISCV_IOMMU_MSI_PTE_M_MRIF:
+        /* MRIF mode, continue. */
+        break;
+    default:
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
+        goto err;
+    }
+
+    /*
+     * Report an error for interrupt identities exceeding the maximum allowed
+     * for an IMSIC interrupt file (2047) or destination address is not 32-bit
+     * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
+     */
+    if ((data > 2047) || (gpa & 3)) {
+        res = MEMTX_ACCESS_ERROR;
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
+        goto err;
+    }
+
+    /* MSI MRIF mode, non atomic pending bit update */
+
+    /* MRIF pending bit address */
+    addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
+    addr = addr | ((data & 0x7c0) >> 3);
+
+    trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
+                          PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
+                          gpa, addr);
+
+    /* MRIF pending bit mask */
+    data = 1ULL << (data & 0x03f);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        goto err;
+    }
+
+    intn = intn | data;
+    res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
+        goto err;
+    }
+
+    /* Get MRIF enable bits */
+    addr = addr + sizeof(intn);
+    res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
+        goto err;
+    }
+
+    if (!(intn & data)) {
+        /* notification disabled, MRIF update completed. */
+        return MEMTX_OK;
+    }
+
+    /* Send notification message */
+    addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
+    n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
+          (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
+
+    res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
+    if (res != MEMTX_OK) {
+        cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
+        goto err;
+    }
+
+    trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
+
+    return MEMTX_OK;
+
+err:
+    riscv_iommu_report_fault(s, ctx, fault_type, cause,
+                             !!ctx->process_id, 0, 0);
+    return res;
+}
+
+/*
+ * Check device context configuration as described by the
+ * riscv-iommu spec section "Device-context configuration
+ * checks".
+ */
+static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
+                                            RISCVIOMMUContext *ctx)
+{
+    uint32_t fsc_mode, msi_mode;
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
+        ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
+        return false;
+    }
+
+    if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
+        ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
+        return false;
+    }
+
+    if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
+        msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
+
+        if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
+            msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
+            return false;
+        }
+    }
+
+    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+
+    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
+        switch (fsc_mode) {
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
+                return false;
+            }
+            break;
+        }
+    } else {
+        /* DC.tc.PDTV is 0 */
+        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
+            return false;
+        }
+
+        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
+            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
+                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
+                return false;
+            }
+        } else {
+            switch (fsc_mode) {
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
+                    return false;
+                }
+                break;
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
+                    return false;
+                }
+            break;
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
+                    return false;
+                }
+                break;
+            }
+        }
+    }
+
+    /*
+     * CAP_END is always zero (only one endianess). FCTL_BE is
+     * always zero (little-endian accesses). Thus TC_SBE must
+     * always be LE, i.e. zero.
+     */
+    if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
+        return false;
+    }
+
+    return true;
+}
+
+/*
+ * Validate process context (PC) according to section
+ * "Process-context configuration checks".
+ */
+static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
+                                             RISCVIOMMUContext *ctx)
+{
+    uint32_t mode;
+
+    if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
+        return false;
+    }
+
+    if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
+        return false;
+    }
+
+    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+    switch (mode) {
+    case RISCV_IOMMU_DC_FSC_MODE_BARE:
+    /* sv39 and sv32 modes have the same value (8) */
+    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+    case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+        break;
+    default:
+        return false;
+    }
+
+    if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
+        if (mode == RISCV_IOMMU_CAP_SV32 &&
+            !(s->cap & RISCV_IOMMU_CAP_SV32)) {
+                return false;
+        }
+    } else {
+        switch (mode) {
+        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+            if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+            if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+            if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
+                return false;
+            }
+            break;
+        }
+    }
+
+    return true;
+}
+
+/*
+ * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
+ *
+ * @s         : IOMMU Device State
+ * @ctx       : Device Translation Context with devid and process_id set.
+ * @return    : success or fault code.
+ */
+static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
+{
+    const uint64_t ddtp = s->ddtp;
+    unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
+    dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
+    struct riscv_iommu_dc dc;
+    /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
+    const int dc_fmt = !s->enable_msi;
+    const size_t dc_len = sizeof(dc) >> dc_fmt;
+    unsigned depth;
+    uint64_t de;
+
+    switch (mode) {
+    case RISCV_IOMMU_DDTP_MODE_OFF:
+        return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+
+    case RISCV_IOMMU_DDTP_MODE_BARE:
+        /* mock up pass-through translation context */
+        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
+        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_FSC_MODE_BARE);
+        ctx->tc = RISCV_IOMMU_DC_TC_V;
+        ctx->ta = 0;
+        ctx->msiptp = 0;
+        return 0;
+
+    case RISCV_IOMMU_DDTP_MODE_1LVL:
+        depth = 0;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_2LVL:
+        depth = 1;
+        break;
+
+    case RISCV_IOMMU_DDTP_MODE_3LVL:
+        depth = 2;
+        break;
+
+    default:
+        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+    }
+
+    /*
+     * Check supported device id width (in bits).
+     * See IOMMU Specification, Chapter 6. Software guidelines.
+     * - if extended device-context format is used:
+     *   1LVL: 6, 2LVL: 15, 3LVL: 24
+     * - if base device-context format is used:
+     *   1LVL: 7, 2LVL: 16, 3LVL: 24
+     */
+    if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
+        return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+    }
+
+    /* Device directory tree walk */
+    for (; depth-- > 0; ) {
+        /*
+         * Select device id index bits based on device directory tree level
+         * and device context format.
+         * See IOMMU Specification, Chapter 2. Data Structures.
+         * - if extended device-context format is used:
+         *   device index: [23:15][14:6][5:0]
+         * - if base device-context format is used:
+         *   device index: [23:16][15:7][6:0]
+         */
+        const int split = depth * 9 + 6 + dc_fmt;
+        addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_DDTE_VALID)) {
+            /* invalid directory entry */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
+            /* reserved bits set */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
+    }
+
+    /* index into device context entry page */
+    addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
+
+    memset(&dc, 0, sizeof(dc));
+    if (dma_memory_read(s->target_as, addr, &dc, dc_len,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
+    }
+
+    /* Set translation context. */
+    ctx->tc = le64_to_cpu(dc.tc);
+    ctx->gatp = le64_to_cpu(dc.iohgatp);
+    ctx->satp = le64_to_cpu(dc.fsc);
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->msiptp = le64_to_cpu(dc.msiptp);
+    ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
+    ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+    }
+
+    if (!riscv_iommu_validate_device_ctx(s, ctx)) {
+        return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+    }
+
+    /* FSC field checks */
+    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
+
+    if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
+        if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
+            /* PID is disabled */
+            return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
+        }
+        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
+            /* Invalid translation mode */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
+        return 0;
+    }
+
+    if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
+        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
+            /* No default process_id enabled, set BARE mode */
+            ctx->satp = 0ULL;
+            return 0;
+        } else {
+            /* Use default process_id #0 */
+            ctx->process_id = 0;
+        }
+    }
+
+    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
+        /* No S-Stage translation, done. */
+        return 0;
+    }
+
+    /* FSC.TC.PDTV enabled */
+    if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
+        /* Invalid PDTP.MODE */
+        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
+    }
+
+    for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
+        /*
+         * Select process id index bits based on process directory tree
+         * level. See IOMMU Specification, 2.2. Process-Directory-Table.
+         */
+        const int split = depth * 9 + 8;
+        addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
+        if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
+                            MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+        }
+        le64_to_cpus(&de);
+        if (!(de & RISCV_IOMMU_PC_TA_V)) {
+            return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
+        }
+        addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
+    }
+
+    /* Leaf entry in PDT */
+    addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
+    if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
+                        MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
+    }
+
+    /* Use FSC and TA from process directory entry. */
+    ctx->ta = le64_to_cpu(dc.ta);
+    ctx->satp = le64_to_cpu(dc.fsc);
+
+    if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
+    }
+
+    if (!riscv_iommu_validate_process_ctx(s, ctx)) {
+        return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
+    }
+
+    return 0;
+}
+
+/* Translation Context cache support */
+static gboolean __ctx_equal(gconstpointer v1, gconstpointer v2)
+{
+    RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
+    RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
+    return c1->devid == c2->devid &&
+           c1->process_id == c2->process_id;
+}
+
+static guint __ctx_hash(gconstpointer v)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
+    /*
+     * Generate simple hash of (process_id, devid)
+     * assuming 24-bit wide devid.
+     */
+    return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
+}
+
+static void __ctx_inval_devid_procid(gpointer key, gpointer value,
+                                     gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid &&
+        ctx->process_id == arg->process_id) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void __ctx_inval_devid(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
+        ctx->devid == arg->devid) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void __ctx_inval_all(gpointer key, gpointer value, gpointer data)
+{
+    RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
+    if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
+        ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
+    }
+}
+
+static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
+                                  uint32_t devid, uint32_t process_id)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .process_id = process_id,
+    };
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    qemu_mutex_lock(&s->ctx_lock);
+    g_hash_table_foreach(ctx_cache, func, &key);
+    qemu_mutex_unlock(&s->ctx_lock);
+    g_hash_table_unref(ctx_cache);
+}
+
+/* Find or allocate translation context for a given {device_id, process_id} */
+static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
+                                          unsigned devid, unsigned process_id,
+                                          void **ref)
+{
+    GHashTable *ctx_cache;
+    RISCVIOMMUContext *ctx;
+    RISCVIOMMUContext key = {
+        .devid = devid,
+        .process_id = process_id,
+    };
+
+    ctx_cache = g_hash_table_ref(s->ctx_cache);
+    qemu_mutex_lock(&s->ctx_lock);
+    ctx = g_hash_table_lookup(ctx_cache, &key);
+    qemu_mutex_unlock(&s->ctx_lock);
+
+    if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    ctx = g_new0(RISCVIOMMUContext, 1);
+    ctx->devid = devid;
+    ctx->process_id = process_id;
+
+    int fault = riscv_iommu_ctx_fetch(s, ctx);
+    if (!fault) {
+        qemu_mutex_lock(&s->ctx_lock);
+        if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
+            g_hash_table_unref(ctx_cache);
+            ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
+                                              g_free, NULL);
+            g_hash_table_ref(ctx_cache);
+            g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
+        }
+        g_hash_table_add(ctx_cache, ctx);
+        qemu_mutex_unlock(&s->ctx_lock);
+        *ref = ctx_cache;
+        return ctx;
+    }
+
+    g_hash_table_unref(ctx_cache);
+    *ref = NULL;
+
+    riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
+                             fault, !!process_id, 0, 0);
+
+    g_free(ctx);
+    return NULL;
+}
+
+static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
+{
+    if (ref) {
+        g_hash_table_unref((GHashTable *)ref);
+    }
+}
+
+/* Find or allocate address space for a given device */
+static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
+{
+    RISCVIOMMUSpace *as;
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    qemu_mutex_lock(&s->core_lock);
+    QLIST_FOREACH(as, &s->spaces, list) {
+        if (as->devid == devid) {
+            break;
+        }
+    }
+    qemu_mutex_unlock(&s->core_lock);
+
+    if (as == NULL) {
+        char name[64];
+        as = g_new0(RISCVIOMMUSpace, 1);
+
+        as->iommu = s;
+        as->devid = devid;
+
+        snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
+            PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+
+        /* IOVA address space, untranslated addresses */
+        memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
+            TYPE_RISCV_IOMMU_MEMORY_REGION,
+            OBJECT(as), "riscv_iommu", UINT64_MAX);
+        address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
+
+        qemu_mutex_lock(&s->core_lock);
+        QLIST_INSERT_HEAD(&s->spaces, as, list);
+        qemu_mutex_unlock(&s->core_lock);
+
+        trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
+                PCI_SLOT(as->devid), PCI_FUNC(as->devid));
+    }
+    return &as->iova_as;
+}
+
+static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
+    IOMMUTLBEntry *iotlb)
+{
+    bool enable_pid;
+    bool enable_pri;
+    int fault;
+
+    /*
+     * TC[32] is reserved for custom extensions, used here to temporarily
+     * enable automatic page-request generation for ATS queries.
+     */
+    enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
+    enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
+
+    /* Translate using device directory / page table information. */
+    fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
+
+    if (enable_pri && fault) {
+        struct riscv_iommu_pq_record pr = {0};
+        if (enable_pid) {
+            pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
+                               RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
+        }
+        pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
+        pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
+                     RISCV_IOMMU_PREQ_PAYLOAD_M;
+        riscv_iommu_pri(s, &pr);
+        return fault;
+    }
+
+    if (fault) {
+        unsigned ttype;
+
+        if (iotlb->perm & IOMMU_RW) {
+            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
+        } else {
+            ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
+        }
+
+        riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
+                                 iotlb->iova, iotlb->translated_addr);
+        return fault;
+    }
+
+    return 0;
+}
+
+/* IOMMU Command Interface */
+static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
+    uint64_t addr, uint32_t data)
+{
+    /*
+     * ATS processing in this implementation of the IOMMU is synchronous,
+     * no need to wait for completions here.
+     */
+    if (!notify) {
+        return MEMTX_OK;
+    }
+
+    return dma_memory_write(s->target_as, addr, &data, sizeof(data),
+        MEMTXATTRS_UNSPECIFIED);
+}
+
+static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
+{
+    uint64_t old_ddtp = s->ddtp;
+    uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
+    unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
+    unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
+    bool ok = false;
+
+    /*
+     * Check for allowed DDTP.MODE transitions:
+     * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
+     * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
+     */
+    if (new_mode == old_mode ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+        new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
+        ok = true;
+    } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
+               new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
+        ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
+             old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
+    }
+
+    if (ok) {
+        /* clear reserved and busy bits, report back sanitized version */
+        new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
+                             RISCV_IOMMU_DDTP_MODE, new_mode);
+    } else {
+        new_ddtp = old_ddtp;
+    }
+    s->ddtp = new_ddtp;
+
+    riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
+}
+
+/* Command function and opcode field. */
+#define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
+
+static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
+{
+    struct riscv_iommu_command cmd;
+    MemTxResult res;
+    dma_addr_t addr;
+    uint32_t tail, head, ctrl;
+    uint64_t cmd_opcode;
+    GHFunc func;
+
+    ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
+    head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
+
+    /* Check for pending error or queue processing disabled */
+    if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
+        !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
+        return;
+    }
+
+    while (tail != head) {
+        addr = s->cq_addr  + head * sizeof(cmd);
+        res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
+                              MEMTXATTRS_UNSPECIFIED);
+
+        if (res != MEMTX_OK) {
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                                  RISCV_IOMMU_CQCSR_CQMF, 0);
+            goto fault;
+        }
+
+        trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
+
+        cmd_opcode = get_field(cmd.dword0,
+                               RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
+
+        switch (cmd_opcode) {
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
+                             RISCV_IOMMU_CMD_IOFENCE_OPCODE):
+            res = riscv_iommu_iofence(s,
+                cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
+
+            if (res != MEMTX_OK) {
+                riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                                      RISCV_IOMMU_CQCSR_CQMF, 0);
+                goto fault;
+            }
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
+                /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
+                goto cmd_ill;
+            }
+            /* translation cache not implemented yet */
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
+                             RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
+            /* translation cache not implemented yet */
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* invalidate all device context cache mappings */
+                func = __ctx_inval_all;
+            } else {
+                /* invalidate all device context matching DID */
+                func = __ctx_inval_devid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
+            break;
+
+        case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
+                             RISCV_IOMMU_CMD_IODIR_OPCODE):
+            if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
+                /* illegal command arguments IODIR_PDT & DV == 0 */
+                goto cmd_ill;
+            } else {
+                func = __ctx_inval_devid_procid;
+            }
+            riscv_iommu_ctx_inval(s, func,
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
+                get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
+            break;
+
+        default:
+        cmd_ill:
+            /* Invalid instruction, do not advance instruction index. */
+            riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
+                RISCV_IOMMU_CQCSR_CMD_ILL, 0);
+            goto fault;
+        }
+
+        /* Advance and update head pointer after command completes. */
+        head = (head + 1) & s->cq_mask;
+        riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
+    }
+    return;
+
+fault:
+    if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
+        riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
+    }
+}
+
+static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
+        s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
+        s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
+        ctrl_set = RISCV_IOMMU_CQCSR_CQON;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
+                   RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
+                   RISCV_IOMMU_CQCSR_FENCE_W_IP;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
+        s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
+        s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
+        ctrl_set = RISCV_IOMMU_FQCSR_FQON;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
+            RISCV_IOMMU_FQCSR_FQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
+{
+    uint64_t base;
+    uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+    uint32_t ctrl_clr;
+    bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
+    bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
+
+    if (enable && !active) {
+        base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
+        s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
+        s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
+        stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
+        ctrl_set = RISCV_IOMMU_PQCSR_PQON;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
+            RISCV_IOMMU_PQCSR_PQOF;
+    } else if (!enable && active) {
+        stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
+    } else {
+        ctrl_set = 0;
+        ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
+}
+
+static void riscv_iommu_process_icvec_update(RISCVIOMMUState *s)
+{
+    uint32_t icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
+
+    s->icvec_update(s,
+                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_CQ),
+                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_FQ),
+                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PM),
+                    riscv_iommu_get_icvec_vector(icvec, RISCV_IOMMU_INTR_PQ));
+}
+
+typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
+
+static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
+{
+    uint32_t cqcsr, fqcsr, pqcsr;
+    uint32_t ipsr_set = 0;
+    uint32_t ipsr_clr = 0;
+
+    if (data & RISCV_IOMMU_IPSR_CIP) {
+        cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
+
+        if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
+            (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
+             cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
+             cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
+             cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
+            ipsr_set |= RISCV_IOMMU_IPSR_CIP;
+        } else {
+            ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
+        }
+    } else {
+        ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
+    }
+
+    if (data & RISCV_IOMMU_IPSR_FIP) {
+        fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
+
+        if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
+            (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
+             fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
+            ipsr_set |= RISCV_IOMMU_IPSR_FIP;
+        } else {
+            ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
+        }
+    } else {
+        ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
+    }
+
+    if (data & RISCV_IOMMU_IPSR_PIP) {
+        pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
+
+        if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
+            (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
+             pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
+            ipsr_set |= RISCV_IOMMU_IPSR_PIP;
+        } else {
+            ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
+        }
+    } else {
+        ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
+    }
+
+    riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
+}
+
+static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    riscv_iommu_process_fn *process_fn = NULL;
+    RISCVIOMMUState *s = opaque;
+    uint32_t regb = addr & ~3;
+    uint32_t busy = 0;
+    uint64_t val = 0;
+
+    if ((addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment or access size */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        /* Unsupported MMIO access location. */
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* Track actionable MMIO write. */
+    switch (regb) {
+    case RISCV_IOMMU_REG_DDTP:
+    case RISCV_IOMMU_REG_DDTP + 4:
+        process_fn = riscv_iommu_process_ddtp;
+        regb = RISCV_IOMMU_REG_DDTP;
+        busy = RISCV_IOMMU_DDTP_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_CQT:
+        process_fn = riscv_iommu_process_cq_tail;
+        break;
+
+    case RISCV_IOMMU_REG_CQCSR:
+        process_fn = riscv_iommu_process_cq_control;
+        busy = RISCV_IOMMU_CQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_FQCSR:
+        process_fn = riscv_iommu_process_fq_control;
+        busy = RISCV_IOMMU_FQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_PQCSR:
+        process_fn = riscv_iommu_process_pq_control;
+        busy = RISCV_IOMMU_PQCSR_BUSY;
+        break;
+
+    case RISCV_IOMMU_REG_ICVEC:
+        process_fn = riscv_iommu_process_icvec_update;
+        break;
+
+    case RISCV_IOMMU_REG_IPSR:
+        /*
+         * IPSR has special procedures to update. Execute it
+         * and exit.
+         */
+        if (size == 4) {
+            uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
+            uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
+            uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
+            stl_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
+        } else if (size == 8) {
+            uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
+            uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
+            uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
+            stq_le_p(&val, ((rw & ro) | (data & ~ro)) & ~(data & wc));
+        }
+
+        riscv_iommu_update_ipsr(s, val);
+
+        return MEMTX_OK;
+
+    default:
+        break;
+    }
+
+    /*
+     * Registers update might be not synchronized with core logic.
+     * If system software updates register when relevant BUSY bit
+     * is set IOMMU behavior of additional writes to the register
+     * is UNSPECIFIED.
+     */
+    qemu_spin_lock(&s->regs_lock);
+    if (size == 1) {
+        uint8_t ro = s->regs_ro[addr];
+        uint8_t wc = s->regs_wc[addr];
+        uint8_t rw = s->regs_rw[addr];
+        s->regs_rw[addr] = ((rw & ro) | (data & ~ro)) & ~(data & wc);
+    } else if (size == 2) {
+        uint16_t ro = lduw_le_p(&s->regs_ro[addr]);
+        uint16_t wc = lduw_le_p(&s->regs_wc[addr]);
+        uint16_t rw = lduw_le_p(&s->regs_rw[addr]);
+        stw_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    } else if (size == 4) {
+        uint32_t ro = ldl_le_p(&s->regs_ro[addr]);
+        uint32_t wc = ldl_le_p(&s->regs_wc[addr]);
+        uint32_t rw = ldl_le_p(&s->regs_rw[addr]);
+        stl_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    } else if (size == 8) {
+        uint64_t ro = ldq_le_p(&s->regs_ro[addr]);
+        uint64_t wc = ldq_le_p(&s->regs_wc[addr]);
+        uint64_t rw = ldq_le_p(&s->regs_rw[addr]);
+        stq_le_p(&s->regs_rw[addr], ((rw & ro) | (data & ~ro)) & ~(data & wc));
+    }
+
+    /* Busy flag update, MSB 4-byte register. */
+    if (busy) {
+        uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
+        stl_le_p(&s->regs_rw[regb], rw | busy);
+    }
+    qemu_spin_unlock(&s->regs_lock);
+
+    if (process_fn) {
+        qemu_mutex_lock(&s->core_lock);
+        process_fn(s);
+        qemu_mutex_unlock(&s->core_lock);
+    }
+
+    return MEMTX_OK;
+}
+
+static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState *s = opaque;
+    uint64_t val = -1;
+    uint8_t *ptr;
+
+    if ((addr & (size - 1)) != 0) {
+        /* Unsupported MMIO alignment. */
+        return MEMTX_ERROR;
+    }
+
+    if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    ptr = &s->regs_rw[addr];
+
+    if (size == 1) {
+        val = (uint64_t)*ptr;
+    } else if (size == 2) {
+        val = lduw_le_p(ptr);
+    } else if (size == 4) {
+        val = ldl_le_p(ptr);
+    } else if (size == 8) {
+        val = ldq_le_p(ptr);
+    } else {
+        return MEMTX_ERROR;
+    }
+
+    *data = val;
+
+    return MEMTX_OK;
+}
+
+static const MemoryRegionOps riscv_iommu_mmio_ops = {
+    .read_with_attrs = riscv_iommu_mmio_read,
+    .write_with_attrs = riscv_iommu_mmio_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+        .unaligned = false,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    }
+};
+
+/*
+ * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
+ * memory region as untranslated address, for additional MSI/MRIF interception
+ * by IOMMU interrupt remapping implementation.
+ * Note: Device emulation code generating an MSI is expected to provide a valid
+ * memory transaction attributes with requested_id set.
+ */
+static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
+    uint64_t data, unsigned size, MemTxAttrs attrs)
+{
+    RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
+    RISCVIOMMUContext *ctx;
+    MemTxResult res;
+    void *ref;
+    uint32_t devid = attrs.requester_id;
+
+    if (attrs.unspecified) {
+        return MEMTX_ACCESS_ERROR;
+    }
+
+    /* FIXME: PCIe bus remapping for attached endpoints. */
+    devid |= s->bus << 8;
+
+    ctx = riscv_iommu_ctx(s, devid, 0, &ref);
+    if (ctx == NULL) {
+        res = MEMTX_ACCESS_ERROR;
+    } else {
+        res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
+    }
+    riscv_iommu_ctx_put(s, ref);
+    return res;
+}
+
+static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
+    uint64_t *data, unsigned size, MemTxAttrs attrs)
+{
+    return MEMTX_ACCESS_ERROR;
+}
+
+static const MemoryRegionOps riscv_iommu_trap_ops = {
+    .read_with_attrs = riscv_iommu_trap_read,
+    .write_with_attrs = riscv_iommu_trap_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+    .impl = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+        .unaligned = true,
+    },
+    .valid = {
+        .min_access_size = 4,
+        .max_access_size = 8,
+    }
+};
+
+static void riscv_iommu_realize(DeviceState *dev, Error **errp)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    s->cap = s->version & RISCV_IOMMU_CAP_VERSION;
+    if (s->enable_msi) {
+        s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
+    }
+    if (s->enable_s_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
+                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
+    }
+    if (s->enable_g_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
+                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
+    }
+    /* Report QEMU target physical address space limits */
+    s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
+                       TARGET_PHYS_ADDR_SPACE_BITS);
+
+    /* TODO: method to report supported PID bits */
+    s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
+    s->cap |= RISCV_IOMMU_CAP_PD8;
+
+    /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
+    s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
+                        RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
+
+    /* register storage */
+    s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+    s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
+
+     /* Mark all registers read-only */
+    memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
+
+    /*
+     * Register complete MMIO space, including MSI/PBA registers.
+     * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
+     * managed directly by the PCIDevice implementation.
+     */
+    memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
+        "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
+
+    /* Set power-on register state */
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
+             ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
+        ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
+        ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
+        ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
+    stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
+        ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
+        RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
+        RISCV_IOMMU_CQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
+        RISCV_IOMMU_FQCSR_FQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
+        RISCV_IOMMU_FQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
+        RISCV_IOMMU_PQCSR_PQOF);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
+        RISCV_IOMMU_PQCSR_BUSY);
+    stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
+    stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
+    stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
+
+    /* Memory region for downstream access, if specified. */
+    if (s->target_mr) {
+        s->target_as = g_new0(AddressSpace, 1);
+        address_space_init(s->target_as, s->target_mr,
+            "riscv-iommu-downstream");
+    } else {
+        /* Fallback to global system memory. */
+        s->target_as = &address_space_memory;
+    }
+
+    /* Memory region for untranslated MRIF/MSI writes */
+    memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
+            "riscv-iommu-trap", ~0ULL);
+    address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
+
+    /* Device translation context cache */
+    s->ctx_cache = g_hash_table_new_full(__ctx_hash, __ctx_equal,
+                                         g_free, NULL);
+    qemu_mutex_init(&s->ctx_lock);
+
+    s->iommus.le_next = NULL;
+    s->iommus.le_prev = NULL;
+    QLIST_INIT(&s->spaces);
+    qemu_mutex_init(&s->core_lock);
+    qemu_spin_init(&s->regs_lock);
+}
+
+static void riscv_iommu_unrealize(DeviceState *dev)
+{
+    RISCVIOMMUState *s = RISCV_IOMMU(dev);
+
+    qemu_mutex_destroy(&s->core_lock);
+    g_hash_table_unref(s->ctx_cache);
+}
+
+static Property riscv_iommu_properties[] = {
+    DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
+        RISCV_IOMMU_SPEC_DOT_VER),
+    DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
+    DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
+    DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
+    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
+    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
+    DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
+        TYPE_MEMORY_REGION, MemoryRegion *),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void riscv_iommu_class_init(ObjectClass *klass, void* data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
+    dc->user_creatable = false;
+    dc->realize = riscv_iommu_realize;
+    dc->unrealize = riscv_iommu_unrealize;
+    device_class_set_props(dc, riscv_iommu_properties);
+}
+
+static const TypeInfo riscv_iommu_info = {
+    .name = TYPE_RISCV_IOMMU,
+    .parent = TYPE_DEVICE,
+    .instance_size = sizeof(RISCVIOMMUState),
+    .class_init = riscv_iommu_class_init,
+};
+
+static const char *IOMMU_FLAG_STR[] = {
+    "NA",
+    "RO",
+    "WR",
+    "RW",
+};
+
+/* RISC-V IOMMU Memory Region - Address Translation Space */
+static IOMMUTLBEntry riscv_iommu_memory_region_translate(
+    IOMMUMemoryRegion *iommu_mr, hwaddr addr,
+    IOMMUAccessFlags flag, int iommu_idx)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+    RISCVIOMMUContext *ctx;
+    void *ref;
+    IOMMUTLBEntry iotlb = {
+        .iova = addr,
+        .target_as = as->iommu->target_as,
+        .addr_mask = ~0ULL,
+        .perm = flag,
+    };
+
+    ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
+    if (ctx == NULL) {
+        /* Translation disabled or invalid. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb)) {
+        /* Translation disabled or fault reported. */
+        iotlb.addr_mask = 0;
+        iotlb.perm = IOMMU_NONE;
+    }
+
+    /* Trace all dma translations with original access flags. */
+    trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
+                          PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
+                          IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
+                          iotlb.translated_addr);
+
+    riscv_iommu_ctx_put(as->iommu, ref);
+
+    return iotlb;
+}
+
+static int riscv_iommu_memory_region_notify(
+    IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
+    IOMMUNotifierFlag new, Error **errp)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+
+    if (old == IOMMU_NOTIFIER_NONE) {
+        as->notifier = true;
+        trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        as->notifier = false;
+        trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
+    }
+
+    return 0;
+}
+
+static inline bool pci_is_iommu(PCIDevice *pdev)
+{
+    return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
+}
+
+static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
+{
+    RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
+    PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
+    AddressSpace *as = NULL;
+
+    if (pdev && pci_is_iommu(pdev)) {
+        return s->target_as;
+    }
+
+    /* Find first registered IOMMU device */
+    while (s->iommus.le_prev) {
+        s = *(s->iommus.le_prev);
+    }
+
+    /* Find first matching IOMMU */
+    while (s != NULL && as == NULL) {
+        as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
+        s = s->iommus.le_next;
+    }
+
+    return as ? as : &address_space_memory;
+}
+
+static const PCIIOMMUOps riscv_iommu_ops = {
+    .get_address_space = riscv_iommu_find_as,
+};
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+        Error **errp)
+{
+    if (bus->iommu_ops &&
+        bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
+        /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
+        RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
+        QLIST_INSERT_AFTER(last, iommu, iommus);
+    } else if (!bus->iommu_ops && !bus->iommu_opaque) {
+        pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
+    } else {
+        error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
+            pci_bus_num(bus));
+    }
+}
+
+static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
+    MemTxAttrs attrs)
+{
+    return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
+}
+
+static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
+{
+    RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+    return 1 << as->iommu->pid_bits;
+}
+
+static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
+{
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
+
+    imrc->translate = riscv_iommu_memory_region_translate;
+    imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
+    imrc->attrs_to_index = riscv_iommu_memory_region_index;
+    imrc->num_indexes = riscv_iommu_memory_region_index_len;
+}
+
+static const TypeInfo riscv_iommu_memory_region_info = {
+    .parent = TYPE_IOMMU_MEMORY_REGION,
+    .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
+    .class_init = riscv_iommu_memory_region_init,
+};
+
+static void riscv_iommu_register_mr_types(void)
+{
+    type_register_static(&riscv_iommu_memory_region_info);
+    type_register_static(&riscv_iommu_info);
+}
+
+type_init(riscv_iommu_register_mr_types);
diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
new file mode 100644
index 0000000000..6d76cb9b1a
--- /dev/null
+++ b/hw/riscv/riscv-iommu.h
@@ -0,0 +1,148 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_STATE_H
+#define HW_RISCV_IOMMU_STATE_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#include "hw/riscv/iommu.h"
+
+struct RISCVIOMMUState {
+    /*< private >*/
+    DeviceState parent_obj;
+
+    /*< public >*/
+    uint32_t version;     /* Reported interface version number */
+    uint32_t pid_bits;    /* process identifier width */
+    uint32_t bus;         /* PCI bus mapping for non-root endpoints */
+
+    uint64_t cap;         /* IOMMU supported capabilities */
+    uint64_t fctl;        /* IOMMU enabled features */
+
+    bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
+    bool enable_msi;      /* Enable MSI remapping */
+    bool enable_s_stage;  /* Enable S/VS-Stage translation */
+    bool enable_g_stage;  /* Enable G-Stage translation */
+
+    /* IOMMU Internal State */
+    uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
+
+    dma_addr_t cq_addr;   /* Command queue base physical address */
+    dma_addr_t fq_addr;   /* Fault/event queue base physical address */
+    dma_addr_t pq_addr;   /* Page request queue base physical address */
+
+    uint32_t cq_mask;     /* Command queue index bit mask */
+    uint32_t fq_mask;     /* Fault/event queue index bit mask */
+    uint32_t pq_mask;     /* Page request queue index bit mask */
+
+    /* interrupt notifier */
+    void (*notify)(RISCVIOMMUState *iommu, unsigned vector);
+    /* register icvec interrupts */
+    void (*icvec_update)(RISCVIOMMUState *iommu,
+                         uint32_t civ, uint32_t fiv,
+                         uint32_t pmiv, uint32_t piv);
+
+    /* IOMMU State Machine */
+    QemuThread core_proc; /* Background processing thread */
+    QemuMutex core_lock;  /* Global IOMMU lock, used for cache/regs updates */
+    QemuCond core_cond;   /* Background processing wake up signal */
+    unsigned core_exec;   /* Processing thread execution actions */
+
+    /* IOMMU target address space */
+    AddressSpace *target_as;
+    MemoryRegion *target_mr;
+
+    /* MSI / MRIF access trap */
+    AddressSpace trap_as;
+    MemoryRegion trap_mr;
+
+    GHashTable *ctx_cache;          /* Device translation Context Cache */
+    QemuMutex ctx_lock;      /* Device translation Cache update lock */
+
+    /* MMIO Hardware Interface */
+    MemoryRegion regs_mr;
+    QemuSpin regs_lock;
+    uint8_t *regs_rw;  /* register state (user write) */
+    uint8_t *regs_wc;  /* write-1-to-clear mask */
+    uint8_t *regs_ro;  /* read-only mask */
+
+    QLIST_ENTRY(RISCVIOMMUState) iommus;
+    QLIST_HEAD(, RISCVIOMMUSpace) spaces;
+};
+
+void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
+         Error **errp);
+
+/* private helpers */
+
+/* Register helper functions */
+static inline uint32_t riscv_iommu_reg_mod32(RISCVIOMMUState *s,
+    unsigned idx, uint32_t set, uint32_t clr)
+{
+    uint32_t val;
+    qemu_spin_lock(&s->regs_lock);
+    val = ldl_le_p(s->regs_rw + idx);
+    stl_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    qemu_spin_unlock(&s->regs_lock);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set32(RISCVIOMMUState *s,
+    unsigned idx, uint32_t set)
+{
+    qemu_spin_lock(&s->regs_lock);
+    stl_le_p(s->regs_rw + idx, set);
+    qemu_spin_unlock(&s->regs_lock);
+}
+
+static inline uint32_t riscv_iommu_reg_get32(RISCVIOMMUState *s,
+    unsigned idx)
+{
+    return ldl_le_p(s->regs_rw + idx);
+}
+
+static inline uint64_t riscv_iommu_reg_mod64(RISCVIOMMUState *s,
+    unsigned idx, uint64_t set, uint64_t clr)
+{
+    uint64_t val;
+    qemu_spin_lock(&s->regs_lock);
+    val = ldq_le_p(s->regs_rw + idx);
+    stq_le_p(s->regs_rw + idx, (val & ~clr) | set);
+    qemu_spin_unlock(&s->regs_lock);
+    return val;
+}
+
+static inline void riscv_iommu_reg_set64(RISCVIOMMUState *s,
+    unsigned idx, uint64_t set)
+{
+    qemu_spin_lock(&s->regs_lock);
+    stq_le_p(s->regs_rw + idx, set);
+    qemu_spin_unlock(&s->regs_lock);
+}
+
+static inline uint64_t riscv_iommu_reg_get64(RISCVIOMMUState *s,
+    unsigned idx)
+{
+    return ldq_le_p(s->regs_rw + idx);
+}
+
+
+
+#endif
diff --git a/hw/riscv/trace-events b/hw/riscv/trace-events
new file mode 100644
index 0000000000..bdd8b657a6
--- /dev/null
+++ b/hw/riscv/trace-events
@@ -0,0 +1,12 @@ 
+# See documentation at docs/devel/tracing.rst
+
+# riscv-iommu.c
+riscv_iommu_new(const char *id, unsigned b, unsigned d, unsigned f) "%s: device attached %04x:%02x.%d"
+riscv_iommu_flt(const char *id, unsigned b, unsigned d, unsigned f, uint64_t reason, uint64_t iova) "%s: fault %04x:%02x.%u reason: 0x%"PRIx64" iova: 0x%"PRIx64
+riscv_iommu_pri(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova) "%s: page request %04x:%02x.%u iova: 0x%"PRIx64
+riscv_iommu_dma(const char *id, unsigned b, unsigned d, unsigned f, unsigned pasid, const char *dir, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u #%u %s 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_msi(const char *id, unsigned b, unsigned d, unsigned f, uint64_t iova, uint64_t phys) "%s: translate %04x:%02x.%u MSI 0x%"PRIx64" -> 0x%"PRIx64
+riscv_iommu_mrif_notification(const char *id, uint32_t nid, uint64_t phys) "%s: sent MRIF notification 0x%x to 0x%"PRIx64
+riscv_iommu_cmd(const char *id, uint64_t l, uint64_t u) "%s: command 0x%"PRIx64" 0x%"PRIx64
+riscv_iommu_notifier_add(const char *id) "%s: dev-iotlb notifier added"
+riscv_iommu_notifier_del(const char *id) "%s: dev-iotlb notifier removed"
diff --git a/hw/riscv/trace.h b/hw/riscv/trace.h
new file mode 100644
index 0000000000..8c0e3ca1f3
--- /dev/null
+++ b/hw/riscv/trace.h
@@ -0,0 +1 @@ 
+#include "trace/trace-hw_riscv.h"
diff --git a/include/hw/riscv/iommu.h b/include/hw/riscv/iommu.h
new file mode 100644
index 0000000000..070ee69973
--- /dev/null
+++ b/include/hw/riscv/iommu.h
@@ -0,0 +1,36 @@ 
+/*
+ * QEMU emulation of an RISC-V IOMMU
+ *
+ * Copyright (C) 2022-2023 Rivos Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HW_RISCV_IOMMU_H
+#define HW_RISCV_IOMMU_H
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+
+#define TYPE_RISCV_IOMMU "riscv-iommu"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUState, RISCV_IOMMU)
+typedef struct RISCVIOMMUState RISCVIOMMUState;
+
+#define TYPE_RISCV_IOMMU_MEMORY_REGION "riscv-iommu-mr"
+typedef struct RISCVIOMMUSpace RISCVIOMMUSpace;
+
+#define TYPE_RISCV_IOMMU_PCI "riscv-iommu-pci"
+OBJECT_DECLARE_SIMPLE_TYPE(RISCVIOMMUStatePci, RISCV_IOMMU_PCI)
+typedef struct RISCVIOMMUStatePci RISCVIOMMUStatePci;
+
+#endif
diff --git a/meson.build b/meson.build
index a1e51277b0..359f836d8e 100644
--- a/meson.build
+++ b/meson.build
@@ -3373,6 +3373,7 @@  if have_system
     'hw/pci-host',
     'hw/ppc',
     'hw/rtc',
+    'hw/riscv',
     'hw/s390x',
     'hw/scsi',
     'hw/sd',