diff mbox series

[v3,09/13] hw/riscv/riscv-iommu: add s-stage and g-stage support

Message ID 20240523173955.1940072-10-dbarboza@ventanamicro.com
State New
Headers show
Series riscv: QEMU RISC-V IOMMU Support | expand

Commit Message

Daniel Henrique Barboza May 23, 2024, 5:39 p.m. UTC
From: Tomasz Jeznach <tjeznach@rivosinc.com>

Add support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage
(sv32x4, sv39x4, sv48x4, sv57x4 caps). Most of the work is done in the
riscv_iommu_spa_fetch() function that now has to consider how many
translation stages we need to walk the page table.

Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
---
 hw/riscv/riscv-iommu-bits.h |  11 ++
 hw/riscv/riscv-iommu.c      | 331 +++++++++++++++++++++++++++++++++++-
 hw/riscv/riscv-iommu.h      |   2 +
 3 files changed, 336 insertions(+), 8 deletions(-)

Comments

Jason Chien June 18, 2024, 10:30 a.m. UTC | #1
Hi Daniel,

On 2024/5/24 上午 01:39, Daniel Henrique Barboza wrote:
> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>
> Add support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage
> (sv32x4, sv39x4, sv48x4, sv57x4 caps). Most of the work is done in the
> riscv_iommu_spa_fetch() function that now has to consider how many
> translation stages we need to walk the page table.
>
> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> ---
>   hw/riscv/riscv-iommu-bits.h |  11 ++
>   hw/riscv/riscv-iommu.c      | 331 +++++++++++++++++++++++++++++++++++-
>   hw/riscv/riscv-iommu.h      |   2 +
>   3 files changed, 336 insertions(+), 8 deletions(-)
>
> diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
> index f29b916acb..a4def7b8ec 100644
> --- a/hw/riscv/riscv-iommu-bits.h
> +++ b/hw/riscv/riscv-iommu-bits.h
> @@ -71,6 +71,14 @@ struct riscv_iommu_pq_record {
>   /* 5.3 IOMMU Capabilities (64bits) */
>   #define RISCV_IOMMU_REG_CAP             0x0000
>   #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
> +#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
> +#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
> +#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
> +#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
> +#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
> +#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
> +#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
> +#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
>   #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
>   #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
>   #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
> @@ -83,6 +91,7 @@ struct riscv_iommu_pq_record {
>   /* 5.4 Features control register (32bits) */
>   #define RISCV_IOMMU_REG_FCTL            0x0008
>   #define RISCV_IOMMU_FCTL_WSI            BIT(1)
> +#define RISCV_IOMMU_FCTL_GXL            BIT(2)
>   
>   /* 5.5 Device-directory-table pointer (64bits) */
>   #define RISCV_IOMMU_REG_DDTP            0x0010
> @@ -205,6 +214,8 @@ struct riscv_iommu_dc {
>   #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
>   #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
>   #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
> +#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
> +#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
>   #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
>   #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
>   #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
> diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
> index abf6ae7726..11c418b548 100644
> --- a/hw/riscv/riscv-iommu.c
> +++ b/hw/riscv/riscv-iommu.c
> @@ -58,6 +58,8 @@ struct RISCVIOMMUContext {
>       uint64_t __rfu:20;          /* reserved */
>       uint64_t tc;                /* Translation Control */
>       uint64_t ta;                /* Translation Attributes */
> +    uint64_t satp;              /* S-Stage address translation and protection */
> +    uint64_t gatp;              /* G-Stage address translation and protection */
>       uint64_t msi_addr_mask;     /* MSI filtering - address mask */
>       uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
>       uint64_t msiptp;            /* MSI redirection page table pointer */
> @@ -201,12 +203,45 @@ static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>       return true;
>   }
>   
> -/* RISCV IOMMU Address Translation Lookup - Page Table Walk */
> +/*
> + * RISCV IOMMU Address Translation Lookup - Page Table Walk
> + *
> + * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
> + * Both implementation can be merged into single helper function in future.
> + * Keeping them separate for now, as error reporting and flow specifics are
> + * sufficiently different for separate implementation.
> + *
> + * @s        : IOMMU Device State
> + * @ctx      : Translation context for device id and process address space id.
> + * @iotlb    : translation data: physical address and access mode.
> + * @return   : success or fault cause code.
> + */
>   static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>       IOMMUTLBEntry *iotlb)
>   {
> +    dma_addr_t addr, base;
> +    uint64_t satp, gatp, pte;
> +    bool en_s, en_g;
> +    struct {
> +        unsigned char step;
> +        unsigned char levels;
> +        unsigned char ptidxbits;
> +        unsigned char ptesize;
> +    } sc[2];
> +    /* Translation stage phase */
> +    enum {
> +        S_STAGE = 0,
> +        G_STAGE = 1,
> +    } pass;
> +
> +    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
> +    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
> +
> +    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
> +    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
> +
>       /* Early check for MSI address match when IOVA == GPA */
> -    if (iotlb->perm & IOMMU_WO &&
> +    if ((iotlb->perm & IOMMU_WO) &&
>           riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
>           iotlb->target_as = &s->trap_as;
>           iotlb->translated_addr = iotlb->iova;
> @@ -215,11 +250,196 @@ static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>       }
>   
>       /* Exit early for pass-through mode. */
> -    iotlb->translated_addr = iotlb->iova;
> -    iotlb->addr_mask = ~TARGET_PAGE_MASK;
> -    /* Allow R/W in pass-through mode */
> -    iotlb->perm = IOMMU_RW;
> -    return 0;
> +    if (!(en_s || en_g)) {
> +        iotlb->translated_addr = iotlb->iova;
> +        iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +        /* Allow R/W in pass-through mode */
> +        iotlb->perm = IOMMU_RW;
> +        return 0;
> +    }
> +
> +    /* S/G translation parameters. */
> +    for (pass = 0; pass < 2; pass++) {
> +        uint32_t sv_mode;
> +
> +        sc[pass].step = 0;
> +        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
> +            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
> +            /* 32bit mode for GXL/SXL == 1 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 2;
> +                sc[pass].ptidxbits = 10;
> +                sc[pass].ptesize   = 4;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        } else {
> +            /* 64bit mode for GXL/SXL == 0 */
> +            switch (pass ? gatp : satp) {
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
> +                sc[pass].levels    = 0;
> +                sc[pass].ptidxbits = 0;
> +                sc[pass].ptesize   = 0;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 3;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 4;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
> +                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
> +                if (!(s->cap & sv_mode)) {
> +                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +                }
> +                sc[pass].levels    = 5;
> +                sc[pass].ptidxbits = 9;
> +                sc[pass].ptesize   = 8;
> +                break;
> +            default:
> +                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
> +            }
> +        }
> +    };
> +
> +    /* S/G stages translation tables root pointers */
> +    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
> +    addr = (en_s && en_g) ? satp : iotlb->iova;
> +    base = en_g ? gatp : satp;
> +    pass = en_g ? G_STAGE : S_STAGE;
> +
> +    do {
> +        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
> +        const unsigned va_bits = widened + sc[pass].ptidxbits;
> +        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
> +                                 (sc[pass].levels - 1 - sc[pass].step);
> +        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
> +        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
> +        const bool ade =
> +            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
> +
> +        /* Address range check before first level lookup */
> +        if (!sc[pass].step) {
> +            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
> +            if ((addr & va_mask) != addr) {
> +                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
> +            }
> +        }
> +
> +        /* Read page table entry */
> +        if (dma_memory_read(s->target_as, pte_addr, &pte,
> +                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
> +            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
> +                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
> +        }
> +
> +        if (sc[pass].ptesize == 4) {
> +            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
> +        } else {
> +            pte = le64_to_cpu(pte);
> +        }
> +
> +        sc[pass].step++;
> +        hwaddr ppn = pte >> PTE_PPN_SHIFT;
> +
> +        if (!(pte & PTE_V)) {
> +            break;                /* Invalid PTE */
> +        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
> +            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
> +            break;                /* Reserved leaf PTE flags: PTE_W */
> +        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
> +            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
> +        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
> +            break;                /* Misaligned PPN */
> +        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
> +            break;                /* Read access check failed */
> +        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
> +            break;                /* Write access check failed */
> +        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
> +            break;                /* Access bit not set */
> +        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
> +            break;                /* Dirty bit not set */
> +        } else {
> +            /* Leaf PTE, translation completed. */
> +            sc[pass].step = sc[pass].levels;
> +            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
> +            /* Update address mask based on smallest translation granularity */
> +            iotlb->addr_mask &= (1ULL << va_skip) - 1;
> +            /* Continue with S-Stage translation? */
> +            if (pass && sc[0].step != sc[0].levels) {
> +                pass = S_STAGE;
> +                addr = iotlb->iova;
> +                continue;
> +            }
> +            /* Translation phase completed (GPA or SPA) */
> +            iotlb->translated_addr = base;
> +            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
> +                                                         : IOMMU_RO;
> +
> +            /* Check MSI GPA address match */
> +            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
> +                riscv_iommu_msi_check(s, ctx, base)) {
> +                /* Trap MSI writes and return GPA address. */
> +                iotlb->target_as = &s->trap_as;
> +                iotlb->addr_mask = ~TARGET_PAGE_MASK;
> +                return 0;
> +            }
> +
> +            /* Continue with G-Stage translation? */
> +            if (!pass && en_g) {
> +                pass = G_STAGE;
> +                addr = base;
> +                base = gatp;
> +                sc[pass].step = 0;
> +                continue;
> +            }
> +
> +            return 0;
> +        }
> +
> +        if (sc[pass].step == sc[pass].levels) {
> +            break; /* Can't find leaf PTE */
> +        }
> +
> +        /* Continue with G-Stage translation? */
> +        if (!pass && en_g) {
> +            pass = G_STAGE;
> +            addr = base;
> +            base = gatp;
> +            sc[pass].step = 0;
> +        }
> +    } while (1);
> +
> +    return (iotlb->perm & IOMMU_WO) ?
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
> +                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
> +                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
>   }
>   
>   static void riscv_iommu_report_fault(RISCVIOMMUState *s,
> @@ -420,7 +640,7 @@ err:
>   static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
>                                               RISCVIOMMUContext *ctx)
>   {
> -    uint32_t msi_mode;
> +    uint32_t fsc_mode, msi_mode;
>   
>       if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
>           ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
> @@ -441,6 +661,58 @@ static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
>           }
>       }
>   
> +    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +
> +    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
> +        switch (fsc_mode) {
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
> +                return false;
> +            }
> +            break;
> +        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
> +            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
> +                return false;
> +            }
> +            break;
> +        }
> +    } else {
> +        /* DC.tc.PDTV is 0 */
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
> +            return false;
> +        }
> +
> +        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
> +            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
> +                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
> +                return false;
> +            }
> +        } else {
> +            switch (fsc_mode) {
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
> +                    return false;
> +                }
> +                break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
> +                    return false;
> +                }
> +            break;
> +            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
> +                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
> +                    return false;
> +                }
> +                break;
> +            }
> +        }
> +    }
> +
>       /*
>        * CAP_END is always zero (only one endianess). FCTL_BE is
>        * always zero (little-endian accesses). Thus TC_SBE must
> @@ -478,6 +750,10 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>   
>       case RISCV_IOMMU_DDTP_MODE_BARE:
>           /* mock up pass-through translation context */
> +        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
> +        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
> +            RISCV_IOMMU_DC_FSC_MODE_BARE);
>           ctx->tc = RISCV_IOMMU_DC_TC_V;
>           ctx->ta = 0;
>           ctx->msiptp = 0;
> @@ -551,6 +827,8 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>   
>       /* Set translation context. */
>       ctx->tc = le64_to_cpu(dc.tc);
> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
> +    ctx->satp = le64_to_cpu(dc.fsc);
>       ctx->ta = le64_to_cpu(dc.ta);
>       ctx->msiptp = le64_to_cpu(dc.msiptp);
>       ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
> @@ -564,14 +842,38 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>           return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>       }
>   
> +    /* FSC field checks */
> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
> +
> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
According to section 2.3, if the function returns here, some necessary 
checks are skipped. I think this if scope should be moved down to after 
"if (ctx->pasid == RISCV_IOMMU_NOPASID) {...}".
> +        /* No S-Stage translation, done. */
> +        return 0;
> +    }
> +
>       if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
>           if (ctx->pasid != RISCV_IOMMU_NOPASID) {
>               /* PASID is disabled */
>               return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>           }
> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
> +            /* Invalid translation mode */
> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
> +        }
>           return 0;
>       }
>   
> +    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
> +            /* No default PASID enabled, set BARE mode */
> +            ctx->satp = 0ULL;
> +            return 0;
> +        } else {
> +            /* Use default PASID #0 */
> +            ctx->pasid = 0;
> +        }
> +    }
> +
return if mode is bare.
>       /* FSC.TC.PDTV enabled */
>       if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
>           /* Invalid PDTP.MODE */
> @@ -605,6 +907,7 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>   
>       /* Use FSC and TA from process directory entry. */
>       ctx->ta = le64_to_cpu(dc.ta);
> +    ctx->satp = le64_to_cpu(dc.fsc);
>   
>       return 0;
>   }
> @@ -832,6 +1135,7 @@ static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
>       GHashTable *iot_cache, hwaddr iova)
>   {
>       RISCVIOMMUEntry key = {
> +        .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
>           .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
>           .iova  = PPN_DOWN(iova),
>       };
> @@ -909,6 +1213,7 @@ static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
>           iot = g_new0(RISCVIOMMUEntry, 1);
>           iot->iova = PPN_DOWN(iotlb->iova);
>           iot->phys = PPN_DOWN(iotlb->translated_addr);
> +        iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
>           iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
>           iot->perm = iotlb->perm;
>           riscv_iommu_iot_update(s, iot_cache, iot);
> @@ -1513,6 +1818,14 @@ static void riscv_iommu_realize(DeviceState *dev, Error **errp)
>       if (s->enable_msi) {
>           s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
>       }
> +    if (s->enable_s_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
> +                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
> +    }
> +    if (s->enable_g_stage) {
> +        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
> +                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
> +    }
>       /* Report QEMU target physical address space limits */
>       s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
>                          TARGET_PHYS_ADDR_SPACE_BITS);
> @@ -1613,6 +1926,8 @@ static Property riscv_iommu_properties[] = {
>           LIMIT_CACHE_IOT),
>       DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
>       DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
> +    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
> +    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
>       DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
>           TYPE_MEMORY_REGION, MemoryRegion *),
>       DEFINE_PROP_END_OF_LIST(),
> diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
> index 3afee9f3e8..c24e3e4c16 100644
> --- a/hw/riscv/riscv-iommu.h
> +++ b/hw/riscv/riscv-iommu.h
> @@ -38,6 +38,8 @@ struct RISCVIOMMUState {
>   
>       bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
>       bool enable_msi;      /* Enable MSI remapping */
> +    bool enable_s_stage;  /* Enable S/VS-Stage translation */
> +    bool enable_g_stage;  /* Enable G-Stage translation */
>   
>       /* IOMMU Internal State */
>       uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */
Daniel Henrique Barboza June 21, 2024, 11:58 a.m. UTC | #2
On 6/18/24 7:30 AM, Jason Chien wrote:
> Hi Daniel,
> 
> On 2024/5/24 上午 01:39, Daniel Henrique Barboza wrote:
>> From: Tomasz Jeznach <tjeznach@rivosinc.com>
>>
>> Add support for s-stage (sv32, sv39, sv48, sv57 caps) and g-stage
>> (sv32x4, sv39x4, sv48x4, sv57x4 caps). Most of the work is done in the
>> riscv_iommu_spa_fetch() function that now has to consider how many
>> translation stages we need to walk the page table.
>>
>> Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
>> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
>> ---
>>   hw/riscv/riscv-iommu-bits.h |  11 ++
>>   hw/riscv/riscv-iommu.c      | 331 +++++++++++++++++++++++++++++++++++-
>>   hw/riscv/riscv-iommu.h      |   2 +
>>   3 files changed, 336 insertions(+), 8 deletions(-)
>>

(...)

>>       /* Set translation context. */
>>       ctx->tc = le64_to_cpu(dc.tc);
>> +    ctx->gatp = le64_to_cpu(dc.iohgatp);
>> +    ctx->satp = le64_to_cpu(dc.fsc);
>>       ctx->ta = le64_to_cpu(dc.ta);
>>       ctx->msiptp = le64_to_cpu(dc.msiptp);
>>       ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
>> @@ -564,14 +842,38 @@ static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
>>           return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>>       }
>> +    /* FSC field checks */
>> +    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
>> +    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
>> +
>> +    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
> According to section 2.3, if the function returns here, some necessary checks are skipped. I think this if scope should be moved down to after "if (ctx->pasid == RISCV_IOMMU_NOPASID) {...}".
>> +        /* No S-Stage translation, done. */
>> +        return 0;
>> +    }
>> +
>>       if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
>>           if (ctx->pasid != RISCV_IOMMU_NOPASID) {
>>               /* PASID is disabled */
>>               return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
>>           }
>> +        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
>> +            /* Invalid translation mode */
>> +            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
>> +        }
>>           return 0;
>>       }
>> +    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
>> +        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
>> +            /* No default PASID enabled, set BARE mode */
>> +            ctx->satp = 0ULL;
>> +            return 0;
>> +        } else {
>> +            /* Use default PASID #0 */
>> +            ctx->pasid = 0;
>> +        }
>> +    }
>> +
> return if mode is bare.

I agree. Just moved the 'if (bare) return' check to this point as you
suggested.


Thanks,


Daniel
diff mbox series

Patch

diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
index f29b916acb..a4def7b8ec 100644
--- a/hw/riscv/riscv-iommu-bits.h
+++ b/hw/riscv/riscv-iommu-bits.h
@@ -71,6 +71,14 @@  struct riscv_iommu_pq_record {
 /* 5.3 IOMMU Capabilities (64bits) */
 #define RISCV_IOMMU_REG_CAP             0x0000
 #define RISCV_IOMMU_CAP_VERSION         GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAP_SV32            BIT_ULL(8)
+#define RISCV_IOMMU_CAP_SV39            BIT_ULL(9)
+#define RISCV_IOMMU_CAP_SV48            BIT_ULL(10)
+#define RISCV_IOMMU_CAP_SV57            BIT_ULL(11)
+#define RISCV_IOMMU_CAP_SV32X4          BIT_ULL(16)
+#define RISCV_IOMMU_CAP_SV39X4          BIT_ULL(17)
+#define RISCV_IOMMU_CAP_SV48X4          BIT_ULL(18)
+#define RISCV_IOMMU_CAP_SV57X4          BIT_ULL(19)
 #define RISCV_IOMMU_CAP_MSI_FLAT        BIT_ULL(22)
 #define RISCV_IOMMU_CAP_MSI_MRIF        BIT_ULL(23)
 #define RISCV_IOMMU_CAP_T2GPA           BIT_ULL(26)
@@ -83,6 +91,7 @@  struct riscv_iommu_pq_record {
 /* 5.4 Features control register (32bits) */
 #define RISCV_IOMMU_REG_FCTL            0x0008
 #define RISCV_IOMMU_FCTL_WSI            BIT(1)
+#define RISCV_IOMMU_FCTL_GXL            BIT(2)
 
 /* 5.5 Device-directory-table pointer (64bits) */
 #define RISCV_IOMMU_REG_DDTP            0x0010
@@ -205,6 +214,8 @@  struct riscv_iommu_dc {
 #define RISCV_IOMMU_DC_TC_DTF           BIT_ULL(4)
 #define RISCV_IOMMU_DC_TC_PDTV          BIT_ULL(5)
 #define RISCV_IOMMU_DC_TC_PRPR          BIT_ULL(6)
+#define RISCV_IOMMU_DC_TC_GADE          BIT_ULL(7)
+#define RISCV_IOMMU_DC_TC_SADE          BIT_ULL(8)
 #define RISCV_IOMMU_DC_TC_DPE           BIT_ULL(9)
 #define RISCV_IOMMU_DC_TC_SBE           BIT_ULL(10)
 #define RISCV_IOMMU_DC_TC_SXL           BIT_ULL(11)
diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
index abf6ae7726..11c418b548 100644
--- a/hw/riscv/riscv-iommu.c
+++ b/hw/riscv/riscv-iommu.c
@@ -58,6 +58,8 @@  struct RISCVIOMMUContext {
     uint64_t __rfu:20;          /* reserved */
     uint64_t tc;                /* Translation Control */
     uint64_t ta;                /* Translation Attributes */
+    uint64_t satp;              /* S-Stage address translation and protection */
+    uint64_t gatp;              /* G-Stage address translation and protection */
     uint64_t msi_addr_mask;     /* MSI filtering - address mask */
     uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
     uint64_t msiptp;            /* MSI redirection page table pointer */
@@ -201,12 +203,45 @@  static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
     return true;
 }
 
-/* RISCV IOMMU Address Translation Lookup - Page Table Walk */
+/*
+ * RISCV IOMMU Address Translation Lookup - Page Table Walk
+ *
+ * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
+ * Both implementation can be merged into single helper function in future.
+ * Keeping them separate for now, as error reporting and flow specifics are
+ * sufficiently different for separate implementation.
+ *
+ * @s        : IOMMU Device State
+ * @ctx      : Translation context for device id and process address space id.
+ * @iotlb    : translation data: physical address and access mode.
+ * @return   : success or fault cause code.
+ */
 static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
     IOMMUTLBEntry *iotlb)
 {
+    dma_addr_t addr, base;
+    uint64_t satp, gatp, pte;
+    bool en_s, en_g;
+    struct {
+        unsigned char step;
+        unsigned char levels;
+        unsigned char ptidxbits;
+        unsigned char ptesize;
+    } sc[2];
+    /* Translation stage phase */
+    enum {
+        S_STAGE = 0,
+        G_STAGE = 1,
+    } pass;
+
+    satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
+    gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
+
+    en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
+    en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
+
     /* Early check for MSI address match when IOVA == GPA */
-    if (iotlb->perm & IOMMU_WO &&
+    if ((iotlb->perm & IOMMU_WO) &&
         riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
         iotlb->target_as = &s->trap_as;
         iotlb->translated_addr = iotlb->iova;
@@ -215,11 +250,196 @@  static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
     }
 
     /* Exit early for pass-through mode. */
-    iotlb->translated_addr = iotlb->iova;
-    iotlb->addr_mask = ~TARGET_PAGE_MASK;
-    /* Allow R/W in pass-through mode */
-    iotlb->perm = IOMMU_RW;
-    return 0;
+    if (!(en_s || en_g)) {
+        iotlb->translated_addr = iotlb->iova;
+        iotlb->addr_mask = ~TARGET_PAGE_MASK;
+        /* Allow R/W in pass-through mode */
+        iotlb->perm = IOMMU_RW;
+        return 0;
+    }
+
+    /* S/G translation parameters. */
+    for (pass = 0; pass < 2; pass++) {
+        uint32_t sv_mode;
+
+        sc[pass].step = 0;
+        if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
+            (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
+            /* 32bit mode for GXL/SXL == 1 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 2;
+                sc[pass].ptidxbits = 10;
+                sc[pass].ptesize   = 4;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        } else {
+            /* 64bit mode for GXL/SXL == 0 */
+            switch (pass ? gatp : satp) {
+            case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
+                sc[pass].levels    = 0;
+                sc[pass].ptidxbits = 0;
+                sc[pass].ptesize   = 0;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 3;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 4;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
+                sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
+                if (!(s->cap & sv_mode)) {
+                    return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+                }
+                sc[pass].levels    = 5;
+                sc[pass].ptidxbits = 9;
+                sc[pass].ptesize   = 8;
+                break;
+            default:
+                return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
+            }
+        }
+    };
+
+    /* S/G stages translation tables root pointers */
+    gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
+    satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
+    addr = (en_s && en_g) ? satp : iotlb->iova;
+    base = en_g ? gatp : satp;
+    pass = en_g ? G_STAGE : S_STAGE;
+
+    do {
+        const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
+        const unsigned va_bits = widened + sc[pass].ptidxbits;
+        const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
+                                 (sc[pass].levels - 1 - sc[pass].step);
+        const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
+        const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
+        const bool ade =
+            ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
+
+        /* Address range check before first level lookup */
+        if (!sc[pass].step) {
+            const uint64_t va_mask = (1ULL << (va_skip + va_bits)) - 1;
+            if ((addr & va_mask) != addr) {
+                return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
+            }
+        }
+
+        /* Read page table entry */
+        if (dma_memory_read(s->target_as, pte_addr, &pte,
+                sc[pass].ptesize, MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
+            return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
+                                            : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
+        }
+
+        if (sc[pass].ptesize == 4) {
+            pte = (uint64_t) le32_to_cpu(*((uint32_t *)&pte));
+        } else {
+            pte = le64_to_cpu(pte);
+        }
+
+        sc[pass].step++;
+        hwaddr ppn = pte >> PTE_PPN_SHIFT;
+
+        if (!(pte & PTE_V)) {
+            break;                /* Invalid PTE */
+        } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
+            base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
+            break;                /* Reserved leaf PTE flags: PTE_W */
+        } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
+            break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
+        } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
+            break;                /* Misaligned PPN */
+        } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
+            break;                /* Read access check failed */
+        } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
+            break;                /* Write access check failed */
+        } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
+            break;                /* Access bit not set */
+        } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
+            break;                /* Dirty bit not set */
+        } else {
+            /* Leaf PTE, translation completed. */
+            sc[pass].step = sc[pass].levels;
+            base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
+            /* Update address mask based on smallest translation granularity */
+            iotlb->addr_mask &= (1ULL << va_skip) - 1;
+            /* Continue with S-Stage translation? */
+            if (pass && sc[0].step != sc[0].levels) {
+                pass = S_STAGE;
+                addr = iotlb->iova;
+                continue;
+            }
+            /* Translation phase completed (GPA or SPA) */
+            iotlb->translated_addr = base;
+            iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
+                                                         : IOMMU_RO;
+
+            /* Check MSI GPA address match */
+            if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
+                riscv_iommu_msi_check(s, ctx, base)) {
+                /* Trap MSI writes and return GPA address. */
+                iotlb->target_as = &s->trap_as;
+                iotlb->addr_mask = ~TARGET_PAGE_MASK;
+                return 0;
+            }
+
+            /* Continue with G-Stage translation? */
+            if (!pass && en_g) {
+                pass = G_STAGE;
+                addr = base;
+                base = gatp;
+                sc[pass].step = 0;
+                continue;
+            }
+
+            return 0;
+        }
+
+        if (sc[pass].step == sc[pass].levels) {
+            break; /* Can't find leaf PTE */
+        }
+
+        /* Continue with G-Stage translation? */
+        if (!pass && en_g) {
+            pass = G_STAGE;
+            addr = base;
+            base = gatp;
+            sc[pass].step = 0;
+        }
+    } while (1);
+
+    return (iotlb->perm & IOMMU_WO) ?
+                (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
+                (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
+                        RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
 }
 
 static void riscv_iommu_report_fault(RISCVIOMMUState *s,
@@ -420,7 +640,7 @@  err:
 static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
                                             RISCVIOMMUContext *ctx)
 {
-    uint32_t msi_mode;
+    uint32_t fsc_mode, msi_mode;
 
     if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
         ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
@@ -441,6 +661,58 @@  static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
         }
     }
 
+    fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+
+    if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
+        switch (fsc_mode) {
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
+                return false;
+            }
+            break;
+        case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
+            if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
+                return false;
+            }
+            break;
+        }
+    } else {
+        /* DC.tc.PDTV is 0 */
+        if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
+            return false;
+        }
+
+        if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
+            if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
+                !(s->cap & RISCV_IOMMU_CAP_SV32)) {
+                return false;
+            }
+        } else {
+            switch (fsc_mode) {
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
+                    return false;
+                }
+                break;
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
+                    return false;
+                }
+            break;
+            case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+                if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
+                    return false;
+                }
+                break;
+            }
+        }
+    }
+
     /*
      * CAP_END is always zero (only one endianess). FCTL_BE is
      * always zero (little-endian accesses). Thus TC_SBE must
@@ -478,6 +750,10 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
 
     case RISCV_IOMMU_DDTP_MODE_BARE:
         /* mock up pass-through translation context */
+        ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
+        ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
+            RISCV_IOMMU_DC_FSC_MODE_BARE);
         ctx->tc = RISCV_IOMMU_DC_TC_V;
         ctx->ta = 0;
         ctx->msiptp = 0;
@@ -551,6 +827,8 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
 
     /* Set translation context. */
     ctx->tc = le64_to_cpu(dc.tc);
+    ctx->gatp = le64_to_cpu(dc.iohgatp);
+    ctx->satp = le64_to_cpu(dc.fsc);
     ctx->ta = le64_to_cpu(dc.ta);
     ctx->msiptp = le64_to_cpu(dc.msiptp);
     ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
@@ -564,14 +842,38 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
         return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
     }
 
+    /* FSC field checks */
+    mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
+    addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
+
+    if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
+        /* No S-Stage translation, done. */
+        return 0;
+    }
+
     if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
         if (ctx->pasid != RISCV_IOMMU_NOPASID) {
             /* PASID is disabled */
             return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
         }
+        if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
+            /* Invalid translation mode */
+            return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
+        }
         return 0;
     }
 
+    if (ctx->pasid == RISCV_IOMMU_NOPASID) {
+        if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
+            /* No default PASID enabled, set BARE mode */
+            ctx->satp = 0ULL;
+            return 0;
+        } else {
+            /* Use default PASID #0 */
+            ctx->pasid = 0;
+        }
+    }
+
     /* FSC.TC.PDTV enabled */
     if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
         /* Invalid PDTP.MODE */
@@ -605,6 +907,7 @@  static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
 
     /* Use FSC and TA from process directory entry. */
     ctx->ta = le64_to_cpu(dc.ta);
+    ctx->satp = le64_to_cpu(dc.fsc);
 
     return 0;
 }
@@ -832,6 +1135,7 @@  static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
     GHashTable *iot_cache, hwaddr iova)
 {
     RISCVIOMMUEntry key = {
+        .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
         .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
         .iova  = PPN_DOWN(iova),
     };
@@ -909,6 +1213,7 @@  static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
         iot = g_new0(RISCVIOMMUEntry, 1);
         iot->iova = PPN_DOWN(iotlb->iova);
         iot->phys = PPN_DOWN(iotlb->translated_addr);
+        iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
         iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
         iot->perm = iotlb->perm;
         riscv_iommu_iot_update(s, iot_cache, iot);
@@ -1513,6 +1818,14 @@  static void riscv_iommu_realize(DeviceState *dev, Error **errp)
     if (s->enable_msi) {
         s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
     }
+    if (s->enable_s_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
+                  RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
+    }
+    if (s->enable_g_stage) {
+        s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
+                  RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
+    }
     /* Report QEMU target physical address space limits */
     s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
                        TARGET_PHYS_ADDR_SPACE_BITS);
@@ -1613,6 +1926,8 @@  static Property riscv_iommu_properties[] = {
         LIMIT_CACHE_IOT),
     DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
     DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
+    DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
+    DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
     DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
         TYPE_MEMORY_REGION, MemoryRegion *),
     DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/riscv/riscv-iommu.h b/hw/riscv/riscv-iommu.h
index 3afee9f3e8..c24e3e4c16 100644
--- a/hw/riscv/riscv-iommu.h
+++ b/hw/riscv/riscv-iommu.h
@@ -38,6 +38,8 @@  struct RISCVIOMMUState {
 
     bool enable_off;      /* Enable out-of-reset OFF mode (DMA disabled) */
     bool enable_msi;      /* Enable MSI remapping */
+    bool enable_s_stage;  /* Enable S/VS-Stage translation */
+    bool enable_g_stage;  /* Enable G-Stage translation */
 
     /* IOMMU Internal State */
     uint64_t ddtp;        /* Validated Device Directory Tree Root Pointer */