diff mbox series

[v6,4/7] target/riscv: rvv: Provide a fast path using direct access to host ram for unit-stride whole register load/store

Message ID 20240918171412.150107-5-max.chou@sifive.com
State New
Headers show
Series [v6,1/7] target/riscv: Set vdata.vm field for vector load/store whole register instructions | expand

Commit Message

Max Chou Sept. 18, 2024, 5:14 p.m. UTC
The vector unit-stride whole register load/store instructions are
similar to unmasked unit-stride load/store instructions that is suitable
to be optimized by using a direct access to host ram fast path.

Because the vector whole register load/store instructions do not need to
handle the tail agnostic, so remove the vstart early exit checking.

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 target/riscv/vector_helper.c | 129 +++++++++++++++++++----------------
 1 file changed, 70 insertions(+), 59 deletions(-)

Comments

Daniel Henrique Barboza Oct. 30, 2024, 4:31 p.m. UTC | #1
On 9/18/24 2:14 PM, Max Chou wrote:
> The vector unit-stride whole register load/store instructions are
> similar to unmasked unit-stride load/store instructions that is suitable
> to be optimized by using a direct access to host ram fast path.
> 
> Because the vector whole register load/store instructions do not need to
> handle the tail agnostic, so remove the vstart early exit checking.
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>
> ---

Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>

>   target/riscv/vector_helper.c | 129 +++++++++++++++++++----------------
>   1 file changed, 70 insertions(+), 59 deletions(-)
> 
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index c2fcf8b3a00..824e6401736 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -665,80 +665,91 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
>    */
>   static void
>   vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
> -                vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
> -                uintptr_t ra)
> +                vext_ldst_elem_fn_tlb *ldst_tlb,
> +                vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
> +                uintptr_t ra, bool is_load)
>   {
> -    uint32_t i, k, off, pos;
> +    target_ulong page_split, elems, addr;
>       uint32_t nf = vext_nf(desc);
>       uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
>       uint32_t max_elems = vlenb >> log2_esz;
> +    uint32_t evl = nf * max_elems;
> +    uint32_t esz = 1 << log2_esz;
> +    int mmu_index = riscv_env_mmu_index(env, false);
>   
> -    if (env->vstart >= ((vlenb * nf) >> log2_esz)) {
> -        env->vstart = 0;
> -        return;
> +    /* Calculate the page range of first page */
> +    addr = base + (env->vstart << log2_esz);
> +    page_split = -(addr | TARGET_PAGE_MASK);
> +    /* Get number of elements */
> +    elems = page_split / esz;
> +    if (unlikely(env->vstart + elems >= evl)) {
> +        elems = evl - env->vstart;
>       }
>   
> -    k = env->vstart / max_elems;
> -    off = env->vstart % max_elems;
> -
> -    if (off) {
> -        /* load/store rest of elements of current segment pointed by vstart */
> -        for (pos = off; pos < max_elems; pos++, env->vstart++) {
> -            target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
> -            ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
> -                      ra);
> -        }
> -        k++;
> +    /* Load/store elements in the first page */
> +    if (likely(elems)) {
> +        vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
> +                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
>       }
>   
> -    /* load/store elements for rest of segments */
> -    for (; k < nf; k++) {
> -        for (i = 0; i < max_elems; i++, env->vstart++) {
> -            target_ulong addr = base + ((i + k * max_elems) << log2_esz);
> -            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
> +    /* Load/store elements in the second page */
> +    if (unlikely(env->vstart < evl)) {
> +        /* Cross page element */
> +        if (unlikely(page_split % esz)) {
> +            addr = base + (env->vstart << log2_esz);
> +            ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
> +            env->vstart++;
>           }
> +
> +        addr = base + (env->vstart << log2_esz);
> +        /* Get number of elements of second page */
> +        elems = evl - env->vstart;
> +
> +        /* Load/store elements in the second page */
> +        vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
> +                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
>       }
>   
>       env->vstart = 0;
>   }
>   
> -#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
> -void HELPER(NAME)(void *vd, target_ulong base,       \
> -                  CPURISCVState *env, uint32_t desc) \
> -{                                                    \
> -    vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
> -                    ctzl(sizeof(ETYPE)), GETPC());   \
> -}
> -
> -GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb)
> -GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb)
> -GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb)
> -GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb)
> -GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb)
> -GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb)
> -GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb)
> -GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb)
> -GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb)
> -GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb)
> -GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb)
> -GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb)
> -GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb)
> -GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb)
> -GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb)
> -GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb)
> -
> -#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
> -void HELPER(NAME)(void *vd, target_ulong base,       \
> -                  CPURISCVState *env, uint32_t desc) \
> -{                                                    \
> -    vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
> -                    ctzl(sizeof(ETYPE)), GETPC());   \
> -}
> -
> -GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb)
> -GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb)
> -GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb)
> -GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb)
> +#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
> +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
> +                  uint32_t desc)                                    \
> +{                                                                   \
> +    vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
> +                    ctzl(sizeof(ETYPE)), GETPC(), true);            \
> +}
> +
> +GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
> +GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
> +GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
> +GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
> +GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
> +GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
> +GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
> +GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
> +GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
> +GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
> +GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
> +GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
> +GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
> +GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
> +GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
> +GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
> +
> +#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
> +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
> +                  uint32_t desc)                                        \
> +{                                                                       \
> +    vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
> +                    ctzl(sizeof(ETYPE)), GETPC(), false);               \
> +}
> +
> +GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
> +GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
> +GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
> +GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
>   
>   /*
>    * Vector Integer Arithmetic Instructions
diff mbox series

Patch

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index c2fcf8b3a00..824e6401736 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -665,80 +665,91 @@  GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
  */
 static void
 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
-                vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
-                uintptr_t ra)
+                vext_ldst_elem_fn_tlb *ldst_tlb,
+                vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
+                uintptr_t ra, bool is_load)
 {
-    uint32_t i, k, off, pos;
+    target_ulong page_split, elems, addr;
     uint32_t nf = vext_nf(desc);
     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
     uint32_t max_elems = vlenb >> log2_esz;
+    uint32_t evl = nf * max_elems;
+    uint32_t esz = 1 << log2_esz;
+    int mmu_index = riscv_env_mmu_index(env, false);
 
-    if (env->vstart >= ((vlenb * nf) >> log2_esz)) {
-        env->vstart = 0;
-        return;
+    /* Calculate the page range of first page */
+    addr = base + (env->vstart << log2_esz);
+    page_split = -(addr | TARGET_PAGE_MASK);
+    /* Get number of elements */
+    elems = page_split / esz;
+    if (unlikely(env->vstart + elems >= evl)) {
+        elems = evl - env->vstart;
     }
 
-    k = env->vstart / max_elems;
-    off = env->vstart % max_elems;
-
-    if (off) {
-        /* load/store rest of elements of current segment pointed by vstart */
-        for (pos = off; pos < max_elems; pos++, env->vstart++) {
-            target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
-                      ra);
-        }
-        k++;
+    /* Load/store elements in the first page */
+    if (likely(elems)) {
+        vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
+                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
     }
 
-    /* load/store elements for rest of segments */
-    for (; k < nf; k++) {
-        for (i = 0; i < max_elems; i++, env->vstart++) {
-            target_ulong addr = base + ((i + k * max_elems) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+    /* Load/store elements in the second page */
+    if (unlikely(env->vstart < evl)) {
+        /* Cross page element */
+        if (unlikely(page_split % esz)) {
+            addr = base + (env->vstart << log2_esz);
+            ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra);
+            env->vstart++;
         }
+
+        addr = base + (env->vstart << log2_esz);
+        /* Get number of elements of second page */
+        elems = evl - env->vstart;
+
+        /* Load/store elements in the second page */
+        vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz,
+                          is_load, mmu_index, ldst_tlb, ldst_host, ra);
     }
 
     env->vstart = 0;
 }
 
-#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
-void HELPER(NAME)(void *vd, target_ulong base,       \
-                  CPURISCVState *env, uint32_t desc) \
-{                                                    \
-    vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
-                    ctzl(sizeof(ETYPE)), GETPC());   \
-}
-
-GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb)
-GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb)
-GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb)
-GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb)
-GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb)
-GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb)
-GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb)
-GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb)
-GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb)
-GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb)
-GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb)
-GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb)
-GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb)
-GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb)
-GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb)
-GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb)
-
-#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
-void HELPER(NAME)(void *vd, target_ulong base,       \
-                  CPURISCVState *env, uint32_t desc) \
-{                                                    \
-    vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
-                    ctzl(sizeof(ETYPE)), GETPC());   \
-}
-
-GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb)
-GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb)
-GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb)
-GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb)
+#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)   \
+void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,  \
+                  uint32_t desc)                                    \
+{                                                                   \
+    vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \
+                    ctzl(sizeof(ETYPE)), GETPC(), true);            \
+}
+
+GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb, lde_b_host)
+GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host)
+GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host)
+GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host)
+GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb, lde_b_host)
+GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host)
+GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host)
+GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host)
+GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb, lde_b_host)
+GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host)
+GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host)
+GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host)
+GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb, lde_b_host)
+GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host)
+GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host)
+GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host)
+
+#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)     \
+void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env,      \
+                  uint32_t desc)                                        \
+{                                                                       \
+    vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,   \
+                    ctzl(sizeof(ETYPE)), GETPC(), false);               \
+}
+
+GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host)
+GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host)
+GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host)
+GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host)
 
 /*
  * Vector Integer Arithmetic Instructions