Message ID | 20240918171412.150107-5-max.chou@sifive.com |
---|---|
State | New |
Headers | show |
Series | [v6,1/7] target/riscv: Set vdata.vm field for vector load/store whole register instructions | expand |
On 9/18/24 2:14 PM, Max Chou wrote: > The vector unit-stride whole register load/store instructions are > similar to unmasked unit-stride load/store instructions that is suitable > to be optimized by using a direct access to host ram fast path. > > Because the vector whole register load/store instructions do not need to > handle the tail agnostic, so remove the vstart early exit checking. > > Signed-off-by: Max Chou <max.chou@sifive.com> > --- Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> > target/riscv/vector_helper.c | 129 +++++++++++++++++++---------------- > 1 file changed, 70 insertions(+), 59 deletions(-) > > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index c2fcf8b3a00..824e6401736 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -665,80 +665,91 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb) > */ > static void > vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > - vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz, > - uintptr_t ra) > + vext_ldst_elem_fn_tlb *ldst_tlb, > + vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, > + uintptr_t ra, bool is_load) > { > - uint32_t i, k, off, pos; > + target_ulong page_split, elems, addr; > uint32_t nf = vext_nf(desc); > uint32_t vlenb = riscv_cpu_cfg(env)->vlenb; > uint32_t max_elems = vlenb >> log2_esz; > + uint32_t evl = nf * max_elems; > + uint32_t esz = 1 << log2_esz; > + int mmu_index = riscv_env_mmu_index(env, false); > > - if (env->vstart >= ((vlenb * nf) >> log2_esz)) { > - env->vstart = 0; > - return; > + /* Calculate the page range of first page */ > + addr = base + (env->vstart << log2_esz); > + page_split = -(addr | TARGET_PAGE_MASK); > + /* Get number of elements */ > + elems = page_split / esz; > + if (unlikely(env->vstart + elems >= evl)) { > + elems = evl - env->vstart; > } > > - k = env->vstart / max_elems; > - off = env->vstart % max_elems; > - > - if (off) { > - /* load/store rest of elements of current segment pointed by vstart */ > - for (pos = off; pos < max_elems; pos++, env->vstart++) { > - target_ulong addr = base + ((pos + k * max_elems) << log2_esz); > - ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, > - ra); > - } > - k++; > + /* Load/store elements in the first page */ > + if (likely(elems)) { > + vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz, > + is_load, mmu_index, ldst_tlb, ldst_host, ra); > } > > - /* load/store elements for rest of segments */ > - for (; k < nf; k++) { > - for (i = 0; i < max_elems; i++, env->vstart++) { > - target_ulong addr = base + ((i + k * max_elems) << log2_esz); > - ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); > + /* Load/store elements in the second page */ > + if (unlikely(env->vstart < evl)) { > + /* Cross page element */ > + if (unlikely(page_split % esz)) { > + addr = base + (env->vstart << log2_esz); > + ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra); > + env->vstart++; > } > + > + addr = base + (env->vstart << log2_esz); > + /* Get number of elements of second page */ > + elems = evl - env->vstart; > + > + /* Load/store elements in the second page */ > + vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz, > + is_load, mmu_index, ldst_tlb, ldst_host, ra); > } > > env->vstart = 0; > } > > -#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN) \ > -void HELPER(NAME)(void *vd, target_ulong base, \ > - CPURISCVState *env, uint32_t desc) \ > -{ \ > - vext_ldst_whole(vd, base, env, desc, LOAD_FN, \ > - ctzl(sizeof(ETYPE)), GETPC()); \ > -} > - > -GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b_tlb) > -GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb) > -GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb) > -GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb) > -GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b_tlb) > -GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb) > -GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb) > -GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb) > -GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b_tlb) > -GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb) > -GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb) > -GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb) > -GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b_tlb) > -GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb) > -GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb) > -GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb) > - > -#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN) \ > -void HELPER(NAME)(void *vd, target_ulong base, \ > - CPURISCVState *env, uint32_t desc) \ > -{ \ > - vext_ldst_whole(vd, base, env, desc, STORE_FN, \ > - ctzl(sizeof(ETYPE)), GETPC()); \ > -} > - > -GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb) > -GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb) > -GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb) > -GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb) > +#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST) \ > +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env, \ > + uint32_t desc) \ > +{ \ > + vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \ > + ctzl(sizeof(ETYPE)), GETPC(), true); \ > +} > + > +GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b_tlb, lde_b_host) > +GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host) > +GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host) > +GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host) > +GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b_tlb, lde_b_host) > +GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host) > +GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host) > +GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host) > +GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b_tlb, lde_b_host) > +GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host) > +GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host) > +GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host) > +GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b_tlb, lde_b_host) > +GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host) > +GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host) > +GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host) > + > +#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST) \ > +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env, \ > + uint32_t desc) \ > +{ \ > + vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST, \ > + ctzl(sizeof(ETYPE)), GETPC(), false); \ > +} > + > +GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host) > +GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host) > +GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host) > +GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host) > > /* > * Vector Integer Arithmetic Instructions
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index c2fcf8b3a00..824e6401736 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -665,80 +665,91 @@ GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb) */ static void vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, - vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz, - uintptr_t ra) + vext_ldst_elem_fn_tlb *ldst_tlb, + vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz, + uintptr_t ra, bool is_load) { - uint32_t i, k, off, pos; + target_ulong page_split, elems, addr; uint32_t nf = vext_nf(desc); uint32_t vlenb = riscv_cpu_cfg(env)->vlenb; uint32_t max_elems = vlenb >> log2_esz; + uint32_t evl = nf * max_elems; + uint32_t esz = 1 << log2_esz; + int mmu_index = riscv_env_mmu_index(env, false); - if (env->vstart >= ((vlenb * nf) >> log2_esz)) { - env->vstart = 0; - return; + /* Calculate the page range of first page */ + addr = base + (env->vstart << log2_esz); + page_split = -(addr | TARGET_PAGE_MASK); + /* Get number of elements */ + elems = page_split / esz; + if (unlikely(env->vstart + elems >= evl)) { + elems = evl - env->vstart; } - k = env->vstart / max_elems; - off = env->vstart % max_elems; - - if (off) { - /* load/store rest of elements of current segment pointed by vstart */ - for (pos = off; pos < max_elems; pos++, env->vstart++) { - target_ulong addr = base + ((pos + k * max_elems) << log2_esz); - ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, - ra); - } - k++; + /* Load/store elements in the first page */ + if (likely(elems)) { + vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz, + is_load, mmu_index, ldst_tlb, ldst_host, ra); } - /* load/store elements for rest of segments */ - for (; k < nf; k++) { - for (i = 0; i < max_elems; i++, env->vstart++) { - target_ulong addr = base + ((i + k * max_elems) << log2_esz); - ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra); + /* Load/store elements in the second page */ + if (unlikely(env->vstart < evl)) { + /* Cross page element */ + if (unlikely(page_split % esz)) { + addr = base + (env->vstart << log2_esz); + ldst_tlb(env, adjust_addr(env, addr), env->vstart, vd, ra); + env->vstart++; } + + addr = base + (env->vstart << log2_esz); + /* Get number of elements of second page */ + elems = evl - env->vstart; + + /* Load/store elements in the second page */ + vext_page_ldst_us(env, vd, addr, elems, 1, max_elems, log2_esz, + is_load, mmu_index, ldst_tlb, ldst_host, ra); } env->vstart = 0; } -#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN) \ -void HELPER(NAME)(void *vd, target_ulong base, \ - CPURISCVState *env, uint32_t desc) \ -{ \ - vext_ldst_whole(vd, base, env, desc, LOAD_FN, \ - ctzl(sizeof(ETYPE)), GETPC()); \ -} - -GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb) -GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb) -GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb) -GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b_tlb) -GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb) -GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb) -GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb) - -#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN) \ -void HELPER(NAME)(void *vd, target_ulong base, \ - CPURISCVState *env, uint32_t desc) \ -{ \ - vext_ldst_whole(vd, base, env, desc, STORE_FN, \ - ctzl(sizeof(ETYPE)), GETPC()); \ -} - -GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb) -GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb) -GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb) -GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb) +#define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST) \ +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env, \ + uint32_t desc) \ +{ \ + vext_ldst_whole(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST, \ + ctzl(sizeof(ETYPE)), GETPC(), true); \ +} + +GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb, lde_d_host) +GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb, lde_d_host) +GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb, lde_d_host) +GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b_tlb, lde_b_host) +GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb, lde_h_host) +GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb, lde_w_host) +GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb, lde_d_host) + +#define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST) \ +void HELPER(NAME)(void *vd, target_ulong base, CPURISCVState *env, \ + uint32_t desc) \ +{ \ + vext_ldst_whole(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST, \ + ctzl(sizeof(ETYPE)), GETPC(), false); \ +} + +GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb, ste_b_host) +GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb, ste_b_host) +GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb, ste_b_host) +GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb, ste_b_host) /* * Vector Integer Arithmetic Instructions
The vector unit-stride whole register load/store instructions are similar to unmasked unit-stride load/store instructions that is suitable to be optimized by using a direct access to host ram fast path. Because the vector whole register load/store instructions do not need to handle the tail agnostic, so remove the vstart early exit checking. Signed-off-by: Max Chou <max.chou@sifive.com> --- target/riscv/vector_helper.c | 129 +++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 59 deletions(-)