Message ID | 20241111130324.32487-2-paolo.savini@embecosm.com |
---|---|
State | New |
Headers | show |
Series | target/riscv: rvv: reduce the overhead for simple RISC-V vector. | expand |
On 2024/11/11 9:03 PM, Paolo Savini wrote: > This patch improves the performance of the emulation of the RVV unit-stride > loads and stores in the following cases: > > - when the data being loaded/stored per iteration amounts to 8 bytes or less. > - when the vector length is 16 bytes (VLEN=128) and there's no grouping of the > vector registers (LMUL=1). > > The optimization consists of avoiding the overhead of probing the RAM of the > host machine and doing a loop load/store on the input data grouped in chunks > of as many bytes as possible (8,4,2,1 bytes). > > Co-authored-by: Helene CHELIN <helene.chelin@embecosm.com> > Co-authored-by: Paolo Savini <paolo.savini@embecosm.com> > > Signed-off-by: Helene CHELIN <helene.chelin@embecosm.com> > Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> > > Reviewed-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> > --- > target/riscv/vector_helper.c | 47 ++++++++++++++++++++++++++++++++++++ > 1 file changed, 47 insertions(+) > > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index 4479726acf..75c24653f0 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -635,6 +635,53 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > > VSTART_CHECK_EARLY_EXIT(env); > > +#if defined(CONFIG_USER_ONLY) && !HOST_BIG_ENDIAN > + /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a > + * better performance by doing a simple simulation of the load/store > + * without the overhead of prodding the host RAM */ > + if ((nf == 1) && ((evl << log2_esz) <= 8 || > + ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) { > + > + uint32_t evl_b = evl << log2_esz; > + > + for (uint32_t j = env->vstart; j < evl_b;) { > + addr = base + j; > + if ((evl_b - j) >= 8) { > + if (is_load) > + lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 8; > + } > + else if ((evl_b - j) >= 4) { > + if (is_load) > + lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 4; > + } > + else if ((evl_b - j) >= 2) { > + if (is_load) > + lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 2; > + } > + else { > + if (is_load) > + lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra); > + else > + ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra); > + j += 1; > + } > + } > + > + env->vstart = 0; > + vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems); > + return; > + } > +#endif > + > vext_cont_ldst_elements(&info, base, env->vreg, env->vstart, evl, desc, > log2_esz, false); > /* Probe the page(s). Exit with exception for any invalid page. */ I think there is a potential issue in this patch. If there is an exception raised by the element covered by this optimization, then the vstart CSR will get unexpected value. Because this flow does not update vstart CSR value. max
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 4479726acf..75c24653f0 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -635,6 +635,53 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, VSTART_CHECK_EARLY_EXIT(env); +#if defined(CONFIG_USER_ONLY) && !HOST_BIG_ENDIAN + /* For data sizes <= 64 bits and for LMUL=1 with VLEN=128 bits we get a + * better performance by doing a simple simulation of the load/store + * without the overhead of prodding the host RAM */ + if ((nf == 1) && ((evl << log2_esz) <= 8 || + ((vext_lmul(desc) == 0) && (simd_maxsz(desc) == 16)))) { + + uint32_t evl_b = evl << log2_esz; + + for (uint32_t j = env->vstart; j < evl_b;) { + addr = base + j; + if ((evl_b - j) >= 8) { + if (is_load) + lde_d_tlb(env, adjust_addr(env, addr), j, vd, ra); + else + ste_d_tlb(env, adjust_addr(env, addr), j, vd, ra); + j += 8; + } + else if ((evl_b - j) >= 4) { + if (is_load) + lde_w_tlb(env, adjust_addr(env, addr), j, vd, ra); + else + ste_w_tlb(env, adjust_addr(env, addr), j, vd, ra); + j += 4; + } + else if ((evl_b - j) >= 2) { + if (is_load) + lde_h_tlb(env, adjust_addr(env, addr), j, vd, ra); + else + ste_h_tlb(env, adjust_addr(env, addr), j, vd, ra); + j += 2; + } + else { + if (is_load) + lde_b_tlb(env, adjust_addr(env, addr), j, vd, ra); + else + ste_b_tlb(env, adjust_addr(env, addr), j, vd, ra); + j += 1; + } + } + + env->vstart = 0; + vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems); + return; + } +#endif + vext_cont_ldst_elements(&info, base, env->vreg, env->vstart, evl, desc, log2_esz, false); /* Probe the page(s). Exit with exception for any invalid page. */