diff mbox series

[RFC,v2,5/6] target/riscv: rvv: Optimize v[l|s]e8.v with limitations

Message ID 20240531174504.281461-6-max.chou@sifive.com
State New
Headers show
Series Improve the performance of RISC-V vector unit-stride/whole register ld/st instructions | expand

Commit Message

Max Chou May 31, 2024, 5:44 p.m. UTC
The vector unit-stride load/store instructions (e.g. vle8.v/vse8.v)
perform continuous load/store. We can replace the corresponding helper
functions by TCG ops to copy more data at a time with following
assumptions:

* Perform virtual address resolution once for entire vector at beginning
* Without mask
* Without tail agnostic
* Both host and target are little endian

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 target/riscv/insn_trans/trans_rvv.c.inc | 197 +++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 2 deletions(-)

Comments

Richard Henderson June 2, 2024, 5:45 p.m. UTC | #1
On 5/31/24 12:44, Max Chou wrote:
> The vector unit-stride load/store instructions (e.g. vle8.v/vse8.v)
> perform continuous load/store. We can replace the corresponding helper
> functions by TCG ops to copy more data at a time with following
> assumptions:
> 
> * Perform virtual address resolution once for entire vector at beginning
> * Without mask
> * Without tail agnostic
> * Both host and target are little endian
> 
> Signed-off-by: Max Chou <max.chou@sifive.com>

Why are you generating all of this inline?  This expansion is very large.  I would expect 
you to get better performance with a helper function.

AGAIN, please see the Arm implementation.


r~
Max Chou June 3, 2024, 3:50 p.m. UTC | #2
Hi Richart,

Thank you for your feedback.
This version is created by referencing the gen_sve_ldr translation 
function with the similar assumptions that no mask(predication)/no tail 
agnostic/continuous load & store.
You are right, the expansion is large in this version (over 20 TCG 
instructions that suggested in tcg-op doc).
I will provide next version with the helper function implementation like 
sve_ldN_r in ARM target.

Thank you,
Max

On 2024/6/3 1:45 AM, Richard Henderson wrote:
> On 5/31/24 12:44, Max Chou wrote:
>> The vector unit-stride load/store instructions (e.g. vle8.v/vse8.v)
>> perform continuous load/store. We can replace the corresponding helper
>> functions by TCG ops to copy more data at a time with following
>> assumptions:
>>
>> * Perform virtual address resolution once for entire vector at beginning
>> * Without mask
>> * Without tail agnostic
>> * Both host and target are little endian
>>
>> Signed-off-by: Max Chou <max.chou@sifive.com>
>
> Why are you generating all of this inline?  This expansion is very 
> large.  I would expect you to get better performance with a helper 
> function.
>
> AGAIN, please see the Arm implementation.
>
>
> r~
Richard Henderson June 4, 2024, 12:58 a.m. UTC | #3
On 6/3/24 10:50, Max Chou wrote:
> Hi Richart,
> 
> Thank you for your feedback.
> This version is created by referencing the gen_sve_ldr translation function with the 
> similar assumptions that no mask(predication)/no tail agnostic/continuous load & store.

Except that gen_sve_ldr has a compile-time constant for the vector length, which is always 
a multiple of 16, and so has no extra special cases like you needed.


r~
diff mbox series

Patch

diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index 1e4fa797a86..bbac73bb12b 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -714,7 +714,105 @@  static bool ld_us_check(DisasContext *s, arg_r2nfvm* a, uint8_t eew)
            vext_check_load(s, a->rd, a->nf, a->vm, eew);
 }
 
-GEN_VEXT_TRANS(vle8_v,  MO_8,  r2nfvm, ld_us_op, ld_us_check)
+static bool trans_vle8_v(DisasContext *s, arg_r2nfvm * a)
+{
+
+    if (ld_us_check(s, a, MO_8)) {
+        if (!HOST_BIG_ENDIAN && s->vstart_eq_zero && s->vta == 0 && a->vm) {
+            uint32_t vofs = vreg_ofs(s, a->rd);
+            uint32_t midx = s->mem_idx;
+
+            TCGv_i64 t0, t1;
+            TCGv_i128 t16;
+            TCGv_ptr tp;
+            TCGv_ptr i = tcg_temp_new_ptr();
+            TCGv len_remain = tcg_temp_new();
+            TCGv rs1 = get_gpr(s, a->rs1, EXT_NONE);
+            TCGv addr = tcg_temp_new();
+
+            TCGLabel *loop_128 = gen_new_label();
+            TCGLabel *remain_64 = gen_new_label();
+            TCGLabel *remain_32 = gen_new_label();
+            TCGLabel *remain_16 = gen_new_label();
+            TCGLabel *remain_8 = gen_new_label();
+            TCGLabel *over = gen_new_label();
+
+            tcg_gen_mov_tl(addr, rs1);
+            tcg_gen_mov_tl(len_remain, cpu_vl);
+            tcg_gen_muli_tl(len_remain, len_remain, a->nf);
+            tcg_gen_movi_ptr(i, 0);
+
+            tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over);
+            gen_helper_check_probe_read(tcg_env, addr, len_remain);
+
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 16, remain_64);
+
+            gen_set_label(loop_128);
+
+            t16 = tcg_temp_new_i128();
+            tcg_gen_qemu_ld_i128(t16, addr, midx,
+                                 MO_LE | MO_128 | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 16);
+
+            tp = tcg_temp_new_ptr();
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_addi_ptr(i, i, 16);
+
+            t0 = tcg_temp_new_i64();
+            t1 = tcg_temp_new_i64();
+            tcg_gen_extr_i128_i64(t0, t1, t16);
+
+            tcg_gen_st_i64(t0, tp, vofs);
+            tcg_gen_st_i64(t1, tp, vofs + 8);
+            tcg_gen_subi_tl(len_remain, len_remain, 16);
+
+            tcg_gen_brcondi_tl(TCG_COND_GEU, len_remain, 16, loop_128);
+
+            gen_set_label(remain_64);
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 8, remain_32);
+            tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEUQ | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 8);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_addi_ptr(i, i, 8);
+            tcg_gen_st_i64(t0, tp, vofs);
+            tcg_gen_subi_tl(len_remain, len_remain, 8);
+
+            gen_set_label(remain_32);
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 4, remain_16);
+            tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEUL | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 4);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_addi_ptr(i, i, 4);
+            tcg_gen_st32_i64(t0, tp, vofs);
+            tcg_gen_subi_tl(len_remain, len_remain, 4);
+
+            gen_set_label(remain_16);
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 2, remain_8);
+            tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEUW | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 2);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_addi_ptr(i, i, 2);
+            tcg_gen_st16_i64(t0, tp, vofs);
+            tcg_gen_subi_tl(len_remain, len_remain, 2);
+
+            gen_set_label(remain_8);
+            tcg_gen_brcondi_tl(TCG_COND_EQ, len_remain, 0, over);
+            tcg_gen_qemu_ld_i64(t0, addr, midx,
+                                MO_LE | MO_8 | MO_ATOM_NONE);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_st8_i64(t0, tp, vofs);
+
+            gen_set_label(over);
+
+            finalize_rvv_inst(s);
+        } else {
+            return ld_us_op(s, a, MO_8);
+        }
+        return true;
+    }
+    return false;
+}
+
 GEN_VEXT_TRANS(vle16_v, MO_16, r2nfvm, ld_us_op, ld_us_check)
 GEN_VEXT_TRANS(vle32_v, MO_32, r2nfvm, ld_us_op, ld_us_check)
 GEN_VEXT_TRANS(vle64_v, MO_64, r2nfvm, ld_us_op, ld_us_check)
@@ -785,7 +883,102 @@  static bool st_us_check(DisasContext *s, arg_r2nfvm* a, uint8_t eew)
            vext_check_store(s, a->rd, a->nf, eew);
 }
 
-GEN_VEXT_TRANS(vse8_v,  MO_8,  r2nfvm, st_us_op, st_us_check)
+static bool trans_vse8_v(DisasContext *s, arg_r2nfvm * a)
+{
+    if (st_us_check(s, a, MO_8)) {
+        if (!HOST_BIG_ENDIAN && s->vstart_eq_zero && s->vta == 0 && a->vm) {
+            uint32_t vofs = vreg_ofs(s, a->rd);
+            uint32_t midx = s->mem_idx;
+
+            TCGv_i64 t0, t1;
+            TCGv_i128 t16;
+            TCGv_ptr tp;
+            TCGv_ptr i = tcg_temp_new_ptr();
+            TCGv len_remain = tcg_temp_new();
+            TCGv rs1 = get_gpr(s, a->rs1, EXT_NONE);
+            TCGv addr = tcg_temp_new();
+
+            TCGLabel *loop_128 = gen_new_label();
+            TCGLabel *remain_64 = gen_new_label();
+            TCGLabel *remain_32 = gen_new_label();
+            TCGLabel *remain_16 = gen_new_label();
+            TCGLabel *remain_8 = gen_new_label();
+            TCGLabel *over = gen_new_label();
+
+            tcg_gen_mov_tl(addr, rs1);
+            tcg_gen_mov_tl(len_remain, cpu_vl);
+            tcg_gen_muli_tl(len_remain, len_remain, a->nf);
+            tcg_gen_movi_ptr(i, 0);
+
+            tcg_gen_brcond_tl(TCG_COND_GEU, cpu_vstart, cpu_vl, over);
+            gen_helper_check_probe_write(tcg_env, addr, len_remain);
+
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 16, remain_64);
+
+            gen_set_label(loop_128);
+
+            t0 = tcg_temp_new_i64();
+            t1 = tcg_temp_new_i64();
+            tp = tcg_temp_new_ptr();
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_ld_i64(t0, tp, vofs);
+            tcg_gen_ld_i64(t1, tp, vofs + 8);
+            tcg_gen_addi_ptr(i, i, 16);
+
+            t16 = tcg_temp_new_i128();
+            tcg_gen_concat_i64_i128(t16, t0, t1);
+
+            tcg_gen_qemu_st_i128(t16, addr, midx,
+                                 MO_LE | MO_128 | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 16);
+            tcg_gen_subi_tl(len_remain, len_remain, 16);
+
+            tcg_gen_brcondi_tl(TCG_COND_GEU, len_remain, 16, loop_128);
+
+            gen_set_label(remain_64);
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 8, remain_32);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_ld_i64(t0, tp, vofs);
+            tcg_gen_addi_ptr(i, i, 8);
+            tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUQ | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 8);
+            tcg_gen_subi_tl(len_remain, len_remain, 8);
+
+            gen_set_label(remain_32);
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 4, remain_16);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_ld_i64(t0, tp, vofs);
+            tcg_gen_addi_ptr(i, i, 4);
+            tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUL | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 4);
+            tcg_gen_subi_tl(len_remain, len_remain, 4);
+
+            gen_set_label(remain_16);
+            tcg_gen_brcondi_tl(TCG_COND_LTU, len_remain, 2, remain_8);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_ld_i64(t0, tp, vofs);
+            tcg_gen_addi_ptr(i, i, 2);
+            tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUW | MO_ATOM_NONE);
+            tcg_gen_addi_tl(addr, addr, 2);
+            tcg_gen_subi_tl(len_remain, len_remain, 2);
+
+            gen_set_label(remain_8);
+            tcg_gen_brcondi_tl(TCG_COND_EQ, len_remain, 0, over);
+            tcg_gen_add_ptr(tp, tcg_env, i);
+            tcg_gen_ld_i64(t0, tp, vofs);
+            tcg_gen_qemu_st_i64(t0, addr, midx, MO_LE | MO_8 | MO_ATOM_NONE);
+
+            gen_set_label(over);
+
+            finalize_rvv_inst(s);
+        } else {
+            return st_us_op(s, a, MO_8);
+        }
+        return true;
+    }
+    return false;
+}
+
 GEN_VEXT_TRANS(vse16_v, MO_16, r2nfvm, st_us_op, st_us_check)
 GEN_VEXT_TRANS(vse32_v, MO_32, r2nfvm, st_us_op, st_us_check)
 GEN_VEXT_TRANS(vse64_v, MO_64, r2nfvm, st_us_op, st_us_check)