Message ID | 20240813113436.831-7-zhiwei_liu@linux.alibaba.com |
---|---|
State | New |
Headers | show |
Series | tcg/riscv: Add support for vector | expand |
On 8/13/24 21:34, LIU Zhiwei wrote: > @@ -827,14 +850,59 @@ static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data, > static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, > TCGReg arg1, intptr_t arg2) > { > - RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD; > + RISCVInsn insn; > + > + if (type < TCG_TYPE_V64) { > + insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD; > + } else { > + tcg_debug_assert(arg >= TCG_REG_V1); > + switch (prev_vece) { > + case MO_8: > + insn = OPC_VLE8_V; > + break; > + case MO_16: > + insn = OPC_VLE16_V; > + break; > + case MO_32: > + insn = OPC_VLE32_V; > + break; > + case MO_64: > + insn = OPC_VLE64_V; > + break; > + default: > + g_assert_not_reached(); > + } > + } > tcg_out_ldst(s, insn, arg, arg1, arg2); tcg_out_ld/st are called directly from register allocation spill/fill. You'll need to set vtype here, and cannot rely on this having been done in tcg_out_vec_op. That said, with a little-endian host, the selected element size doesn't matter *too* much. A write of 8 uint16_t or a write of 2 uint64_t produces the same bits in memory. Therefore you can examine prev_vtype and adjust only if the vector length changes. But we do that -- e.g. load V256, store V256, store V128 to perform a 384-bit store for AArch64 SVE when VQ=3. Is there an advantage to using the vector load/store whole register insns, if the requested length is not too small? IIRC the NF field can be used to store multiples, but we can't store half of a register with these. r~
On 2024/8/14 17:01, Richard Henderson wrote: > On 8/13/24 21:34, LIU Zhiwei wrote: >> @@ -827,14 +850,59 @@ static void tcg_out_ldst(TCGContext *s, >> RISCVInsn opc, TCGReg data, >> static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, >> TCGReg arg1, intptr_t arg2) >> { >> - RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD; >> + RISCVInsn insn; >> + >> + if (type < TCG_TYPE_V64) { >> + insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD; >> + } else { >> + tcg_debug_assert(arg >= TCG_REG_V1); >> + switch (prev_vece) { >> + case MO_8: >> + insn = OPC_VLE8_V; >> + break; >> + case MO_16: >> + insn = OPC_VLE16_V; >> + break; >> + case MO_32: >> + insn = OPC_VLE32_V; >> + break; >> + case MO_64: >> + insn = OPC_VLE64_V; >> + break; >> + default: >> + g_assert_not_reached(); >> + } >> + } >> tcg_out_ldst(s, insn, arg, arg1, arg2); > > tcg_out_ld/st are called directly from register allocation spill/fill. > You'll need to set vtype here, and cannot rely on this having been > done in tcg_out_vec_op. OK. > > That said, with a little-endian host, the selected element size > doesn't matter *too* much. A write of 8 uint16_t or a write of 2 > uint64_t produces the same bits in memory. > > Therefore you can examine prev_vtype and adjust only if the vector > length changes. OK. > But we do that -- e.g. load V256, store V256, store V128 to perform > a 384-bit store for AArch64 SVE when VQ=3. > > Is there an advantage to using the vector load/store whole register > insns, if the requested length is not too small? For vector type equal or bigger than vlen in host, we will use the whole register instructions. > IIRC the NF field can be used to store multiples, but we can't store > half of a register with these. I think we can still use the unit-stride instructions for them. Thanks, Zhiwei > > > r~
diff --git a/tcg/riscv/tcg-target-con-set.h b/tcg/riscv/tcg-target-con-set.h index aac5ceee2b..d73a62b0f2 100644 --- a/tcg/riscv/tcg-target-con-set.h +++ b/tcg/riscv/tcg-target-con-set.h @@ -21,3 +21,5 @@ C_O1_I2(r, rZ, rZ) C_N1_I2(r, r, rM) C_O1_I4(r, r, rI, rM, rM) C_O2_I4(r, r, rZ, rZ, rM, rM) +C_O0_I2(v, r) +C_O1_I1(v, r) diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc index d17f523187..f17d679d71 100644 --- a/tcg/riscv/tcg-target.c.inc +++ b/tcg/riscv/tcg-target.c.inc @@ -279,6 +279,15 @@ typedef enum { OPC_VSETVLI = 0x57 | V_OPCFG, OPC_VSETIVLI = 0xc0000057 | V_OPCFG, OPC_VSETVL = 0x80000057 | V_OPCFG, + + OPC_VLE8_V = 0x7 | V_LUMOP, + OPC_VLE16_V = 0x5007 | V_LUMOP, + OPC_VLE32_V = 0x6007 | V_LUMOP, + OPC_VLE64_V = 0x7007 | V_LUMOP, + OPC_VSE8_V = 0x27 | V_SUMOP, + OPC_VSE16_V = 0x5027 | V_SUMOP, + OPC_VSE32_V = 0x6027 | V_SUMOP, + OPC_VSE64_V = 0x7027 | V_SUMOP, } RISCVInsn; /* @@ -810,6 +819,13 @@ static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data, case OPC_SD: tcg_out_opc_store(s, opc, addr, data, imm12); break; + case OPC_VSE8_V: + case OPC_VSE16_V: + case OPC_VSE32_V: + case OPC_VSE64_V: + tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, addr, imm12); + tcg_out_opc_ldst_vec(s, opc, data, TCG_REG_TMP0, true); + break; case OPC_LB: case OPC_LBU: case OPC_LH: @@ -819,6 +835,13 @@ static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data, case OPC_LD: tcg_out_opc_imm(s, opc, data, addr, imm12); break; + case OPC_VLE8_V: + case OPC_VLE16_V: + case OPC_VLE32_V: + case OPC_VLE64_V: + tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, addr, imm12); + tcg_out_opc_ldst_vec(s, opc, data, TCG_REG_TMP0, true); + break; default: g_assert_not_reached(); } @@ -827,14 +850,59 @@ static void tcg_out_ldst(TCGContext *s, RISCVInsn opc, TCGReg data, static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1, intptr_t arg2) { - RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD; + RISCVInsn insn; + + if (type < TCG_TYPE_V64) { + insn = type == TCG_TYPE_I32 ? OPC_LW : OPC_LD; + } else { + tcg_debug_assert(arg >= TCG_REG_V1); + switch (prev_vece) { + case MO_8: + insn = OPC_VLE8_V; + break; + case MO_16: + insn = OPC_VLE16_V; + break; + case MO_32: + insn = OPC_VLE32_V; + break; + case MO_64: + insn = OPC_VLE64_V; + break; + default: + g_assert_not_reached(); + } + } tcg_out_ldst(s, insn, arg, arg1, arg2); } static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1, intptr_t arg2) { - RISCVInsn insn = type == TCG_TYPE_I32 ? OPC_SW : OPC_SD; + RISCVInsn insn; + + if (type < TCG_TYPE_V64) { + insn = type == TCG_TYPE_I32 ? OPC_SW : OPC_SD; + tcg_out_ldst(s, insn, arg, arg1, arg2); + } else { + tcg_debug_assert(arg >= TCG_REG_V1); + switch (prev_vece) { + case MO_8: + insn = OPC_VSE8_V; + break; + case MO_16: + insn = OPC_VSE16_V; + break; + case MO_32: + insn = OPC_VSE32_V; + break; + case MO_64: + insn = OPC_VSE64_V; + break; + default: + g_assert_not_reached(); + } + } tcg_out_ldst(s, insn, arg, arg1, arg2); } @@ -2030,11 +2098,25 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, const int const_args[TCG_MAX_OP_ARGS]) { TCGType type = vecl + TCG_TYPE_V64; + TCGArg a0, a1, a2; + + a0 = args[0]; + a1 = args[1]; + a2 = args[2]; - if (vec_vtpye_init) { + if (!vec_vtpye_init && + (opc == INDEX_op_ld_vec || opc == INDEX_op_st_vec)) { + tcg_target_set_vec_config(s, type, prev_vece); + } else { tcg_target_set_vec_config(s, type, vece); } switch (opc) { + case INDEX_op_ld_vec: + tcg_out_ld(s, type, a0, a1, a2); + break; + case INDEX_op_st_vec: + tcg_out_st(s, type, a0, a1, a2); + break; case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ default: @@ -2198,6 +2280,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_qemu_st_a64_i64: return C_O0_I2(rZ, r); + case INDEX_op_st_vec: + return C_O0_I2(v, r); + case INDEX_op_ld_vec: + return C_O1_I1(v, r); default: g_assert_not_reached(); }