@@ -708,6 +708,7 @@ bool can_be_broadcasted_p (rtx);
bool gather_scatter_valid_offset_p (machine_mode);
HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
+bool splat_to_scalar_move_p (rtx *);
}
/* We classify builtin types into two classes:
@@ -5151,4 +5151,16 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
return false;
}
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */
+bool
+splat_to_scalar_move_p (rtx *ops)
+{
+ return satisfies_constraint_Wc1 (ops[1])
+ && satisfies_constraint_vu (ops[2])
+ && !MEM_P (ops[3])
+ && satisfies_constraint_c01 (ops[4])
+ && INTVAL (ops[7]) == NONVLMAX
+ && known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
+}
+
} // namespace riscv_vector
@@ -1977,8 +1977,15 @@
(match_operand:V_VLS 2 "vector_merge_operand")))]
"TARGET_VECTOR"
{
+ /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f
+ has better chances to do vsetvl fusion in vsetvl pass. */
+ if (riscv_vector::splat_to_scalar_move_p (operands))
+ {
+ operands[1] = riscv_vector::gen_scalar_move_mask (<VM>mode);
+ operands[3] = force_reg (<VEL>mode, operands[3]);
+ }
/* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar. */
- if (satisfies_constraint_Wdm (operands[3]))
+ else if (satisfies_constraint_Wdm (operands[3]))
{
if (satisfies_constraint_Wb1 (operands[1]))
{
new file mode 100644
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+void
+foo (uint32_t *outputMat, uint32_t *inputMat)
+{
+ vuint32m1_t matRegIn0 = __riscv_vle32_v_u32m1 (inputMat, 4);
+ vuint32m1_t matRegIn1 = __riscv_vle32_v_u32m1 (inputMat + 4, 4);
+ vuint32m1_t matRegIn2 = __riscv_vle32_v_u32m1 (inputMat + 8, 4);
+ vuint32m1_t matRegIn3 = __riscv_vle32_v_u32m1 (inputMat + 12, 4);
+
+ vbool32_t oddMask
+ = __riscv_vreinterpret_v_u32m1_b32 (__riscv_vmv_v_x_u32m1 (0xaaaa, 1));
+
+ vuint32m1_t smallTransposeMat0
+ = __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn0, matRegIn1, 1, 4);
+ vuint32m1_t smallTransposeMat2
+ = __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn2, matRegIn3, 1, 4);
+
+ vuint32m1_t outMat0 = __riscv_vslideup_vx_u32m1_tu (smallTransposeMat0,
+ smallTransposeMat2, 2, 4);
+
+ __riscv_vse32_v_u32m1 (outputMat, outMat0, 4);
+}
+
+void
+foo2 (void *outputMat, void *inputMat)
+{
+ vfloat32m1_t v = __riscv_vfmv_v_f_f32m1 (0xaaaa, 1);
+ __riscv_vse32_v_f32m1 (outputMat, v, 4);
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 2 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 2 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
new file mode 100644
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void matrix_transpose_in_register(uint32_t* outputMat, uint32_t* inputMat) {
+ vuint32m1_t matRegIn0 = __riscv_vle32_v_u32m1(inputMat, 4);
+ vuint32m1_t matRegIn1 = __riscv_vle32_v_u32m1(inputMat + 4, 4);
+ vuint32m1_t matRegIn2 = __riscv_vle32_v_u32m1(inputMat + 8, 4);
+ vuint32m1_t matRegIn3 = __riscv_vle32_v_u32m1(inputMat + 12, 4);
+
+ vbool32_t oddMask = __riscv_vreinterpret_v_u32m1_b32(__riscv_vmv_v_x_u32m1(0xaaaa, 1));
+
+ vuint32m1_t smallTransposeMat0 = __riscv_vslideup_vx_u32m1_tumu(oddMask, matRegIn0, matRegIn1, 1, 4);
+ vuint32m1_t smallTransposeMat2 = __riscv_vslideup_vx_u32m1_tumu(oddMask, matRegIn2, matRegIn3, 1, 4);
+
+ vbool32_t evenMask = __riscv_vreinterpret_v_u32m1_b32(__riscv_vmv_v_x_u32m1(0x5555, 1));
+
+ vuint32m1_t smallTransposeMat1 = __riscv_vslidedown_vx_u32m1_tumu(evenMask, matRegIn1, matRegIn0, 1, 4);
+ vuint32m1_t smallTransposeMat3 = __riscv_vslidedown_vx_u32m1_tumu(evenMask, matRegIn3, matRegIn2, 1, 4);
+
+ vuint32m1_t outMat0 = __riscv_vslideup_vx_u32m1_tu(smallTransposeMat0, smallTransposeMat2, 2, 4);
+ vuint32m1_t outMat1 = __riscv_vslideup_vx_u32m1_tu(smallTransposeMat1, smallTransposeMat3, 2, 4);
+
+ vuint32m1_t outMat2 = __riscv_vslidedown_vx_u32m1_tu(smallTransposeMat2, smallTransposeMat0, 2, 2);
+ vuint32m1_t outMat3 = __riscv_vslidedown_vx_u32m1_tu(smallTransposeMat3, smallTransposeMat1, 2, 2);
+ __riscv_vse32_v_u32m1(outputMat, outMat0, 4);
+ __riscv_vse32_v_u32m1(outputMat + 4, outMat1, 4);
+ __riscv_vse32_v_u32m1(outputMat + 8, outMat2, 4);
+ __riscv_vse32_v_u32m1(outputMat + 12, outMat3, 4);
+}
+
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 2 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*2,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 3 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */