@@ -1062,6 +1062,9 @@ struct stringop_info {
MAX_EW is the maximum element width that the caller wants to use and
LENGTH_IN is the length of the stringop in bytes.
+
+ This is currently used for cpymem and setmem. If expand_vec_cmpmem switches
+ to using it too then check_vectorise_memory_operation can be removed.
*/
static bool
@@ -1600,41 +1603,75 @@ check_vectorise_memory_operation (rtx length_in, HOST_WIDE_INT &lmul_out)
bool
expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in)
{
- HOST_WIDE_INT lmul;
+ stringop_info info;
+
/* Check we are able and allowed to vectorise this operation;
bail if not. */
- if (!check_vectorise_memory_operation (length_in, lmul))
+ if (!use_vector_stringop_p (info, 1, length_in))
return false;
- machine_mode vmode
- = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul)
- .require ();
+ /* avl holds the (remaining) length of the required set.
+ cnt holds the length we set with the current store. */
+ rtx cnt = info.avl;
rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
- rtx dst = change_address (dst_in, vmode, dst_addr);
+ rtx dst = change_address (dst_in, info.vmode, dst_addr);
- rtx fill_value = gen_reg_rtx (vmode);
+ rtx fill_value = gen_reg_rtx (info.vmode);
rtx broadcast_ops[] = { fill_value, fill_value_in };
- /* If the length is exactly vlmax for the selected mode, do that.
- Otherwise, use a predicated store. */
- if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+ rtx label = NULL_RTX;
+ rtx mask = NULL_RTX;
+
+ /* If we don't need a loop and the length is exactly vlmax for the selected
+ mode do a broadcast and store, otherwise use a predicated store. */
+ if (!info.need_loop
+ && known_eq (GET_MODE_SIZE (info.vmode), INTVAL (length_in)))
{
- emit_vlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
- broadcast_ops);
+ emit_vlmax_insn (code_for_pred_broadcast (info.vmode), UNARY_OP,
+ broadcast_ops);
emit_move_insn (dst, fill_value);
+ return true;
}
- else
+
+ machine_mode mask_mode
+ = riscv_vector::get_vector_mode (BImode,
+ GET_MODE_NUNITS (info.vmode)).require ();
+ mask = CONSTM1_RTX (mask_mode);
+ if (!satisfies_constraint_K (cnt))
+ cnt = force_reg (Pmode, cnt);
+
+ if (info.need_loop)
{
- if (!satisfies_constraint_K (length_in))
- length_in = force_reg (Pmode, length_in);
- emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
- broadcast_ops, length_in);
- machine_mode mask_mode
- = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (vmode))
- .require ();
- rtx mask = CONSTM1_RTX (mask_mode);
- emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in,
- get_avl_type_rtx (riscv_vector::NONVLMAX)));
+ info.avl = copy_to_mode_reg (Pmode, info.avl);
+ cnt = gen_reg_rtx (Pmode);
+ emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt,
+ info.avl));
+ }
+
+ emit_nonvlmax_insn (code_for_pred_broadcast (info.vmode),
+ riscv_vector::UNARY_OP, broadcast_ops, cnt);
+
+ if (info.need_loop)
+ {
+ label = gen_label_rtx ();
+
+ emit_label (label);
+ emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info.vmode, cnt,
+ info.avl));
+ }
+
+ emit_insn (gen_pred_store (info.vmode, dst, mask, fill_value, cnt,
+ get_avl_type_rtx (riscv_vector::NONVLMAX)));
+
+ if (info.need_loop)
+ {
+ emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+ emit_insn (gen_rtx_SET (info.avl, gen_rtx_MINUS (Pmode, info.avl, cnt)));
+
+ /* Emit the loop condition. */
+ rtx test = gen_rtx_NE (VOIDmode, info.avl, const0_rtx);
+ emit_jump_insn (gen_cbranch4 (Pmode, test, info.avl, const0_rtx, label));
+ emit_insn (gen_nop ());
}
return true;
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-add-options riscv_v } */
-/* { dg-additional-options "-O3 -mrvv-max-lmul=dynamic" } */
+/* { dg-additional-options "-O1 -mrvv-max-lmul=dynamic" } */
/* { dg-final { check-function-bodies "**" "" } } */
#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
@@ -91,13 +91,42 @@ f6 (void *a, int const b)
return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8);
}
-/* Don't vectorise if the move is too large for one operation.
+/* Vectorise with loop for larger lengths
** f7:
-** li\s+a2,\d+
-** tail\s+memset
+** mv\s+[ta][0-7],a0
+** li\s+[ta][0-7],129
+** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+** vmv.v.x\s+v8,a1
+XX \.L\d+:
+** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+** vse8.v\s+v8,0\(a[0-9]\)
+** add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** bne\s+[ta][0-7],zero,\.L\d+
+** ret
*/
void *
f7 (void *a, int const b)
{
return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1);
}
+
+/* Vectorize with loop for unknown length.
+** f8:
+** mv\s+[ta][0-7],a0
+** mv\s+[ta][0-7],a2
+** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+** vmv.v.x\s+v8,a1
+XX \.L\d+:
+** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+** vse8.v\s+v8,0\(a[0-9]\)
+** add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** bne\s+[ta][0-7],zero,\.L\d+
+** ret
+*/
+void *
+f8 (void *a, int const b, int n)
+{
+ return __builtin_memset (a, b, n);
+}
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-add-options riscv_v } */
-/* { dg-additional-options "-O3 -mrvv-max-lmul=m1" } */
+/* { dg-additional-options "-O1 -mrvv-max-lmul=m1" } */
/* { dg-final { check-function-bodies "**" "" } } */
#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
@@ -39,13 +39,42 @@ f2 (void *a, int const b)
return __builtin_memset (a, b, MIN_VECTOR_BYTES);
}
-/* Don't vectorise if the move is too large for requested lmul.
+/* Vectorise with loop for larger lengths
** f3:
-** li\s+a2,\d+
-** tail\s+memset
+** mv\s+[ta][0-7],a0
+** li\s+[ta][0-7],17
+** vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
+** vmv.v.x\s+v1,a1
+XX \.L\d+:
+** vsetvli\s+[ta][0-7],[ta][0-7],e8,m1,ta,ma
+** vse8.v\s+v1,0\(a[0-9]\)
+** add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** bne\s+[ta][0-7],zero,\.L\d+
+** ret
*/
void *
f3 (void *a, int const b)
{
return __builtin_memset (a, b, MIN_VECTOR_BYTES + 1);
}
+
+/* Vectorize with loop for unknown length.
+** f4:
+** mv\s+[ta][0-7],a0
+** mv\s+[ta][0-7],a2
+** vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
+** vmv.v.x\s+v1,a1
+XX \.L\d+:
+** vsetvli\s+[ta][0-7],[ta][0-7],e8,m1,ta,ma
+** vse8.v\s+v1,0\(a[0-9]\)
+** add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** bne\s+[ta][0-7],zero,\.L\d+
+** ret
+*/
+void *
+f4 (void *a, int const b, int n)
+{
+ return __builtin_memset (a, b, n);
+}
@@ -1,6 +1,6 @@
/* { dg-do compile } */
/* { dg-add-options riscv_v } */
-/* { dg-additional-options "-O3 -mrvv-max-lmul=m8" } */
+/* { dg-additional-options "-O1 -mrvv-max-lmul=m8" } */
/* { dg-final { check-function-bodies "**" "" } } */
#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
@@ -21,13 +21,13 @@ f1 (void *a, int const b)
return __builtin_memset (a, b, MIN_VECTOR_BYTES - 1);
}
-/* Vectorise+inline minimum vector register width using requested lmul.
+/* Vectorised code should use smallest lmul known to fit length.
** f2:
** (
-** vsetivli\s+zero,\d+,e8,m8,ta,ma
+** vsetivli\s+zero,\d+,e8,m1,ta,ma
** |
** li\s+a\d+,\d+
-** vsetvli\s+zero,a\d+,e8,m8,ta,ma
+** vsetvli\s+zero,a\d+,e8,m1,ta,ma
** )
** vmv\.v\.x\s+v\d+,a1
** vse8\.v\s+v\d+,0\(a0\)
@@ -57,13 +57,40 @@ f3 (void *a, int const b)
return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8);
}
-/* Don't vectorise if the move is too large for requested lmul.
+/* Vectorise with loop for larger lengths
** f4:
-** li\s+a2,\d+
-** tail\s+memset
+** mv\s+[ta][0-7],a0
+** li\s+[ta][0-7],129
+** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+** vmv.v.x\s+v8,a1
+** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+** vse8.v\s+v8,0\(a[0-9]\)
+** add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** bne\s+[ta][0-7],zero,\.L\d+
+** ret
*/
void *
f4 (void *a, int const b)
{
return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1);
}
+
+/* Vectorize with loop for unknown length.
+** f5:
+** mv\s+[ta][0-7],a0
+** mv\s+[ta][0-7],a2
+** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+** vmv.v.x\s+v8,a1
+** vsetvli\s+[ta][0-7],[ta][0-7],e8,m8,ta,ma
+** vse8.v\s+v8,0\(a[0-9]\)
+** add\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** sub\s+[ta][0-7],[ta][0-7],[ta][0-7]
+** bne\s+[ta][0-7],zero,\.L\d+
+** ret
+*/
+void *
+f5 (void *a, int const b, int n)
+{
+ return __builtin_memset (a, b, n);
+}