@@ -1153,9 +1153,7 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
Still, by choosing a lower LMUL factor that still allows
an entire transfer, we can reduce register pressure. */
for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
- if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
- /* Avoid loosing the option of using vsetivli . */
- && (nunits <= 31 * lmul || nunits > 31 * 8)
+ if (length * BITS_PER_UNIT <= TARGET_MIN_VLEN * lmul
&& multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew)
&& (riscv_vector::get_vector_mode
(elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul,
@@ -1163,6 +1161,10 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
break;
}
+ /* Stop searching if a suitable vmode has been found. */
+ if (vmode != VOIDmode)
+ break;
+
/* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
wide. BYTES_PER_RISCV_VECTOR can't be evenly divided by
the sizes of larger element types; the LMUL factor of 8 can at
new file mode 100644
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Check that vector memcpy with predicated store uses smaller LMUL where
+ possible.
+
+/* m1
+** f1:
+** (
+** vsetivli\s+zero,\d+,e8,m1,ta,ma
+** |
+** li\s+[ta][0-7],\d+
+** vsetvli\s+zero,[ta][0-7],e8,m1,ta,ma
+** )
+** vle8.v\s+v\d+,0\(a1\)
+** vse8.v\s+v\d+,0\(a0\)
+** ret
+*/
+
+void f1 (char *d, char *s)
+{
+ __builtin_memcpy (d, s, MIN_VECTOR_BYTES - 1);
+}
+
+/* m2
+** f2:
+** (
+** vsetivli\s+zero,\d+,e8,m2,ta,ma
+** |
+** li\s+[ta][0-7],\d+
+** vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma
+** )
+** vle8.v\s+v\d+,0\(a1\)
+** vse8.v\s+v\d+,0\(a0\)
+** ret
+*/
+
+void f2 (char *d, char *s)
+{
+ __builtin_memcpy (d, s, 2 * MIN_VECTOR_BYTES - 1);
+}
+
+/* m4
+** f3:
+** (
+** vsetivli\s+zero,\d+,e8,m4,ta,ma
+** |
+** li\s+[ta][0-7],\d+
+** vsetvli\s+zero,[ta][0-7],e8,m4,ta,ma
+** )
+** vle8.v\s+v\d+,0\(a1\)
+** vse8.v\s+v\d+,0\(a0\)
+** ret
+*/
+
+void f3 (char *d, char *s)
+{
+ __builtin_memcpy (d, s, 4 * MIN_VECTOR_BYTES - 1);
+}
+
+/* m8
+** f4:
+** (
+** vsetivli\s+zero,\d+,e8,m8,ta,ma
+** |
+** li\s+[ta][0-7],\d+
+** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+** |
+** li\s+[ta][0-7],\d+
+** addi\s+[ta][0-7],[ta][0-7],-?\d+
+** vsetvli\s+zero,[ta][0-7],e8,m8,ta,ma
+** )
+** vle8.v\s+v\d+,0\(a1\)
+** vse8.v\s+v\d+,0\(a0\)
+** ret
+*/
+
+void f4 (char *d, char *s)
+{
+ __builtin_memcpy (d, s, 8 * MIN_VECTOR_BYTES - 1);
+}
@@ -54,5 +54,5 @@ int main() {
/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
/* { dg-final { scan-assembler-not {vsetivli} } } */
-/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
@@ -64,5 +64,5 @@ int main() {
/* { dg-final { scan-assembler-times {vsetvli} 4 } } */
/* { dg-final { scan-assembler-not {vsetivli} } } */
-/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m8,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 1 } } */
/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 1 } } */