@@ -624,6 +624,7 @@ enum mask_policy
enum tail_policy get_prefer_tail_policy ();
enum mask_policy get_prefer_mask_policy ();
rtx get_avl_type_rtx (enum avl_type);
+opt_machine_mode get_lmul_mode (scalar_mode, int);
opt_machine_mode get_vector_mode (scalar_mode, poly_uint64);
opt_machine_mode get_tuple_mode (machine_mode, unsigned int);
bool simm5_p (rtx);
@@ -672,7 +673,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode,
machine_mode, rtx *);
rtx gen_avl_for_scalar_move (rtx);
void expand_tuple_move (rtx *);
-bool expand_block_move (rtx, rtx, rtx);
+bool expand_block_move (rtx, rtx, rtx, bool);
machine_mode preferred_simd_mode (scalar_mode);
machine_mode get_mask_mode (machine_mode);
void expand_vec_series (rtx, rtx, rtx, rtx = 0);
@@ -966,7 +966,7 @@ riscv_expand_block_move_scalar (rtx dest, rtx src, rtx length)
/* This function delegates block-move expansion to either the vector
implementation or the scalar one. Return TRUE if successful or FALSE
- otherwise. */
+ otherwise. Assume that the memory regions do not overlap. */
bool
riscv_expand_block_move (rtx dest, rtx src, rtx length)
@@ -974,7 +974,7 @@ riscv_expand_block_move (rtx dest, rtx src, rtx length)
if ((TARGET_VECTOR && !TARGET_XTHEADVECTOR)
&& stringop_strategy & STRATEGY_VECTOR)
{
- bool ok = riscv_vector::expand_block_move (dest, src, length);
+ bool ok = riscv_vector::expand_block_move (dest, src, length, false);
if (ok)
return true;
}
@@ -1054,7 +1054,7 @@ namespace riscv_vector {
/* Used by cpymemsi in riscv.md . */
bool
-expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in, bool movmem_p)
{
/*
memcpy:
@@ -1085,10 +1085,9 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
{
HOST_WIDE_INT length = INTVAL (length_in);
- /* By using LMUL=8, we can copy as many bytes in one go as there
- are bits in a vector register. If the entire block thus fits,
- we don't need a loop. */
- if (length <= TARGET_MIN_VLEN)
+ /* If the VLEN and preferred LMUL allow the entire block to be copied in
+ one go then no loop is needed. */
+ if (known_le (length, BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL))
{
need_loop = false;
@@ -1114,19 +1113,32 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
for small element widths, we might allow larger element widths for
loops too. */
if (need_loop)
- potential_ew = 1;
+ {
+ if (movmem_p)
+ /* Inlining general memmove is a pessimisation: we can't avoid
+ having to decide which direction to go at runtime, which is
+ costly in instruction count however for situations where the
+ entire move fits in one vector operation we can do all reads
+ before doing any writes so we don't have to worry so generate
+ the inline vector code in such situations. */
+ return false;
+ potential_ew = 1;
+ }
for (; potential_ew; potential_ew >>= 1)
{
scalar_int_mode elem_mode;
unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
- unsigned HOST_WIDE_INT per_iter;
- HOST_WIDE_INT nunits;
+ poly_uint64 per_iter;
+ poly_int64 nunits;
if (need_loop)
- per_iter = TARGET_MIN_VLEN;
+ per_iter = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
else
per_iter = length;
- nunits = per_iter / potential_ew;
+ /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL may not be divisible by
+ this potential_ew. */
+ if (!multiple_p (per_iter, potential_ew, &nunits))
+ continue;
/* Unless we get an implementation that's slow for small element
size / non-word-aligned accesses, we assume that the hardware
@@ -1137,6 +1149,8 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
if (length % potential_ew != 0
|| !int_mode_for_size (bits, 0).exists (&elem_mode))
continue;
+
+ poly_uint64 mode_units;
/* Find the mode to use for the copy inside the loop - or the
sole copy, if there is no loop. */
if (!need_loop)
@@ -1152,12 +1166,12 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
pointless.
Still, by choosing a lower LMUL factor that still allows
an entire transfer, we can reduce register pressure. */
- for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
- if (length * BITS_PER_UNIT <= TARGET_MIN_VLEN * lmul
- && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew)
+ for (unsigned lmul = 1; lmul < TARGET_MAX_LMUL; lmul <<= 1)
+ if (known_le (length * BITS_PER_UNIT, TARGET_MIN_VLEN * lmul)
+ && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew,
+ &mode_units)
&& (riscv_vector::get_vector_mode
- (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul,
- potential_ew)).exists (&vmode)))
+ (elem_mode, mode_units).exists (&vmode)))
break;
}
@@ -1165,15 +1179,12 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
if (vmode != VOIDmode)
break;
- /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
- wide. BYTES_PER_RISCV_VECTOR can't be evenly divided by
- the sizes of larger element types; the LMUL factor of 8 can at
- the moment be divided by the SEW, with SEW of up to 8 bytes,
- but there are reserved encodings so there might be larger
- SEW in the future. */
- if (riscv_vector::get_vector_mode
- (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * 8,
- potential_ew)).exists (&vmode))
+ /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL will at least be divisible
+ by potential_ew 1, so this should succeed eventually. */
+ if (multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
+ potential_ew, &mode_units)
+ && riscv_vector::get_vector_mode (elem_mode,
+ mode_units).exists (&vmode))
break;
/* We may get here if we tried an element size that's larger than
@@ -1186,7 +1197,7 @@ expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
}
else
{
- vmode = E_RVVM8QImode;
+ gcc_assert (get_lmul_mode (QImode, TARGET_MAX_LMUL).exists (&vmode));
}
/* A memcpy libcall in the worst case takes 3 instructions to prepare the
@@ -1890,6 +1890,18 @@ get_mask_mode (machine_mode mode)
return get_vector_mode (BImode, nunits).require ();
}
+/* Return the appropriate LMUL mode for MODE. */
+
+opt_machine_mode
+get_lmul_mode (scalar_mode mode, int lmul)
+{
+ poly_uint64 lmul_nunits;
+ unsigned int bytes = GET_MODE_SIZE (mode);
+ if (multiple_p (BYTES_PER_RISCV_VECTOR * lmul, bytes, &lmul_nunits))
+ return get_vector_mode (mode, lmul_nunits);
+ return E_VOIDmode;
+}
+
/* Return the appropriate M1 mode for MODE. */
static opt_machine_mode
@@ -2745,12 +2745,6 @@
FAIL;
})
-;; Inlining general memmove is a pessimisation: we can't avoid having to decide
-;; which direction to go at runtime, which is costly in instruction count
-;; however for situations where the entire move fits in one vector operation
-;; we can do all reads before doing any writes so we don't have to worry
-;; so generate the inline vector code in such situations
-;; nb. prefer scalar path for tiny memmoves.
(define_expand "movmem<mode>"
[(parallel [(set (match_operand:BLK 0 "general_operand")
(match_operand:BLK 1 "general_operand"))
@@ -2758,10 +2752,8 @@
(use (match_operand:SI 3 "const_int_operand"))])]
"TARGET_VECTOR"
{
- if ((INTVAL (operands[2]) >= TARGET_MIN_VLEN / 8)
- && (INTVAL (operands[2]) <= TARGET_MIN_VLEN)
- && riscv_vector::expand_block_move (operands[0], operands[1],
- operands[2]))
+ if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2],
+ true))
DONE;
else
FAIL;
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -mrvv-max-lmul=m8" } */
signed char e;
short f = 8;
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -frename-registers" } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -frename-registers -mrvv-max-lmul=m8" } */
signed char e;
short f = 8;
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -143,10 +143,6 @@ DEF_RET1_ARG9 (v1024qi)
DEF_RET1_ARG9 (v2048qi)
DEF_RET1_ARG9 (v4096qi)
-// RET1_ARG0 tests
-/* { dg-final { scan-assembler-times {li\s+a[0-1],\s*0} 9 } } */
-/* { dg-final { scan-assembler-times {call\s+memset} 3 } } */
-
// v1qi tests: return value (lbu) and function prologue (sb)
// 1 lbu per test, argnum sb's when args > 1
/* { dg-final { scan-assembler-times {lbu\s+a0,\s*[0-9]+\(sp\)} 8 } } */
@@ -169,7 +165,4 @@ DEF_RET1_ARG9 (v4096qi)
/* { dg-final { scan-assembler-times {sd\s+a[0-7],\s*[0-9]+\(sp\)} 103 } } */
// v32-4096qi tests: return value (vse8.v)
-/* { dg-final { scan-assembler-times {vse8.v\s+v[0-9],\s*[0-9]+\(a0\)} 74 } } */
-// v1024-4096qi_ARG1 tests: return value (vse64.v)
-// for some reason ARG1 returns using vse64 instead of vse8
-/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)\s+ret} 3 } } */
+/* { dg-final { scan-assembler-times {vse8.v\s+v[0-9],\s*[0-9]+\(a0\)} 80 } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -133,10 +133,6 @@ DEF_RET1_ARG9 (v512hi)
DEF_RET1_ARG9 (v1024hi)
DEF_RET1_ARG9 (v2048hi)
-// RET1_ARG0 tests
-/* { dg-final { scan-assembler-times {li\s+a[0-1],\s*0} 8 } } */
-/* { dg-final { scan-assembler-times {call\s+memset} 3 } } */
-
// v1hi tests: return value (lhu) and function prologue (sh)
// 1 lhu per test, argnum sh's when args > 1
/* { dg-final { scan-assembler-times {lhu\s+a0,\s*[0-9]+\(sp\)} 8 } } */
@@ -155,7 +151,4 @@ DEF_RET1_ARG9 (v2048hi)
/* { dg-final { scan-assembler-times {sd\s+a[0-7],\s*[0-9]+\(sp\)} 103 } } */
// v16-2048hi tests: return value (vse16.v)
-/* { dg-final { scan-assembler-times {vse16.v\s+v[0-9],\s*[0-9]+\(a0\)} 74 } } */
-// v512-2048qi_ARG1 tests: return value (vse64.v)
-// for some reason ARG1 returns using vse64 instead of vse16
-/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)\s+ret} 3 } } */
+/* { dg-final { scan-assembler-times {vse16.v\s+v[0-9],\s*[0-9]+\(a0\)} 80 } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -123,10 +123,6 @@ DEF_RET1_ARG9 (v256si)
DEF_RET1_ARG9 (v512si)
DEF_RET1_ARG9 (v1024si)
-// RET1_ARG0 tests
-/* { dg-final { scan-assembler-times {li\s+a[0-1],\s*0} 7 } } */
-/* { dg-final { scan-assembler-times {call\s+memset} 3 } } */
-
// v1si tests: return value (lw) and function prologue (sw)
// 1 lw per test, argnum sw's when args > 1
/* { dg-final { scan-assembler-times {lw\s+a0,\s*[0-9]+\(sp\)} 8 } } */
@@ -140,7 +136,4 @@ DEF_RET1_ARG9 (v1024si)
/* { dg-final { scan-assembler-times {sd\s+a[0-7],\s*[0-9]+\(sp\)} 103 } } */
// v8-1024si tests: return value (vse32.v)
-/* { dg-final { scan-assembler-times {vse32.v\s+v[0-9],\s*[0-9]+\(a0\)} 74 } } */
-// 256-1024si tests: return value (vse64.v)
-// for some reason ARG1 returns using vse64 instead of vse32
-/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)\s+ret} 3 } } */
+/* { dg-final { scan-assembler-times {vse32.v\s+v[0-9],\s*[0-9]+\(a0\)} 80 } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -113,10 +113,6 @@ DEF_RET1_ARG9 (v128di)
DEF_RET1_ARG9 (v256di)
DEF_RET1_ARG9 (v512di)
-// RET1_ARG0 tests
-/* { dg-final { scan-assembler-times {li\s+a[0-1],\s*0} 6 } } */
-/* { dg-final { scan-assembler-times {call\s+memset} 3 } } */
-
// v1di and v2di tests: return value (ld) and function prologue (sd)
// - 1 ld per v1di and 2 ld per v2di with args > 1
// - argnum sd's per v1di when argnum > 1
@@ -125,4 +121,4 @@ DEF_RET1_ARG9 (v512di)
/* { dg-final { scan-assembler-times {sd\s+a[0-7],\s*[0-9]+\(sp\)} 103 } } */
// v4-512di tests: return value (vse64.v)
-/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)} 77 } } */
+/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)} 80 } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -133,10 +133,6 @@ DEF_RET1_ARG9 (v512hf)
DEF_RET1_ARG9 (v1024hf)
DEF_RET1_ARG9 (v2048hf)
-// RET1_ARG0 tests
-/* { dg-final { scan-assembler-times {li\s+a[0-1],\s*0} 8 } } */
-/* { dg-final { scan-assembler-times {call\s+memset} 3 } } */
-
// v1hf tests: return value (lhu) and function prologue (sh)
// 1 lhu per test, argnum sh's when args > 1
/* { dg-final { scan-assembler-times {lhu\s+a[0-1],\s*[0-9]+\(sp\)} 8 } } */
@@ -155,7 +151,4 @@ DEF_RET1_ARG9 (v2048hf)
/* { dg-final { scan-assembler-times {sd\s+a[0-7],\s*[0-9]+\(sp\)} 103 } } */
// v16-2048hf tests: return value (vse16.v)
-/* { dg-final { scan-assembler-times {vse16.v\s+v[0-9],\s*[0-9]+\(a0\)} 74 } } */
-// v512-2048qf_ARG1 tests: return value (vse64.v)
-// for some reason ARG1 returns using vse64 instead of vse16
-/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)\s+ret} 3 } } */
+/* { dg-final { scan-assembler-times {vse16.v\s+v[0-9],\s*[0-9]+\(a0\)} 80 } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -123,10 +123,6 @@ DEF_RET1_ARG9 (v256sf)
DEF_RET1_ARG9 (v512sf)
DEF_RET1_ARG9 (v1024sf)
-// RET1_ARG0 tests
-/* { dg-final { scan-assembler-times {li\s+a[0-1],\s*0} 7 } } */
-/* { dg-final { scan-assembler-times {call\s+memset} 3 } } */
-
// v1sf tests: return value (lw) and function prologue (sw)
// 1 lw per test, argnum sw's when args > 1
/* { dg-final { scan-assembler-times {lw\s+a[0-1],\s*[0-9]+\(sp\)} 8 } } */
@@ -140,7 +136,4 @@ DEF_RET1_ARG9 (v1024sf)
/* { dg-final { scan-assembler-times {sd\s+a[0-7],\s*[0-9]+\(sp\)} 103 } } */
// v8-1024sf tests: return value (vse32.v)
-/* { dg-final { scan-assembler-times {vse32.v\s+v[0-9],\s*[0-9]+\(a0\)} 74 } } */
-// 256-1024sf tests: return value (vse64.v)
-// for some reason ARG1 returns using vse64 instead of vse32
-/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)\s+ret} 3 } } */
+/* { dg-final { scan-assembler-times {vse32.v\s+v[0-9],\s*[0-9]+\(a0\)} 80 } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mrvv-vector-bits=scalable -mabi=lp64d -O3 -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -113,10 +113,6 @@ DEF_RET1_ARG9 (v128df)
DEF_RET1_ARG9 (v256df)
DEF_RET1_ARG9 (v512df)
-// RET1_ARG0 tests
-/* { dg-final { scan-assembler-times {li\s+a[0-1],\s*0} 6 } } */
-/* { dg-final { scan-assembler-times {call\s+memset} 3 } } */
-
// v1df and v2df tests: return value (ld) and function prologue (sd)
// - 1 ld per v1df and 2 ld per v2df with args > 1
// - argnum sd's per v1df when argnum > 1
@@ -125,4 +121,4 @@ DEF_RET1_ARG9 (v512df)
/* { dg-final { scan-assembler-times {sd\s+a[0-7],\s*[0-9]+\(sp\)} 103 } } */
// v4-512df tests: return value (vse64.v)
-/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)} 77 } } */
+/* { dg-final { scan-assembler-times {vse64.v\s+v[0-9],\s*[0-9]+\(a0\)} 80 } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
#include "def.h"
@@ -12,7 +12,7 @@ extern void *memcpy(void *__restrict dest, const void *__restrict src, __SIZE_TY
/* memcpy should be implemented using the cpymem pattern.
** f1:
XX \.L\d+: # local label is ignored
-** vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+** vsetvli\s+[ta][0-7],a2,e8,m1,ta,ma
** vle8\.v\s+v\d+,0\(a1\)
** vse8\.v\s+v\d+,0\(a0\)
** add\s+a1,a1,[ta][0-7]
@@ -31,7 +31,7 @@ void f1 (void *a, void *b, __SIZE_TYPE__ l)
overflow is undefined.
** f2:
XX \.L\d+: # local label is ignored
-** vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+** vsetvli\s+[ta][0-7],a2,e8,m1,ta,ma
** vle8\.v\s+v\d+,0\(a1\)
** vse8\.v\s+v\d+,0\(a0\)
** add\s+a1,a1,[ta][0-7]
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-additional-options "-O1 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-additional-options "-O1 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m8" } */
/* { dg-add-options riscv_v } */
/* { dg-final { check-function-bodies "**" "" } } */
@@ -7,13 +7,14 @@
/* Tiny memmoves should not be vectorised.
** f1:
-** li\s+a2,\d+
-** tail\s+memmove
+** lbu\s+[ta][0-7],0\(a1\)
+** sb\s+[ta][0-7],0\(a0\)
+** ret
*/
char *
f1 (char *a, char const *b)
{
- return __builtin_memmove (a, b, MIN_VECTOR_BYTES - 1);
+ return __builtin_memmove (a, b, 1);
}
/* Vectorise+inline minimum vector register width with LMUL=1
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl" } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -ftree-vectorize -mrvv-vector-bits=zvl -mrvv-max-lmul=m8" } */
#include "riscv_vector.h"
@@ -52,7 +52,7 @@ int main() {
printf("%d\n", m);
}
-/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
/* { dg-final { scan-assembler-not {vsetivli} } } */
-/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\t[a-x0-9]+,\s*[a-x0-9]+,\s*e8,\s*m1,\s*t[au],\s*m[au]} 3 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
+/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 3 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-funroll-loops" no-opts "-g" } } } } */
@@ -62,7 +62,7 @@ int main() {
return 0;
}
-/* { dg-final { scan-assembler-times {vsetvli} 4 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 5 } } */
/* { dg-final { scan-assembler-not {vsetivli} } } */
-/* { dg-final { scan-assembler-times {vsetvli\tzero,\s*[a-x0-9]+,\s*e8,\s*m2,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli\t[a-x0-9]+,\s*[a-x0-9]+,\s*e8,\s*m1,\s*t[au],\s*m[au]} 2 } } */
+/* { dg-final { scan-assembler-times {li\t[a-x0-9]+,\s*32} 2 } } */