@@ -2822,6 +2822,89 @@ shuffle_merge_patterns (struct expand_vec_perm_d *d)
return true;
}
+/* Recognize the consecutive index that we can use a single
+ vrgather.v[x|i] to shuffle the vectors.
+
+ e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
+ Use SEW = 32, index = 1 vrgather.vi to get the result. */
+static bool
+shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
+{
+ machine_mode vmode = d->vmode;
+ scalar_mode smode = GET_MODE_INNER (vmode);
+ poly_int64 vec_len = d->perm.length ();
+ HOST_WIDE_INT elt;
+
+ if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
+ return false;
+ int vlen = vec_len.to_constant ();
+
+ /* Compute the last element index of consecutive pattern from the leading
+ consecutive elements. */
+ int last_consecutive_idx = -1;
+ int consecutive_num = -1;
+ for (int i = 1; i < vlen; i++)
+ {
+ if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
+ break;
+ last_consecutive_idx = i;
+ consecutive_num = last_consecutive_idx + 1;
+ }
+
+ int new_vlen = vlen / consecutive_num;
+ if (last_consecutive_idx < 0 || consecutive_num == vlen
+ || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
+ return false;
+ /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
+ All elements of index, index + 1, ... index + consecutive_num - 1 should
+ locate at the same vector. */
+ if (maybe_ge (d->perm[0], vec_len)
+ != maybe_ge (d->perm[last_consecutive_idx], vec_len))
+ return false;
+ /* If a vector has 8 elements. We allow optimizations on consecutive
+ patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
+ Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
+ to be optimized. */
+ if (d->perm[0].to_constant () % consecutive_num != 0)
+ return false;
+ unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
+ if (container_bits > 64)
+ return false;
+ else if (container_bits == 64)
+ {
+ if (!TARGET_VECTOR_ELEN_64)
+ return false;
+ else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
+ return false;
+ }
+
+ /* Check the rest of elements are the same consecutive pattern. */
+ for (int i = consecutive_num; i < vlen; i++)
+ if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
+ return false;
+
+ if (FLOAT_MODE_P (smode))
+ smode = float_mode_for_size (container_bits).require ();
+ else
+ smode = int_mode_for_size (container_bits, 0).require ();
+ if (!get_vector_mode (smode, new_vlen).exists (&vmode))
+ return false;
+ machine_mode sel_mode = related_int_vector_mode (vmode).require ();
+
+ /* Success! */
+ if (d->testing_p)
+ return true;
+
+ int index = elt / consecutive_num;
+ if (index >= new_vlen)
+ index = index - new_vlen;
+ rtx sel = gen_const_vector_dup (sel_mode, index);
+ rtx op = elt >= vlen ? d->op0 : d->op1;
+ emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
+ gen_lowpart (vmode, op), sel);
+ return true;
+}
+
/* Recognize the patterns that we can use compress operation to shuffle the
vectors. The perm selector of compress pattern is divided into 2 part:
The first part is the random index number < NUNITS.
@@ -3174,6 +3257,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
if (shuffle_merge_patterns (d))
return true;
+ if (shuffle_consecutive_patterns (d))
+ return true;
if (shuffle_compress_patterns (d))
return true;
if (shuffle_decompress_patterns (d))
new file mode 100644
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx4i __attribute__ ((vector_size (4)));
+typedef uint8_t vnx4ui __attribute__ ((vector_size (4)));
+
+#define MASK_4 0, 1, 0, 1
+
+vnx4i __attribute__ ((noinline, noclone)) test_1 (vnx4i x, vnx4i y)
+{
+ return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx4ui __attribute__ ((noinline, noclone)) test_2 (vnx4ui x, vnx4ui y)
+{
+ return __builtin_shufflevector (x, y, MASK_4);
+}
+
+/* { dg-final { scan-assembler-times {\tvrgather\.vi} 2 } } */
new file mode 100644
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <stdint-gcc.h>
+
+typedef int8_t vnx8i __attribute__ ((vector_size (8)));
+typedef int16_t vnx4i __attribute__ ((vector_size (8)));
+typedef uint8_t vnx8ui __attribute__ ((vector_size (8)));
+typedef uint16_t vnx4ui __attribute__ ((vector_size (8)));
+typedef _Float16 vnx4f __attribute__ ((vector_size (8)));
+
+#define MASK_4 4, 5, 4, 5
+#define MASK_8 12, 13, 14, 15, 12, 13, 14, 15
+
+vnx8i __attribute__ ((noinline, noclone))
+test_1 (vnx8i x, vnx8i y)
+{
+ return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4i __attribute__ ((noinline, noclone))
+test_2 (vnx4i x, vnx4i y)
+{
+ return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx8ui __attribute__ ((noinline, noclone))
+test_3 (vnx8ui x, vnx8ui y)
+{
+ return __builtin_shufflevector (x, y, MASK_8);
+}
+
+vnx4ui __attribute__ ((noinline, noclone))
+test_4 (vnx4ui x, vnx4ui y)
+{
+ return __builtin_shufflevector (x, y, MASK_4);
+}
+
+vnx4f __attribute__ ((noinline, noclone))
+test_5 (vnx4f x, vnx4f y)
+{
+ return __builtin_shufflevector (x, y, MASK_4);
+}
+
+/* { dg-final { scan-assembler-times {\tvrgather\.vi} 5 } } */
new file mode 100644
@@ -0,0 +1,27 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "consecutive-1.c"
+
+int
+main (void)
+{
+ vnx4i test_1_x = {99, 111, 2, 4};
+ vnx4i test_1_y = {4, 5, 7, 8};
+ vnx4i test_1_except = {99, 111, 99, 111};
+ vnx4i test_1_real;
+ test_1_real = test_1 (test_1_x, test_1_y);
+ for (int i = 0; i < 4; i++)
+ assert (test_1_real[i] == test_1_except[i]);
+
+ vnx4ui test_2_x = {99, 111, 2, 4};
+ vnx4ui test_2_y = {4, 5, 6, 8};
+ vnx4ui test_2_except = {99, 111, 99, 111};
+ vnx4ui test_2_real;
+ test_2_real = test_2 (test_2_x, test_2_y);
+ for (int i = 0; i < 4; i++)
+ assert (test_2_real[i] == test_2_except[i]);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,51 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-options "-O3 --param riscv-autovec-preference=fixed-vlmax -Wno-psabi" } */
+
+#include <assert.h>
+#include "consecutive-2.c"
+
+int
+main (void)
+{
+ vnx8i test_1_x = {0, 1, 2, 3, 5, 6, 7, 8};
+ vnx8i test_1_y = {8, 9, 10, 11, 13, 14, 15, 16};
+ vnx8i test_1_except = {13, 14, 15, 16, 13, 14, 15, 16};
+ vnx8i test_1_real;
+ test_1_real = test_1 (test_1_x, test_1_y);
+ for (int i = 0; i < 8; i++)
+ assert (test_1_real[i] == test_1_except[i]);
+
+ vnx4i test_2_x = {1, 2, 3, 4};
+ vnx4i test_2_y = {5, 6, 7, 8};
+ vnx4i test_2_except = {5, 6, 5, 6};
+ vnx4i test_2_real;
+ test_2_real = test_2 (test_2_x, test_2_y);
+ for (int i = 0; i < 4; i++)
+ assert (test_2_real[i] == test_2_except[i]);
+
+ vnx8ui test_3_x = {0, 1, 2, 3, 4, 5, 6, 8};
+ vnx8ui test_3_y = {8, 9, 10, 11, 12, 13, 15, 16};
+ vnx8ui test_3_except = {12, 13, 15, 16, 12, 13, 15, 16};
+ vnx8ui test_3_real;
+ test_3_real = test_3 (test_3_x, test_3_y);
+ for (int i = 0; i < 8; i++)
+ assert (test_3_real[i] == test_3_except[i]);
+
+ vnx4ui test_4_x = {1, 2, 3, 4};
+ vnx4ui test_4_y = {4, 5, 6, 8};
+ vnx4ui test_4_except = {4, 5, 4, 5};
+ vnx4ui test_4_real;
+ test_4_real = test_4 (test_4_x, test_4_y);
+ for (int i = 0; i < 4; i++)
+ assert (test_4_real[i] == test_4_except[i]);
+
+ vnx4f test_5_x = {0, 1, 3, 4};
+ vnx4f test_5_y = {4, 5, 6, 7};
+ vnx4f test_5_except = {4, 5, 4, 5};
+ vnx4f test_5_real;
+ test_5_real = test_5 (test_5_x, test_5_y);
+ for (int i = 0; i < 4; i++)
+ assert (test_5_real[i] == test_5_except[i]);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+#define MASK_8 0, 1, 0, 1, 0, 1, 0, 1
+#define MASK_16 MASK_8, MASK_8
+#define MASK_32 MASK_16, MASK_16
+#define MASK_64 MASK_32, MASK_32
+#define MASK_64 MASK_32, MASK_32
+#define MASK_128 MASK_64, MASK_64
+#define MASK_256 MASK_128, MASK_128
+#define MASK_512 MASK_256, MASK_256
+#define MASK_1024 MASK_512, MASK_512
+#define MASK_2048 MASK_1024, MASK_1024
+#define MASK_4096 MASK_2048, MASK_2048
+
+DEF_CONSECUTIVE (v8qi, 8)
+DEF_CONSECUTIVE (v16qi, 16)
+DEF_CONSECUTIVE (v32qi, 32)
+DEF_CONSECUTIVE (v64qi, 64)
+DEF_CONSECUTIVE (v128qi, 128)
+DEF_CONSECUTIVE (v256qi, 256)
+DEF_CONSECUTIVE (v512qi, 512)
+DEF_CONSECUTIVE (v1024qi, 1024)
+DEF_CONSECUTIVE (v2048qi, 2048)
+DEF_CONSECUTIVE (v4096qi, 4096)
+DEF_CONSECUTIVE (v8uqi, 8)
+DEF_CONSECUTIVE (v16uqi, 16)
+DEF_CONSECUTIVE (v32uqi, 32)
+DEF_CONSECUTIVE (v64uqi, 64)
+DEF_CONSECUTIVE (v128uqi, 128)
+DEF_CONSECUTIVE (v256uqi, 256)
+DEF_CONSECUTIVE (v512uqi, 512)
+DEF_CONSECUTIVE (v1024uqi, 1024)
+DEF_CONSECUTIVE (v2048uqi, 2048)
+DEF_CONSECUTIVE (v4096uqi, 4096)
+
+DEF_CONSECUTIVE (v8hi, 8)
+DEF_CONSECUTIVE (v16hi, 16)
+DEF_CONSECUTIVE (v32hi, 32)
+DEF_CONSECUTIVE (v64hi, 64)
+DEF_CONSECUTIVE (v128hi, 128)
+DEF_CONSECUTIVE (v256hi, 256)
+DEF_CONSECUTIVE (v512hi, 512)
+DEF_CONSECUTIVE (v1024hi, 1024)
+DEF_CONSECUTIVE (v2048hi, 2048)
+DEF_CONSECUTIVE (v8uhi, 8)
+DEF_CONSECUTIVE (v16uhi, 16)
+DEF_CONSECUTIVE (v32uhi, 32)
+DEF_CONSECUTIVE (v64uhi, 64)
+DEF_CONSECUTIVE (v128uhi, 128)
+DEF_CONSECUTIVE (v256uhi, 256)
+DEF_CONSECUTIVE (v512uhi, 512)
+DEF_CONSECUTIVE (v1024uhi, 1024)
+DEF_CONSECUTIVE (v2048uhi, 2048)
+
+DEF_CONSECUTIVE (v8si, 8)
+DEF_CONSECUTIVE (v16si, 16)
+DEF_CONSECUTIVE (v32si, 32)
+DEF_CONSECUTIVE (v64si, 64)
+DEF_CONSECUTIVE (v128si, 128)
+DEF_CONSECUTIVE (v256si, 256)
+DEF_CONSECUTIVE (v512si, 512)
+DEF_CONSECUTIVE (v1024si, 1024)
+DEF_CONSECUTIVE (v8usi, 8)
+DEF_CONSECUTIVE (v16usi, 16)
+DEF_CONSECUTIVE (v32usi, 32)
+DEF_CONSECUTIVE (v64usi, 64)
+DEF_CONSECUTIVE (v128usi, 128)
+DEF_CONSECUTIVE (v256usi, 256)
+DEF_CONSECUTIVE (v512usi, 512)
+DEF_CONSECUTIVE (v1024usi, 1024)
+
+DEF_CONSECUTIVE (v8hf, 8)
+DEF_CONSECUTIVE (v16hf, 16)
+DEF_CONSECUTIVE (v32hf, 32)
+DEF_CONSECUTIVE (v64hf, 64)
+DEF_CONSECUTIVE (v128hf, 128)
+DEF_CONSECUTIVE (v256hf, 256)
+DEF_CONSECUTIVE (v512hf, 512)
+DEF_CONSECUTIVE (v1024hf, 1024)
+DEF_CONSECUTIVE (v2048hf, 2048)
+
+DEF_CONSECUTIVE (v8sf, 8)
+DEF_CONSECUTIVE (v16sf, 16)
+DEF_CONSECUTIVE (v32sf, 32)
+DEF_CONSECUTIVE (v64sf, 64)
+DEF_CONSECUTIVE (v128sf, 128)
+DEF_CONSECUTIVE (v256sf, 256)
+DEF_CONSECUTIVE (v512sf, 512)
+DEF_CONSECUTIVE (v1024sf, 1024)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\s+v[0-9]+,\s*v[0-9]+,\s*0} 71 } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+#define MASK_8 4, 5, 6, 7, 4, 5, 6, 7
+#define MASK_16 MASK_8, MASK_8
+#define MASK_32 MASK_16, MASK_16
+#define MASK_64 MASK_32, MASK_32
+#define MASK_64 MASK_32, MASK_32
+#define MASK_128 MASK_64, MASK_64
+#define MASK_256 MASK_128, MASK_128
+#define MASK_512 MASK_256, MASK_256
+#define MASK_1024 MASK_512, MASK_512
+#define MASK_2048 MASK_1024, MASK_1024
+#define MASK_4096 MASK_2048, MASK_2048
+
+DEF_CONSECUTIVE (v8qi, 8)
+DEF_CONSECUTIVE (v16qi, 16)
+DEF_CONSECUTIVE (v32qi, 32)
+DEF_CONSECUTIVE (v64qi, 64)
+DEF_CONSECUTIVE (v128qi, 128)
+DEF_CONSECUTIVE (v256qi, 256)
+DEF_CONSECUTIVE (v512qi, 512)
+DEF_CONSECUTIVE (v1024qi, 1024)
+DEF_CONSECUTIVE (v2048qi, 2048)
+DEF_CONSECUTIVE (v4096qi, 4096)
+DEF_CONSECUTIVE (v8uqi, 8)
+DEF_CONSECUTIVE (v16uqi, 16)
+DEF_CONSECUTIVE (v32uqi, 32)
+DEF_CONSECUTIVE (v64uqi, 64)
+DEF_CONSECUTIVE (v128uqi, 128)
+DEF_CONSECUTIVE (v256uqi, 256)
+DEF_CONSECUTIVE (v512uqi, 512)
+DEF_CONSECUTIVE (v1024uqi, 1024)
+DEF_CONSECUTIVE (v2048uqi, 2048)
+DEF_CONSECUTIVE (v4096uqi, 4096)
+
+DEF_CONSECUTIVE (v8hi, 8)
+DEF_CONSECUTIVE (v16hi, 16)
+DEF_CONSECUTIVE (v32hi, 32)
+DEF_CONSECUTIVE (v64hi, 64)
+DEF_CONSECUTIVE (v128hi, 128)
+DEF_CONSECUTIVE (v256hi, 256)
+DEF_CONSECUTIVE (v512hi, 512)
+DEF_CONSECUTIVE (v1024hi, 1024)
+DEF_CONSECUTIVE (v2048hi, 2048)
+DEF_CONSECUTIVE (v8uhi, 8)
+DEF_CONSECUTIVE (v16uhi, 16)
+DEF_CONSECUTIVE (v32uhi, 32)
+DEF_CONSECUTIVE (v64uhi, 64)
+DEF_CONSECUTIVE (v128uhi, 128)
+DEF_CONSECUTIVE (v256uhi, 256)
+DEF_CONSECUTIVE (v512uhi, 512)
+DEF_CONSECUTIVE (v1024uhi, 1024)
+DEF_CONSECUTIVE (v2048uhi, 2048)
+
+DEF_CONSECUTIVE (v8hf, 8)
+DEF_CONSECUTIVE (v16hf, 16)
+DEF_CONSECUTIVE (v32hf, 32)
+DEF_CONSECUTIVE (v64hf, 64)
+DEF_CONSECUTIVE (v128hf, 128)
+DEF_CONSECUTIVE (v256hf, 256)
+DEF_CONSECUTIVE (v512hf, 512)
+DEF_CONSECUTIVE (v1024hf, 1024)
+DEF_CONSECUTIVE (v2048hf, 2048)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\s+v[0-9]+,\s*v[0-9]+,\s*1} 47 } } */
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 -fdump-tree-optimized" } */
+
+#include "def.h"
+
+#define MASK_8 2, 3, 4, 5, 2, 3, 4, 5
+#define MASK_16 MASK_8, MASK_8
+#define MASK_32 MASK_16, MASK_16
+#define MASK_64 MASK_32, MASK_32
+#define MASK_64 MASK_32, MASK_32
+#define MASK_128 MASK_64, MASK_64
+#define MASK_256 MASK_128, MASK_128
+#define MASK_512 MASK_256, MASK_256
+#define MASK_1024 MASK_512, MASK_512
+#define MASK_2048 MASK_1024, MASK_1024
+#define MASK_4096 MASK_2048, MASK_2048
+
+DEF_CONSECUTIVE (v8qi, 8)
+DEF_CONSECUTIVE (v16qi, 16)
+DEF_CONSECUTIVE (v32qi, 32)
+DEF_CONSECUTIVE (v64qi, 64)
+DEF_CONSECUTIVE (v128qi, 128)
+DEF_CONSECUTIVE (v256qi, 256)
+DEF_CONSECUTIVE (v512qi, 512)
+DEF_CONSECUTIVE (v1024qi, 1024)
+DEF_CONSECUTIVE (v2048qi, 2048)
+DEF_CONSECUTIVE (v4096qi, 4096)
+DEF_CONSECUTIVE (v8uqi, 8)
+DEF_CONSECUTIVE (v16uqi, 16)
+DEF_CONSECUTIVE (v32uqi, 32)
+DEF_CONSECUTIVE (v64uqi, 64)
+DEF_CONSECUTIVE (v128uqi, 128)
+DEF_CONSECUTIVE (v256uqi, 256)
+DEF_CONSECUTIVE (v512uqi, 512)
+DEF_CONSECUTIVE (v1024uqi, 1024)
+DEF_CONSECUTIVE (v2048uqi, 2048)
+DEF_CONSECUTIVE (v4096uqi, 4096)
+
+DEF_CONSECUTIVE (v8hi, 8)
+DEF_CONSECUTIVE (v16hi, 16)
+DEF_CONSECUTIVE (v32hi, 32)
+DEF_CONSECUTIVE (v64hi, 64)
+DEF_CONSECUTIVE (v128hi, 128)
+DEF_CONSECUTIVE (v256hi, 256)
+DEF_CONSECUTIVE (v512hi, 512)
+DEF_CONSECUTIVE (v1024hi, 1024)
+DEF_CONSECUTIVE (v2048hi, 2048)
+DEF_CONSECUTIVE (v8uhi, 8)
+DEF_CONSECUTIVE (v16uhi, 16)
+DEF_CONSECUTIVE (v32uhi, 32)
+DEF_CONSECUTIVE (v64uhi, 64)
+DEF_CONSECUTIVE (v128uhi, 128)
+DEF_CONSECUTIVE (v256uhi, 256)
+DEF_CONSECUTIVE (v512uhi, 512)
+DEF_CONSECUTIVE (v1024uhi, 1024)
+DEF_CONSECUTIVE (v2048uhi, 2048)
+
+DEF_CONSECUTIVE (v8hf, 8)
+DEF_CONSECUTIVE (v16hf, 16)
+DEF_CONSECUTIVE (v32hf, 32)
+DEF_CONSECUTIVE (v64hf, 64)
+DEF_CONSECUTIVE (v128hf, 128)
+DEF_CONSECUTIVE (v256hf, 256)
+DEF_CONSECUTIVE (v512hf, 512)
+DEF_CONSECUTIVE (v1024hf, 1024)
+DEF_CONSECUTIVE (v2048hf, 2048)
+
+/* { dg-final { scan-assembler-not {vrgather\.vi\s+v[0-9]+,\s*v[0-9]+,\s*1} } } */
@@ -833,3 +833,9 @@ typedef double v512df __attribute__ ((vector_size (4096)));
a[i] = cond[i] ? (TYPE3) (b[i] >> shift) : a[i]; \
return a; \
}
+
+#define DEF_CONSECUTIVE(TYPE, NUM) \
+ TYPE f##TYPE (TYPE a, TYPE b) \
+ { \
+ return __builtin_shufflevector (a, b, MASK_##NUM); \
+ }