diff mbox series

[2/2] aarch64: Improve part-variable vector initialization with SVE INDEX instruction [PR113328]

Message ID 20240912005129.26758-2-quic_pzheng@quicinc.com
State New
Headers show
Series [1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328] | expand

Commit Message

Pengxuan Zheng Sept. 12, 2024, 12:51 a.m. UTC
We can still use SVE's INDEX instruction to construct vectors even if not all
elements are constants. For example, { 0, x, 2, 3 } can be constructed by first
using "INDEX #0, #1" to generate { 0, 1, 2, 3 }, and then set the elements which
are non-constants separately.

	PR target/113328

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback):
	Improve part-variable vector generation with SVE's INDEX if TARGET_SVE
	is available.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
	check-function-bodies.
	* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
	* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
	* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
	* gcc.target/aarch64/sve/vec_init_4.c: New test.
	* gcc.target/aarch64/sve/vec_init_5.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
---
 gcc/config/aarch64/aarch64.cc                 | 81 ++++++++++++++++++-
 .../aarch64/sve/acle/general/dupq_1.c         | 12 ++-
 .../aarch64/sve/acle/general/dupq_2.c         | 12 ++-
 .../aarch64/sve/acle/general/dupq_3.c         | 12 ++-
 .../aarch64/sve/acle/general/dupq_4.c         | 12 ++-
 .../gcc.target/aarch64/sve/vec_init_4.c       | 47 +++++++++++
 .../gcc.target/aarch64/sve/vec_init_5.c       | 12 +++
 7 files changed, 171 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6b3ca57d0eb..7305a5c6375 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23942,12 +23942,91 @@  aarch64_expand_vector_init_fallback (rtx target, rtx vals)
   if (n_var != n_elts)
     {
       rtx copy = copy_rtx (vals);
+      bool is_index_seq = false;
+
+      /* If at least half of the elements of the vector are constants and all
+	 these constant elements form a linear sequence of the form { B, B + S,
+	 B + 2 * S, B + 3 * S, ... }, we can generate the vector with SVE's
+	 INDEX instruction if SVE is available and then set the elements which
+	 are not constant separately.  More precisely, each constant element I
+	 has to be B + I * S where B and S must be valid immediate operand for
+	 an SVE INDEX instruction.
+
+	 For example, { X, 1, 2, 3} is a vector satisfying these conditions and
+	 we can generate a vector of all constants (i.e., { 0, 1, 2, 3 }) first
+	 and then set the first element of the vector to X.  */
+
+      if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+	  && n_var <= n_elts / 2)
+	{
+	  int const_idx = -1;
+	  HOST_WIDE_INT const_val = 0;
+	  int base = 16;
+	  int step = 16;
+
+	  for (int i = 0; i < n_elts; ++i)
+	    {
+	      rtx x = XVECEXP (vals, 0, i);
+
+	      if (!CONST_INT_P (x))
+		continue;
+
+	      if (const_idx == -1)
+		{
+		  const_idx = i;
+		  const_val = INTVAL (x);
+		}
+	      else
+		{
+		  if ((INTVAL (x) - const_val) % (i - const_idx) == 0)
+		    {
+		      HOST_WIDE_INT s
+			  = (INTVAL (x) - const_val) / (i - const_idx);
+		      if (s >= -16 && s <= 15)
+			{
+			  int b = const_val - s * const_idx;
+			  if (b >= -16 && b <= 15)
+			    {
+			      base = b;
+			      step = s;
+			    }
+			}
+		    }
+		  break;
+		}
+	    }
+
+	  if (base != 16
+	      && (!CONST_INT_P (v0)
+		  || (CONST_INT_P (v0) && INTVAL (v0) == base)))
+	    {
+	      if (!CONST_INT_P (v0))
+		XVECEXP (copy, 0, 0) = GEN_INT (base);
+
+	      is_index_seq = true;
+	      for (int i = 1; i < n_elts; ++i)
+		{
+		  rtx x = XVECEXP (copy, 0, i);
+
+		  if (CONST_INT_P (x))
+		    {
+		      if (INTVAL (x) != base + i * step)
+			{
+			  is_index_seq = false;
+			  break;
+			}
+		    }
+		  else
+		    XVECEXP (copy, 0, i) = GEN_INT (base + i * step);
+		}
+	    }
+	}
 
       /* Load constant part of vector.  We really don't care what goes into the
 	 parts we will overwrite, but we're more likely to be able to load the
 	 constant efficiently if it has fewer, larger, repeating parts
 	 (see aarch64_simd_valid_immediate).  */
-      for (int i = 0; i < n_elts; i++)
+      for (int i = 0; !is_index_seq && i < n_elts; i++)
 	{
 	  rtx x = XVECEXP (vals, 0, i);
 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
index 0940bedd0dd..74cfe5e7ee3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
@@ -1,15 +1,19 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
 /* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 #include <arm_sve.h>
 
+/*
+** dupq:
+**	index	z0\.s, #0, #1
+**	ins	v0\.s\[0\], w0
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
 svint32_t
 dupq (int x)
 {
   return svdupq_s32 (x, 1, 2, 3);
 }
-
-/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
-/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
index 218a6601337..51e380efba1 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
@@ -1,15 +1,19 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 /* To avoid needing big-endian header files.  */
 #pragma GCC aarch64 "arm_sve.h"
 
+/*
+** dupq:
+**	index	z0\.s, #3, #-1
+**	ins	v0\.s\[0\], w0
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
 svint32_t
 dupq (int x)
 {
   return svdupq_s32 (x, 1, 2, 3);
 }
-
-/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
-/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
index 245d43b75b5..7796862a1fb 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
@@ -1,15 +1,19 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 /* To avoid needing big-endian header files.  */
 #pragma GCC aarch64 "arm_sve.h"
 
+/*
+** dupq:
+**	index	z0\.s, #0, #1
+**	ins	v0\.s\[2\], w0
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
 svint32_t
 dupq (int x)
 {
   return svdupq_s32 (0, 1, x, 3);
 }
-
-/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
-/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
index cbee6f27b62..eecfc3d363b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
@@ -1,15 +1,19 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
 
 /* To avoid needing big-endian header files.  */
 #pragma GCC aarch64 "arm_sve.h"
 
+/*
+** dupq:
+**	index	z0\.s, #3, #-1
+**	ins	v0\.s\[2\], w0
+**	dup	z0\.q, z0\.q\[0\]
+**	ret
+*/
 svint32_t
 dupq (int x)
 {
   return svdupq_s32 (0, 1, x, 3);
 }
-
-/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
-/* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c
new file mode 100644
index 00000000000..898168dc8ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_4.c
@@ -0,0 +1,47 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef short v8hi __attribute__((vector_size(16)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef long v2di __attribute__((vector_size(16)));
+
+/*
+** f:
+**	index	z0\.s, #0, #1
+**	ins	v0\.s\[1\], w0
+**	ret
+*/
+v4si
+f (int x)
+{
+  return (v4si){ 0, x, 2, 3 };
+}
+
+/*
+** f1:
+**	index	z0\.s, #3, #-4
+**	ins	v0\.s\[1\], w0
+**	ins	v0\.s\[2\], w1
+**	ret
+*/
+v4si
+f1 (int x, int y)
+{
+  return (v4si){ 3, x, y, -9 };
+}
+
+/*
+** f2:
+**	index	z0\.h, #4, #2
+**	ins	v0\.h\[0\], w0
+**	ins	v0\.h\[3\], w1
+**	ins	v0\.h\[7\], w2
+**	ret
+*/
+v8hi
+f2 (short x, short y, short z)
+{
+  return (v8hi){ x, 6, 8, y, 12, 14, 16, z };
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
new file mode 100644
index 00000000000..e4a71736f5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_5.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+v4si
+f (int x, int y)
+{
+  return (v4si){ 1, x, y, 3 };
+}
+
+/* { dg-final { scan-assembler-not {index} } } */