diff mbox series

aarch64: Handle SVE modes in aarch64_evpc_reencode

Message ID mptjzeo6t3m.fsf@arm.com
State New
Headers show
Series aarch64: Handle SVE modes in aarch64_evpc_reencode | expand

Commit Message

Richard Sandiford Oct. 4, 2024, 10:49 a.m. UTC
For Advanced SIMD modes, aarch64_evpc_reencode tests whether
a permute in a narrow element mode can be done more cheaply
in a wider mode.  For example, { 0, 1, 8, 9, 4, 5, 12, 13 }
on V8HI is a natural TRN1 on V4SI ({ 0, 4, 2, 6 }).

This patch extends the code to handle SVE data and predicate
modes as well.  This is a prerequisite to getting good results
for PR116583.

Tested on aarch64-linux-gnu (with and without SVE enabled by default).
I'll push on Monday if there are no comments before then.

Thanks,
Richard


gcc/
	PR target/116583
	* config/aarch64/aarch64.cc (aarch64_coalesce_units): New function,
	extending the Advanced SIMD handling from...
	(aarch64_evpc_reencode): ...here to SVE data and predicate modes.

gcc/testsuite/
	PR target/116583
	* gcc.target/aarch64/sve/permute_1.c: New test.
	* gcc.target/aarch64/sve/permute_2.c: Likewise.
	* gcc.target/aarch64/sve/permute_3.c: Likewise.
	* gcc.target/aarch64/sve/permute_4.c: Likewise.
---
 gcc/config/aarch64/aarch64.cc                 |  55 +++-
 .../gcc.target/aarch64/sve/permute_1.c        | 106 +++++++
 .../gcc.target/aarch64/sve/permute_2.c        | 277 ++++++++++++++++++
 .../gcc.target/aarch64/sve/permute_3.c        |  91 ++++++
 .../gcc.target/aarch64/sve/permute_4.c        | 113 +++++++
 5 files changed, 633 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/permute_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/permute_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/permute_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/permute_4.c
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e7bb3278a27..102680a0efc 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1933,6 +1933,46 @@  aarch64_sve_int_mode (machine_mode mode)
   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
 }
 
+/* Look for a vector mode with the same classification as VEC_MODE,
+   but with each group of FACTOR elements coalesced into a single element.
+   In other words, look for a mode in which the elements are FACTOR times
+   larger and in which the number of elements is FACTOR times smaller.
+
+   Return the mode found, if one exists.  */
+
+static opt_machine_mode
+aarch64_coalesce_units (machine_mode vec_mode, unsigned int factor)
+{
+  auto elt_bits = vector_element_size (GET_MODE_BITSIZE (vec_mode),
+				       GET_MODE_NUNITS (vec_mode));
+  auto vec_flags = aarch64_classify_vector_mode (vec_mode);
+  if (vec_flags & VEC_SVE_PRED)
+    {
+      if (known_eq (GET_MODE_SIZE (vec_mode), BYTES_PER_SVE_PRED))
+	return aarch64_sve_pred_mode (elt_bits * factor);
+      return {};
+    }
+
+  scalar_mode new_elt_mode;
+  if (!int_mode_for_size (elt_bits * factor, false).exists (&new_elt_mode))
+    return {};
+
+  if (vec_flags == VEC_ADVSIMD)
+    {
+      auto mode = aarch64_simd_container_mode (new_elt_mode,
+					       GET_MODE_BITSIZE (vec_mode));
+      if (mode != word_mode)
+	return mode;
+    }
+  else if (vec_flags & VEC_SVE_DATA)
+    {
+      poly_uint64 new_nunits;
+      if (multiple_p (GET_MODE_NUNITS (vec_mode), factor, &new_nunits))
+	return aarch64_sve_data_mode (new_elt_mode, new_nunits);
+    }
+  return {};
+}
+
 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
 
 static opt_machine_mode
@@ -25731,26 +25771,23 @@  aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 {
   expand_vec_perm_d newd;
 
-  if (d->vec_flags != VEC_ADVSIMD)
+  /* The subregs that we'd create are not supported for big-endian SVE;
+     see aarch64_modes_compatible_p for details.  */
+  if (BYTES_BIG_ENDIAN && (d->vec_flags & VEC_ANY_SVE))
     return false;
 
   /* Get the new mode.  Always twice the size of the inner
      and half the elements.  */
-  poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
-  unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
-  auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
-  machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
-
-  if (new_mode == word_mode)
+  machine_mode new_mode;
+  if (!aarch64_coalesce_units (d->vmode, 2).exists (&new_mode))
     return false;
 
   vec_perm_indices newpermindices;
-
   if (!newpermindices.new_shrunk_vector (d->perm, 2))
     return false;
 
   newd.vmode = new_mode;
-  newd.vec_flags = VEC_ADVSIMD;
+  newd.vec_flags = d->vec_flags;
   newd.op_mode = newd.vmode;
   newd.op_vec_flags = newd.vec_flags;
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c b/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c
new file mode 100644
index 00000000000..90aeef32188
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c
@@ -0,0 +1,106 @@ 
+/* { dg-options "-O -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef __SVInt32_t vint32 __attribute__((arm_sve_vector_bits(256)));
+typedef __SVFloat32_t vfloat32 __attribute__((arm_sve_vector_bits(256)));
+
+#define TESTS(TYPE)							\
+  TYPE									\
+  TYPE##_zip1_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 8, 9, 2, 3, 10, 11);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_zip2_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 4, 5, 12, 13, 6, 7, 14, 15);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn1_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 8, 9, 4, 5, 12, 13);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn2_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 2, 3, 10, 11, 6, 7, 14, 15);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp1_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 4, 5, 8, 9, 12, 13);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp2_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 2, 3, 6, 7, 10, 11, 14, 15);	\
+  }
+
+/*
+** vint32_zip1_d:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vint32_zip2_d:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vint32_trn1_d:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vint32_trn2_d:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vint32_uzp1_d:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vint32_uzp2_d:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TESTS (vint32)
+
+/*
+** vfloat32_zip1_d:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat32_zip2_d:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat32_trn1_d:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat32_trn2_d:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat32_uzp1_d:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat32_uzp2_d:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+TESTS (vfloat32)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_2.c b/gcc/testsuite/gcc.target/aarch64/sve/permute_2.c
new file mode 100644
index 00000000000..085e05e0f7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_2.c
@@ -0,0 +1,277 @@ 
+/* { dg-options "-O -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef __SVUint16_t vuint16 __attribute__((arm_sve_vector_bits(256)));
+typedef __SVFloat16_t vfloat16 __attribute__((arm_sve_vector_bits(256)));
+typedef __SVBfloat16_t vbfloat16 __attribute__((arm_sve_vector_bits(256)));
+
+#define TESTS(TYPE)							\
+  TYPE									\
+  TYPE##_zip1_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 2, 3, 16, 17, 18, 19,	\
+				    4, 5, 6, 7, 20, 21, 22, 23);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_zip2_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 8, 9, 10, 11, 24, 25, 26, 27,	\
+				    12, 13, 14, 15, 28, 29, 30, 31);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn1_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 2, 3, 16, 17, 18, 19,	\
+				    8, 9, 10, 11, 24, 25, 26, 27);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn2_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 4, 5, 6, 7, 20, 21, 22, 23,	\
+				    12, 13, 14, 15, 28, 29, 30, 31);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp1_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 2, 3, 8, 9, 10, 11,	\
+				    16, 17, 18, 19, 24, 25, 26, 27);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp2_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 4, 5, 6, 7, 12, 13, 14, 15,	\
+				    20, 21, 22, 23, 28, 29, 30, 31);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_zip1_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 16, 17, 2, 3, 18, 19,	\
+				    4, 5, 20, 21, 6, 7, 22, 23);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_zip2_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 8, 9, 24, 25, 10, 11, 26, 27,	\
+				    12, 13, 28, 29, 14, 15, 30, 31);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn1_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 16, 17, 4, 5, 20, 21,	\
+				    8, 9, 24, 25, 12, 13, 28, 29);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn2_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 2, 3, 18, 19, 6, 7, 22, 23,	\
+				    10, 11, 26, 27, 14, 15, 30, 31);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp1_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 4, 5, 8, 9, 12, 13,	\
+				    16, 17, 20, 21, 24, 25, 28, 29);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp2_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 2, 3, 6, 7, 10, 11, 14, 15,	\
+				    18, 19, 22, 23, 26, 27, 30, 31);	\
+  }
+
+/*
+** vuint16_zip1_d:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vuint16_zip2_d:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vuint16_trn1_d:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vuint16_trn2_d:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vuint16_uzp1_d:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vuint16_uzp2_d:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vuint16_zip1_s:
+**	zip1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vuint16_zip2_s:
+**	zip2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vuint16_trn1_s:
+**	trn1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vuint16_trn2_s:
+**	trn2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vuint16_uzp1_s:
+**	uzp1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vuint16_uzp2_s:
+**	uzp2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TESTS (vuint16)
+
+/*
+** vfloat16_zip1_d:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat16_zip2_d:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat16_trn1_d:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat16_trn2_d:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat16_uzp1_d:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat16_uzp2_d:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vfloat16_zip1_s:
+**	zip1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vfloat16_zip2_s:
+**	zip2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vfloat16_trn1_s:
+**	trn1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vfloat16_trn2_s:
+**	trn2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vfloat16_uzp1_s:
+**	uzp1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vfloat16_uzp2_s:
+**	uzp2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TESTS (vfloat16)
+
+/*
+** vbfloat16_zip1_d:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vbfloat16_zip2_d:
+**	zip2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vbfloat16_trn1_d:
+**	trn1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vbfloat16_trn2_d:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vbfloat16_uzp1_d:
+**	uzp1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vbfloat16_uzp2_d:
+**	uzp2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vbfloat16_zip1_s:
+**	zip1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vbfloat16_zip2_s:
+**	zip2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vbfloat16_trn1_s:
+**	trn1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vbfloat16_trn2_s:
+**	trn2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vbfloat16_uzp1_s:
+**	uzp1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vbfloat16_uzp2_s:
+**	uzp2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+TESTS (vbfloat16)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_3.c b/gcc/testsuite/gcc.target/aarch64/sve/permute_3.c
new file mode 100644
index 00000000000..0a88ce0e889
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_3.c
@@ -0,0 +1,91 @@ 
+/* { dg-options "-O -msve-vector-bits=256" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef __SVInt8_t vint8 __attribute__((arm_sve_vector_bits(256)));
+
+#define TESTS(TYPE)							\
+  TYPE									\
+  TYPE##_zip1_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 2, 3, 4, 5, 6, 7,	\
+				    32, 33, 34, 35, 36, 37, 38, 39,	\
+				    8, 9, 10, 11, 12, 13, 14, 15,	\
+				    40, 41, 42, 43, 44, 45, 46, 47);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_zip2_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 16, 17, 18, 19, 48, 49, 50, 51, \
+				    20, 21, 22, 23, 52, 53, 54, 55,	\
+				    24, 25, 26, 27, 56, 57, 58, 59,	\
+				    28, 29, 30, 31, 60, 61, 62, 63);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn1_h (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 32, 33, 4, 5, 36, 37,	\
+				    8, 9, 40, 41, 12, 13, 44, 45,	\
+				    16, 17, 48, 49, 20, 21, 52, 53,	\
+				    24, 25, 56, 57, 28, 29, 60, 61);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_trn2_d (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 8, 9, 10, 11, 12, 13, 14, 15,	\
+				    40, 41, 42, 43, 44, 45, 46, 47,	\
+				    24, 25, 26, 27, 28, 29, 30, 31,	\
+				    56, 57, 58, 59, 60, 61, 62, 63);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp1_s (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 0, 1, 2, 3, 8, 9, 10, 11,	\
+				    16, 17, 18, 19, 24, 25, 26, 27,	\
+				    32, 33, 34, 35, 40, 41, 42, 43,	\
+				    48, 49, 50, 51, 56, 57, 58, 59);	\
+  }									\
+									\
+  TYPE									\
+  TYPE##_uzp2_h (TYPE x, TYPE y)					\
+  {									\
+    return __builtin_shufflevector (x, y, 2, 3, 6, 7, 10, 11, 14, 15,	\
+				    18, 19, 22, 23, 26, 27, 30, 31,	\
+				    34, 35, 38, 39, 42, 43, 46, 47,	\
+				    50, 51, 54, 55, 58, 59, 62, 63);	\
+  }
+
+/*
+** vint8_zip1_d:
+**	zip1	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vint8_zip2_s:
+**	zip2	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vint8_trn1_h:
+**	trn1	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+/*
+** vint8_trn2_d:
+**	trn2	z0\.d, z0\.d, z1\.d
+**	ret
+*/
+/*
+** vint8_uzp1_s:
+**	uzp1	z0\.s, z0\.s, z1\.s
+**	ret
+*/
+/*
+** vint8_uzp2_h:
+**	uzp2	z0\.h, z0\.h, z1\.h
+**	ret
+*/
+TESTS (vint8)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_4.c b/gcc/testsuite/gcc.target/aarch64/sve/permute_4.c
new file mode 100644
index 00000000000..a9cad7b49fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_4.c
@@ -0,0 +1,113 @@ 
+/* { dg-options "-O -msve-vector-bits=256 -fgimple" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef __SVInt8_t vint8 __attribute__((arm_sve_vector_bits(256)));
+typedef __SVBool_t vbool __attribute__((arm_sve_vector_bits(256)));
+
+/*
+** uzp1_h:
+**	uzp1	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+vbool __GIMPLE
+uzp1_h (vbool x, vbool y)
+{
+  vbool z;
+
+  z = __VEC_PERM (x, y, _Literal (vint8)
+		  { 0, 1, 4, 5, 8, 9, 12, 13,
+		    16, 17, 20, 21, 24, 25, 28, 29,
+		    32, 33, 36, 37, 40, 41, 44, 45,
+		    48, 49, 52, 53, 56, 57, 60, 61 });
+  return z;
+}
+
+/*
+** uzp2_s:
+**	uzp2	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+vbool __GIMPLE
+uzp2_s (vbool x, vbool y)
+{
+  vbool z;
+
+  z = __VEC_PERM (x, y, _Literal (vint8)
+		  { 4, 5, 6, 7, 12, 13, 14, 15,
+		    20, 21, 22, 23, 28, 29, 30, 31,
+		    36, 37, 38, 39, 44, 45, 46, 47,
+		    52, 53, 54, 55, 60, 61, 62, 63 });
+  return z;
+}
+
+/*
+** trn1_d:
+**	trn1	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+vbool __GIMPLE
+trn1_d (vbool x, vbool y)
+{
+  vbool z;
+
+  z = __VEC_PERM (x, y, _Literal (vint8)
+		  { 0, 1, 2, 3, 4, 5, 6, 7,
+		    32, 33, 34, 35, 36, 37, 38, 39,
+		    16, 17, 18, 19, 20, 21, 22, 23,
+		    48, 49, 50, 51, 52, 53, 54, 55 });
+  return z;
+}
+
+/*
+** trn2_h:
+**	trn2	p0\.h, p0\.h, p1\.h
+**	ret
+*/
+vbool __GIMPLE
+trn2_h (vbool x, vbool y)
+{
+  vbool z;
+
+  z = __VEC_PERM (x, y, _Literal (vint8)
+		  { 2, 3, 34, 35, 6, 7, 38, 39,
+		    10, 11, 42, 43, 14, 15, 46, 47,
+		    18, 19, 50, 51, 22, 23, 54, 55,
+		    26, 27, 58, 59, 30, 31, 62, 63 });
+  return z;
+}
+
+/*
+** zip1_d:
+**	zip1	p0\.d, p0\.d, p1\.d
+**	ret
+*/
+vbool __GIMPLE
+zip1_d (vbool x, vbool y)
+{
+  vbool z;
+
+  z = __VEC_PERM (x, y, _Literal (vint8)
+		  { 0, 1, 2, 3, 4, 5, 6, 7,
+		    32, 33, 34, 35, 36, 37, 38, 39,
+		    8, 9, 10, 11, 12, 13, 14, 15,
+		    40, 41, 42, 43, 44, 45, 46, 47 });
+  return z;
+}
+
+/*
+** zip2_s:
+**	zip2	p0\.s, p0\.s, p1\.s
+**	ret
+*/
+vbool __GIMPLE
+zip2_s (vbool x, vbool y)
+{
+  vbool z;
+
+  z = __VEC_PERM (x, y, _Literal (vint8)
+		  { 16, 17, 18, 19, 48, 49, 50, 51,
+		    20, 21, 22, 23, 52, 53, 54, 55,
+		    24, 25, 26, 27, 56, 57, 58, 59,
+		    28, 29, 30, 31, 60, 61, 62, 63 });
+  return z;
+}