diff mbox

[AArch64/ARM,2/3] Recognize shuffle patterns for REV instructions on AARch64, rewrite intrinsics.

Message ID 53581815.8020407@arm.com
State New
Headers show

Commit Message

Alan Lawrence April 23, 2014, 7:44 p.m. UTC
This patch (borrowing heavily from the ARM backend) makes 
aarch64_expand_vec_perm_const output REV instructions when appropriate,
and then implements the vrev_XXX intrinsics in terms of __builtin_shuffle (which
now produces the same assembly instructions).

No regressions (and tests in previous patch 
http://gcc.gnu.org/ml/gcc-patches/2014-04/msg01468.html still passing) on 
aarch64-none-elf; also on aarch64_be-none-elf, where there are
no regressions following testsuite config changes in
http://gcc.gnu.org/ml/gcc-patches/2014-04/msg00579.html, but some "noise" (due
to unexpected success in vectorization) without that patch.

gcc/ChangeLog:
2014-04-23  Alan Lawrence  <alan.lawrence@arm.com>

	* config/aarch64/iterators.md: add a REVERSE iterator and rev_op
	attribute for REV64/32/16 insns.
	* config/aarch64/aarch64-simd.md: add corresponding define_insn
	parameterized by REVERSE iterator.
	* config/aarch64/aarch64.c (aarch64_evpc_rev): recognize REVnn patterns.
	(aarch64_expand_vec_perm_const_1): call aarch64_evpc_rev also.
	* config/aarch64/arm_neon.h (vrev{16,32,64}[q]_{s,p,u,f}{8,16,32}): rewrite to
	use __builtin_shuffle.

Comments

Marcus Shawcroft April 24, 2014, 9:55 a.m. UTC | #1
On 23 April 2014 20:44, Alan Lawrence <alan.lawrence@arm.com> wrote:
> This patch (borrowing heavily from the ARM backend) makes
> aarch64_expand_vec_perm_const output REV instructions when appropriate,
> and then implements the vrev_XXX intrinsics in terms of __builtin_shuffle
> (which
> now produces the same assembly instructions).
>
> No regressions (and tests in previous patch
> http://gcc.gnu.org/ml/gcc-patches/2014-04/msg01468.html still passing) on
> aarch64-none-elf; also on aarch64_be-none-elf, where there are
> no regressions following testsuite config changes in
> http://gcc.gnu.org/ml/gcc-patches/2014-04/msg00579.html, but some "noise"
> (due
> to unexpected success in vectorization) without that patch.
>
> gcc/ChangeLog:
> 2014-04-23  Alan Lawrence  <alan.lawrence@arm.com>

Two spaces after the date.

>         * config/aarch64/iterators.md: add a REVERSE iterator and rev_op
>         attribute for REV64/32/16 insns.

ChangeLog entries are sentences, therefore they start with a capital letter.

Identify the new definitions in parentheses.  The ChangeLog entry just
states what changed, not why, any explanation required should be in
code comments or in the submission email, therefore:

         * config/aarch64/iterators.md (REVERSE, rev_op): Define.


>         * config/aarch64/aarch64-simd.md: add corresponding define_insn
>         parameterized by REVERSE iterator.

... and this one should read something like:

         * config/aarch64/aarch64-simd.md
           (aarch64_rev<REVERSE:rev_op><mode>): Define.

The remaining entries should all be updated in a similar fashion...


> +
> +/* vrev  */
> +
> +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
> +vrev16_p8 (poly8x8_t a)
> +{
> +  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
> +}

This has the effect of reserving the symbol 'a', use __a instead.  We
have other breakage like this in arm_neon.h which needs fixing but
that aside we should not be compounding the problem.

Cheers
/Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4dffb59..d499e86 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4032,6 +4032,15 @@ 
   [(set_attr "type" "neon_permute<q>")]
 )
 
+(define_insn "aarch64_rev<REVERSE:rev_op><mode>"
+  [(set (match_operand:VALL 0 "register_operand" "=w")
+	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")]
+                    REVERSE))]
+  "TARGET_SIMD"
+  "rev<REVERSE:rev_op>\\t%0.<Vtype>, %1.<Vtype>"
+  [(set_attr "type" "neon_rev<q>")]
+)
+
 (define_insn "aarch64_st2<mode>_dreg"
   [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv")
 	(unspec:TI [(match_operand:OI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 16c51a8..5bb10a2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8047,6 +8047,80 @@  aarch64_evpc_zip (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* Recognize patterns for the REV insns.  */
+
+static bool
+aarch64_evpc_rev (struct expand_vec_perm_d *d)
+{
+  unsigned int i, j, diff, nelt = d->nelt;
+  rtx (*gen) (rtx, rtx);
+
+  if (!d->one_vector_p)
+    return false;
+
+  diff = d->perm[0];
+  switch (diff)
+    {
+    case 7:
+      switch (d->vmode)
+	{
+	case V16QImode: gen = gen_aarch64_rev64v16qi; break;
+	case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
+	default:
+	  return false;
+	}
+      break;
+    case 3:
+      switch (d->vmode)
+	{
+	case V16QImode: gen = gen_aarch64_rev32v16qi; break;
+	case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
+	case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
+	case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
+	default:
+	  return false;
+	}
+      break;
+    case 1:
+      switch (d->vmode)
+	{
+	case V16QImode: gen = gen_aarch64_rev16v16qi; break;
+	case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
+	case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
+	case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
+	case V4SImode: gen = gen_aarch64_rev64v4si;  break;
+	case V2SImode: gen = gen_aarch64_rev64v2si;  break;
+	case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
+	case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
+	default:
+	  return false;
+	}
+      break;
+    default:
+      return false;
+    }
+
+  for (i = 0; i < nelt ; i += diff + 1)
+    for (j = 0; j <= diff; j += 1)
+      {
+	/* This is guaranteed to be true as the value of diff
+	   is 7, 3, 1 and we should have enough elements in the
+	   queue to generate this.  Getting a vector mask with a
+	   value of diff other than these values implies that
+	   something is wrong by the time we get here.  */
+	gcc_assert (i + j < nelt);
+	if (d->perm[i + j] != i + diff - j)
+	  return false;
+      }
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  emit_insn (gen (d->target, d->op0));
+  return true;
+}
+
 static bool
 aarch64_evpc_dup (struct expand_vec_perm_d *d)
 {
@@ -8153,6 +8227,8 @@  aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	return true;
       else if (aarch64_evpc_trn (d))
 	return true;
+      else if (aarch64_evpc_rev (d))
+        return true;
       else if (aarch64_evpc_dup (d))
 	return true;
       return aarch64_evpc_tbl (d);
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 6af99361..383ed56 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -10628,402 +10628,6 @@  vrecpeq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev16_p8 (poly8x8_t a)
-{
-  poly8x8_t result;
-  __asm__ ("rev16 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev16_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ ("rev16 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev16_u8 (uint8x8_t a)
-{
-  uint8x8_t result;
-  __asm__ ("rev16 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev16q_p8 (poly8x16_t a)
-{
-  poly8x16_t result;
-  __asm__ ("rev16 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev16q_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ ("rev16 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev16q_u8 (uint8x16_t a)
-{
-  uint8x16_t result;
-  __asm__ ("rev16 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev32_p8 (poly8x8_t a)
-{
-  poly8x8_t result;
-  __asm__ ("rev32 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vrev32_p16 (poly16x4_t a)
-{
-  poly16x4_t result;
-  __asm__ ("rev32 %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev32_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ ("rev32 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrev32_s16 (int16x4_t a)
-{
-  int16x4_t result;
-  __asm__ ("rev32 %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev32_u8 (uint8x8_t a)
-{
-  uint8x8_t result;
-  __asm__ ("rev32 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrev32_u16 (uint16x4_t a)
-{
-  uint16x4_t result;
-  __asm__ ("rev32 %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev32q_p8 (poly8x16_t a)
-{
-  poly8x16_t result;
-  __asm__ ("rev32 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vrev32q_p16 (poly16x8_t a)
-{
-  poly16x8_t result;
-  __asm__ ("rev32 %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev32q_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ ("rev32 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrev32q_s16 (int16x8_t a)
-{
-  int16x8_t result;
-  __asm__ ("rev32 %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev32q_u8 (uint8x16_t a)
-{
-  uint8x16_t result;
-  __asm__ ("rev32 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrev32q_u16 (uint16x8_t a)
-{
-  uint16x8_t result;
-  __asm__ ("rev32 %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrev64_f32 (float32x2_t a)
-{
-  float32x2_t result;
-  __asm__ ("rev64 %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev64_p8 (poly8x8_t a)
-{
-  poly8x8_t result;
-  __asm__ ("rev64 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vrev64_p16 (poly16x4_t a)
-{
-  poly16x4_t result;
-  __asm__ ("rev64 %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev64_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ ("rev64 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrev64_s16 (int16x4_t a)
-{
-  int16x4_t result;
-  __asm__ ("rev64 %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrev64_s32 (int32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("rev64 %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev64_u8 (uint8x8_t a)
-{
-  uint8x8_t result;
-  __asm__ ("rev64 %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrev64_u16 (uint16x4_t a)
-{
-  uint16x4_t result;
-  __asm__ ("rev64 %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrev64_u32 (uint32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("rev64 %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrev64q_f32 (float32x4_t a)
-{
-  float32x4_t result;
-  __asm__ ("rev64 %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev64q_p8 (poly8x16_t a)
-{
-  poly8x16_t result;
-  __asm__ ("rev64 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vrev64q_p16 (poly16x8_t a)
-{
-  poly16x8_t result;
-  __asm__ ("rev64 %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev64q_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ ("rev64 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrev64q_s16 (int16x8_t a)
-{
-  int16x8_t result;
-  __asm__ ("rev64 %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrev64q_s32 (int32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("rev64 %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev64q_u8 (uint8x16_t a)
-{
-  uint8x16_t result;
-  __asm__ ("rev64 %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrev64q_u16 (uint16x8_t a)
-{
-  uint16x8_t result;
-  __asm__ ("rev64 %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrev64q_u32 (uint32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("rev64 %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vrshrn_high_n_s16(a, b, c)                                      \
   __extension__                                                         \
     ({                                                                  \
@@ -22473,6 +22077,234 @@  vrecpxd_f64 (float64_t __a)
   return __builtin_aarch64_frecpxdf (__a);
 }
 
+
+/* vrev  */
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev16_p8 (poly8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev16_s8 (int8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev16_u8 (uint8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev16q_p8 (poly8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev16q_s8 (int8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev16q_u8 (uint8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 });
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev32_p8 (poly8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vrev32_p16 (poly16x4_t a)
+{
+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev32_s8 (int8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrev32_s16 (int16x4_t a)
+{
+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev32_u8 (uint8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrev32_u16 (uint16x4_t a)
+{
+  return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 });
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev32q_p8 (poly8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vrev32q_p16 (poly16x8_t a)
+{
+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev32q_s8 (int8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrev32q_s16 (int16x8_t a)
+{
+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev32q_u8 (uint8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrev32q_u16 (uint16x8_t a)
+{
+  return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrev64_f32 (float32x2_t a)
+{
+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev64_p8 (poly8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vrev64_p16 (poly16x4_t a)
+{
+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev64_s8 (int8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrev64_s16 (int16x4_t a)
+{
+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrev64_s32 (int32x2_t a)
+{
+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev64_u8 (uint8x8_t a)
+{
+  return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 });
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrev64_u16 (uint16x4_t a)
+{
+  return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 });
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrev64_u32 (uint32x2_t a)
+{
+  return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrev64q_f32 (float32x4_t a)
+{
+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev64q_p8 (poly8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vrev64q_p16 (poly16x8_t a)
+{
+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev64q_s8 (int8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrev64q_s16 (int16x8_t a)
+{
+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrev64q_s32 (int32x4_t a)
+{
+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev64q_u8 (uint8x16_t a)
+{
+  return __builtin_shuffle (a,
+      (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 });
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrev64q_u16 (uint16x8_t a)
+{
+  return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrev64q_u32 (uint32x4_t a)
+{
+  return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 });
+}
+
 /* vrnd  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f1339b8..c1f5544 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -267,6 +267,9 @@ 
     UNSPEC_UZP2		; Used in vector permute patterns.
     UNSPEC_TRN1		; Used in vector permute patterns.
     UNSPEC_TRN2		; Used in vector permute patterns.
+    UNSPEC_REV64        ; Used in vector reverse patterns (permute).
+    UNSPEC_REV32        ; Used in vector reverse patterns (permute).
+    UNSPEC_REV16        ; Used in vector reverse patterns (permute).
     UNSPEC_AESE		; Used in aarch64-simd.md.
     UNSPEC_AESD         ; Used in aarch64-simd.md.
     UNSPEC_AESMC        ; Used in aarch64-simd.md.
@@ -855,6 +858,8 @@ 
 			      UNSPEC_TRN1 UNSPEC_TRN2
 			      UNSPEC_UZP1 UNSPEC_UZP2])
 
+(define_int_iterator REVERSE [UNSPEC_REV64 UNSPEC_REV32 UNSPEC_REV16])
+
 (define_int_iterator FRINT [UNSPEC_FRINTZ UNSPEC_FRINTP UNSPEC_FRINTM
 			     UNSPEC_FRINTN UNSPEC_FRINTI UNSPEC_FRINTX
 			     UNSPEC_FRINTA])
@@ -982,6 +987,10 @@ 
 			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
 			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
 
+; op code for REV instructions (size within which elements are reversed).
+(define_int_attr rev_op [(UNSPEC_REV64 "64") (UNSPEC_REV32 "32")
+			 (UNSPEC_REV16 "16")])
+
 (define_int_attr perm_hilo [(UNSPEC_ZIP1 "1") (UNSPEC_ZIP2 "2")
 			    (UNSPEC_TRN1 "1") (UNSPEC_TRN2 "2")
 			    (UNSPEC_UZP1 "1") (UNSPEC_UZP2 "2")])