diff mbox

[2/2,AArch64] Add missing support for poly64x1_t

Message ID VI1PR0801MB2031BF90D204E9F28C2A8402FFFE0@VI1PR0801MB2031.eurprd08.prod.outlook.com
State New
Headers show

Commit Message

Tamar Christina Sept. 13, 2016, 12:54 p.m. UTC
Hi all,

This patch adds the following NEON intrinsics to the ARM Aarch64 GCC
(and fixes https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72758):

* vsli_n_p64
* vsliq_n_p64

* vld1_p64
* vld1q_p64
* vld1_dup_p64
* vld1q_dup_p64

* vst1_p64
* vst1q_p64
  
* vld2_p64
* vld3_p64
* vld4_p64
* vld2q_p64
* vld3q_p64
* vld4q_p64

* vld2_dup_p64
* vld3_dup_p64
* vld4_dup_p64

* __aarch64_vdup_lane_p64
* __aarch64_vdup_laneq_p64
* __aarch64_vdupq_lane_p64
* __aarch64_vdupq_laneq_p64

* vget_lane_p64
* vgetq_lane_p64

* vreinterpret_p8_p64
* vreinterpretq_p8_p64
* vreinterpret_p16_p64
* vreinterpretq_p16_p64

* vreinterpret_p64_f16
* vreinterpret_p64_f64
* vreinterpret_p64_s8
* vreinterpret_p64_s16
* vreinterpret_p64_s32
* vreinterpret_p64_s64
* vreinterpret_p64_f32
* vreinterpret_p64_u8
* vreinterpret_p64_u16
* vreinterpret_p64_u32
* vreinterpret_p64_u64
* vreinterpret_p64_p8

* vreinterpretq_p64_f64
* vreinterpretq_p64_s8
* vreinterpretq_p64_s16
* vreinterpretq_p64_s32
* vreinterpretq_p64_s64
* vreinterpretq_p64_f16
* vreinterpretq_p64_f32
* vreinterpretq_p64_u8
* vreinterpretq_p64_u16
* vreinterpretq_p64_u32
* vreinterpretq_p64_u64
* vreinterpretq_p64_p8

* vreinterpret_f16_p64
* vreinterpretq_f16_p64
* vreinterpret_f32_p64
* vreinterpretq_f32_p64
* vreinterpret_f64_p64
* vreinterpretq_f64_p64
* vreinterpret_s64_p64
* vreinterpretq_s64_p64
* vreinterpret_u64_p64
* vreinterpretq_u64_p64
* vreinterpret_s8_p64
* vreinterpretq_s8_p64
* vreinterpret_s16_p64
* vreinterpret_s32_p64
* vreinterpretq_s32_p64
* vreinterpret_u8_p64
* vreinterpret_u16_p64
* vreinterpretq_u16_p64
* vreinterpret_u32_p64
* vreinterpretq_u32_p64

* vset_lane_p64
* vsetq_lane_p64

* vget_low_p64
* vget_high_p64

* vcombine_p64
* vcreate_p64

* vst2_lane_p64
* vst3_lane_p64
* vst4_lane_p64
* vst2q_lane_p64
* vst3q_lane_p64
* vst4q_lane_p64

* vget_lane_p64
* vget_laneq_p64
* vset_lane_p64
* vset_laneq_p64

* vcopy_lane_p64
* vcopy_laneq_p64  

* vdup_n_p64
* vdupq_n_p64
* vdup_lane_p64
* vdup_laneq_p64

* vld1_p64
* vld1q_p64
* vld1_dup_p64
* vld1q_dup_p64
* vld1q_dup_p64
* vmov_n_p64
* vmovq_n_p64
* vst3q_p64
* vst4q_p64

* vld1_lane_p64
* vld1q_lane_p64
* vst1_lane_p64
* vst1q_lane_p64
* vcopy_laneq_p64
* vcopyq_laneq_p64
* vdupq_laneq_p64

Added new tests for these and ran regression tests on aarch64-none-linux-gnu.

Ok for trunk?

Thanks,
Tamar

gcc/
2016-08-17  Tamar Christina  <tamar.christina@arm.com>

	* config/aarch64/aarch64-builtins.c (TYPES_SETREGP): Added poly type.
	(TYPES_GETREGP): Likewise.
	(TYPES_SHIFTINSERTP): Likewise.
	(TYPES_COMBINEP): Likewise.
	(TYPES_STORE1P): Likewise.
	* config/aarch64/aarch64-simd-builtins.def
	(combine): Added poly generator.
	(get_dregoi): Likewise.
	(get_dregci): Likewise.
	(get_dregxi): Likewise.
	(ssli_n): Likewise.
	(ld1): Likewise.
	(st1): Likewise.
	* config/aarch64/aarch64-simd.md
	(aarch64_ld1<VALL_F16:mode>): Rename to...
	(aarch64_ld1<VALLDI_F16:mode>): ...this.
	(aarch64_get_dreg<VSTRUCT:mode><VDC:mode>): Rename to...
	(aarch64_get_dreg<VSTRUCT:mode><VDC_PS:mode>): ..this.
	(aarch64_set_qreg<VSTRUCT:mode><VQ:mode>): Rename to...
	(aarch64_set_qreg<VSTRUCT:mode><VQ_DI:mode>): ..this.
	(aarch64_set_qpreg<VSTRUCT:mode><PS:mode>): New.
	* config/aarch64/arm_neon.h
	(poly64x1x2_t, poly64x1x3_t): New.
	(poly64x1x4_t, poly64x2x2_t): Likewise.
	(poly64x2x3_t, poly64x2x4_t): Likewise.
	(poly64x1_t): Likewise.
	(vcreate_p64, vcombine_p64): Likewise.
	(vdup_n_p64, vdupq_n_p64): Likewise.
	(vld2_p64, vld2q_p64): Likewise.
	(vld3_p64, vld3q_p64): Likewise.
	(vld4_p64, vld4q_p64): Likewise.
	(vld2_dup_p64, vld3_dup_p64): Likewise.
	(vld4_dup_p64, vsli_n_p64): Likewise.
	(vsliq_n_p64, vst1_p64): Likewise.
	(vst1q_p64, vst2_p64): Likewise.
	(vst3_p64, vst4_p64): Likewise.
	(__aarch64_vdup_lane_p64, __aarch64_vdup_laneq_p64): Likewise.
	(__aarch64_vdupq_lane_p64, __aarch64_vdupq_laneq_p64): Likewise.
	(vget_lane_p64, vgetq_lane_p64): Likewise.
	(vreinterpret_p8_p64, vreinterpretq_p8_p64): Likewise.
	(vreinterpret_p16_p64, vreinterpretq_p16_p64): Likewise.
	(vreinterpret_p64_f16, vreinterpret_p64_f64): Likewise.
	(vreinterpret_p64_s8, vreinterpret_p64_s16): Likewise.
	(vreinterpret_p64_s32, vreinterpret_p64_s64): Likewise.
	(vreinterpret_p64_f32, vreinterpret_p64_u8): Likewise.
	(vreinterpret_p64_u16, vreinterpret_p64_u32): Likewise.
	(vreinterpret_p64_u64, vreinterpret_p64_p8): Likewise.
	(vreinterpretq_p64_f64, vreinterpretq_p64_s8): Likewise.
	(vreinterpretq_p64_s16, vreinterpretq_p64_s32): Likewise.
	(vreinterpretq_p64_s64, vreinterpretq_p64_f16): Likewise.
	(vreinterpretq_p64_f32, vreinterpretq_p64_u8): Likewise.
	(vreinterpretq_p64_u16, vreinterpretq_p64_u32): Likewise.
	(vreinterpretq_p64_u64, vreinterpretq_p64_p8): Likewise.
	(vreinterpret_f16_p64, vreinterpretq_f16_p64): Likewise.
	(vreinterpret_f32_p64, vreinterpretq_f32_p64): Likewise.
	(vreinterpret_f64_p64, vreinterpretq_f64_p64): Likewise.
	(vreinterpret_s64_p64, vreinterpretq_s64_p64): Likewise.
	(vreinterpret_u64_p64, vreinterpretq_u64_p64): Likewise.
	(vreinterpret_s8_p64, vreinterpretq_s8_p64): Likewise.
	(vreinterpret_s16_p64, vreinterpret_s32_p64): Likewise.
	(vreinterpretq_s32_p64, vreinterpret_u8_p64): Likewise.
	(vreinterpret_u16_p64, vreinterpretq_u16_p64): Likewise.
	(vreinterpret_u32_p64, vreinterpretq_u32_p64): Likewise.
	(vset_lane_p64, vsetq_lane_p64): Likewise.
	(vget_low_p64, vget_high_p64): Likewise.
	(vcombine_p64, vst2_lane_p64): Likewise.
	(vst3_lane_p64, vst4_lane_p64): Likewise.
	(vst2q_lane_p64, vst3q_lane_p64): Likewise.
	(vst4q_lane_p64, vget_lane_p64): Likewise.
	(vget_laneq_p64, vset_lane_p64): Likewise.
	(vset_laneq_p64, vcopy_lane_p64): Likewise.
	(vcopy_laneq_p64, vdup_n_p64): Likewise.
	(vdupq_n_p64, vdup_lane_p64): Likewise.
	(vdup_laneq_p64, vld1_p64): Likewise.
	(vld1q_p64, vld1_dup_p64): Likewise.
	(vld1q_dup_p64, vld1q_dup_p64): Likewise.
	(vmov_n_p64, vmovq_n_p64): Likewise.
	(vst3q_p64, vst4q_p64): Likewise.
	(vld1_lane_p64, vld1q_lane_p64): Likewise.
	(vst1_lane_p64, vst1q_lane_p64): Likewise.
	(vcopy_laneq_p64, vcopyq_laneq_p64): Likewise.
	(vdupq_laneq_p64): Likewise.
	* config/aarch64/iterators.md
	(VQ_DI): Added DI to VQ.

gcc/testsuite/
2016-08-17  Tamar Christina  <tamar.christina@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/p64.c: New.
	* gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h (Poly64x1_t): Added type.
	(Poly64x2_t): Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vcombine.c: Added test for Poly64.
	* gcc.target/aarch64/advsimd-intrinsics/vcreate.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vget_high.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vget_lane.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vget_low.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vldX.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c: Likewise.
	* gcc.target/aarch64/advsimd-intrinsics/vld1.c: Likewise.

Comments

James Greenhalgh Sept. 17, 2016, 6:20 a.m. UTC | #1
On Tue, Sep 13, 2016 at 01:54:15PM +0100, Tamar Christina wrote:
> Hi all,
> 
> This patch adds the following NEON intrinsics to the ARM Aarch64 GCC
> (and fixes https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72758):
> 

<snip>

> Added new tests for these and ran regression tests on aarch64-none-linux-gnu.
> 
> Ok for trunk?

Hi Tamar,

Thanks for the patch! If I'm not mistaken, this also fixes PR target/58693.

The convention when fixing a bugzilla is to place a line in your ChangeLog
which looks like:

	PR component/xxxxx

This then gets picked up by a little robot which scrapes the commits, and
updates bugzilla with a link to the commit that fixes the bug. See
gcc/ChangeLog for examples.

This patch looks OK to me functionally, but I'd appreciate a second set
of eyes on the testsuite changes. I've CC'ed Christophe Lyon who wrote
the testsuite. Do remember that these testcases are shared with the ARM
port, so it is appropriate to test an ARM build against your changes. In
this case, I see your patch introduces some errors when testing the ARM
target. Presumably the check against __ARM_FEATURE_CRYPTO is not strong
enough to ensure that the poly64_t types are available.

    FAIL: gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c   -O0  (test for excess errors)
    Excess errors:
    .../gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c:171:5: warning: implicit declaration of function 'vmov_n_p64'; did you mean 'vmov_n_u64'? [-Wimplicit-function-declaration]
    .../gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c:171:5: warning: implicit declaration of function 'vmovq_n_p64'; did you mean 'vmovq_n_u64'? [-Wimplicit-function-declaration]
    .../gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c:170:30: error: incompatible types when assigning to type 'poly64x2_t {aka __vector(2) long long unsigned int}' from type 'int'

    FAIL: gcc.target/aarch64/advsimd-intrinsics/vget_lane.c   -O0  (test for excess errors)
    FAIL: gcc.target/aarch64/advsimd-intrinsics/vldX.c   -O0  (test for excess errors)
    FAIL: gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c   -O0  (test for excess errors)
    FAIL: gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p128.c   -O0  (test for excess errors)
    FAIL: gcc.target/aarch64/advsimd-intrinsics/vreinterpret_p64.c   -O0  (test for excess errors)
    FAIL: gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c   -O0  (test for excess errors)

Could you take a look at this?

Thanks,
James
Tamar Christina Sept. 19, 2016, 8:39 a.m. UTC | #2
On 17/09/16 07:20, James Greenhalgh wrote:


Hi James,

> The convention when fixing a bugzilla is to place a line in your ChangeLog
> which looks like:
>
> 	PR component/xxxxx
Oh, I wasn't aware of this, thanks! I'll add it.
>      FAIL: gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c   -O0  (test for excess errors)
>      Excess errors:
>      .../gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c:171:5: warning: implicit declaration of function 'vmov_n_p64'; did you mean 'vmov_n_u64'? [-Wimplicit-function

It looks like ARM is missing some of the poly64 functions. I can add 
these since they don't seem to be too many.

Regards,
Tamar
Christophe Lyon Sept. 19, 2016, 1:22 p.m. UTC | #3
Hi,


On 19 September 2016 at 10:39, Tamar Christina <tamar.christina@arm.com> wrote:
>
>
> On 17/09/16 07:20, James Greenhalgh wrote:
>
>
> Hi James,
>
>> The convention when fixing a bugzilla is to place a line in your ChangeLog
>> which looks like:
>>
>>         PR component/xxxxx
>
> Oh, I wasn't aware of this, thanks! I'll add it.
>>
>>      FAIL: gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c   -O0  (test
>> for excess errors)
>>      Excess errors:
>>
>> .../gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c:171:5:
>> warning: implicit declaration of function 'vmov_n_p64'; did you mean
>> 'vmov_n_u64'? [-Wimplicit-function
>
>
> It looks like ARM is missing some of the poly64 functions. I can add these
> since they don't seem to be too many.
>
I had opened https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71233 to track missing
poly64 intrinsics for the ARM target. Not sure why vmov_n_p64 isn't in the list?

> Regards,
> Tamar
>
James Greenhalgh Sept. 19, 2016, 1:30 p.m. UTC | #4
On Mon, Sep 19, 2016 at 03:22:39PM +0200, Christophe Lyon wrote:
> Hi,
> 
> 
> On 19 September 2016 at 10:39, Tamar Christina <tamar.christina@arm.com> wrote:
> >
> >
> > On 17/09/16 07:20, James Greenhalgh wrote:
> >
> >
> > Hi James,
> >
> >> The convention when fixing a bugzilla is to place a line in your ChangeLog
> >> which looks like:
> >>
> >>         PR component/xxxxx
> >
> > Oh, I wasn't aware of this, thanks! I'll add it.
> >>
> >>      FAIL: gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c   -O0  (test
> >> for excess errors)
> >>      Excess errors:
> >>
> >> .../gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c:171:5:
> >> warning: implicit declaration of function 'vmov_n_p64'; did you mean
> >> 'vmov_n_u64'? [-Wimplicit-function
> >
> >
> > It looks like ARM is missing some of the poly64 functions. I can add these
> > since they don't seem to be too many.
> >
> I had opened https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71233 to track missing
> poly64 intrinsics for the ARM target. Not sure why vmov_n_p64 isn't in the list?

Now you mention it, I don't see vmov_n_p64 in the list of intrinsics at:

  http://infocenter.arm.com/help/topic/com.arm.doc.ihi0073a/IHI0073A_arm_neon_intrinsics_ref.pdf

So I wonder whether this patch would actually add some extra intrinsics
that we don't require?

Thanks,
James
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 9136910cd324a391de929ea9d1a13419dbcfb8bc..1356d5a934176c939c0f75d95f2aaf05b0ff9645 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -169,6 +169,10 @@  aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define TYPES_QUADOP_LANE (aarch64_types_quadop_lane_qualifiers)
 
 static enum aarch64_type_qualifiers
+aarch64_types_binop_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_none, qualifier_immediate };
+#define TYPES_GETREGP (aarch64_types_binop_imm_p_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_binop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_immediate };
 #define TYPES_GETREG (aarch64_types_binop_imm_qualifiers)
@@ -187,11 +191,20 @@  aarch64_types_unsigned_shift_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define TYPES_USHIFTIMM (aarch64_types_unsigned_shift_qualifiers)
 
 static enum aarch64_type_qualifiers
-aarch64_types_ternop_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
-  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate };
-#define TYPES_SETREG (aarch64_types_ternop_imm_qualifiers)
-#define TYPES_SHIFTINSERT (aarch64_types_ternop_imm_qualifiers)
-#define TYPES_SHIFTACC (aarch64_types_ternop_imm_qualifiers)
+aarch64_types_ternop_s_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_poly, qualifier_immediate};
+#define TYPES_SETREGP (aarch64_types_ternop_s_imm_p_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_s_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none, qualifier_immediate};
+#define TYPES_SETREG (aarch64_types_ternop_s_imm_qualifiers)
+#define TYPES_SHIFTINSERT (aarch64_types_ternop_s_imm_qualifiers)
+#define TYPES_SHIFTACC (aarch64_types_ternop_s_imm_qualifiers)
+
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_p_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_poly, qualifier_poly, qualifier_immediate};
+#define TYPES_SHIFTINSERTP (aarch64_types_ternop_p_imm_qualifiers)
 
 static enum aarch64_type_qualifiers
 aarch64_types_unsigned_shiftacc_qualifiers[SIMD_MAX_BUILTIN_ARGS]
@@ -206,6 +219,11 @@  aarch64_types_combine_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define TYPES_COMBINE (aarch64_types_combine_qualifiers)
 
 static enum aarch64_type_qualifiers
+aarch64_types_combine_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_poly, qualifier_poly, qualifier_poly };
+#define TYPES_COMBINEP (aarch64_types_combine_p_qualifiers)
+
+static enum aarch64_type_qualifiers
 aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_const_pointer_map_mode };
 #define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
@@ -238,6 +256,10 @@  aarch64_types_bsl_u_qualifiers[SIMD_MAX_BUILTIN_ARGS]
    qualifier_map_mode | qualifier_pointer to build a pointer to the
    element type of the vector.  */
 static enum aarch64_type_qualifiers
+aarch64_types_store1_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_void, qualifier_pointer_map_mode, qualifier_poly };
+#define TYPES_STORE1P (aarch64_types_store1_p_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_store1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_void, qualifier_pointer_map_mode, qualifier_none };
 #define TYPES_STORE1 (aarch64_types_store1_qualifiers)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index e1154b4b27820c0075d9a9edb4f8b48ef4f06b07..bc8a85dcf03cc5e52891ae4300ec721e7a533b9b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -40,6 +40,7 @@ 
    10 - CODE_FOR_<name><mode>.  */
 
   BUILTIN_VDC (COMBINE, combine, 0)
+  VAR1 (COMBINEP, combine, 0, di)
   BUILTIN_VB (BINOP, pmul, 0)
   BUILTIN_VHSDF_HSDF (BINOP, fmulx, 0)
   BUILTIN_VHSDF_DF (UNOP, sqrt, 2)
@@ -68,14 +69,23 @@ 
   BUILTIN_VDC (GETREG, get_dregoi, 0)
   BUILTIN_VDC (GETREG, get_dregci, 0)
   BUILTIN_VDC (GETREG, get_dregxi, 0)
+  VAR1 (GETREGP, get_dregoi, 0, di)
+  VAR1 (GETREGP, get_dregci, 0, di)
+  VAR1 (GETREGP, get_dregxi, 0, di)
   /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
   BUILTIN_VQ (GETREG, get_qregoi, 0)
   BUILTIN_VQ (GETREG, get_qregci, 0)
   BUILTIN_VQ (GETREG, get_qregxi, 0)
+  VAR1 (GETREGP, get_qregoi, 0, v2di)
+  VAR1 (GETREGP, get_qregci, 0, v2di)
+  VAR1 (GETREGP, get_qregxi, 0, v2di)
   /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
   BUILTIN_VQ (SETREG, set_qregoi, 0)
   BUILTIN_VQ (SETREG, set_qregci, 0)
   BUILTIN_VQ (SETREG, set_qregxi, 0)
+  VAR1 (SETREGP, set_qregoi, 0, v2di)
+  VAR1 (SETREGP, set_qregci, 0, v2di)
+  VAR1 (SETREGP, set_qregxi, 0, v2di)
   /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
   BUILTIN_VDC (LOADSTRUCT, ld2, 0)
   BUILTIN_VDC (LOADSTRUCT, ld3, 0)
@@ -224,6 +234,7 @@ 
   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
   BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
   BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
+  VAR2 (SHIFTINSERTP, ssli_n, 0, di, v2di)
   BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
   /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
   BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
@@ -416,9 +427,11 @@ 
 
   /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
   BUILTIN_VALL_F16 (LOAD1, ld1, 0)
+  VAR1(STORE1P, ld1, 0, v2di)
 
   /* Implemented by aarch64_st1<VALL_F16:mode>.  */
   BUILTIN_VALL_F16 (STORE1, st1, 0)
+  VAR1(STORE1P, st1, 0, v2di)
 
   /* Implemented by fma<mode>4.  */
   BUILTIN_VHSDF (TERNOP, fma, 4)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 70140744bb5a7b6dded304c16291cfdce4375c10..3cceb5132c3362f0b8edddee1836efdf19caec81 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5115,7 +5115,8 @@ 
   rtx temp = gen_reg_rtx (<VDC:VDBL>mode);
   int offset = part * 16;
 
-  emit_move_insn (temp, gen_rtx_SUBREG (<VDC:VDBL>mode, operands[1], offset));
+  emit_move_insn (temp,
+		  gen_rtx_SUBREG (<VDC:VDBL>mode, operands[1], offset));
   emit_move_insn (operands[0], gen_lowpart (<VDC:MODE>mode, temp));
   DONE;
 })
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d0e1fe7e1b113c2d9f2b7966c9b3bd6956e4152f..1bf0b15b92535258a2e87396d1c8056716ff34f8 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -58,6 +58,7 @@  typedef __Float64x2_t float64x2_t;
 typedef __Poly8x16_t poly8x16_t;
 typedef __Poly16x8_t poly16x8_t;
 typedef __Poly64x2_t poly64x2_t;
+typedef __Poly64x1_t poly64x1_t;
 typedef __Uint8x16_t uint8x16_t;
 typedef __Uint16x8_t uint16x8_t;
 typedef __Uint32x4_t uint32x4_t;
@@ -202,6 +203,36 @@  typedef struct poly16x8x2_t
   poly16x8_t val[2];
 } poly16x8x2_t;
 
+typedef struct poly64x1x2_t
+{
+  poly64x1_t val[2];
+} poly64x1x2_t;
+
+typedef struct poly64x1x3_t
+{
+  poly64x1_t val[3];
+} poly64x1x3_t;
+
+typedef struct poly64x1x4_t
+{
+  poly64x1_t val[4];
+} poly64x1x4_t;
+
+typedef struct poly64x2x2_t
+{
+  poly64x2_t val[2];
+} poly64x2x2_t;
+
+typedef struct poly64x2x3_t
+{
+  poly64x2_t val[3];
+} poly64x2x3_t;
+
+typedef struct poly64x2x4_t
+{
+  poly64x2_t val[4];
+} poly64x2x4_t;
+
 typedef struct int8x8x3_t
 {
   int8x8_t val[3];
@@ -476,6 +507,8 @@  typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, , __a, __b)
 #define __aarch64_vdup_lane_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, , __a, __b)
+#define __aarch64_vdup_lane_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, , __a, __b)
 #define __aarch64_vdup_lane_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, , __a, __b)
 #define __aarch64_vdup_lane_s16(__a, __b) \
@@ -504,6 +537,8 @@  typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, , __a, __b)
 #define __aarch64_vdup_laneq_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, , __a, __b)
+#define __aarch64_vdup_laneq_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, , __a, __b)
 #define __aarch64_vdup_laneq_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, , __a, __b)
 #define __aarch64_vdup_laneq_s16(__a, __b) \
@@ -532,6 +567,8 @@  typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, q, __a, __b)
 #define __aarch64_vdupq_lane_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, q, __a, __b)
+#define __aarch64_vdupq_lane_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, q, __a, __b)
 #define __aarch64_vdupq_lane_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, q, __a, __b)
 #define __aarch64_vdupq_lane_s16(__a, __b) \
@@ -560,6 +597,8 @@  typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (p8, q, __a, __b)
 #define __aarch64_vdupq_laneq_p16(__a, __b) \
    __aarch64_vdup_lane_any (p16, q, __a, __b)
+#define __aarch64_vdupq_laneq_p64(__a, __b) \
+   __aarch64_vdup_lane_any (p64, q, __a, __b)
 #define __aarch64_vdupq_laneq_s8(__a, __b) \
    __aarch64_vdup_lane_any (s8, q, __a, __b)
 #define __aarch64_vdupq_laneq_s16(__a, __b) \
@@ -2735,6 +2774,12 @@  vcreate_p16 (uint64_t __a)
   return (poly16x4_t) __a;
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vcreate_p64 (uint64_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
 /* vget_lane  */
 
 __extension__ static __inline float16_t __attribute__ ((__always_inline__))
@@ -2767,6 +2812,12 @@  vget_lane_p16 (poly16x4_t __a, const int __b)
   return __aarch64_vget_lane_any (__a, __b);
 }
 
+__extension__ static __inline poly64_t __attribute__ ((__always_inline__))
+vget_lane_p64 (poly64x1_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vget_lane_s8 (int8x8_t __a, const int __b)
 {
@@ -2847,6 +2898,12 @@  vgetq_lane_p16 (poly16x8_t __a, const int __b)
   return __aarch64_vget_lane_any (__a, __b);
 }
 
+__extension__ static __inline poly64_t __attribute__ ((__always_inline__))
+vgetq_lane_p64 (poly64x2_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vgetq_lane_s8 (int8x16_t __a, const int __b)
 {
@@ -2969,6 +3026,12 @@  vreinterpret_p8_p16 (poly16x4_t __a)
   return (poly8x8_t) __a;
 }
 
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_p64 (poly64x1_t __a)
+{
+  return (poly8x8_t) __a;
+}
+
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_p8_f64 (float64x2_t __a)
 {
@@ -3041,6 +3104,12 @@  vreinterpretq_p8_p16 (poly16x8_t __a)
   return (poly8x16_t) __a;
 }
 
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_p64 (poly64x2_t __a)
+{
+  return (poly8x16_t) __a;
+}
+
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vreinterpret_p16_f16 (float16x4_t __a)
 {
@@ -3113,6 +3182,12 @@  vreinterpret_p16_p8 (poly8x8_t __a)
   return (poly16x4_t) __a;
 }
 
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_p64 (poly64x1_t __a)
+{
+  return (poly16x4_t) __a;
+}
+
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_p16_f64 (float64x2_t __a)
 {
@@ -3185,6 +3260,156 @@  vreinterpretq_p16_p8 (poly8x16_t __a)
   return (poly16x8_t) __a;
 }
 
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_p64 (poly64x2_t __a)
+{
+  return (poly16x8_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_f16 (float16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_f64 (float64x1_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_s8 (int8x8_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_s16 (int16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_s32 (int32x2_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_s64 (int64x1_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_f32 (float32x2_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_u8 (uint8x8_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_u16 (uint16x4_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_u32 (uint32x2_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_u64 (uint64x1_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vreinterpret_p64_p8 (poly8x8_t __a)
+{
+  return (poly64x1_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_f64 (float64x2_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_s8 (int8x16_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_s16 (int16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_s32 (int32x4_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_s64 (int64x2_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_f16 (float16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_f32 (float32x4_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_u8 (uint8x16_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_u16 (uint16x8_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_u32 (uint32x4_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_u64 (uint64x2_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_p64_p8 (poly8x16_t __a)
+{
+  return (poly64x2_t) __a;
+}
+
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vreinterpret_f16_f64 (float64x1_t __a)
 {
@@ -3257,6 +3482,12 @@  vreinterpret_f16_p16 (poly16x4_t __a)
   return (float16x4_t) __a;
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p64 (poly64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_f16_f64 (float64x2_t __a)
 {
@@ -3329,6 +3560,12 @@  vreinterpretq_f16_p16 (poly16x8_t __a)
   return (float16x8_t) __a;
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p64 (poly64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vreinterpret_f32_f16 (float16x4_t __a)
 {
@@ -3401,6 +3638,12 @@  vreinterpret_f32_p16 (poly16x4_t __a)
   return (float32x2_t) __a;
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_p64 (poly64x1_t __a)
+{
+  return (float32x2_t) __a;
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_f16 (float16x8_t __a)
 {
@@ -3473,6 +3716,12 @@  vreinterpretq_f32_p16 (poly16x8_t __a)
   return (float32x4_t) __a;
 }
 
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_p64 (poly64x2_t __a)
+{
+  return (float32x4_t) __a;
+}
+
 __extension__ static __inline float64x1_t __attribute__((__always_inline__))
 vreinterpret_f64_f16 (float16x4_t __a)
 {
@@ -3498,6 +3747,12 @@  vreinterpret_f64_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline float64x1_t __attribute__((__always_inline__))
+vreinterpret_f64_p64 (poly64x1_t __a)
+{
+  return (float64x1_t) __a;
+}
+
+__extension__ static __inline float64x1_t __attribute__((__always_inline__))
 vreinterpret_f64_s8 (int8x8_t __a)
 {
   return (float64x1_t) __a;
@@ -3570,6 +3825,12 @@  vreinterpretq_f64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline float64x2_t __attribute__((__always_inline__))
+vreinterpretq_f64_p64 (poly64x2_t __a)
+{
+  return (float64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__((__always_inline__))
 vreinterpretq_f64_s8 (int8x16_t __a)
 {
   return (float64x2_t) __a;
@@ -3689,6 +3950,12 @@  vreinterpret_s64_p16 (poly16x4_t __a)
   return (int64x1_t) __a;
 }
 
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_p64 (poly64x1_t __a)
+{
+  return (int64x1_t) __a;
+}
+
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_s64_f64 (float64x2_t __a)
 {
@@ -3761,6 +4028,12 @@  vreinterpretq_s64_p16 (poly16x8_t __a)
   return (int64x2_t) __a;
 }
 
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_p64 (poly64x2_t __a)
+{
+  return (int64x2_t) __a;
+}
+
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vreinterpret_u64_f16 (float16x4_t __a)
 {
@@ -3833,6 +4106,12 @@  vreinterpret_u64_p16 (poly16x4_t __a)
   return (uint64x1_t) __a;
 }
 
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_p64 (poly64x1_t __a)
+{
+  return (uint64x1_t) __a;
+}
+
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_u64_f64 (float64x2_t __a)
 {
@@ -3905,6 +4184,12 @@  vreinterpretq_u64_p16 (poly16x8_t __a)
   return (uint64x2_t) __a;
 }
 
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_p64 (poly64x2_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vreinterpret_s8_f16 (float16x4_t __a)
 {
@@ -3977,6 +4262,12 @@  vreinterpret_s8_p16 (poly16x4_t __a)
   return (int8x8_t) __a;
 }
 
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_p64 (poly64x1_t __a)
+{
+  return (int8x8_t) __a;
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_s8_f64 (float64x2_t __a)
 {
@@ -4049,6 +4340,12 @@  vreinterpretq_s8_p16 (poly16x8_t __a)
   return (int8x16_t) __a;
 }
 
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_p64 (poly64x2_t __a)
+{
+  return (int8x16_t) __a;
+}
+
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vreinterpret_s16_f16 (float16x4_t __a)
 {
@@ -4121,6 +4418,12 @@  vreinterpret_s16_p16 (poly16x4_t __a)
   return (int16x4_t) __a;
 }
 
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_p64 (poly64x1_t __a)
+{
+  return (int16x4_t) __a;
+}
+
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_s16_f64 (float64x2_t __a)
 {
@@ -4193,6 +4496,12 @@  vreinterpretq_s16_p16 (poly16x8_t __a)
   return (int16x8_t) __a;
 }
 
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_p64 (poly64x2_t __a)
+{
+  return (int16x8_t) __a;
+}
+
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vreinterpret_s32_f16 (float16x4_t __a)
 {
@@ -4265,6 +4574,12 @@  vreinterpret_s32_p16 (poly16x4_t __a)
   return (int32x2_t) __a;
 }
 
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_p64 (poly64x1_t __a)
+{
+  return (int32x2_t) __a;
+}
+
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_s32_f64 (float64x2_t __a)
 {
@@ -4337,6 +4652,12 @@  vreinterpretq_s32_p16 (poly16x8_t __a)
   return (int32x4_t) __a;
 }
 
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_p64 (poly64x2_t __a)
+{
+  return (int32x4_t) __a;
+}
+
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vreinterpret_u8_f16 (float16x4_t __a)
 {
@@ -4409,6 +4730,12 @@  vreinterpret_u8_p16 (poly16x4_t __a)
   return (uint8x8_t) __a;
 }
 
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_p64 (poly64x1_t __a)
+{
+  return (uint8x8_t) __a;
+}
+
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_u8_f64 (float64x2_t __a)
 {
@@ -4481,6 +4808,12 @@  vreinterpretq_u8_p16 (poly16x8_t __a)
   return (uint8x16_t) __a;
 }
 
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_p64 (poly64x2_t __a)
+{
+  return (uint8x16_t) __a;
+}
+
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vreinterpret_u16_f16 (float16x4_t __a)
 {
@@ -4553,6 +4886,12 @@  vreinterpret_u16_p16 (poly16x4_t __a)
   return (uint16x4_t) __a;
 }
 
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_p64 (poly64x1_t __a)
+{
+  return (uint16x4_t) __a;
+}
+
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_u16_f64 (float64x2_t __a)
 {
@@ -4625,6 +4964,12 @@  vreinterpretq_u16_p16 (poly16x8_t __a)
   return (uint16x8_t) __a;
 }
 
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_p64 (poly64x2_t __a)
+{
+  return (uint16x8_t) __a;
+}
+
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vreinterpret_u32_f16 (float16x4_t __a)
 {
@@ -4697,6 +5042,12 @@  vreinterpret_u32_p16 (poly16x4_t __a)
   return (uint32x2_t) __a;
 }
 
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_p64 (poly64x1_t __a)
+{
+  return (uint32x2_t) __a;
+}
+
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_u32_f64 (float64x2_t __a)
 {
@@ -4769,6 +5120,12 @@  vreinterpretq_u32_p16 (poly16x8_t __a)
   return (uint32x4_t) __a;
 }
 
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_p64 (poly64x2_t __a)
+{
+  return (uint32x4_t) __a;
+}
+
 /* vset_lane  */
 
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
@@ -4801,6 +5158,12 @@  vset_lane_p16 (poly16_t __elem, poly16x4_t __vec, const int __index)
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vset_lane_p64 (poly64_t __elem, poly64x1_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vset_lane_s8 (int8_t __elem, int8x8_t __vec, const int __index)
 {
@@ -4881,6 +5244,12 @@  vsetq_lane_p16 (poly16_t __elem, poly16x8_t __vec, const int __index)
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_p64 (poly64_t __elem, poly64x2_t __vec, const int __index)
+{
+  return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vsetq_lane_s8 (int8_t __elem, int8x16_t __vec, const int __index)
 {
@@ -4964,6 +5333,12 @@  vget_low_p16 (poly16x8_t __a)
   __GET_LOW (p16);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vget_low_p64 (poly64x2_t __a)
+{
+  __GET_LOW (p64);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vget_low_s8 (int8x16_t __a)
 {
@@ -5049,6 +5424,12 @@  vget_high_p16 (poly16x8_t __a)
   __GET_HIGH (p16);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vget_high_p64 (poly64x2_t __a)
+{
+  __GET_HIGH (p64);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vget_high_s8 (int8x16_t __a)
 {
@@ -5182,6 +5563,12 @@  vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
 						     (int16x4_t) __b);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vcombine_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+  return (poly64x2_t) __builtin_aarch64_combinedi_ppp (__a[0], __b[0]);
+}
+
 /* Start of temporary inline asm implementations.  */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
@@ -9367,6 +9754,8 @@  __ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
 		 int8x16_t)
 __ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
 		 int16x8_t)
+__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
 __ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
 		 int8x16_t)
 __ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
@@ -9402,6 +9791,7 @@  __ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
 __ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
 __ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
 __ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
 __ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
 __ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
 __ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
@@ -9449,6 +9839,8 @@  __ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8,
 		 int8x16_t)
 __ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, p16,
 		 int16x8_t)
+__ST3_LANE_FUNC (poly64x1x3_t, poly64x2x3_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
 __ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
 		 int8x16_t)
 __ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
@@ -9484,6 +9876,7 @@  __ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
 __ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
 __ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
 __ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
 __ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
 __ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
 __ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
@@ -9536,6 +9929,8 @@  __ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8,
 		 int8x16_t)
 __ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, p16,
 		 int16x8_t)
+__ST4_LANE_FUNC (poly64x1x4_t, poly64x2x4_t, poly64_t, di, v2di_ssps, di, p64,
+		 poly64x2_t)
 __ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
 		 int8x16_t)
 __ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
@@ -9571,6 +9966,7 @@  __ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
 __ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
 __ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
 __ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
 __ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
 __ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
 __ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
@@ -12254,6 +12650,14 @@  vcopy_lane_p16 (poly16x4_t __a, const int __lane1,
 				  __a, __lane1);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vcopy_lane_p64 (poly64x1_t __a, const int __lane1,
+		poly64x1_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vcopy_lane_s8 (int8x8_t __a, const int __lane1,
 	       int8x8_t __b, const int __lane2)
@@ -12352,6 +12756,14 @@  vcopy_laneq_p16 (poly16x4_t __a, const int __lane1,
 				  __a, __lane1);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vcopy_laneq_p64 (poly64x1_t __a, const int __lane1,
+		 poly64x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				  __a, __lane1);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vcopy_laneq_s8 (int8x8_t __a, const int __lane1,
 		int8x16_t __b, const int __lane2)
@@ -12450,6 +12862,14 @@  vcopyq_lane_p16 (poly16x8_t __a, const int __lane1,
 				   __a, __lane1);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vcopyq_lane_p64 (poly64x2_t __a, const int __lane1,
+		 poly64x1_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vcopyq_lane_s8 (int8x16_t __a, const int __lane1,
 		int8x8_t __b, const int __lane2)
@@ -12548,6 +12968,14 @@  vcopyq_laneq_p16 (poly16x8_t __a, const int __lane1,
 				   __a, __lane1);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vcopyq_laneq_p64 (poly64x2_t __a, const int __lane1,
+		  poly64x2_t __b, const int __lane2)
+{
+  return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+				   __a, __lane1);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vcopyq_laneq_s8 (int8x16_t __a, const int __lane1,
 		 int8x16_t __b, const int __lane2)
@@ -13293,6 +13721,12 @@  vdup_n_p16 (poly16_t __a)
   return (poly16x4_t) {__a, __a, __a, __a};
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vdup_n_p64 (poly64_t __a)
+{
+  return (poly64x1_t) {__a};
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vdup_n_s8 (int8_t __a)
 {
@@ -13374,6 +13808,12 @@  vdupq_n_p16 (uint32_t __a)
   return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vdupq_n_p64 (poly64_t __a)
+{
+  return (poly64x2_t) {__a, __a};
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vdupq_n_s8 (int32_t __a)
 {
@@ -13456,6 +13896,12 @@  vdup_lane_p16 (poly16x4_t __a, const int __b)
   return __aarch64_vdup_lane_p16 (__a, __b);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vdup_lane_p64 (poly64x1_t __a, const int __b)
+{
+  return __aarch64_vdup_lane_p64 (__a, __b);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vdup_lane_s8 (int8x8_t __a, const int __b)
 {
@@ -13536,6 +13982,12 @@  vdup_laneq_p16 (poly16x8_t __a, const int __b)
   return __aarch64_vdup_laneq_p16 (__a, __b);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vdup_laneq_p64 (poly64x2_t __a, const int __b)
+{
+  return __aarch64_vdup_laneq_p64 (__a, __b);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vdup_laneq_s8 (int8x16_t __a, const int __b)
 {
@@ -13616,6 +14068,12 @@  vdupq_lane_p16 (poly16x4_t __a, const int __b)
   return __aarch64_vdupq_lane_p16 (__a, __b);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vdupq_lane_p64 (poly64x1_t __a, const int __b)
+{
+  return __aarch64_vdupq_lane_p64 (__a, __b);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vdupq_lane_s8 (int8x8_t __a, const int __b)
 {
@@ -13696,6 +14154,12 @@  vdupq_laneq_p16 (poly16x8_t __a, const int __b)
   return __aarch64_vdupq_laneq_p16 (__a, __b);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vdupq_laneq_p64 (poly64x2_t __a, const int __b)
+{
+  return __aarch64_vdupq_laneq_p64 (__a, __b);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vdupq_laneq_s8 (int8x16_t __a, const int __b)
 {
@@ -14570,6 +15034,12 @@  vld1_p16 (const poly16_t *a)
     __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vld1_p64 (const poly64_t *a)
+{
+  return (poly64x1_t) {*a};
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vld1_s8 (const int8_t *a)
 {
@@ -14655,6 +15125,13 @@  vld1q_p16 (const poly16_t *a)
     __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vld1q_p64 (const poly64_t *a)
+{
+  return (poly64x2_t)
+    __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vld1q_s8 (const int8_t *a)
 {
@@ -14739,6 +15216,12 @@  vld1_dup_p16 (const poly16_t* __a)
   return vdup_n_p16 (*__a);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vld1_dup_p64 (const poly64_t* __a)
+{
+  return vdup_n_p64 (*__a);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vld1_dup_s8 (const int8_t* __a)
 {
@@ -14819,6 +15302,12 @@  vld1q_dup_p16 (const poly16_t* __a)
   return vdupq_n_p16 (*__a);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vld1q_dup_p64 (const poly64_t* __a)
+{
+  return vdupq_n_p64 (*__a);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vld1q_dup_s8 (const int8_t* __a)
 {
@@ -14899,6 +15388,12 @@  vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
   return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vld1_lane_p64 (const poly64_t *__src, poly64x1_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
 {
@@ -14979,6 +15474,12 @@  vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
   return __aarch64_vset_lane_any (*__src, __vec, __lane);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_p64 (const poly64_t *__src, poly64x2_t __vec, const int __lane)
+{
+  return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
 {
@@ -15084,6 +15585,17 @@  vld2_p8 (const poly8_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
+vld2_p64 (const poly64_t * __a)
+{
+  poly64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
 vld2_s16 (const int16_t * __a)
 {
@@ -15216,6 +15728,17 @@  vld2q_p16 (const poly16_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x2x2_t __attribute__ ((__always_inline__))
+vld2q_p64 (const poly64_t * __a)
+{
+  poly64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
 vld2q_s32 (const int32_t * __a)
 {
@@ -15471,6 +15994,18 @@  vld3_f32 (const float32_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
+vld3_p64 (const poly64_t * __a)
+{
+  poly64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
 vld3q_s8 (const int8_t * __a)
 {
@@ -15627,6 +16162,18 @@  vld3q_f64 (const float64_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x2x3_t __attribute__ ((__always_inline__))
+vld3q_p64 (const poly64_t * __a)
+{
+  poly64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
 vld4_s64 (const int64_t * __a)
 {
@@ -15796,6 +16343,19 @@  vld4_f32 (const float32_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x1x4_t  __attribute__ ((__always_inline__))
+vld4_p64 (const poly64_t * __a)
+{
+  poly64x1x4_t  ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
+  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
 vld4q_s8 (const int8_t * __a)
 {
@@ -15965,6 +16525,19 @@  vld4q_f64 (const float64_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x2x4_t  __attribute__ ((__always_inline__))
+vld4q_p64 (const poly64_t * __a)
+{
+  poly64x2x4_t  ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
+  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
+  return ret;
+}
+
 /* vldn_dup */
 
 __extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
@@ -16088,6 +16661,18 @@  vld2_dup_p16 (const poly16_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x1x2_t __attribute__ ((__always_inline__))
+vld2_dup_p64 (const poly64_t * __a)
+{
+  poly64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1);
+  return ret;
+}
+
+
 __extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
 vld2_dup_s64 (const int64_t * __a)
 {
@@ -16253,6 +16838,17 @@  vld2q_dup_f64 (const float64_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x2x2_t __attribute__ ((__always_inline__))
+vld2q_dup_p64 (const poly64_t * __a)
+{
+  poly64x2x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1);
+  return ret;
+}
+
 __extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
 vld3_dup_s64 (const int64_t * __a)
 {
@@ -16409,6 +17005,18 @@  vld3_dup_f32 (const float32_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x1x3_t __attribute__ ((__always_inline__))
+vld3_dup_p64 (const poly64_t * __a)
+{
+  poly64x1x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
 vld3q_dup_s8 (const int8_t * __a)
 {
@@ -16565,6 +17173,18 @@  vld3q_dup_f64 (const float64_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x2x3_t __attribute__ ((__always_inline__))
+vld3q_dup_p64 (const poly64_t * __a)
+{
+  poly64x2x3_t ret;
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2);
+  return ret;
+}
+
 __extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
 vld4_dup_s64 (const int64_t * __a)
 {
@@ -16734,6 +17354,19 @@  vld4_dup_f32 (const float32_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x1x4_t __attribute__ ((__always_inline__))
+vld4_dup_p64 (const poly64_t * __a)
+{
+  poly64x1x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0);
+  ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1);
+  ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2);
+  ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3);
+  return ret;
+}
+
 __extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
 vld4q_dup_s8 (const int8_t * __a)
 {
@@ -16903,6 +17536,19 @@  vld4q_dup_f64 (const float64_t * __a)
   return ret;
 }
 
+__extension__ static __inline poly64x2x4_t __attribute__ ((__always_inline__))
+vld4q_dup_p64 (const poly64_t * __a)
+{
+  poly64x2x4_t ret;
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a);
+  ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0);
+  ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1);
+  ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2);
+  ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3);
+  return ret;
+}
+
 /* vld2_lane */
 
 #define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode,	   \
@@ -16939,6 +17585,8 @@  __LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi,
 		 int8x16_t)
 __LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi,
 		 p16, int16x8_t)
+__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di,
+		 v2di_ssps, di, p64, poly64x2_t)
 __LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
 		 int8x16_t)
 __LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
@@ -16980,6 +17628,7 @@  __LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
 __LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
 __LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
 __LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
 __LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
 __LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
@@ -17033,6 +17682,8 @@  __LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi,
 		 int8x16_t)
 __LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi,
 		 p16, int16x8_t)
+__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di,
+		 v2di_ssps, di, p64, poly64x2_t)
 __LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8,
 		 int8x16_t)
 __LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16,
@@ -17076,6 +17727,7 @@  __LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
 __LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
 __LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
 __LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
 __LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
 __LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
@@ -17137,6 +17789,8 @@  __LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi,
 		 int8x16_t)
 __LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi,
 		 p16, int16x8_t)
+__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di,
+		 v2di_ssps, di, p64, poly64x2_t)
 __LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8,
 		 int8x16_t)
 __LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16,
@@ -17182,6 +17836,7 @@  __LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
 __LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
 __LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
 __LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
 __LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
 __LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
 __LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
@@ -18457,6 +19112,12 @@  vmov_n_p16 (poly16_t __a)
   return vdup_n_p16 (__a);
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vmov_n_p64 (poly64_t __a)
+{
+  return vdup_n_p64 (__a);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vmov_n_s8 (int8_t __a)
 {
@@ -18535,6 +19196,12 @@  vmovq_n_p16 (poly16_t __a)
   return vdupq_n_p16 (__a);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vmovq_n_p64 (poly64_t __a)
+{
+  return vdupq_n_p64 (__a);
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vmovq_n_s8 (int8_t __a)
 {
@@ -22510,6 +23177,12 @@  vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
   return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)};
 }
 
+__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__))
+vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c)
+{
+  return (poly64x1_t) {__builtin_aarch64_ssli_ndi_ppps (__a[0], __b[0], __c)};
+}
+
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
 {
@@ -22558,6 +23231,12 @@  vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
   return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c);
 }
 
+__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__))
+vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
+{
+  return __builtin_aarch64_ssli_nv2di_ppps (__a, __b, __c);
+}
+
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
 vslid_n_s64 (int64_t __a, int64_t __b, const int __c)
 {
@@ -22924,6 +23603,12 @@  vst1_p16 (poly16_t *a, poly16x4_t b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p64 (poly64_t *a, poly64x1_t b)
+{
+  *a = b[0];
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_s8 (int8_t *a, int8x8_t b)
 {
   __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b);
@@ -23009,6 +23694,13 @@  vst1q_p16 (poly16_t *a, poly16x8_t b)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p64 (poly64_t *a, poly64x2_t b)
+{
+  __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) a,
+				(poly64x2_t) b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_s8 (int8_t *a, int8x16_t b)
 {
   __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b);
@@ -23093,6 +23785,12 @@  vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_p64 (poly64_t *__a, poly64x1_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane)
 {
   *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -23173,6 +23871,12 @@  vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_p64 (poly64_t *__a, poly64x2_t __b, const int __lane)
+{
+  *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane)
 {
   *__a = __aarch64_vget_lane_any (__b, __lane);
@@ -23379,6 +24083,20 @@  vst2_f32 (float32_t * __a, float32x2x2_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_p64 (poly64_t * __a, poly64x1x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst2q_s8 (int8_t * __a, int8x16x2_t val)
 {
   __builtin_aarch64_simd_oi __o;
@@ -23495,6 +24213,17 @@  vst2q_f64 (float64_t * __a, float64x2x2_t val)
   __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_p64 (poly64_t * __a, poly64x2x2_t val)
+{
+  __builtin_aarch64_simd_oi __o;
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregoiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 __extension__ static __inline void
 vst3_s64 (int64_t * __a, int64x1x3_t val)
 {
@@ -23678,6 +24407,23 @@  vst3_f32 (float32_t * __a, float32x2x3_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_p64 (poly64_t * __a, poly64x1x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  poly64x2x3_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst3q_s8 (int8_t * __a, int8x16x3_t val)
 {
   __builtin_aarch64_simd_ci __o;
@@ -23807,6 +24553,19 @@  vst3q_f64 (float64_t * __a, float64x2x3_t val)
   __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_p64 (poly64_t * __a, poly64x2x3_t val)
+{
+  __builtin_aarch64_simd_ci __o;
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregciv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 __extension__ static __inline void
 vst4_s64 (int64_t * __a, int64x1x4_t val)
 {
@@ -24016,6 +24775,26 @@  vst4_f32 (float32_t * __a, float32x2x4_t val)
 }
 
 __extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_p64 (poly64_t * __a, poly64x1x4_t val)
+{
+  __builtin_aarch64_simd_xi __o;
+  poly64x2x4_t temp;
+  temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  temp.val[3] = vcombine_p64 (val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) temp.val[3], 3);
+  __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
 vst4q_s8 (int8_t * __a, int8x16x4_t val)
 {
   __builtin_aarch64_simd_xi __o;
@@ -24158,6 +24937,21 @@  vst4q_f64 (float64_t * __a, float64x2x4_t val)
   __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
 }
 
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_p64 (poly64_t * __a, poly64x2x4_t val)
+{
+  __builtin_aarch64_simd_xi __o;
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[0], 0);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[1], 1);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[2], 2);
+  __o = __builtin_aarch64_set_qregxiv2di_ssps (__o,
+					       (poly64x2_t) val.val[3], 3);
+  __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+}
+
 /* vsub */
 
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
index 3363a7208625175d0a9cbb79954902c80aeedabc..19fbe835e8c7af34320c333e0c3f74230368291a 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/arm-neon-ref.h
@@ -84,6 +84,13 @@  extern size_t strlen(const char *);
     fprintf(stderr, "CHECKED %s %s\n", STR(VECT_TYPE(T, W, N)), MSG);	\
   }
 
+#if defined (__ARM_FEATURE_CRYPTO)
+#define CHECK_CRYPTO(MSG,T,W,N,FMT,EXPECTED,COMMENT) \
+	       CHECK(MSG,T,W,N,FMT,EXPECTED,COMMENT)
+#else
+#define CHECK_CRYPTO(MSG,T,W,N,FMT,EXPECTED,COMMENT)
+#endif
+
 /* Floating-point variant.  */
 #define CHECK_FP(MSG,T,W,N,FMT,EXPECTED,COMMENT)			\
   {									\
@@ -176,6 +183,9 @@  extern ARRAY(expected, uint, 32, 2);
 extern ARRAY(expected, uint, 64, 1);
 extern ARRAY(expected, poly, 8, 8);
 extern ARRAY(expected, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+extern ARRAY(expected, poly, 64, 1);
+#endif
 extern ARRAY(expected, hfloat, 16, 4);
 extern ARRAY(expected, hfloat, 32, 2);
 extern ARRAY(expected, hfloat, 64, 1);
@@ -189,6 +199,9 @@  extern ARRAY(expected, uint, 32, 4);
 extern ARRAY(expected, uint, 64, 2);
 extern ARRAY(expected, poly, 8, 16);
 extern ARRAY(expected, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+extern ARRAY(expected, poly, 64, 2);
+#endif
 extern ARRAY(expected, hfloat, 16, 8);
 extern ARRAY(expected, hfloat, 32, 4);
 extern ARRAY(expected, hfloat, 64, 2);
@@ -205,6 +218,7 @@  extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_CRYPTO(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -217,6 +231,7 @@  extern ARRAY(expected, hfloat, 64, 2);
     CHECK(test_name, uint, 64, 2, PRIx64, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_CRYPTO(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment);	\
   }									\
 
@@ -390,6 +405,9 @@  static void clean_results (void)
   CLEAN(result, uint, 64, 1);
   CLEAN(result, poly, 8, 8);
   CLEAN(result, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  CLEAN(result, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CLEAN(result, float, 16, 4);
 #endif
@@ -405,6 +423,9 @@  static void clean_results (void)
   CLEAN(result, uint, 64, 2);
   CLEAN(result, poly, 8, 16);
   CLEAN(result, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  CLEAN(result, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CLEAN(result, float, 16, 8);
 #endif
@@ -430,6 +451,13 @@  static void clean_results (void)
 #define DECL_VARIABLE(VAR, T1, W, N)		\
   VECT_TYPE(T1, W, N) VECT_VAR(VAR, T1, W, N)
 
+#if defined (__ARM_FEATURE_CRYPTO)
+#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N) \
+  DECL_VARIABLE(VAR, T1, W, N)
+#else
+#define DECL_VARIABLE_CRYPTO(VAR, T1, W, N)
+#endif
+
 /* Declare only 64 bits signed variants.  */
 #define DECL_VARIABLE_64BITS_SIGNED_VARIANTS(VAR)	\
   DECL_VARIABLE(VAR, int, 8, 8);			\
@@ -465,6 +493,7 @@  static void clean_results (void)
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
   DECL_VARIABLE(VAR, float, 16, 4);		\
   DECL_VARIABLE(VAR, float, 32, 2)
 #else
@@ -473,6 +502,7 @@  static void clean_results (void)
   DECL_VARIABLE_64BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 8);		\
   DECL_VARIABLE(VAR, poly, 16, 4);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 1);	\
   DECL_VARIABLE(VAR, float, 32, 2)
 #endif
 
@@ -483,6 +513,7 @@  static void clean_results (void)
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
   DECL_VARIABLE(VAR, float, 16, 8);		\
   DECL_VARIABLE(VAR, float, 32, 4)
 #else
@@ -491,6 +522,7 @@  static void clean_results (void)
   DECL_VARIABLE_128BITS_UNSIGNED_VARIANTS(VAR);	\
   DECL_VARIABLE(VAR, poly, 8, 16);		\
   DECL_VARIABLE(VAR, poly, 16, 8);		\
+  DECL_VARIABLE_CRYPTO(VAR, poly, 64, 2);	\
   DECL_VARIABLE(VAR, float, 32, 4)
 #endif
 /* Declare all variants.  */
@@ -532,6 +564,13 @@  static void clean_results (void)
 
 /* Helpers to call macros with 1 constant and 5 variable
    arguments.  */
+#if defined (__ARM_FEATURE_CRYPTO)
+#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N) \
+  MACRO(VAR1, VAR2, T1, T2, T3, W, N)
+#else
+#define MACRO_CRYPTO(MACRO, VAR1, VAR2, T1, T2, T3, W, N)
+#endif
+
 #define TEST_MACRO_64BITS_SIGNED_VARIANTS_1_5(MACRO, VAR)	\
   MACRO(VAR, , int, s, 8, 8);					\
   MACRO(VAR, , int, s, 16, 4);					\
@@ -602,13 +641,15 @@  static void clean_results (void)
   TEST_MACRO_64BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_64BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   MACRO(VAR1, VAR2, , poly, p, 8, 8);				\
-  MACRO(VAR1, VAR2, , poly, p, 16, 4)
+  MACRO(VAR1, VAR2, , poly, p, 16, 4);				\
+  MACRO_CRYPTO(MACRO, VAR1, VAR2, , poly, p, 64, 1)
 
 #define TEST_MACRO_128BITS_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_128BITS_SIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   TEST_MACRO_128BITS_UNSIGNED_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
   MACRO(VAR1, VAR2, q, poly, p, 8, 16);				\
-  MACRO(VAR1, VAR2, q, poly, p, 16, 8)
+  MACRO(VAR1, VAR2, q, poly, p, 16, 8);				\
+  MACRO_CRYPTO(MACRO, VAR1, VAR2, q, poly, p, 64, 2)
 
 #define TEST_MACRO_ALL_VARIANTS_2_5(MACRO, VAR1, VAR2)	\
   TEST_MACRO_64BITS_VARIANTS_2_5(MACRO, VAR1, VAR2);	\
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64.c
new file mode 100644
index 0000000000000000000000000000000000000000..df66eaa749f7a1a34011d6d169d1262ba976c6ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64.c
@@ -0,0 +1,302 @@ 
+/* This file contains tests for the VLD{X}, VLD{X}_DUP and VSLI.  */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* { dg-options "-march=armv8-a+crypto" } */
+/* { dg-skip-if "" { arm*-*-* } } */
+
+/* Expected results: vld1.  */
+VECT_VAR_DECL (vld1_expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld1_expected,poly,64,2) [] = { 0xfffffffffffffff0,
+					       0xfffffffffffffff1 };
+
+/* Expected results: vld1_dup.  */
+VECT_VAR_DECL (vld1_dup_expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld1_dup_expected0,poly,64,2) [] = { 0xfffffffffffffff0,
+						    0xfffffffffffffff0 };
+VECT_VAR_DECL (vld1_dup_expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL (vld1_dup_expected1,poly,64,2) [] = { 0xfffffffffffffff1,
+						    0xfffffffffffffff1 };
+VECT_VAR_DECL (vld1_dup_expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL (vld1_dup_expected2,poly,64,2) [] = { 0xfffffffffffffff2,
+						    0xfffffffffffffff2 };
+
+/* Expected results: vldX.  */
+VECT_VAR_DECL (vld2_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld2_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL (vld3_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld3_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL (vld3_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL (vld4_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld4_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL (vld4_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL (vld4_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+
+/* Expected results: vldX_dup.  */
+VECT_VAR_DECL (vld2_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld2_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL (vld3_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld3_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL (vld3_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL (vld4_dup_expected_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL (vld4_dup_expected_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL (vld4_dup_expected_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL (vld4_dup_expected_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+
+/* Expected results: vsli.  */
+VECT_VAR_DECL (vsli_expected,poly,64,1) [] = { 0x10 };
+VECT_VAR_DECL (vsli_expected,poly,64,2) [] = { 0x7ffffffffffff0,
+					       0x7ffffffffffff1 };
+VECT_VAR_DECL (vsli_expected_max_shift,poly,64,1) [] = { 0x7ffffffffffffff0 };
+VECT_VAR_DECL (vsli_expected_max_shift,poly,64,2) [] = { 0xfffffffffffffff0,
+							 0xfffffffffffffff1 };
+
+int main (void)
+{
+  int i;
+
+  /* vld1_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VLD1/VLD1Q"
+
+#define TEST_VLD1(VAR, BUF, Q, T1, T2, W, N)				\
+  VECT_VAR (VAR, T1, W, N) = vld1##Q##_##T2##W (VECT_VAR (BUF, T1, W, N)); \
+  vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N), VECT_VAR (VAR, T1, W, N))
+
+  DECL_VARIABLE (vld1_vector, poly, 64, 1);
+  DECL_VARIABLE (vld1_vector, poly, 64, 2);
+
+  CLEAN (result, poly, 64, 1);
+  CLEAN (result, poly, 64, 2);
+
+  VLOAD (vld1_vector, buffer, , poly, p, 64, 1);
+  VLOAD (vld1_vector, buffer, q, poly, p, 64, 2);
+
+  TEST_VLD1 (vld1_vector, buffer, , poly, p, 64, 1);
+  TEST_VLD1 (vld1_vector, buffer, q, poly, p, 64, 2);
+
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld1_expected, "");
+  CHECK (TEST_MSG, poly, 64, 2, PRIx64, vld1_expected, "");
+
+  /* vld1_dup_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VLD1_DUP/VLD1_DUPQ"
+
+#define TEST_VLD1_DUP(VAR, BUF, Q, T1, T2, W, N)			\
+  VECT_VAR (VAR, T1, W, N) =						\
+    vld1##Q##_dup_##T2##W (&VECT_VAR (BUF, T1, W, N)[i]); \
+  vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N), VECT_VAR (VAR, T1, W, N))
+
+  DECL_VARIABLE (vld1_dup_vector, poly, 64, 1);
+  DECL_VARIABLE (vld1_dup_vector, poly, 64, 2);
+
+  /* Try to read different places from the input buffer.  */
+  for (i=0; i<3; i++)
+  {
+    CLEAN (result, poly, 64, 1);
+    CLEAN (result, poly, 64, 2);
+
+    TEST_VLD1_DUP (vld1_dup_vector, buffer_dup, , poly, p, 64, 1);
+    TEST_VLD1_DUP (vld1_dup_vector, buffer_dup, q, poly, p, 64, 2);
+
+    switch (i)
+    {
+    case 0:
+      CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected0, "");
+      CHECK (TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected0, "");
+      break;
+    case 1:
+      CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected1, "");
+      CHECK (TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected1, "");
+      break;
+    case 2:
+      CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld1_dup_expected2, "");
+      CHECK (TEST_MSG, poly, 64, 2, PRIx64, vld1_dup_expected2, "");
+      break;
+    default:
+      abort ();
+    }
+  }
+
+  /* vldX_p64 tests.  */
+#define DECL_VLDX(T1, W, N, X)						\
+  VECT_ARRAY_TYPE (T1, W, N, X) VECT_ARRAY_VAR (vldX_vector, T1, W, N, X); \
+  VECT_VAR_DECL (vldX_result_bis_##X, T1, W, N)[X * N]
+
+#define TEST_VLDX(Q, T1, T2, W, N, X)					\
+  VECT_ARRAY_VAR (vldX_vector, T1, W, N, X) =				\
+    /* Use dedicated init buffer, of size X.  */	\
+    vld##X##Q##_##T2##W (VECT_ARRAY_VAR (buffer_vld##X, T1, W, N, X));	\
+  vst##X##Q##_##T2##W (VECT_VAR (vldX_result_bis_##X, T1, W, N), \
+		      VECT_ARRAY_VAR (vldX_vector, T1, W, N, X));	\
+	 memcpy (VECT_VAR (result, T1, W, N), \
+		 VECT_VAR (vldX_result_bis_##X, T1, W, N), \
+		 sizeof (VECT_VAR (result, T1, W, N)));
+
+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
+#define TEST_EXTRA_CHUNK(T1, W, N, X,Y)				\
+  memcpy (VECT_VAR (result, T1, W, N),				\
+	 &(VECT_VAR (vldX_result_bis_##X, T1, W, N)[Y*N]),	\
+	 sizeof (VECT_VAR (result, T1, W, N)));
+
+  DECL_VLDX (poly, 64, 1, 2);
+  DECL_VLDX (poly, 64, 1, 3);
+  DECL_VLDX (poly, 64, 1, 4);
+
+  VECT_ARRAY_INIT2 (buffer_vld2, poly, 64, 1);
+  PAD (buffer_vld2_pad, poly, 64, 1);
+  VECT_ARRAY_INIT3 (buffer_vld3, poly, 64, 1);
+  PAD (buffer_vld3_pad, poly, 64, 1);
+  VECT_ARRAY_INIT4 (buffer_vld4, poly, 64, 1);
+  PAD (buffer_vld4_pad, poly, 64, 1);
+
+#undef TEST_MSG
+#define TEST_MSG "VLD2/VLD2Q"
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX (, poly, p, 64, 1, 2);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_0, "chunk 0");
+  CLEAN (result, poly, 64, 1);
+  TEST_EXTRA_CHUNK (poly, 64, 1, 2, 1);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld2_expected_1, "chunk 1");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD3/VLD3Q"
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX (, poly, p, 64, 1, 3);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_0, "chunk 0");
+  CLEAN (result, poly, 64, 1);
+  TEST_EXTRA_CHUNK (poly, 64, 1, 3, 1);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_1, "chunk 1");
+  CLEAN (result, poly, 64, 1);
+  TEST_EXTRA_CHUNK (poly, 64, 1, 3, 2);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld3_expected_2, "chunk 2");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD4/VLD4Q"
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX (, poly, p, 64, 1, 4);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_0, "chunk 0");
+  CLEAN (result, poly, 64, 1);
+  TEST_EXTRA_CHUNK (poly, 64, 1, 4, 1);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_1, "chunk 1");
+  CLEAN (result, poly, 64, 1);
+  TEST_EXTRA_CHUNK (poly, 64, 1, 4, 2);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_2, "chunk 2");
+  CLEAN (result, poly, 64, 1);
+  TEST_EXTRA_CHUNK (poly, 64, 1, 4, 3);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_expected_3, "chunk 3");
+
+  /* vldX_dup_p64 tests.  */
+#define DECL_VLDX_DUP(T1, W, N, X)					\
+  VECT_ARRAY_TYPE (T1, W, N, X) VECT_ARRAY_VAR (vldX_dup_vector, T1, W, N, X); \
+  VECT_VAR_DECL (vldX_dup_result_bis_##X, T1, W, N)[X * N]
+
+#define TEST_VLDX_DUP(Q, T1, T2, W, N, X)				\
+  VECT_ARRAY_VAR (vldX_dup_vector, T1, W, N, X) =			\
+    vld##X##Q##_dup_##T2##W (&VECT_VAR (buffer_dup, T1, W, N)[0]);	\
+									\
+  vst##X##Q##_##T2##W (VECT_VAR (vldX_dup_result_bis_##X, T1, W, N),	\
+		      VECT_ARRAY_VAR (vldX_dup_vector, T1, W, N, X));	\
+  memcpy (VECT_VAR (result, T1, W, N), \
+	 VECT_VAR (vldX_dup_result_bis_##X, T1, W, N), \
+	 sizeof (VECT_VAR (result, T1, W, N)));
+
+  /* Overwrite "result" with the contents of "result_bis"[Y].  */
+#define TEST_VLDX_DUP_EXTRA_CHUNK(T1, W, N, X,Y)		\
+  memcpy (VECT_VAR (result, T1, W, N),				\
+	 &(VECT_VAR (vldX_dup_result_bis_##X, T1, W, N)[Y*N]),	\
+	 sizeof (VECT_VAR (result, T1, W, N)));
+
+  DECL_VLDX_DUP (poly, 64, 1, 2);
+  DECL_VLDX_DUP (poly, 64, 1, 3);
+  DECL_VLDX_DUP (poly, 64, 1, 4);
+
+
+#undef TEST_MSG
+#define TEST_MSG "VLD2_DUP/VLD2Q_DUP"
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP (, poly, p, 64, 1, 2);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_0, "chunk 0");
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK (poly, 64, 1, 2, 1);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld2_dup_expected_1, "chunk 1");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD3_DUP/VLD3Q_DUP"
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP (, poly, p, 64, 1, 3);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_0, "chunk 0");
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK (poly, 64, 1, 3, 1);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_1, "chunk 1");
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK (poly, 64, 1, 3, 2);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld3_dup_expected_2, "chunk 2");
+
+#undef TEST_MSG
+#define TEST_MSG "VLD4_DUP/VLD4Q_DUP"
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP (, poly, p, 64, 1, 4);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_0, "chunk 0");
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK (poly, 64, 1, 4, 1);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_1, "chunk 1");
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK (poly, 64, 1, 4, 2);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_2, "chunk 2");
+  CLEAN (result, poly, 64, 1);
+  TEST_VLDX_DUP_EXTRA_CHUNK (poly, 64, 1, 4, 3);
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vld4_dup_expected_3, "chunk 3");
+
+  /* vsli_p64 tests.  */
+#undef TEST_MSG
+#define TEST_MSG "VSLI"
+
+#define TEST_VSXI1(INSN, Q, T1, T2, W, N, V)				\
+  VECT_VAR (vsXi_vector_res, T1, W, N) = \
+    INSN##Q##_n_##T2##W (VECT_VAR (vsXi_vector, T1, W, N),		\
+		      VECT_VAR (vsXi_vector2, T1, W, N), \
+		      V);						\
+  vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N), \
+		     VECT_VAR (vsXi_vector_res, T1, W, N))
+
+#define TEST_VSXI(INSN, Q, T1, T2, W, N, V)	\
+  TEST_VSXI1 (INSN, Q, T1, T2, W, N, V)
+
+  DECL_VARIABLE (vsXi_vector, poly, 64, 1);
+  DECL_VARIABLE (vsXi_vector, poly, 64, 2);
+  DECL_VARIABLE (vsXi_vector2, poly, 64, 1);
+  DECL_VARIABLE (vsXi_vector2, poly, 64, 2);
+  DECL_VARIABLE (vsXi_vector_res, poly, 64, 1);
+  DECL_VARIABLE (vsXi_vector_res, poly, 64, 2);
+
+  CLEAN (result, poly, 64, 1);
+  CLEAN (result, poly, 64, 2);
+
+  VLOAD (vsXi_vector, buffer, , poly, p, 64, 1);
+  VLOAD (vsXi_vector, buffer, q, poly, p, 64, 2);
+
+  VDUP (vsXi_vector2, , poly, p, 64, 1, 2);
+  VDUP (vsXi_vector2, q, poly, p, 64, 2, 3);
+
+  TEST_VSXI (vsli, , poly, p, 64, 1, 3);
+  TEST_VSXI (vsli, q, poly, p, 64, 2, 53);
+
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vsli_expected, "");
+  CHECK (TEST_MSG, poly, 64, 2, PRIx64, vsli_expected, "");
+
+  /* Test cases with maximum shift amount.  */
+  CLEAN (result, poly, 64, 1);
+  CLEAN (result, poly, 64, 2);
+
+  TEST_VSXI (vsli, , poly, p, 64, 1, 63);
+  TEST_VSXI (vsli, q, poly, p, 64, 2, 63);
+
+#define COMMENT "(max shift amount)"
+  CHECK (TEST_MSG, poly, 64, 1, PRIx64, vsli_expected_max_shift, COMMENT);
+  CHECK (TEST_MSG, poly, 64, 2, PRIx64, vsli_expected_max_shift, COMMENT);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
index 5100375e5fe0c1f1f6b1e0cbff549990d73948e3..0c6b25d578102f042c669d9bdeaa15e5a1292267 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcombine.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -25,6 +26,9 @@  VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0x55, 0x55, 0x55, 0x55 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0x66, 0x66, 0x66, 0x66 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x77 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
 					   0x40533333, 0x40533333 };
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
@@ -62,6 +66,9 @@  void exec_vcombine (void)
   VDUP(vector64_b, , uint, u, 64, 1, 0x88);
   VDUP(vector64_b, , poly, p, 8, 8, 0x55);
   VDUP(vector64_b, , poly, p, 16, 4, 0x66);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VDUP(vector64_b, , poly, p, 64, 1, 0x77);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VDUP(vector64_b, , float, f, 16, 4, 2.25);
 #endif
@@ -80,6 +87,9 @@  void exec_vcombine (void)
   TEST_VCOMBINE(uint, u, 64, 1, 2);
   TEST_VCOMBINE(poly, p, 8, 8, 16);
   TEST_VCOMBINE(poly, p, 16, 4, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VCOMBINE(poly, p, 64, 1, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VCOMBINE(float, f, 16, 4, 8);
 #endif
@@ -95,6 +105,9 @@  void exec_vcombine (void)
   CHECK(TEST_MSG, uint, 64, 2, PRIx64, expected, "");
   CHECK(TEST_MSG, poly, 8, 16, PRIx8, expected, "");
   CHECK(TEST_MSG, poly, 16, 8, PRIx16, expected, "");
+#if defined (__ARM_FEATURE_CRYPTO)
+  CHECK(TEST_MSG, poly, 64, 2, PRIx64, expected, "");
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 8, PRIx16, expected, "");
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
index b8b338ef3c06ff2489b525e22760cbaed1fda335..d6d3bba39523e9e9f91b4fe80065682ea01bd6b8 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vcreate.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0xde, 0xbc, 0x9a,
@@ -16,6 +17,9 @@  VECT_VAR_DECL(expected,uint,64,1) [] = { 0x123456789abcdef0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xde, 0xbc, 0x9a,
 					0x78, 0x56, 0x34, 0x12 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,1) [] = { 0x123456789abcdef0 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xdef0, 0x9abc, 0x5678, 0x1234 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x9abcdef0, 0x12345678 };
 
@@ -49,6 +53,9 @@  FNNAME (INSN_NAME)
   DECL_VAL(val, uint, 64, 1);
   DECL_VAL(val, poly, 8, 8);
   DECL_VAL(val, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  DECL_VAL(val, poly, 64, 1);
+#endif
 
   DECL_VARIABLE(vector_res, int, 8, 8);
   DECL_VARIABLE(vector_res, int, 16, 4);
@@ -64,6 +71,9 @@  FNNAME (INSN_NAME)
   DECL_VARIABLE(vector_res, uint, 64, 1);
   DECL_VARIABLE(vector_res, poly, 8, 8);
   DECL_VARIABLE(vector_res, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  DECL_VARIABLE(vector_res, poly, 64, 1);
+#endif
 
   clean_results ();
 
@@ -82,6 +92,9 @@  FNNAME (INSN_NAME)
   VECT_VAR(val, uint, 64, 1) = 0x123456789abcdef0ULL;
   VECT_VAR(val, poly, 8, 8) = 0x123456789abcdef0ULL;
   VECT_VAR(val, poly, 16, 4) = 0x123456789abcdef0ULL;
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_VAR(val, poly, 64, 1) = 0x123456789abcdef0ULL;
+#endif
 
   TEST_VCREATE(int, s, 8, 8);
   TEST_VCREATE(int, s, 16, 4);
@@ -97,6 +110,9 @@  FNNAME (INSN_NAME)
   TEST_VCREATE(uint, u, 64, 1);
   TEST_VCREATE(poly, p, 8, 8);
   TEST_VCREATE(poly, p, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VCREATE(poly, p, 64, 1);
+#endif
 
   CHECK(TEST_MSG, int, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, int, 16, 4, PRIx16, expected, "");
@@ -108,6 +124,9 @@  FNNAME (INSN_NAME)
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+#if defined (__ARM_FEATURE_CRYPTO)
+  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected, "");
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
index 22d45d56c8e3c517da8cc595f767f8034aeabde8..29b7c7dd016a596482b64008b384ac5c1d50a69c 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup-vmov.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* We test vdup and vmov in the same place since they are aliases.  */
 
@@ -19,6 +20,11 @@  VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected0,poly,64,2) [] = { 0xfffffffffffffff0,
+					  0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0,
@@ -63,6 +69,11 @@  VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected1,poly,64,2) [] = { 0xfffffffffffffff1,
+					  0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1,
@@ -107,6 +118,11 @@  VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(expected2,poly,64,2) [] = { 0xfffffffffffffff2,
+					  0xfffffffffffffff2 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
 VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2,
@@ -171,6 +187,9 @@  void exec_vdup_vmov (void)
     TEST_VDUP(, uint, u, 64, 1);
     TEST_VDUP(, poly, p, 8, 8);
     TEST_VDUP(, poly, p, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+    TEST_VDUP(, poly, p, 64, 1);
+#endif
     TEST_VDUP(, float, f, 32, 2);
 
     TEST_VDUP(q, int, s, 8, 16);
@@ -183,9 +202,13 @@  void exec_vdup_vmov (void)
     TEST_VDUP(q, uint, u, 64, 2);
     TEST_VDUP(q, poly, p, 8, 16);
     TEST_VDUP(q, poly, p, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+    TEST_VDUP(q, poly, p, 64, 2);
+#endif
     TEST_VDUP(q, float, f, 32, 4);
 
-    switch (i) {
+    switch (i)
+    {
     case 0:
       CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
       break;
@@ -203,7 +226,8 @@  void exec_vdup_vmov (void)
   /* Do the same tests with vmov. Use the same expected results.  */
 #undef TEST_MSG
 #define TEST_MSG "VMOV/VMOVQ"
-  for (i=0; i< 3; i++) {
+  for (i=0; i< 3; i++)
+  {
     clean_results ();
 
     TEST_VMOV(, int, s, 8, 8);
@@ -216,6 +240,9 @@  void exec_vdup_vmov (void)
     TEST_VMOV(, uint, u, 64, 1);
     TEST_VMOV(, poly, p, 8, 8);
     TEST_VMOV(, poly, p, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+    TEST_VMOV(, poly, p, 64, 1);
+#endif
     TEST_VMOV(, float, f, 32, 2);
 
     TEST_VMOV(q, int, s, 8, 16);
@@ -228,9 +255,13 @@  void exec_vdup_vmov (void)
     TEST_VMOV(q, uint, u, 64, 2);
     TEST_VMOV(q, poly, p, 8, 16);
     TEST_VMOV(q, poly, p, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+    TEST_VMOV(q, poly, p, 64, 2);
+#endif
     TEST_VMOV(q, float, f, 32, 4);
 
-    switch (i) {
+    switch (i)
+    {
     case 0:
       CHECK_RESULTS_NAMED_NO_FP16 (TEST_MSG, expected0, "");
       break;
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
index ef708dcba17e68ccc0e3540c52bb6507d0562fad..38e98f0a97668e4aa6102b203129f76952717967 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdup_lane.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
@@ -27,6 +28,10 @@  VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff1, 0xfffffff1,
 					0xfffffff1, 0xfffffff1 };
 VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff0,
 					0xfffffffffffffff0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,2) [] = { 0xfffffffffffffff0,
+					0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
 					 0xf5, 0xf5, 0xf5, 0xf5,
 					 0xf5, 0xf5, 0xf5, 0xf5,
@@ -43,6 +48,9 @@  VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf5, 0xf5, 0xf5, 0xf5,
 					 0xf5, 0xf5, 0xf5, 0xf5 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
 					   0xc1700000, 0xc1700000 };
 
@@ -76,6 +84,9 @@  void exec_vdup_lane (void)
   TEST_VDUP_LANE(, uint, u, 64, 1, 1, 0);
   TEST_VDUP_LANE(, poly, p, 8, 8, 8, 7);
   TEST_VDUP_LANE(, poly, p, 16, 4, 4, 3);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VDUP_LANE(, poly, p, 64, 1, 1, 0);
+#endif
   TEST_VDUP_LANE(, float, f, 32, 2, 2, 1);
 
   TEST_VDUP_LANE(q, int, s, 8, 16, 8, 2);
@@ -88,6 +99,9 @@  void exec_vdup_lane (void)
   TEST_VDUP_LANE(q, uint, u, 64, 2, 1, 0);
   TEST_VDUP_LANE(q, poly, p, 8, 16, 8, 5);
   TEST_VDUP_LANE(q, poly, p, 16, 8, 4, 1);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VDUP_LANE(q, poly, p, 64, 2, 1, 0);
+#endif
   TEST_VDUP_LANE(q, float, f, 32, 4, 2, 1);
 
   CHECK_RESULTS_NO_FP16 (TEST_MSG, "");
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
index 9f0a1687f189bc2dfbe111c7f5c3b96c9acecd52..8c9f52e39aadcfb37fed3c3cefc5fef941ca5314 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_high.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
@@ -16,6 +17,9 @@  VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
@@ -50,6 +54,9 @@  void exec_vget_high (void)
   TEST_VGET_HIGH(uint, u, 64, 1, 2);
   TEST_VGET_HIGH(poly, p, 8, 8, 16);
   TEST_VGET_HIGH(poly, p, 16, 4, 8);
+  #if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VGET_HIGH(poly, p, 64, 1, 2);
+  #endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VGET_HIGH(float, f, 16, 4, 8);
 #endif
@@ -65,6 +72,7 @@  void exec_vget_high (void)
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected, "");
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected, "");
 }
 
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
index ee6d6503ad622c936be2f6c7468db845398a6c2e..7cd8955e1439efbc6ca8df35a0e1532d2ef6d8fc 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_lane.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 int8_t     expected_s8   = 0xf7;
@@ -13,6 +14,9 @@  uint32_t   expected_u32  = 0xfffffff1;
 uint64_t   expected_u64  = 0xfffffffffffffff0;
 poly8_t    expected_p8   = 0xf6;
 poly16_t   expected_p16  = 0xfff2;
+#if defined (__ARM_FEATURE_CRYPTO)
+poly64_t   expected_p64  = 0xfffffffffffffff0;
+#endif
 hfloat16_t expected_f16  = 0xcb80;
 hfloat32_t expected_f32  = 0xc1700000;
 
@@ -26,6 +30,9 @@  uint32_t   expectedq_u32 = 0xfffffff2;
 uint64_t   expectedq_u64 = 0xfffffffffffffff1;
 poly8_t    expectedq_p8  = 0xfe;
 poly16_t   expectedq_p16 = 0xfff6;
+#if defined (__ARM_FEATURE_CRYPTO)
+poly64_t   expectedq_p64 = 0xfffffffffffffff1;
+#endif
 hfloat16_t expectedq_f16 = 0xca80;
 hfloat32_t expectedq_f32 = 0xc1500000;
 
@@ -89,6 +96,9 @@  void exec_vget_lane (void)
   VAR_DECL(var, uint, 64);
   VAR_DECL(var, poly, 8);
   VAR_DECL(var, poly, 16);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VAR_DECL(var, poly, 64);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VAR_DECL(var, float, 16);
 #endif
@@ -114,6 +124,9 @@  void exec_vget_lane (void)
   TEST_VGET_LANE(, uint, u, 64, 1, 0);
   TEST_VGET_LANE(, poly, p, 8, 8, 6);
   TEST_VGET_LANE(, poly, p, 16, 4, 2);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VGET_LANE(, poly, p, 64, 1, 0);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VGET_LANE_FP(, float, f, 16, 4, 1);
 #endif
@@ -129,6 +142,9 @@  void exec_vget_lane (void)
   TEST_VGET_LANE(q, uint, u, 64, 2, 1);
   TEST_VGET_LANE(q, poly, p, 8, 16, 14);
   TEST_VGET_LANE(q, poly, p, 16, 8, 6);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VGET_LANE(q, poly, p, 64, 2, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VGET_LANE_FP(q, float, f, 16, 8, 3);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
index 2b875b9b7b86d850647cf977086f336b932cfb0b..6a67baa6c64ae59b6d454d6c97b9a219e2610490 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vget_low.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -16,6 +17,9 @@  VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -50,6 +54,9 @@  void exec_vget_low (void)
   TEST_VGET_LOW(uint, u, 64, 1, 2);
   TEST_VGET_LOW(poly, p, 8, 8, 16);
   TEST_VGET_LOW(poly, p, 16, 4, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VGET_LOW(poly, p, 64, 1, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VGET_LOW(float, f, 16, 4, 8);
 #endif
@@ -65,6 +72,9 @@  void exec_vget_low (void)
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, expected, "");
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected, "");
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected, "");
+#if defined (__ARM_FEATURE_CRYPTO)
+  CHECK(TEST_MSG, poly, 64, 1, PRIx64, expected, "");
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   CHECK_FP(TEST_MSG, float, 16, 4, PRIx16, expected, "");
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
index 4ed0e464f9ce6b0f599a6a72d3f49db5ac9a0374..96cf06be923efa47e7977d02a8ad63ce2e6cba1f 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -16,6 +17,9 @@  VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -33,7 +37,7 @@  VECT_VAR_DECL(expected,uint,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xf8, 0xf9, 0xfa, 0xfb,
 					 0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,uint,16,8) [] = { 0xfff0, 0xfff1, 0xfff2,
-					 0xfff3, 0xfff4, 0xfff5,
+					 0xfff3, 0xfff4, 0xfff5,	
 					 0xfff6, 0xfff7 };
 VECT_VAR_DECL(expected,uint,32,4) [] = { 0xfffffff0, 0xfffffff1,
 					 0xfffffff2, 0xfffffff3 };
@@ -45,6 +49,10 @@  VECT_VAR_DECL(expected,poly,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					 0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
 					 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,2) [] = { 0xfffffffffffffff0,
+					 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80,
 					   0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1800000, 0xc1700000,
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
index 34be214e9122c5701a341b09479443fdb5f2716b..62585e8371fd1c738d0285a7997cc3bb1ab30948 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld1_dup.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 /* Chunk 0.  */
@@ -17,6 +18,9 @@  VECT_VAR_DECL(expected0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected0,poly,8,8) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					 0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,4) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected0,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,16,4) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,2) [] = { 0xc1800000, 0xc1800000 };
 VECT_VAR_DECL(expected0,int,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
@@ -45,6 +49,10 @@  VECT_VAR_DECL(expected0,poly,8,16) [] = { 0xf0, 0xf0, 0xf0, 0xf0,
 					  0xf0, 0xf0, 0xf0, 0xf0 };
 VECT_VAR_DECL(expected0,poly,16,8) [] = { 0xfff0, 0xfff0, 0xfff0, 0xfff0,
 					  0xfff0, 0xfff0, 0xfff0, 0xfff0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected0,poly,64,2) [] = { 0xfffffffffffffff0,
+					  0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected0,hfloat,16,8) [] = { 0xcc00, 0xcc00, 0xcc00, 0xcc00,
 					    0xcc00, 0xcc00, 0xcc00, 0xcc00 };
 VECT_VAR_DECL(expected0,hfloat,32,4) [] = { 0xc1800000, 0xc1800000,
@@ -64,6 +72,9 @@  VECT_VAR_DECL(expected1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected1,poly,8,8) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					 0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,4) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected1,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected1,hfloat,16,4) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,2) [] = { 0xc1700000, 0xc1700000 };
 VECT_VAR_DECL(expected1,int,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
@@ -92,6 +103,10 @@  VECT_VAR_DECL(expected1,poly,8,16) [] = { 0xf1, 0xf1, 0xf1, 0xf1,
 					  0xf1, 0xf1, 0xf1, 0xf1 };
 VECT_VAR_DECL(expected1,poly,16,8) [] = { 0xfff1, 0xfff1, 0xfff1, 0xfff1,
 					  0xfff1, 0xfff1, 0xfff1, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected1,poly,64,2) [] = { 0xfffffffffffffff1,
+					  0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected1,hfloat,16,8) [] = { 0xcb80, 0xcb80, 0xcb80, 0xcb80,
 					    0xcb80, 0xcb80, 0xcb80, 0xcb80 };
 VECT_VAR_DECL(expected1,hfloat,32,4) [] = { 0xc1700000, 0xc1700000,
@@ -111,6 +126,9 @@  VECT_VAR_DECL(expected2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected2,poly,8,8) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					 0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,4) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected2,poly,64,1) [] = { 0xfffffffffffffff2 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,16,4) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,2) [] = { 0xc1600000, 0xc1600000 };
 VECT_VAR_DECL(expected2,int,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
@@ -139,6 +157,10 @@  VECT_VAR_DECL(expected2,poly,8,16) [] = { 0xf2, 0xf2, 0xf2, 0xf2,
 					  0xf2, 0xf2, 0xf2, 0xf2 };
 VECT_VAR_DECL(expected2,poly,16,8) [] = { 0xfff2, 0xfff2, 0xfff2, 0xfff2,
 					  0xfff2, 0xfff2, 0xfff2, 0xfff2 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected2,poly,64,2) [] = { 0xfffffffffffffff2,
+					  0xfffffffffffffff2 };
+#endif
 VECT_VAR_DECL(expected2,hfloat,16,8) [] = { 0xcb00, 0xcb00, 0xcb00, 0xcb00,
 					    0xcb00, 0xcb00, 0xcb00, 0xcb00 };
 VECT_VAR_DECL(expected2,hfloat,32,4) [] = { 0xc1600000, 0xc1600000,
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
index e1e8562ac6be424e638d11a90aeb406116abca24..383ff67f9666479c4e9378fd7234737fd3f3a763 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 
@@ -18,6 +19,11 @@  VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld2_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -61,6 +67,11 @@  VECT_VAR_DECL(expected_vld2_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld2_1,poly,64,2) [] = { 0xfffffffffffffff2,
+						0xfffffffffffffff3 };
+#endif
 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld2_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
@@ -104,6 +115,11 @@  VECT_VAR_DECL(expected_vld3_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld3_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -147,6 +163,11 @@  VECT_VAR_DECL(expected_vld3_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld3_1,poly,64,2) [] = { 0xfffffffffffffff2,
+						0xfffffffffffffff3 };
+#endif
 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld3_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
@@ -193,6 +214,11 @@  VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff8, 0xfff9,
 						0xfffa, 0xfffb };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(expected_vld3_2,poly,64,2) [] = { 0xfffffffffffffff4,
+						0xfffffffffffffff5 };
+#endif
 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld3_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
@@ -238,6 +264,11 @@  VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf4, 0xf5, 0xf6, 0xf7 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld4_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,8,16) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
@@ -281,6 +312,11 @@  VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf8, 0xf9, 0xfa, 0xfb,
 					       0xfc, 0xfd, 0xfe, 0xff };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld4_1,poly,64,2) [] = { 0xfffffffffffffff2,
+						0xfffffffffffffff3 };
+#endif
 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xca00, 0xc980, 0xc900, 0xc880 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,8,16) [] = { 0x0, 0x1, 0x2, 0x3,
@@ -324,6 +360,11 @@  VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0x0, 0x1, 0x2, 0x3,
 					       0x4, 0x5, 0x6, 0x7 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff8, 0xfff9, 0xfffa, 0xfffb };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(expected_vld4_2,poly,64,2) [] = { 0xfffffffffffffff4,
+						0xfffffffffffffff5 };
+#endif
 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xc800, 0xc700, 0xc600, 0xc500 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1400000, 0xc1300000 };
 VECT_VAR_DECL(expected_vld4_2,int,8,16) [] = { 0x10, 0x11, 0x12, 0x13,
@@ -367,6 +408,11 @@  VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0x8, 0x9, 0xa, 0xb,
 					       0xc, 0xd, 0xe, 0xf };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfffc, 0xfffd, 0xfffe, 0xffff };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+VECT_VAR_DECL(expected_vld4_3,poly,64,2) [] = { 0xfffffffffffffff6,
+						0xfffffffffffffff7 };
+#endif
 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xc400, 0xc200, 0xc000, 0xbc00 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1200000, 0xc1100000 };
 VECT_VAR_DECL(expected_vld4_3,int,8,16) [] = { 0x20, 0x21, 0x22, 0x23,
@@ -424,6 +470,19 @@  void exec_vldX (void)
 	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
 	 sizeof(VECT_VAR(result, T1, W, N)));
 
+#if defined (__ARM_FEATURE_CRYPTO)
+#define DECL_VLDX_CRYPTO(T1, W, N, X) \
+  DECL_VLDX(T1, W, N, X)
+#define TEST_VLDX_CRYPTO(Q, T1, T2, W, N, X) \
+   TEST_VLDX(Q, T1, T2, W, N, X)
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X,Y) \
+  TEST_EXTRA_CHUNK(T1, W, N, X,Y)
+#else
+#define DECL_VLDX_CRYPTO(T1, W, N, X)
+#define TEST_VLDX_CRYPTO(Q, T1, T2, W, N, X)
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X,Y)
+#endif
+
   /* We need all variants in 64 bits, but there is no 64x2 variant.  */
 #define DECL_ALL_VLDX_NO_FP16(X)		\
   DECL_VLDX(int, 8, 8, X);			\
@@ -436,6 +495,7 @@  void exec_vldX (void)
   DECL_VLDX(uint, 64, 1, X);			\
   DECL_VLDX(poly, 8, 8, X);			\
   DECL_VLDX(poly, 16, 4, X);			\
+  DECL_VLDX_CRYPTO(poly, 64, 1, X);		\
   DECL_VLDX(float, 32, 2, X);			\
   DECL_VLDX(int, 8, 16, X);			\
   DECL_VLDX(int, 16, 8, X);			\
@@ -445,6 +505,7 @@  void exec_vldX (void)
   DECL_VLDX(uint, 32, 4, X);			\
   DECL_VLDX(poly, 8, 16, X);			\
   DECL_VLDX(poly, 16, 8, X);			\
+  DECL_VLDX_CRYPTO(poly, 64, 2, X);		\
   DECL_VLDX(float, 32, 4, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -467,6 +528,7 @@  void exec_vldX (void)
   TEST_VLDX(, uint, u, 64, 1, X);		\
   TEST_VLDX(, poly, p, 8, 8, X);		\
   TEST_VLDX(, poly, p, 16, 4, X);		\
+  TEST_VLDX_CRYPTO(, poly, p, 64, 1, X);	\
   TEST_VLDX(, float, f, 32, 2, X);		\
   TEST_VLDX(q, int, s, 8, 16, X);		\
   TEST_VLDX(q, int, s, 16, 8, X);		\
@@ -476,6 +538,7 @@  void exec_vldX (void)
   TEST_VLDX(q, uint, u, 32, 4, X);		\
   TEST_VLDX(q, poly, p, 8, 16, X);		\
   TEST_VLDX(q, poly, p, 16, 8, X);		\
+  TEST_VLDX_CRYPTO(q, poly, p, 64, 2, X);	\
   TEST_VLDX(q, float, f, 32, 4, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -498,6 +561,7 @@  void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK_CRYPTO(poly, 64, 1, X, Y);	\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(int, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
@@ -507,6 +571,7 @@  void exec_vldX (void)
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 16, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  TEST_EXTRA_CHUNK_CRYPTO(poly, 64, 2, X, Y);	\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -530,6 +595,7 @@  void exec_vldX (void)
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_CRYPTO(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
 									\
     CHECK(test_name, int, 8, 16, PRIx8, EXPECTED, comment);		\
@@ -540,6 +606,7 @@  void exec_vldX (void)
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 16, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_CRYPTO(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -580,6 +647,10 @@  void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 1);
+  PAD(buffer_vld2_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
   PAD(buffer_vld2_pad, float, 16, 4);
@@ -607,6 +678,10 @@  void exec_vldX (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 2);
+  PAD(buffer_vld2_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
   PAD(buffer_vld2_pad, float, 16, 8);
@@ -635,6 +710,10 @@  void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 1);
+  PAD(buffer_vld3_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
   PAD(buffer_vld3_pad, float, 16, 4);
@@ -662,6 +741,10 @@  void exec_vldX (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 2);
+  PAD(buffer_vld3_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
   PAD(buffer_vld3_pad, float, 16, 8);
@@ -690,6 +773,10 @@  void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 1);
+  PAD(buffer_vld4_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
   PAD(buffer_vld4_pad, float, 16, 4);
@@ -717,6 +804,10 @@  void exec_vldX (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 2);
+  PAD(buffer_vld4_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
   PAD(buffer_vld4_pad, float, 16, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
index b44a987cb5d2f169b633d9c1e862fb782bd65d39..60fdd20f42a19862684c28c6f44db3f6f5642c98 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_dup.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 
@@ -18,6 +19,9 @@  VECT_VAR_DECL(expected_vld2_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff0, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = {0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -36,6 +40,9 @@  VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf0, 0xf1,
 					       0xf0, 0xf1, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff0, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -56,6 +63,9 @@  VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf0,
 					       0xf1, 0xf2, 0xf0, 0xf1 };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xfff0, 0xfff1,
 						0xfff2, 0xfff0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xcc00 };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -76,6 +86,9 @@  VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xf2, 0xf0, 0xf1, 0xf2,
 					       0xf0, 0xf1, 0xf2, 0xf0 };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xfff1, 0xfff2,
 						0xfff0, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xcb80, 0xcb00, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xc1800000 };
 
@@ -96,6 +109,9 @@  VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xf1, 0xf2, 0xf0, 0xf1,
 					       0xf2, 0xf0, 0xf1, 0xf2 };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xfff2, 0xfff0,
 						0xfff1, 0xfff2 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+#endif
 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xcc00, 0xcb80, 0xcb00 };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xc1700000, 0xc1600000 };
 
@@ -114,6 +130,9 @@  VECT_VAR_DECL(expected_vld4_0,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -131,6 +150,9 @@  VECT_VAR_DECL(expected_vld4_1,uint,64,1) [] = { 0xfffffffffffffff1 };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
@@ -148,6 +170,9 @@  VECT_VAR_DECL(expected_vld4_2,uint,64,1) [] = { 0xfffffffffffffff2 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+#endif
 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 
@@ -165,6 +190,9 @@  VECT_VAR_DECL(expected_vld4_3,uint,64,1) [] = { 0xfffffffffffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xf0, 0xf1, 0xf2, 0xf3 };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+#endif
 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 
@@ -197,6 +225,16 @@  void exec_vldX_dup (void)
 	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
 	 sizeof(VECT_VAR(result, T1, W, N)));
 
+#if defined (__ARM_FEATURE_CRYPTO)
+#define TEST_VLDX_DUP_CRYPTO(Q, T1, T2, W, N, X) TEST_VLDX_DUP(Q, T1, T2, W, N, X)
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X,Y) TEST_EXTRA_CHUNK(T1, W, N, X,Y)
+#define DECL_VLDX_DUP_CRYPTO(T1, W, N, X) DECL_VLDX_DUP(T1, W, N, X)
+#else
+#define TEST_VLDX_DUP_CRYPTO(Q, T1, T2, W, N, X)
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X,Y)
+#define DECL_VLDX_DUP_CRYPTO(T1, W, N, X)
+#endif
+
 #define DECL_ALL_VLDX_DUP_NO_FP16(X)		\
   DECL_VLDX_DUP(int, 8, 8, X);			\
   DECL_VLDX_DUP(int, 16, 4, X);			\
@@ -208,6 +246,7 @@  void exec_vldX_dup (void)
   DECL_VLDX_DUP(uint, 64, 1, X);		\
   DECL_VLDX_DUP(poly, 8, 8, X);			\
   DECL_VLDX_DUP(poly, 16, 4, X);		\
+  DECL_VLDX_DUP_CRYPTO(poly, 64, 1, X);		\
   DECL_VLDX_DUP(float, 32, 2, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -229,6 +268,7 @@  void exec_vldX_dup (void)
   TEST_VLDX_DUP(, uint, u, 64, 1, X);		\
   TEST_VLDX_DUP(, poly, p, 8, 8, X);		\
   TEST_VLDX_DUP(, poly, p, 16, 4, X);		\
+  TEST_VLDX_DUP_CRYPTO(, poly, p, 64, 1, X);	\
   TEST_VLDX_DUP(, float, f, 32, 2, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -250,6 +290,7 @@  void exec_vldX_dup (void)
   TEST_EXTRA_CHUNK(uint, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK_CRYPTO(poly, 64, 1, X, Y);	\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -272,6 +313,7 @@  void exec_vldX_dup (void)
     CHECK(test_name, uint, 64, 1, PRIx64, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_CRYPTO(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -313,6 +355,10 @@  void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 8);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 4);
   PAD(buffer_vld2_pad, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 1);
+  PAD(buffer_vld2_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 4);
   PAD(buffer_vld2_pad, float, 16, 4);
@@ -340,6 +386,10 @@  void exec_vldX_dup (void)
   PAD(buffer_vld2_pad, poly, 8, 16);
   VECT_ARRAY_INIT2(buffer_vld2, poly, 16, 8);
   PAD(buffer_vld2_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT2(buffer_vld2, poly, 64, 2);
+  PAD(buffer_vld2_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT2(buffer_vld2, float, 16, 8);
   PAD(buffer_vld2_pad, float, 16, 8);
@@ -368,6 +418,10 @@  void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 8);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 4);
   PAD(buffer_vld3_pad, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 1);
+  PAD(buffer_vld3_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 4);
   PAD(buffer_vld3_pad, float, 16, 4);
@@ -395,6 +449,10 @@  void exec_vldX_dup (void)
   PAD(buffer_vld3_pad, poly, 8, 16);
   VECT_ARRAY_INIT3(buffer_vld3, poly, 16, 8);
   PAD(buffer_vld3_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT3(buffer_vld3, poly, 64, 2);
+  PAD(buffer_vld3_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT3(buffer_vld3, float, 16, 8);
   PAD(buffer_vld3_pad, float, 16, 8);
@@ -423,6 +481,10 @@  void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 8);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 4);
   PAD(buffer_vld4_pad, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 1);
+  PAD(buffer_vld4_pad, poly, 64, 1);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 4);
   PAD(buffer_vld4_pad, float, 16, 4);
@@ -450,6 +512,10 @@  void exec_vldX_dup (void)
   PAD(buffer_vld4_pad, poly, 8, 16);
   VECT_ARRAY_INIT4(buffer_vld4, poly, 16, 8);
   PAD(buffer_vld4_pad, poly, 16, 8);
+#if defined (__ARM_FEATURE_CRYPTO)
+  VECT_ARRAY_INIT4(buffer_vld4, poly, 64, 2);
+  PAD(buffer_vld4_pad, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   VECT_ARRAY_INIT4(buffer_vld4, float, 16, 8);
   PAD(buffer_vld4_pad, float, 16, 8);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
index cda76abfe0a18f648331ec9cffc030368b2a7c70..74c4b4226c3fd8932b7ada3185aa54a667ff48ba 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vldX_lane.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 
@@ -18,6 +19,11 @@  VECT_VAR_DECL(expected_vld2_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_0,poly,16,4) [] = { 0xaaaa, 0xaaaa,
 						0xaaaa, 0xaaaa };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld2_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld2_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld2_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -47,6 +53,11 @@  VECT_VAR_DECL(expected_vld2_1,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_vld2_1,poly,8,8) [] = { 0xf0, 0xf1, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld2_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld2_1,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						0xaaaaaaaaaaaaaaaa };
+#endif
 VECT_VAR_DECL(expected_vld2_1,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld2_1,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld2_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -76,6 +87,11 @@  VECT_VAR_DECL(expected_vld3_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld3_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld3_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld3_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -105,6 +121,11 @@  VECT_VAR_DECL(expected_vld3_1,uint,32,2) [] = { 0xaaaaaaaa, 0xfffffff0 };
 VECT_VAR_DECL(expected_vld3_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xf0, 0xf1, 0xf2, 0xaa };
 VECT_VAR_DECL(expected_vld3_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld3_1,poly,64,2) [] = { 0xfffffffffffffff2,
+						0xaaaaaaaaaaaaaaaa };
+#endif
 VECT_VAR_DECL(expected_vld3_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xcc00, 0xcb80 };
 VECT_VAR_DECL(expected_vld3_1,hfloat,32,2) [] = { 0xc1600000, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -134,6 +155,11 @@  VECT_VAR_DECL(expected_vld3_2,uint,32,2) [] = { 0xfffffff1, 0xfffffff2 };
 VECT_VAR_DECL(expected_vld3_2,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld3_2,poly,16,4) [] = { 0xaaaa, 0xfff0, 0xfff1, 0xfff2 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(expected_vld3_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						0xaaaaaaaaaaaaaaaa };
+#endif
 VECT_VAR_DECL(expected_vld3_2,hfloat,16,4) [] = { 0xcb00, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld3_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld3_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xfff0, 0xfff1,
@@ -163,6 +189,11 @@  VECT_VAR_DECL(expected_vld4_0,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_0,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_0,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+VECT_VAR_DECL(expected_vld4_0,poly,64,2) [] = { 0xfffffffffffffff0,
+						0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_vld4_0,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_vld4_0,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -192,6 +223,11 @@  VECT_VAR_DECL(expected_vld4_1,uint,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_1,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_1,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+VECT_VAR_DECL(expected_vld4_1,poly,64,2) [] = { 0xfffffffffffffff2,
+						0xfffffffffffffff3 };
+#endif
 VECT_VAR_DECL(expected_vld4_1,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_vld4_1,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -221,6 +257,11 @@  VECT_VAR_DECL(expected_vld4_2,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_vld4_2,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_2,poly,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+VECT_VAR_DECL(expected_vld4_2,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						0xaaaaaaaaaaaaaaaa };
+#endif
 VECT_VAR_DECL(expected_vld4_2,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_vld4_2,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_2,int,16,8) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa,
@@ -250,6 +291,11 @@  VECT_VAR_DECL(expected_vld4_3,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_vld4_3,poly,8,8) [] = { 0xaa, 0xaa, 0xaa, 0xaa,
 					       0xaa, 0xaa, 0xaa, 0xaa };
 VECT_VAR_DECL(expected_vld4_3,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_vld4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+VECT_VAR_DECL(expected_vld4_3,poly,64,2) [] = { 0xaaaaaaaaaaaaaaaa,
+						0xaaaaaaaaaaaaaaaa };
+#endif
 VECT_VAR_DECL(expected_vld4_3,hfloat,16,4) [] = { 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa };
 VECT_VAR_DECL(expected_vld4_3,hfloat,32,2) [] = { 0xaaaaaaaa, 0xaaaaaaaa };
 VECT_VAR_DECL(expected_vld4_3,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
@@ -279,6 +325,9 @@  VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
 #endif
@@ -295,6 +344,9 @@  VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 64, 3);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
 #endif
@@ -311,6 +363,9 @@  VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 64, 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
 #endif
@@ -356,6 +411,16 @@  void exec_vldX_lane (void)
 	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
 	 sizeof(VECT_VAR(result, T1, W, N)));
 
+#if defined (__ARM_FEATURE_CRYPTO)
+#define DECL_VLDX_LANE_CRYPTO(T1, W, N, X) DECL_VLDX_LANE(T1, W, N, X)
+#define TEST_VLDX_LANE_CRYPTO(Q, T1, T2, W, N, X, L) TEST_VLDX_LANE(Q, T1, T2, W, N, X, L)
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X, Y) TEST_EXTRA_CHUNK(T1, W, N, X, Y)
+#else
+#define DECL_VLDX_LANE_CRYPTO(T1, W, N, X)
+#define TEST_VLDX_LANE_CRYPTO(Q, T1, T2, W, N, X, L)
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X, Y)
+#endif
+
   /* We need all variants in 64 bits, but there is no 64x2 variant.  */
 #define DECL_ALL_VLDX_LANE_NO_FP16(X)		\
   DECL_VLDX_LANE(int, 8, 8, X);			\
@@ -366,11 +431,13 @@  void exec_vldX_lane (void)
   DECL_VLDX_LANE(uint, 32, 2, X);		\
   DECL_VLDX_LANE(poly, 8, 8, X);		\
   DECL_VLDX_LANE(poly, 16, 4, X);		\
+  DECL_VLDX_LANE(poly, 64, 1, X);		\
   DECL_VLDX_LANE(int, 16, 8, X);		\
   DECL_VLDX_LANE(int, 32, 4, X);		\
   DECL_VLDX_LANE(uint, 16, 8, X);		\
   DECL_VLDX_LANE(uint, 32, 4, X);		\
   DECL_VLDX_LANE(poly, 16, 8, X);		\
+  DECL_VLDX_LANE_CRYPTO(poly, 64, 2, X);	\
   DECL_VLDX_LANE(float, 32, 2, X);		\
   DECL_VLDX_LANE(float, 32, 4, X)
 
@@ -400,11 +467,13 @@  void exec_vldX_lane (void)
   TEST_VLDX_LANE(, uint, u, 32, 2, X, 1);	\
   TEST_VLDX_LANE(, poly, p, 8, 8, X, 4);	\
   TEST_VLDX_LANE(, poly, p, 16, 4, X, 3);	\
+  TEST_VLDX_LANE_CRYPTO(, poly, p, 64, 1, X, 0);\
   TEST_VLDX_LANE(q, int, s, 16, 8, X, 6);	\
   TEST_VLDX_LANE(q, int, s, 32, 4, X, 2);	\
   TEST_VLDX_LANE(q, uint, u, 16, 8, X, 5);	\
   TEST_VLDX_LANE(q, uint, u, 32, 4, X, 0);	\
   TEST_VLDX_LANE(q, poly, p, 16, 8, X, 5);	\
+  TEST_VLDX_LANE_CRYPTO(q, poly, p, 64, 2, X, 0);\
   TEST_VLDX_LANE(, float, f, 32, 2, X, 0);	\
   TEST_VLDX_LANE(q, float, f, 32, 4, X, 2)
 
@@ -426,11 +495,13 @@  void exec_vldX_lane (void)
   TEST_EXTRA_CHUNK(uint, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK_CRYPTO(poly, 64, 1, X, Y);	\
   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(int, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(uint, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(uint, 32, 4, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 8, X, Y);		\
+  TEST_EXTRA_CHUNK_CRYPTO(poly, 64, 2, X, Y);	\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 4, X, Y)
 
@@ -453,12 +524,14 @@  void exec_vldX_lane (void)
     CHECK(test_name, uint, 32, 2, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, poly, 8, 8, PRIx8, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 4, PRIx16, EXPECTED, comment);		\
+    CHECK_CRYPTO(test_name, poly, 64, 1, PRIx64, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 2, PRIx32, EXPECTED, comment);	\
     CHECK(test_name, int, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, int, 32, 4, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, uint, 16, 8, PRIx16, EXPECTED, comment);		\
     CHECK(test_name, uint, 32, 4, PRIx32, EXPECTED, comment);		\
     CHECK(test_name, poly, 16, 8, PRIx16, EXPECTED, comment);		\
+    CHECK_CRYPTO(test_name, poly, 64, 2, PRIx64, EXPECTED, comment);	\
     CHECK_FP(test_name, float, 32, 4, PRIx32, EXPECTED, comment)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -489,11 +562,17 @@  void exec_vldX_lane (void)
   DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
   DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  DUMMY_ARRAY(buffer_src, poly, 64, 1, 4);
+#endif
   DUMMY_ARRAY(buffer_src, int, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, int, 32, 4, 4);
   DUMMY_ARRAY(buffer_src, uint, 16, 8, 4);
   DUMMY_ARRAY(buffer_src, uint, 32, 4, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 8, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  DUMMY_ARRAY(buffer_src, poly, 64, 2, 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
   DUMMY_ARRAY(buffer_src, float, 16, 8, 4);
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
index 825d07dbf77fc54b5ef796b57a42c81d6dd6d611..047ee1fa80be89083315505c6c228a03df290047 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst1_lane.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results.  */
 VECT_VAR_DECL(expected,int,8,8) [] = { 0xf7, 0x33, 0x33, 0x33,
@@ -16,6 +17,9 @@  VECT_VAR_DECL(expected,uint,64,1) [] = { 0xfffffffffffffff0 };
 VECT_VAR_DECL(expected,poly,8,8) [] = { 0xf6, 0x33, 0x33, 0x33,
 					0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,4) [] = { 0xfff2, 0x3333, 0x3333, 0x3333 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,4) [] = { 0xcb80, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc1700000, 0x33333333 };
 VECT_VAR_DECL(expected,int,8,16) [] = { 0xff, 0x33, 0x33, 0x33,
@@ -25,7 +29,7 @@  VECT_VAR_DECL(expected,int,8,16) [] = { 0xff, 0x33, 0x33, 0x33,
 VECT_VAR_DECL(expected,int,16,8) [] = { 0xfff5, 0x3333, 0x3333, 0x3333,
 					0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffffff1, 0x33333333,
-					0x33333333, 0x33333333 };
+					0x33333333, 0x33333333   };
 VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffffff1, 0x3333333333333333 };
 VECT_VAR_DECL(expected,uint,8,16) [] = { 0xfa, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33,
@@ -43,6 +47,10 @@  VECT_VAR_DECL(expected,poly,8,16) [] = { 0xfa, 0x33, 0x33, 0x33,
 					 0x33, 0x33, 0x33, 0x33 };
 VECT_VAR_DECL(expected,poly,16,8) [] = { 0xfff4, 0x3333, 0x3333, 0x3333,
 					 0x3333, 0x3333, 0x3333, 0x3333 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected,poly,64,2) [] = { 0xfffffffffffffff0,
+					 0x3333333333333333 };
+#endif
 VECT_VAR_DECL(expected,hfloat,16,8) [] = { 0xc900, 0x3333, 0x3333, 0x3333,
 					   0x3333, 0x3333, 0x3333, 0x3333 };
 VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc1700000, 0x33333333,
@@ -72,6 +80,9 @@  void exec_vst1_lane (void)
   TEST_VST1_LANE(, uint, u, 64, 1, 0);
   TEST_VST1_LANE(, poly, p, 8, 8, 6);
   TEST_VST1_LANE(, poly, p, 16, 4, 2);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VST1_LANE(, poly, p, 64, 1, 0);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VST1_LANE(, float, f, 16, 4, 1);
 #endif
@@ -87,6 +98,9 @@  void exec_vst1_lane (void)
   TEST_VST1_LANE(q, uint, u, 64, 2, 0);
   TEST_VST1_LANE(q, poly, p, 8, 16, 10);
   TEST_VST1_LANE(q, poly, p, 16, 8, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  TEST_VST1_LANE(q, poly, p, 64, 2, 0);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   TEST_VST1_LANE(q, float, f, 16, 8, 6);
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
index f5bf3bd325fa05e330d766dc0a93582d6c12e8c8..19a7485249baa1d93fd2cf501bcc198ddd4b6b2c 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vstX_lane.c
@@ -1,6 +1,7 @@ 
 #include <arm_neon.h>
 #include "arm-neon-ref.h"
 #include "compute-ref-data.h"
+/* { dg-additional-options "-march=armv8-a+crypto" { target aarch64*-*-* } } */
 
 /* Expected results for vst2, chunk 0.  */
 VECT_VAR_DECL(expected_st2_0,int,8,8) [] = { 0xf0, 0xf1, 0x0, 0x0,
@@ -14,6 +15,9 @@  VECT_VAR_DECL(expected_st2_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st2_0,poly,8,8) [] = { 0xf0, 0xf1, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st2_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected_st2_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st2_0,int,16,8) [] = { 0xfff0, 0xfff1, 0x0, 0x0,
@@ -42,6 +46,9 @@  VECT_VAR_DECL(expected_st2_1,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st2_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_st2_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st2_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -68,6 +75,9 @@  VECT_VAR_DECL(expected_st3_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st3_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st3_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected_st3_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0x0 };
 VECT_VAR_DECL(expected_st3_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st3_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0x0,
@@ -97,6 +107,9 @@  VECT_VAR_DECL(expected_st3_1,uint,32,2) [] = { 0xfffffff2, 0x0 };
 VECT_VAR_DECL(expected_st3_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st3_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_st3_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_1,hfloat,32,2) [] = { 0xc1600000, 0x0 };
 VECT_VAR_DECL(expected_st3_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -123,6 +136,9 @@  VECT_VAR_DECL(expected_st3_2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st3_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+#endif
 VECT_VAR_DECL(expected_st3_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st3_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -149,6 +165,9 @@  VECT_VAR_DECL(expected_st4_0,uint,32,2) [] = { 0xfffffff0, 0xfffffff1 };
 VECT_VAR_DECL(expected_st4_0,poly,8,8) [] = { 0xf0, 0xf1, 0xf2, 0xf3,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_0,poly,16,4) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st4_0,poly,64,1) [] = { 0xfffffffffffffff0 };
+#endif
 VECT_VAR_DECL(expected_st4_0,hfloat,16,4) [] = { 0xcc00, 0xcb80, 0xcb00, 0xca80 };
 VECT_VAR_DECL(expected_st4_0,hfloat,32,2) [] = { 0xc1800000, 0xc1700000 };
 VECT_VAR_DECL(expected_st4_0,int,16,8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
@@ -178,6 +197,9 @@  VECT_VAR_DECL(expected_st4_1,uint,32,2) [] = { 0xfffffff2, 0xfffffff3 };
 VECT_VAR_DECL(expected_st4_1,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st4_1,poly,64,1) [] = { 0xfffffffffffffff1 };
+#endif
 VECT_VAR_DECL(expected_st4_1,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_1,hfloat,32,2) [] = { 0xc1600000, 0xc1500000 };
 VECT_VAR_DECL(expected_st4_1,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -204,6 +226,9 @@  VECT_VAR_DECL(expected_st4_2,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st4_2,poly,64,1) [] = { 0xfffffffffffffff2 };
+#endif
 VECT_VAR_DECL(expected_st4_2,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_2,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -230,6 +255,9 @@  VECT_VAR_DECL(expected_st4_3,uint,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,poly,8,8) [] = { 0x0, 0x0, 0x0, 0x0,
 					      0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,poly,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL(expected_st4_3,poly,64,1) [] = { 0xfffffffffffffff3 };
+#endif
 VECT_VAR_DECL(expected_st4_3,hfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,hfloat,32,2) [] = { 0x0, 0x0 };
 VECT_VAR_DECL(expected_st4_3,int,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
@@ -256,6 +284,9 @@  VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 8, 2);
 VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 16, 2);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer_vld2_lane, poly, 64, 2);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 16, 2);
 #endif
@@ -272,6 +303,9 @@  VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 8, 3);
 VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 16, 3);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer_vld3_lane, poly, 64, 3);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 16, 3);
 #endif
@@ -288,6 +322,9 @@  VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 8, 4);
 VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 16, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+VECT_VAR_DECL_INIT(buffer_vld4_lane, poly, 64, 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 16, 4);
 #endif
@@ -336,6 +373,19 @@  void exec_vstX_lane (void)
 	 &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]),	\
 	 sizeof(VECT_VAR(result, T1, W, N)));
 
+#if defined (__ARM_FEATURE_CRYPTO)
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X, Y) \
+		TEST_EXTRA_CHUNK(T1, W, N, X, Y)
+#define TEST_VSTX_LANE_CRYPTO(Q, T1, T2, W, N, X, L) \
+		TEST_VSTX_LANE(Q, T1, T2, W, N, X, L)
+#define DECL_VSTX_LANE_CRYPTO(T1, W, N, X) \
+		DECL_VSTX_LANE(T1, W, N, X)
+#else
+#define TEST_EXTRA_CHUNK_CRYPTO(T1, W, N, X, Y)
+#define TEST_VSTX_LANE_CRYPTO(Q, T1, T2, W, N, X, L)
+#define DECL_VSTX_LANE_CRYPTO(T1, W, N, X)
+#endif
+
   /* We need all variants in 64 bits, but there is no 64x2 variant,
      nor 128 bits vectors of int8/uint8/poly8.  */
 #define DECL_ALL_VSTX_LANE_NO_FP16(X)		\
@@ -347,12 +397,14 @@  void exec_vstX_lane (void)
   DECL_VSTX_LANE(uint, 32, 2, X);		\
   DECL_VSTX_LANE(poly, 8, 8, X);		\
   DECL_VSTX_LANE(poly, 16, 4, X);		\
+  DECL_VSTX_LANE_CRYPTO(poly, 64, 1, X);		\
   DECL_VSTX_LANE(float, 32, 2, X);		\
   DECL_VSTX_LANE(int, 16, 8, X);		\
   DECL_VSTX_LANE(int, 32, 4, X);		\
   DECL_VSTX_LANE(uint, 16, 8, X);		\
   DECL_VSTX_LANE(uint, 32, 4, X);		\
   DECL_VSTX_LANE(poly, 16, 8, X);		\
+  DECL_VSTX_LANE_CRYPTO(poly, 64, 2, X);		\
   DECL_VSTX_LANE(float, 32, 4, X)
 
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
@@ -378,6 +430,7 @@  void exec_vstX_lane (void)
   TEST_VSTX_LANE(, uint, u, 32, 2, X, 1);	\
   TEST_VSTX_LANE(, poly, p, 8, 8, X, 4);	\
   TEST_VSTX_LANE(, poly, p, 16, 4, X, 3);	\
+  TEST_VSTX_LANE_CRYPTO(, poly, p, 64, 1, X, 0);\
   TEST_VSTX_LANE(q, int, s, 16, 8, X, 6);	\
   TEST_VSTX_LANE(q, int, s, 32, 4, X, 2);	\
   TEST_VSTX_LANE(q, uint, u, 16, 8, X, 5);	\
@@ -403,6 +456,7 @@  void exec_vstX_lane (void)
   TEST_EXTRA_CHUNK(uint, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 8, 8, X, Y);		\
   TEST_EXTRA_CHUNK(poly, 16, 4, X, Y);		\
+  TEST_EXTRA_CHUNK_CRYPTO(poly, 64, 1, X, Y);		\
   TEST_EXTRA_CHUNK(float, 32, 2, X, Y);		\
   TEST_EXTRA_CHUNK(int, 16, 8, X, Y);		\
   TEST_EXTRA_CHUNK(int, 32, 4, X, Y);		\
@@ -434,6 +488,9 @@  void exec_vstX_lane (void)
   DUMMY_ARRAY(buffer_src, uint, 32, 2, 4);
   DUMMY_ARRAY(buffer_src, poly, 8, 8, 4);
   DUMMY_ARRAY(buffer_src, poly, 16, 4, 4);
+#if defined (__ARM_FEATURE_CRYPTO)
+  DUMMY_ARRAY(buffer_src, poly, 64, 1, 4);
+#endif
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
   DUMMY_ARRAY(buffer_src, float, 16, 4, 4);
 #endif
@@ -462,6 +519,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_0, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_0, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_0, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st2_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_0, CMT);
@@ -485,6 +543,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st2_1, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st2_1, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st2_1, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st2_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st2_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st2_1, CMT);
@@ -514,6 +573,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_0, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_0, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_0, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st3_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_0, CMT);
@@ -538,6 +598,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_1, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_1, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_1, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st3_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_1, CMT);
@@ -562,6 +623,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st3_2, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st3_2, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st3_2, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st3_2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st3_2, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st3_2, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st3_2, CMT);
@@ -591,6 +653,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_0, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_0, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_0, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st4_0, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_0, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_0, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_0, CMT);
@@ -615,6 +678,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_1, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_1, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_1, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st4_1, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_1, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_1, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_1, CMT);
@@ -639,6 +703,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_2, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_2, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_2, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st4_2, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_2, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_2, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_2, CMT);
@@ -663,6 +728,7 @@  void exec_vstX_lane (void)
   CHECK(TEST_MSG, uint, 32, 2, PRIx32, expected_st4_3, CMT);
   CHECK(TEST_MSG, poly, 8, 8, PRIx8, expected_st4_3, CMT);
   CHECK(TEST_MSG, poly, 16, 4, PRIx16, expected_st4_3, CMT);
+  CHECK_CRYPTO(TEST_MSG, poly, 64, 1, PRIx64, expected_st4_3, CMT);
   CHECK_FP(TEST_MSG, float, 32, 2, PRIx32, expected_st4_3, CMT);
   CHECK(TEST_MSG, int, 16, 8, PRIx16, expected_st4_3, CMT);
   CHECK(TEST_MSG, int, 32, 4, PRIx32, expected_st4_3, CMT);