@@ -3849,6 +3849,19 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
LSHIFT_EXPR, args[0], args[1]);
break;
+
+ /* lower saturating add/sub neon builtins to gimple. */
+ BUILTIN_VSDQ_I (BINOP, ssadd, 3, NONE)
+ BUILTIN_VSDQ_I (BINOPU, usadd, 3, NONE)
+ new_stmt = gimple_build_call_internal (IFN_SAT_ADD, 2, args[0], args[1]);
+ gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+ break;
+ BUILTIN_VSDQ_I (BINOP, sssub, 3, NONE)
+ BUILTIN_VSDQ_I (BINOPU, ussub, 3, NONE)
+ new_stmt = gimple_build_call_internal (IFN_SAT_SUB, 2, args[0], args[1]);
+ gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
+ break;
+
BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, NONE)
BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, NONE)
{
@@ -71,10 +71,10 @@
BUILTIN_VSDQ_I (BINOP, sqrshl, 0, NONE)
BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, NONE)
/* Implemented by aarch64_<su_optab><optab><mode>. */
- BUILTIN_VSDQ_I (BINOP, sqadd, 0, NONE)
- BUILTIN_VSDQ_I (BINOPU, uqadd, 0, NONE)
- BUILTIN_VSDQ_I (BINOP, sqsub, 0, NONE)
- BUILTIN_VSDQ_I (BINOPU, uqsub, 0, NONE)
+ BUILTIN_VSDQ_I (BINOP, ssadd, 3, NONE)
+ BUILTIN_VSDQ_I (BINOPU, usadd, 3, NONE)
+ BUILTIN_VSDQ_I (BINOP, sssub, 3, NONE)
+ BUILTIN_VSDQ_I (BINOPU, ussub, 3, NONE)
/* Implemented by aarch64_<sur>qadd<mode>. */
BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, NONE)
BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, NONE)
@@ -5221,15 +5221,100 @@
)
;; <su>q<addsub>
-(define_insn "aarch64_<su_optab>q<addsub><mode><vczle><vczbe>"
- [(set (match_operand:VSDQ_I 0 "register_operand" "=w")
- (BINQOPS:VSDQ_I (match_operand:VSDQ_I 1 "register_operand" "w")
- (match_operand:VSDQ_I 2 "register_operand" "w")))]
+(define_insn "<su_optab>s<addsub><mode>3<vczle><vczbe>"
+ [(set (match_operand:VSDQ_I_QI_HI 0 "register_operand" "=w")
+ (BINQOPS:VSDQ_I_QI_HI (match_operand:VSDQ_I_QI_HI 1 "register_operand" "w")
+ (match_operand:VSDQ_I_QI_HI 2 "register_operand" "w")))]
"TARGET_SIMD"
"<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
[(set_attr "type" "neon_q<addsub><q>")]
)
+(define_insn "<su_optab>s<addsub><mode>3<vczle><vczbe>"
+ [(set (match_operand:GPI 0 "register_operand" "=w")
+ (SBINQOPS:GPI (match_operand:GPI 1 "register_operand" "w")
+ (match_operand:GPI 2 "register_operand" "w")))]
+ "TARGET_SIMD"
+ "<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+ [(set_attr "type" "neon_q<addsub><q>")]
+)
+
+;; If this is an unsigned saturating arithmetic and the operands arrive in GP
+;; registers, then it is possible to perform this arithmetic without using the
+;; NEON instructions. This avoids using unnecessary fmov instructions to move
+;; either the operands or the result to and from GP regs to FP regs. This is
+;; only possible with SImode and DImode.
+
+(define_insn_and_split "<su_optab>s<addsub><mode>3<vczle><vczbe>"
+ [(set (match_operand:GPI 0 "register_operand")
+ (UBINQOPS:GPI (match_operand:GPI 1 "register_operand")
+ (match_operand:GPI 2 "aarch64_plus_operand")))]
+ ""
+ {@ [ cons: =0, 1 , 2 ; attrs: type, arch, length ]
+ [ w , w , w ; neon_q<addsub><q>, *, 4 ] <su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
+ [ r , r , JIr ; * , *, 8 ] #
+ }
+ "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))"
+ [(set (match_dup 0)
+ (if_then_else:GPI
+ (match_operator 3 "comparison_operator" [(reg:CC CC_REGNUM) (const_int 0)])
+ (match_dup 0)
+ (match_operand:GPI 4 "immediate_operand" "i")))]
+ {
+
+ if (REG_P (operands[2]))
+ {
+ switch (<CODE>)
+ {
+ case US_MINUS:
+ emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
+ operands[2]));
+ break;
+ case US_PLUS:
+ emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1],
+ operands[2]));
+ break;
+ default:
+ break;
+ }
+ }
+ else
+ {
+ unsigned long imm = UINTVAL (operands[2]);
+ gcc_assert (imm != 0);
+ rtx neg_imm = gen_int_mode (-imm, <MODE>mode);
+ switch (<CODE>)
+ {
+ case US_MINUS:
+ emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+ operands[2], neg_imm));
+ break;
+ case US_PLUS:
+ emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
+ neg_imm, operands[2]));
+ break;
+ default:
+ break;
+ }
+ }
+
+ rtx ccin = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+ switch (<CODE>)
+ {
+ case US_PLUS:
+ operands[3] = gen_rtx_LTU (<MODE>mode, ccin, const0_rtx);
+ operands[4] = gen_int_mode (-1, <MODE>mode);
+ break;
+ case US_MINUS:
+ operands[3] = gen_rtx_GEU (<MODE>mode, ccin, const0_rtx);
+ operands[4] = const0_rtx;
+ break;
+ default:
+ break;
+ }
+ }
+)
+
;; suqadd and usqadd
(define_insn "aarch64_<sur>qadd<mode><vczle><vczbe>"
@@ -1904,35 +1904,35 @@ __extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_s8 (int8x8_t __a, int8x8_t __b)
{
- return (int8x8_t) __builtin_aarch64_sqaddv8qi (__a, __b);
+ return (int8x8_t) __builtin_aarch64_ssaddv8qi (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_s16 (int16x4_t __a, int16x4_t __b)
{
- return (int16x4_t) __builtin_aarch64_sqaddv4hi (__a, __b);
+ return (int16x4_t) __builtin_aarch64_ssaddv4hi (__a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_s32 (int32x2_t __a, int32x2_t __b)
{
- return (int32x2_t) __builtin_aarch64_sqaddv2si (__a, __b);
+ return (int32x2_t) __builtin_aarch64_ssaddv2si (__a, __b);
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_s64 (int64x1_t __a, int64x1_t __b)
{
- return (int64x1_t) {__builtin_aarch64_sqadddi (__a[0], __b[0])};
+ return (int64x1_t) {__builtin_aarch64_ssadddi (__a[0], __b[0])};
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
{
- return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
+ return __builtin_aarch64_usaddv8qi_uuu (__a, __b);
}
__extension__ extern __inline int8x8_t
@@ -2191,189 +2191,189 @@ __extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
{
- return __builtin_aarch64_uqaddv4hi_uuu (__a, __b);
+ return __builtin_aarch64_usaddv4hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
{
- return __builtin_aarch64_uqaddv2si_uuu (__a, __b);
+ return __builtin_aarch64_usaddv2si_uuu (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
{
- return (uint64x1_t) {__builtin_aarch64_uqadddi_uuu (__a[0], __b[0])};
+ return (uint64x1_t) {__builtin_aarch64_usadddi_uuu (__a[0], __b[0])};
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_s8 (int8x16_t __a, int8x16_t __b)
{
- return (int8x16_t) __builtin_aarch64_sqaddv16qi (__a, __b);
+ return (int8x16_t) __builtin_aarch64_ssaddv16qi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_s16 (int16x8_t __a, int16x8_t __b)
{
- return (int16x8_t) __builtin_aarch64_sqaddv8hi (__a, __b);
+ return (int16x8_t) __builtin_aarch64_ssaddv8hi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_s32 (int32x4_t __a, int32x4_t __b)
{
- return (int32x4_t) __builtin_aarch64_sqaddv4si (__a, __b);
+ return (int32x4_t) __builtin_aarch64_ssaddv4si (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_s64 (int64x2_t __a, int64x2_t __b)
{
- return (int64x2_t) __builtin_aarch64_sqaddv2di (__a, __b);
+ return (int64x2_t) __builtin_aarch64_ssaddv2di (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
{
- return __builtin_aarch64_uqaddv16qi_uuu (__a, __b);
+ return __builtin_aarch64_usaddv16qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
{
- return __builtin_aarch64_uqaddv8hi_uuu (__a, __b);
+ return __builtin_aarch64_usaddv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
{
- return __builtin_aarch64_uqaddv4si_uuu (__a, __b);
+ return __builtin_aarch64_usaddv4si_uuu (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
{
- return __builtin_aarch64_uqaddv2di_uuu (__a, __b);
+ return __builtin_aarch64_usaddv2di_uuu (__a, __b);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_s8 (int8x8_t __a, int8x8_t __b)
{
- return (int8x8_t) __builtin_aarch64_sqsubv8qi (__a, __b);
+ return (int8x8_t) __builtin_aarch64_sssubv8qi (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_s16 (int16x4_t __a, int16x4_t __b)
{
- return (int16x4_t) __builtin_aarch64_sqsubv4hi (__a, __b);
+ return (int16x4_t) __builtin_aarch64_sssubv4hi (__a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_s32 (int32x2_t __a, int32x2_t __b)
{
- return (int32x2_t) __builtin_aarch64_sqsubv2si (__a, __b);
+ return (int32x2_t) __builtin_aarch64_sssubv2si (__a, __b);
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_s64 (int64x1_t __a, int64x1_t __b)
{
- return (int64x1_t) {__builtin_aarch64_sqsubdi (__a[0], __b[0])};
+ return (int64x1_t) {__builtin_aarch64_sssubdi (__a[0], __b[0])};
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
{
- return __builtin_aarch64_uqsubv8qi_uuu (__a, __b);
+ return __builtin_aarch64_ussubv8qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
{
- return __builtin_aarch64_uqsubv4hi_uuu (__a, __b);
+ return __builtin_aarch64_ussubv4hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
{
- return __builtin_aarch64_uqsubv2si_uuu (__a, __b);
+ return __builtin_aarch64_ussubv2si_uuu (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
{
- return (uint64x1_t) {__builtin_aarch64_uqsubdi_uuu (__a[0], __b[0])};
+ return (uint64x1_t) {__builtin_aarch64_ussubdi_uuu (__a[0], __b[0])};
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_s8 (int8x16_t __a, int8x16_t __b)
{
- return (int8x16_t) __builtin_aarch64_sqsubv16qi (__a, __b);
+ return (int8x16_t) __builtin_aarch64_sssubv16qi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_s16 (int16x8_t __a, int16x8_t __b)
{
- return (int16x8_t) __builtin_aarch64_sqsubv8hi (__a, __b);
+ return (int16x8_t) __builtin_aarch64_sssubv8hi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_s32 (int32x4_t __a, int32x4_t __b)
{
- return (int32x4_t) __builtin_aarch64_sqsubv4si (__a, __b);
+ return (int32x4_t) __builtin_aarch64_sssubv4si (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_s64 (int64x2_t __a, int64x2_t __b)
{
- return (int64x2_t) __builtin_aarch64_sqsubv2di (__a, __b);
+ return (int64x2_t) __builtin_aarch64_sssubv2di (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
{
- return __builtin_aarch64_uqsubv16qi_uuu (__a, __b);
+ return __builtin_aarch64_ussubv16qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
{
- return __builtin_aarch64_uqsubv8hi_uuu (__a, __b);
+ return __builtin_aarch64_ussubv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
{
- return __builtin_aarch64_uqsubv4si_uuu (__a, __b);
+ return __builtin_aarch64_ussubv4si_uuu (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
{
- return __builtin_aarch64_uqsubv2di_uuu (__a, __b);
+ return __builtin_aarch64_ussubv2di_uuu (__a, __b);
}
__extension__ extern __inline int8x8_t
@@ -17583,56 +17583,56 @@ __extension__ extern __inline int8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddb_s8 (int8_t __a, int8_t __b)
{
- return (int8_t) __builtin_aarch64_sqaddqi (__a, __b);
+ return (int8_t) __builtin_aarch64_ssaddqi (__a, __b);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddh_s16 (int16_t __a, int16_t __b)
{
- return (int16_t) __builtin_aarch64_sqaddhi (__a, __b);
+ return (int16_t) __builtin_aarch64_ssaddhi (__a, __b);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadds_s32 (int32_t __a, int32_t __b)
{
- return (int32_t) __builtin_aarch64_sqaddsi (__a, __b);
+ return (int32_t) __builtin_aarch64_ssaddsi (__a, __b);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddd_s64 (int64_t __a, int64_t __b)
{
- return __builtin_aarch64_sqadddi (__a, __b);
+ return __builtin_aarch64_ssadddi (__a, __b);
}
__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddb_u8 (uint8_t __a, uint8_t __b)
{
- return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b);
+ return (uint8_t) __builtin_aarch64_usaddqi_uuu (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddh_u16 (uint16_t __a, uint16_t __b)
{
- return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b);
+ return (uint16_t) __builtin_aarch64_usaddhi_uuu (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqadds_u32 (uint32_t __a, uint32_t __b)
{
- return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b);
+ return (uint32_t) __builtin_aarch64_usaddsi_uuu (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqaddd_u64 (uint64_t __a, uint64_t __b)
{
- return __builtin_aarch64_uqadddi_uuu (__a, __b);
+ return __builtin_aarch64_usadddi_uuu (__a, __b);
}
/* vqdmlal */
@@ -19282,56 +19282,56 @@ __extension__ extern __inline int8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubb_s8 (int8_t __a, int8_t __b)
{
- return (int8_t) __builtin_aarch64_sqsubqi (__a, __b);
+ return (int8_t) __builtin_aarch64_sssubqi (__a, __b);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubh_s16 (int16_t __a, int16_t __b)
{
- return (int16_t) __builtin_aarch64_sqsubhi (__a, __b);
+ return (int16_t) __builtin_aarch64_sssubhi (__a, __b);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubs_s32 (int32_t __a, int32_t __b)
{
- return (int32_t) __builtin_aarch64_sqsubsi (__a, __b);
+ return (int32_t) __builtin_aarch64_sssubsi (__a, __b);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubd_s64 (int64_t __a, int64_t __b)
{
- return __builtin_aarch64_sqsubdi (__a, __b);
+ return __builtin_aarch64_sssubdi (__a, __b);
}
__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubb_u8 (uint8_t __a, uint8_t __b)
{
- return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b);
+ return (uint8_t) __builtin_aarch64_ussubqi_uuu (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubh_u16 (uint16_t __a, uint16_t __b)
{
- return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b);
+ return (uint16_t) __builtin_aarch64_ussubhi_uuu (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubs_u32 (uint32_t __a, uint32_t __b)
{
- return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b);
+ return (uint32_t) __builtin_aarch64_ussubsi_uuu (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqsubd_u64 (uint64_t __a, uint64_t __b)
{
- return __builtin_aarch64_uqsubdi_uuu (__a, __b);
+ return __builtin_aarch64_ussubdi_uuu (__a, __b);
}
/* vqtbl2 */
@@ -93,6 +93,10 @@
;; integer modes; 64-bit scalar integer mode.
(define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI])
+;; Advanced SIMD and scalar, 64 & 128-bit container; 8 and 16-bit scalar
+;; integer modes.
+(define_mode_iterator VSDQ_I_QI_HI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI HI QI])
+
;; Double vector modes.
(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF])
new file mode 100644
@@ -0,0 +1,58 @@
+/* Template file for vector saturating arithmetic validation.
+
+ This file defines saturating addition and subtraction functions for a given
+ scalar type, testing the auto-vectorization of these two operators. This
+ type, along with the corresponding minimum and maximum values for that type,
+ must be defined by any test file which includes this template file. */
+
+#ifndef SAT_ARIT_AUTOVEC_INC
+#define SAT_ARIT_AUTOVEC_INC
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#ifndef UT
+#define UT unsigned int
+#define VT uint32x4_t
+#define UMAX UINT_MAX
+#define UMIN 0
+#endif
+
+
+UT uadd_lane (UT a, VT b)
+{
+ UT sum = a + b[0];
+ return sum < a ? UMAX : sum;
+}
+
+void uaddq (UT *out, UT *a, UT *b, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ UT sum = a[i] + b[i];
+ out[i] = sum < a[i] ? UMAX : sum;
+ }
+}
+
+void uaddq2 (UT *out, UT *a, UT *b, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ UT sum;
+ if (!__builtin_add_overflow(a[i], b[i], &sum))
+ out[i] = sum;
+ else
+ out[i] = UMAX;
+ }
+}
+
+void usubq (UT *out, UT *a, UT *b, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ UT sum = a[i] - b[i];
+ out[i] = sum > a[i] ? UMIN : sum;
+ }
+}
+
+#endif
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,79 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane: { xfail *-*-* }
+** dup\tv([0-9]+).8b, w0
+** uqadd\tb([0-9]+), b\1, b0
+** umov\tw0, v\2.b\[0]
+** ret
+*/
+/*
+** uaddq:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.16b, v\1.16b, v\2.16b
+** ...
+** ldr\td([0-9]+), .*
+** ldr\td([0-9]+), .*
+** uqadd\tv\4.8b, v\3.8b, v\4.8b
+** ...
+** ldr\tb([0-9]+), .*
+** ldr\tb([0-9]+), .*
+** uqadd\tb\6, b\5, b\6
+** ...
+** ldr\tb([0-9]+), .*
+** ldr\tb([0-9]+), .*
+** uqadd\tb\8, b\7, b\8
+** ...
+*/
+/*
+** uaddq2:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.16b, v\1.16b, v\2.16b
+** ...
+** ldr\td([0-9]+), .*
+** ldr\td([0-9]+), .*
+** uqadd\tv\4.8b, v\3.8b, v\4.8b
+** ...
+** ldr\tb([0-9]+), .*
+** ldr\tb([0-9]+), .*
+** uqadd\tb\6, b\5, b\6
+** ...
+** uqadd\tb([0-9]+), b([0-9]+), b\7
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqsub\tv\2.16b, v\1.16b, v\2.16b
+** ...
+** ldr\td([0-9]+), .*
+** ldr\td([0-9]+), .*
+** uqsub\tv\4.8b, v\3.8b, v\4.8b
+** ...
+** ldr\tb([0-9]+), .*
+** ldr\tb([0-9]+), .*
+** uqsub\tb\6, b\5, b\6
+** ...
+** ldr\tb([0-9]+), .*
+** ldr\tb([0-9]+), .*
+** uqsub\tb\8, b\7, b\8
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned char
+#define VT uint8x8_t
+#define UMAX UCHAR_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,79 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane: { xfail *-*-* }
+** dup\tv([0-9]+).4h, w0
+** uqadd\th([0-9]+), h\1, h0
+** umov\tw0, v\2.h\[0]
+** ret
+*/
+/*
+** uaddq:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.8h, v\1.8h, v\2.8h
+** ...
+** ldr\td([0-9]+), .*
+** ldr\td([0-9]+), .*
+** uqadd\tv\4.4h, v\3.4h, v\4.4h
+** ...
+** ldr\th([0-9]+), .*
+** ldr\th([0-9]+), .*
+** uqadd\th\6, h\5, h\6
+** ...
+** ldr\th([0-9]+), .*
+** ldr\th([0-9]+), .*
+** uqadd\th\8, h\7, h\8
+** ...
+*/
+/*
+** uaddq2:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.8h, v\1.8h, v\2.8h
+** ...
+** ldr\td([0-9]+), .*
+** ldr\td([0-9]+), .*
+** uqadd\tv\4.4h, v\3.4h, v\4.4h
+** ...
+** ldr\th([0-9]+), .*
+** ldr\th([0-9]+), .*
+** uqadd\th\6, h\5, h\6
+** ...
+** uqadd\th([0-9]+), h([0-9]+), h\7
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqsub\tv\2.8h, v\1.8h, v\2.8h
+** ...
+** ldr\td([0-9]+), .*
+** ldr\td([0-9]+), .*
+** uqsub\tv\4.4h, v\3.4h, v\4.4h
+** ...
+** ldr\th([0-9]+), .*
+** ldr\th([0-9]+), .*
+** uqsub\th\6, h\5, h\6
+** ...
+** ldr\th([0-9]+), .*
+** ldr\th([0-9]+), .*
+** uqsub\th\8, h\7, h\8
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned short
+#define VT uint16x4_t
+#define UMAX USHRT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,75 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane:
+** fmov\tw([0-9]+), s0
+** adds\tw([0-9]+), (?:w\1, w0|w0, w\1)
+** csinv\tw0, w\2, wzr, cc
+** ret
+*/
+/*
+** uaddq:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.4s, v\1.4s, v\2.4s
+** ...
+** ldr\tw([0-9]+), .*
+** ldr\tw([0-9]+), .*
+** adds\tw\3, w\3, w\4
+** csinv\tw\3, w\3, wzr, cc
+** ...
+** ldr\tw([0-9]+), .*
+** ldr\tw([0-9]+), .*
+** adds\tw\5, w\5, w\6
+** csinv\tw\5, w\5, wzr, cc
+** ...
+*/
+/*
+** uaddq2:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.4s, v\1.4s, v\2.4s
+** ...
+** ldr\tw([0-9]+), .*
+** ldr\tw([0-9]+), .*
+** adds\tw\3, w\3, w\4
+** csinv\tw\3, w\3, wzr, cc
+** ...
+** ldr\tw([0-9]+), .*
+** ldr\tw([0-9]+), .*
+** adds\tw\5, w\5, w\6
+** csinv\tw\5, w\5, wzr, cc
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqsub\tv\2.4s, v\1.4s, v\2.4s
+** ...
+** ldr\tw([0-9]+), .*
+** ldr\tw([0-9]+), .*
+** subs\tw\3, w\3, w\4
+** csel\tw\3, w\3, wzr, cs
+** ...
+** ldr\tw([0-9]+), .*
+** ldr\tw([0-9]+), .*
+** subs\tw\5, w\5, w\6
+** csel\tw\5, w\5, wzr, cs
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned int
+#define VT uint32x2_t
+#define UMAX UINT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,77 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+/*
+** uadd_lane:
+** ...
+** (?:fmov|ldr)\tx([0-9]+), .*
+** ...
+** adds\tx([0-9]+), (?:x\1, x0|x0, x\1)
+** csinv\tx0, x\2, xzr, cc
+** ret
+*/
+/*
+** uaddq:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.2d, v\1.2d, v\2.2d
+** ...
+** ldr\tx([0-9]+), .*
+** ldr\tx([0-9]+), .*
+** adds\tx\3, x\3, x\4
+** csinv\tx\3, x\3, xzr, cc
+** ...
+** ldr\tx([0-9]+), .*
+** ldr\tx([0-9]+), .*
+** adds\tx\5, x\5, x\6
+** csinv\tx\5, x\5, xzr, cc
+** ...
+*/
+/*
+** uaddq2:
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqadd\tv\2.2d, v\1.2d, v\2.2d
+** ...
+** ldr\tx([0-9]+), .*
+** ldr\tx([0-9]+), .*
+** adds\tx\3, x\3, x\4
+** csinv\tx\3, x\3, xzr, cc
+** ...
+** ldr\tx([0-9]+), .*
+** ldr\tx([0-9]+), .*
+** adds\tx\5, x\5, x\6
+** csinv\tx\5, x\5, xzr, cc
+** ...
+*/
+/*
+** usubq: { xfail *-*-* }
+** ...
+** ldr\tq([0-9]+), .*
+** ldr\tq([0-9]+), .*
+** uqsub\tv\2.2d, v\1.2d, v\2.2d
+** ...
+** ldr\tx([0-9]+), .*
+** ldr\tx([0-9]+), .*
+** subs\tx\3, x\3, x\4
+** csel\tx\3, x\3, xzr, cs
+** ...
+** ldr\tx([0-9]+), .*
+** ldr\tx([0-9]+), .*
+** subs\tx\5, x\5, x\6
+** csel\tx\5, x\5, xzr, cs
+** ...
+*/
+
+#include <limits.h>
+#include <arm_neon.h>
+
+#define UT unsigned long
+#define VT uint64x2_t
+#define UMAX ULONG_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic_autovect.inc"
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,39 @@
+/* Template file for scalar saturating arithmetic validation.
+
+ This file defines scalar saturating addition and subtraction functions for a
+ given type. This type, along with the corresponding minimum and maximum
+ values for that type, must be defined by any test file which includes this
+ template file. */
+
+#ifndef SAT_ARIT_INC
+#define SAT_ARIT_INC
+
+#include <limits.h>
+
+#ifndef UT
+#define UT unsigned int
+#define UMAX UINT_MAX
+#define UMIN 0
+#endif
+
+UT uadd (UT a, UT b)
+{
+ UT sum = a + b;
+ return sum < a ? UMAX : sum;
+}
+
+UT uadd2 (UT a, UT b)
+{
+ UT c;
+ if (!__builtin_add_overflow(a, b, &c))
+ return c;
+ return UMAX;
+}
+
+UT usub (UT a, UT b)
+{
+ UT sum = a - b;
+ return sum > a ? UMIN : sum;
+}
+
+#endif
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do-compile } */
+/* { dg-options "-O2 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+** dup v([0-9]+).8b, w0
+** dup v([0-9]+).8b, w1
+** uqadd b\2, b\2, b\1
+** umov w0, v\2.b\[0\]
+** ret
+*/
+/*
+** uadd2:
+** dup v([0-9]+).8b, w0
+** dup v([0-9]+).8b, w1
+** uqadd b\2, b\2, b\1
+** umov w0, v\2.b\[0\]
+** ret
+*/
+/*
+** usub: { xfail *-*-* }
+** dup v([0-9]+).8b, w0
+** dup v([0-9]+).8b, w1
+** (
+** uqsub b\2, (?:b\2, b\1|b\1. b\2)
+** umov w0, v\2.b\[0\]
+** |
+** uqsub b\1, (?:b\2, b\1|b\1. b\2)
+** umov w0, v\1.b\[0\]
+** )
+** ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned char
+#define UMAX UCHAR_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do-compile } */
+/* { dg-options "-O2 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+** dup v([0-9]+).4h, w0
+** dup v([0-9]+).4h, w1
+** uqadd h\2, h\2, h\1
+** umov w0, v\2.h\[0\]
+** ret
+*/
+/*
+** uadd2:
+** dup v([0-9]+).4h, w0
+** dup v([0-9]+).4h, w1
+** uqadd h\2, h\2, h\1
+** umov w0, v\2.h\[0\]
+** ret
+*/
+/*
+** usub: { xfail *-*-* }
+** dup v([0-9]+).4h, w0
+** dup v([0-9]+).4h, w1
+** (
+** uqsub h\2, (?:h\2, h\1|h\1. h\2)
+** umov w0, v\2.h\[0\]
+** |
+** uqsub h\1, (?:h\2, h\1|h\1. h\2)
+** umov w0, v\1.h\[0\]
+** )
+** ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned short
+#define UMAX USHRT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+** adds\tw([0-9]+), w([0-9]+), w([0-9]+)
+** csinv\tw\1, w\1, wzr, cc
+** ret
+*/
+/*
+** uadd2:
+** adds\tw([0-9]+), w([0-9]+), w([0-9]+)
+** csinv\tw\1, w\1, wzr, cc
+** ret
+*/
+/*
+** usub:
+** subs\tw([0-9]+), w([0-9]+), w([0-9]+)
+** csel\tw\1, w\1, wzr, cs
+** ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned int
+#define UMAX UINT_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** uadd:
+** adds\tx([0-9]+), x([0-9]+), x([0-9]+)
+** csinv\tx\1, x\1, xzr, cc
+** ret
+*/
+/*
+** uadd2:
+** adds\tx([0-9]+), x([0-9]+), x([0-9]+)
+** csinv\tx\1, x\1, xzr, cc
+** ret
+*/
+/*
+** usub:
+** subs\tx([0-9]+), x([0-9]+), x([0-9]+)
+** csel\tx\1, x\1, xzr, cs
+** ret
+*/
+
+#include <limits.h>
+
+#define UT unsigned long
+#define UMAX ULONG_MAX
+#define UMIN 0
+
+#include "saturating_arithmetic.inc"
\ No newline at end of file