@@ -1396,21 +1396,21 @@ aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
switch (fcode)
{
- BUILTIN_VALL (UNOP, reduc_splus_, 10)
+ BUILTIN_VALL (UNOP, reduc_splus_, 0)
new_stmt = gimple_build_assign_with_ops (
REDUC_PLUS_EXPR,
gimple_call_lhs (stmt),
args[0],
NULL_TREE);
break;
- BUILTIN_VDQIF (UNOP, reduc_smax_, 10)
+ BUILTIN_VDQIF (UNOP, reduc_smax_, 0)
new_stmt = gimple_build_assign_with_ops (
REDUC_MAX_EXPR,
gimple_call_lhs (stmt),
args[0],
NULL_TREE);
break;
- BUILTIN_VDQIF (UNOP, reduc_smin_, 10)
+ BUILTIN_VDQIF (UNOP, reduc_smin_, 0)
new_stmt = gimple_build_assign_with_ops (
REDUC_MIN_EXPR,
gimple_call_lhs (stmt),
@@ -251,13 +251,19 @@
BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0)
BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
+ /* Implemented by aarch64_reduc_splus_<mode>. */
+ BUILTIN_VALL (UNOP, reduc_splus_, 0)
+
/* Implemented by reduc_<sur>plus_<mode>. */
- BUILTIN_VALL (UNOP, reduc_splus_, 10)
BUILTIN_VDQ (UNOP, reduc_uplus_, 10)
+ /* Implemented by aarch64_reduc_smax_<mode>. */
+ BUILTIN_VDQIF (UNOP, reduc_smax_, 0)
+
+ /* Implemented by aarch64_reduc_smin_<mode>. */
+ BUILTIN_VDQIF (UNOP, reduc_smin_, 0)
+
/* Implemented by reduc_<maxmin_uns>_<mode>. */
- BUILTIN_VDQIF (UNOP, reduc_smax_, 10)
- BUILTIN_VDQIF (UNOP, reduc_smin_, 10)
BUILTIN_VDQ_BHSI (UNOP, reduc_umax_, 10)
BUILTIN_VDQ_BHSI (UNOP, reduc_umin_, 10)
BUILTIN_VDQF (UNOP, reduc_smax_nan_, 10)
@@ -1719,6 +1719,19 @@
;; 'across lanes' add.
+;; Template for outputting a scalar, so we can create __builtins which can be
+;; gimple_fold'd to the REDUC_PLUS_EXPR tree code.
+(define_expand "aarch64_reduc_splus_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand")
+ (match_operand:VALL 1 "register_operand"))]
+ "TARGET_SIMD"
+ {
+ /* Must be handled by aarch64_gimple_fold_builtin. */
+ gcc_unreachable ();
+ FAIL;
+ }
+)
+
(define_insn "reduc_<sur>plus_<mode>"
[(set (match_operand:VDQV 0 "register_operand" "=w")
(unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
@@ -1776,6 +1789,31 @@
;; 'across lanes' max and min ops.
+;; Template for outputting a scalar, so we can create __builtins which can be
+;; gimple_fold'd to the REDUC_MAX_EXPR tree code. The V2DI isn't used.
+(define_expand "aarch64_reduc_smax_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand")
+ (match_operand:VALL 1 "register_operand"))]
+ "TARGET_SIMD"
+ {
+ /* Must be handled in aarch64_gimple_fold_builtin. */
+ gcc_unreachable ();
+ FAIL;
+ }
+)
+
+;; Likewise for REDUC_MIN_EXPR tree code.
+(define_expand "aarch64_reduc_smin_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand")
+ (match_operand:VALL 1 "register_operand"))]
+ "TARGET_SIMD"
+ {
+ /* Must be handled in aarch64_gimple_fold_builtin. */
+ gcc_unreachable ();
+ FAIL;
+ }
+)
+
(define_insn "reduc_<maxmin_uns>_<mode>"
[(set (match_operand:VDQV_S 0 "register_operand" "=w")
(unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
@@ -13532,19 +13532,19 @@ vaddd_u64 (uint64_t __a, uint64_t __b)
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vaddv_s8 (int8x8_t __a)
{
- return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0);
+ return __builtin_aarch64_reduc_splus_v8qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vaddv_s16 (int16x4_t __a)
{
- return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0);
+ return __builtin_aarch64_reduc_splus_v4hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vaddv_s32 (int32x2_t __a)
{
- return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0);
+ return __builtin_aarch64_reduc_splus_v2si (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -13574,26 +13574,25 @@ vaddv_u32 (uint32x2_t __a)
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vaddvq_s8 (int8x16_t __a)
{
- return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a),
- 0);
+ return __builtin_aarch64_reduc_splus_v16qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vaddvq_s16 (int16x8_t __a)
{
- return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0);
+ return __builtin_aarch64_reduc_splus_v8hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vaddvq_s32 (int32x4_t __a)
{
- return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0);
+ return __builtin_aarch64_reduc_splus_v4si (__a);
}
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vaddvq_s64 (int64x2_t __a)
{
- return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0);
+ return __builtin_aarch64_reduc_splus_v2di (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -13631,22 +13630,19 @@ vaddvq_u64 (uint64x2_t __a)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vaddv_f32 (float32x2_t __a)
{
- float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a);
- return vget_lane_f32 (__t, 0);
+ return __builtin_aarch64_reduc_splus_v2sf (__a);
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vaddvq_f32 (float32x4_t __a)
{
- float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a);
- return vgetq_lane_f32 (__t, 0);
+ return __builtin_aarch64_reduc_splus_v4sf (__a);
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vaddvq_f64 (float64x2_t __a)
{
- float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a);
- return vgetq_lane_f64 (__t, 0);
+ return __builtin_aarch64_reduc_splus_v2df (__a);
}
/* vbsl */
@@ -18125,19 +18121,19 @@ vmaxv_f32 (float32x2_t __a)
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vmaxv_s8 (int8x8_t __a)
{
- return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0);
+ return __builtin_aarch64_reduc_smax_v8qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vmaxv_s16 (int16x4_t __a)
{
- return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0);
+ return __builtin_aarch64_reduc_smax_v4hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vmaxv_s32 (int32x2_t __a)
{
- return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0);
+ return __builtin_aarch64_reduc_smax_v2si (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -18181,19 +18177,19 @@ vmaxvq_f64 (float64x2_t __a)
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vmaxvq_s8 (int8x16_t __a)
{
- return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0);
+ return __builtin_aarch64_reduc_smax_v16qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vmaxvq_s16 (int16x8_t __a)
{
- return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0);
+ return __builtin_aarch64_reduc_smax_v8hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vmaxvq_s32 (int32x4_t __a)
{
- return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0);
+ return __builtin_aarch64_reduc_smax_v4si (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -18225,20 +18221,19 @@ vmaxvq_u32 (uint32x4_t __a)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vmaxnmv_f32 (float32x2_t __a)
{
- return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a),
- 0);
+ return __builtin_aarch64_reduc_smax_v2sf (__a);
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vmaxnmvq_f32 (float32x4_t __a)
{
- return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0);
+ return __builtin_aarch64_reduc_smax_v4sf (__a);
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vmaxnmvq_f64 (float64x2_t __a)
{
- return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0);
+ return __builtin_aarch64_reduc_smax_v2df (__a);
}
/* vmin */
@@ -18371,20 +18366,19 @@ vminv_f32 (float32x2_t __a)
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vminv_s8 (int8x8_t __a)
{
- return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a),
- 0);
+ return __builtin_aarch64_reduc_smin_v8qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vminv_s16 (int16x4_t __a)
{
- return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0);
+ return __builtin_aarch64_reduc_smin_v4hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vminv_s32 (int32x2_t __a)
{
- return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0);
+ return __builtin_aarch64_reduc_smin_v2si (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -18428,19 +18422,19 @@ vminvq_f64 (float64x2_t __a)
__extension__ static __inline int8_t __attribute__ ((__always_inline__))
vminvq_s8 (int8x16_t __a)
{
- return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0);
+ return __builtin_aarch64_reduc_smin_v16qi (__a);
}
__extension__ static __inline int16_t __attribute__ ((__always_inline__))
vminvq_s16 (int16x8_t __a)
{
- return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0);
+ return __builtin_aarch64_reduc_smin_v8hi (__a);
}
__extension__ static __inline int32_t __attribute__ ((__always_inline__))
vminvq_s32 (int32x4_t __a)
{
- return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0);
+ return __builtin_aarch64_reduc_smin_v4si (__a);
}
__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
@@ -18472,19 +18466,19 @@ vminvq_u32 (uint32x4_t __a)
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vminnmv_f32 (float32x2_t __a)
{
- return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0);
+ return __builtin_aarch64_reduc_smin_v2sf (__a);
}
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vminnmvq_f32 (float32x4_t __a)
{
- return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0);
+ return __builtin_aarch64_reduc_smin_v4sf (__a);
}
__extension__ static __inline float64_t __attribute__ ((__always_inline__))
vminnmvq_f64 (float64x2_t __a)
{
- return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0);
+ return __builtin_aarch64_reduc_smin_v2df (__a);
}
/* vmla */
@@ -9019,7 +9019,17 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
{
op0 = expand_normal (treeop0);
this_optab = optab_for_tree_code (code, type, optab_default);
- temp = expand_unop (mode, this_optab, op0, target, unsignedp);
+ enum machine_mode vec_mode = TYPE_MODE (TREE_TYPE (treeop0));
+ temp = expand_unop (vec_mode, this_optab, op0, NULL_RTX, unsignedp);
+ gcc_assert (temp);
+ /* The tree code produces a scalar result, but (somewhat by convention)
+ the optab produces a vector with the result in element 0 if
+ little-endian, or element N-1 if big-endian. So pull the scalar
+ result out of that element. */
+ int index = BYTES_BIG_ENDIAN ? GET_MODE_NUNITS (vec_mode) - 1 : 0;
+ int bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
+ temp = extract_bit_field (temp, bitsize, bitsize * index, unsignedp,
+ target, mode, mode);
gcc_assert (temp);
return temp;
}
@@ -8439,12 +8439,13 @@ fold_unary_loc (location_t loc, enum tree_code code, tree type, tree op0)
case REDUC_MAX_EXPR:
case REDUC_PLUS_EXPR:
{
- unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i;
+ unsigned int nelts, i;
tree *elts;
enum tree_code subcode;
if (TREE_CODE (op0) != VECTOR_CST)
return NULL_TREE;
+ nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
elts = XALLOCAVEC (tree, nelts);
if (!vec_cst_ctor_to_array (op0, elts))
@@ -8463,10 +8464,9 @@ fold_unary_loc (location_t loc, enum tree_code code, tree type, tree op0)
elts[0] = const_binop (subcode, elts[0], elts[i]);
if (elts[0] == NULL_TREE || !CONSTANT_CLASS_P (elts[0]))
return NULL_TREE;
- elts[i] = build_zero_cst (TREE_TYPE (type));
}
- return build_vector (type, elts);
+ return elts[0];
}
default:
@@ -3531,12 +3531,21 @@ verify_gimple_assign_unary (gimple stmt)
return false;
}
-
- case VEC_UNPACK_HI_EXPR:
- case VEC_UNPACK_LO_EXPR:
case REDUC_MAX_EXPR:
case REDUC_MIN_EXPR:
case REDUC_PLUS_EXPR:
+ if (!VECTOR_TYPE_P (rhs1_type)
+ || !useless_type_conversion_p (lhs_type, TREE_TYPE (rhs1_type)))
+ {
+ error ("reduction should convert from vector to element type");
+ debug_generic_expr (lhs_type);
+ debug_generic_expr (rhs1_type);
+ return true;
+ }
+ return false;
+
+ case VEC_UNPACK_HI_EXPR:
+ case VEC_UNPACK_LO_EXPR:
case VEC_UNPACK_FLOAT_HI_EXPR:
case VEC_UNPACK_FLOAT_LO_EXPR:
/* FIXME. */
@@ -1892,9 +1892,9 @@ vect_analyze_loop (struct loop *loop)
Output:
REDUC_CODE - the corresponding tree-code to be used to reduce the
- vector of partial results into a single scalar result (which
- will also reside in a vector) or ERROR_MARK if the operation is
- a supported reduction operation, but does not have such tree-code.
+ vector of partial results into a single scalar result, or ERROR_MARK
+ if the operation is a supported reduction operation, but does not have
+ such tree-code.
Return FALSE if CODE currently cannot be vectorized as reduction. */
@@ -4175,14 +4175,12 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
dump_printf_loc (MSG_NOTE, vect_location,
"Reduce using direct vector reduction.\n");
- vec_dest = vect_create_destination_var (scalar_dest, vectype);
- tmp = build1 (reduc_code, vectype, new_phi_result);
- epilog_stmt = gimple_build_assign (vec_dest, tmp);
- new_temp = make_ssa_name (vec_dest, epilog_stmt);
+ tmp = build1 (reduc_code, scalar_type, new_phi_result);
+ epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
+ new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
gimple_assign_set_lhs (epilog_stmt, new_temp);
gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
- extract_scalar_result = true;
+ scalar_results.safe_push (new_temp);
}
else
{
@@ -1157,10 +1157,9 @@ DEFTREECODE (TRANSACTION_EXPR, "transaction_expr", tcc_expression, 1)
result (e.g. summing the elements of the vector, finding the minimum over
the vector elements, etc).
Operand 0 is a vector.
- The expression returns a vector of the same type, with the first
- element in the vector holding the result of the reduction of all elements
- of the operand. The content of the other elements in the returned vector
- is undefined. */
+ The expression returns a scalar, with type the same as the elements of the
+ vector, holding the result of the reduction of all elements of the operand.
+ */
DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1)
DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1)
DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1)