@@ -194,22 +194,46 @@ struct cpu_regmove_cost
struct simd_vec_cost
{
- const int int_stmt_cost; /* Cost of any int vector operation,
- excluding load, store, permute,
- vector-to-scalar and
- scalar-to-vector operation. */
- const int fp_stmt_cost; /* Cost of any fp vector operation,
- excluding load, store, permute,
- vector-to-scalar and
- scalar-to-vector operation. */
- const int permute_cost; /* Cost of permute operation. */
- const int vec_to_scalar_cost; /* Cost of vec-to-scalar operation. */
- const int scalar_to_vec_cost; /* Cost of scalar-to-vector
- operation. */
- const int align_load_cost; /* Cost of aligned vector load. */
- const int unalign_load_cost; /* Cost of unaligned vector load. */
- const int unalign_store_cost; /* Cost of unaligned vector store. */
- const int store_cost; /* Cost of vector store. */
+ /* Cost of any integer vector operation, excluding the ones handled
+ specially below. */
+ const int int_stmt_cost;
+
+ /* Cost of any fp vector operation, excluding the ones handled
+ specially below. */
+ const int fp_stmt_cost;
+
+ /* Cost of a permute operation. */
+ const int permute_cost;
+
+ /* Cost of reductions for various vector types: iN is for N-bit
+ integer elements and fN is for N-bit floating-point elements.
+ We need to single out the element type because it affects the
+ depth of the reduction. */
+ const int reduc_i8_cost;
+ const int reduc_i16_cost;
+ const int reduc_i32_cost;
+ const int reduc_i64_cost;
+ const int reduc_f16_cost;
+ const int reduc_f32_cost;
+ const int reduc_f64_cost;
+
+ /* Cost of a vector-to-scalar operation. */
+ const int vec_to_scalar_cost;
+
+ /* Cost of a scalar-to-vector operation. */
+ const int scalar_to_vec_cost;
+
+ /* Cost of an aligned vector load. */
+ const int align_load_cost;
+
+ /* Cost of an unaligned vector load. */
+ const int unalign_load_cost;
+
+ /* Cost of an unaligned vector store. */
+ const int unalign_store_cost;
+
+ /* Cost of a vector store. */
+ const int store_cost;
};
typedef struct simd_vec_cost advsimd_vec_cost;
@@ -48,4 +48,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
+AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
+
#undef AARCH64_EXTRA_TUNING_OPTION
@@ -591,6 +591,13 @@ static const advsimd_vec_cost generic_advsimd_vector_cost =
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
2, /* permute_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
@@ -605,6 +612,13 @@ static const sve_vec_cost generic_sve_vector_cost =
1, /* int_stmt_cost */
1, /* fp_stmt_cost */
2, /* permute_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
2, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
@@ -631,6 +645,13 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost =
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
3, /* permute_cost */
+ 13, /* reduc_i8_cost */
+ 13, /* reduc_i16_cost */
+ 13, /* reduc_i32_cost */
+ 13, /* reduc_i64_cost */
+ 13, /* reduc_f16_cost */
+ 13, /* reduc_f32_cost */
+ 13, /* reduc_f64_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
@@ -644,6 +665,13 @@ static const sve_vec_cost a64fx_sve_vector_cost =
2, /* int_stmt_cost */
5, /* fp_stmt_cost */
3, /* permute_cost */
+ 13, /* reduc_i8_cost */
+ 13, /* reduc_i16_cost */
+ 13, /* reduc_i32_cost */
+ 13, /* reduc_i64_cost */
+ 13, /* reduc_f16_cost */
+ 13, /* reduc_f32_cost */
+ 13, /* reduc_f64_cost */
13, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
6, /* align_load_cost */
@@ -669,6 +697,13 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
1, /* int_stmt_cost */
3, /* fp_stmt_cost */
2, /* permute_cost */
+ 1, /* reduc_i8_cost */
+ 1, /* reduc_i16_cost */
+ 1, /* reduc_i32_cost */
+ 1, /* reduc_i64_cost */
+ 1, /* reduc_f16_cost */
+ 1, /* reduc_f32_cost */
+ 1, /* reduc_f64_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* align_load_cost */
@@ -696,6 +731,13 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost =
4, /* int_stmt_cost */
1, /* fp_stmt_cost */
4, /* permute_cost */
+ 2, /* reduc_i8_cost */
+ 2, /* reduc_i16_cost */
+ 2, /* reduc_i32_cost */
+ 2, /* reduc_i64_cost */
+ 2, /* reduc_f16_cost */
+ 2, /* reduc_f32_cost */
+ 2, /* reduc_f64_cost */
2, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
3, /* align_load_cost */
@@ -722,6 +764,13 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost =
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
2, /* permute_cost */
+ 3, /* reduc_i8_cost */
+ 3, /* reduc_i16_cost */
+ 3, /* reduc_i32_cost */
+ 3, /* reduc_i64_cost */
+ 3, /* reduc_f16_cost */
+ 3, /* reduc_f32_cost */
+ 3, /* reduc_f64_cost */
3, /* vec_to_scalar_cost */
2, /* scalar_to_vec_cost */
5, /* align_load_cost */
@@ -747,6 +796,13 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
3, /* permute_cost */
+ 8, /* reduc_i8_cost */
+ 8, /* reduc_i16_cost */
+ 8, /* reduc_i32_cost */
+ 8, /* reduc_i64_cost */
+ 8, /* reduc_f16_cost */
+ 8, /* reduc_f32_cost */
+ 8, /* reduc_f64_cost */
8, /* vec_to_scalar_cost */
8, /* scalar_to_vec_cost */
4, /* align_load_cost */
@@ -773,6 +829,13 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
3, /* int_stmt_cost */
3, /* fp_stmt_cost */
3, /* permute_cost */
+ 3, /* reduc_i8_cost */
+ 3, /* reduc_i16_cost */
+ 3, /* reduc_i32_cost */
+ 3, /* reduc_i64_cost */
+ 3, /* reduc_f16_cost */
+ 3, /* reduc_f32_cost */
+ 3, /* reduc_f64_cost */
3, /* vec_to_scalar_cost */
3, /* scalar_to_vec_cost */
5, /* align_load_cost */
@@ -798,6 +861,13 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost =
2, /* int_stmt_cost */
2, /* fp_stmt_cost */
2, /* permute_cost */
+ 4, /* reduc_i8_cost */
+ 4, /* reduc_i16_cost */
+ 4, /* reduc_i32_cost */
+ 4, /* reduc_i64_cost */
+ 4, /* reduc_f16_cost */
+ 4, /* reduc_f32_cost */
+ 4, /* reduc_f64_cost */
4, /* vec_to_scalar_cost */
4, /* scalar_to_vec_cost */
10, /* align_load_cost */
@@ -824,6 +894,13 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
4, /* int_stmt_cost */
5, /* fp_stmt_cost */
10, /* permute_cost */
+ 6, /* reduc_i8_cost */
+ 6, /* reduc_i16_cost */
+ 6, /* reduc_i32_cost */
+ 6, /* reduc_i64_cost */
+ 6, /* reduc_f16_cost */
+ 6, /* reduc_f32_cost */
+ 6, /* reduc_f64_cost */
6, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
@@ -850,6 +927,13 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
5, /* int_stmt_cost */
5, /* fp_stmt_cost */
10, /* permute_cost */
+ 5, /* reduc_i8_cost */
+ 5, /* reduc_i16_cost */
+ 5, /* reduc_i32_cost */
+ 5, /* reduc_i64_cost */
+ 5, /* reduc_f16_cost */
+ 5, /* reduc_f32_cost */
+ 5, /* reduc_f64_cost */
5, /* vec_to_scalar_cost */
5, /* scalar_to_vec_cost */
4, /* align_load_cost */
@@ -13874,6 +13958,28 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
/* Vectorizer cost model target hooks. */
+/* Return true if the current CPU should use the new costs defined
+ in GCC 11. This should be removed for GCC 12 and above, with the
+ costs applying to all CPUs instead. */
+static bool
+aarch64_use_new_vector_costs_p ()
+{
+ return (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
+}
+
+/* Return the appropriate SIMD costs for vectors of type VECTYPE. */
+static const simd_vec_cost *
+aarch64_simd_vec_costs (tree vectype)
+{
+ const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
+ if (vectype != NULL
+ && aarch64_sve_mode_p (TYPE_MODE (vectype))
+ && costs->sve != NULL)
+ return costs->sve;
+ return costs->advsimd;
+}
+
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
@@ -13887,12 +13993,7 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
if (vectype != NULL)
fp = FLOAT_TYPE_P (vectype);
- const simd_vec_cost *simd_costs;
- if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype))
- && costs->sve != NULL)
- simd_costs = costs->sve;
- else
- simd_costs = costs->advsimd;
+ const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
switch (type_of_cost)
{
@@ -13951,6 +14052,14 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
}
}
+/* Return true if STMT_INFO represents part of a reduction. */
+static bool
+aarch64_is_reduction (stmt_vec_info stmt_info)
+{
+ return (STMT_VINFO_REDUC_DEF (stmt_info)
+ || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
+}
+
/* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
vectors would produce a series of LDP or STP operations. KIND is the
kind of statement that STMT_INFO represents. */
@@ -14014,6 +14123,57 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
&& TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
}
+/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
+ for the vectorized form of STMT_INFO, which has cost kind KIND and which
+ when vectorized would operate on vector type VECTYPE. Try to subdivide
+ the target-independent categorization provided by KIND to get a more
+ accurate cost. WHERE specifies where the cost associated with KIND
+ occurs. */
+static unsigned int
+aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info, tree vectype,
+ enum vect_cost_model_location where,
+ unsigned int stmt_cost)
+{
+ const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
+
+ /* Detect cases in which vec_to_scalar represents a single reduction
+ instruction like FADDP or MAXV. */
+ if (kind == vec_to_scalar
+ && where == vect_epilogue
+ && aarch64_is_reduction (stmt_info))
+ switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+ {
+ case E_QImode:
+ return simd_costs->reduc_i8_cost;
+
+ case E_HImode:
+ return simd_costs->reduc_i16_cost;
+
+ case E_SImode:
+ return simd_costs->reduc_i32_cost;
+
+ case E_DImode:
+ return simd_costs->reduc_i64_cost;
+
+ case E_HFmode:
+ case E_BFmode:
+ return simd_costs->reduc_f16_cost;
+
+ case E_SFmode:
+ return simd_costs->reduc_f32_cost;
+
+ case E_DFmode:
+ return simd_costs->reduc_f64_cost;
+
+ default:
+ break;
+ }
+
+ /* Otherwise stick with the original categorization. */
+ return stmt_cost;
+}
+
/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
for STMT_INFO, which has cost kind KIND and which when vectorized would
operate on vector type VECTYPE. Adjust the cost as necessary for SVE
@@ -14097,6 +14257,14 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
int stmt_cost =
aarch64_builtin_vectorization_cost (kind, vectype, misalign);
+ /* Try to get a more accurate cost by looking at STMT_INFO instead
+ of just looking at KIND. */
+ if (stmt_info && vectype && aarch64_use_new_vector_costs_p ())
+ stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info,
+ vectype, where,
+ stmt_cost);
+
+ /* Do any SVE-specific adjustments to the cost. */
if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
vectype, stmt_cost);