[01/13] aarch64: Add reduction costs to simd_vec_costs

Message ID	mptk0ptsxie.fsf@arm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 3ED293857C63 To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: [PATCH 01/13] aarch64: Add reduction costs to simd_vec_costs References: <mptpmzlsxl1.fsf@arm.com> Date: Fri, 26 Mar 2021 16:14:17 +0000 In-Reply-To: <mptpmzlsxl1.fsf@arm.com> (Richard Sandiford's message of "Fri, 26 Mar 2021 16:12:42 +0000") Message-ID: <mptk0ptsxie.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list From: Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Richard Sandiford <richard.sandiford@arm.com> Errors-To: gcc-patches-bounces@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces@gcc.gnu.org>
Series	[01/13] aarch64: Add reduction costs to simd_vec_costs \| expand [01/13] aarch64: Add reduction costs to simd_vec_costs [02/13] aarch64: Add vector costs for SVE CLAST[AB] and FADDA [03/13] aarch64: Add costs for LD[234]/ST[234] permutes [04/13] aarch64: Add costs for storing one element of a vector [05/13] aarch64: Add costs for one element of a scatter store [06/13] aarch64: Add a CPU-specific cost table for Neoverse V1 [07/13] aarch64: Use an aarch64-specific structure for vector costing [08/13] aarch64: Try to detect when Advanced SIMD code would be completely unrolled [09/13] aarch64: Detect scalar extending loads [10/13] aarch64: Cost comparisons embedded in COND_EXPRs [11/13] aarch64: Ignore inductions when costing vector code [12/13] aarch64: Take issue rate into account for vector loop costs [13/13] aarch64: Add costs for LD[34] and ST[34] postincrements

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ff87ced2a34..e4eeb2ce142 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -194,22 +194,46 @@ struct cpu_regmove_cost struct simd_vec_cost { - const int int_stmt_cost; /* Cost of any int vector operation, - excluding load, store, permute, - vector-to-scalar and - scalar-to-vector operation. */ - const int fp_stmt_cost; /* Cost of any fp vector operation, - excluding load, store, permute, - vector-to-scalar and - scalar-to-vector operation. */ - const int permute_cost; /* Cost of permute operation. */ - const int vec_to_scalar_cost; /* Cost of vec-to-scalar operation. */ - const int scalar_to_vec_cost; /* Cost of scalar-to-vector - operation. */ - const int align_load_cost; /* Cost of aligned vector load. */ - const int unalign_load_cost; /* Cost of unaligned vector load. */ - const int unalign_store_cost; /* Cost of unaligned vector store. */ - const int store_cost; /* Cost of vector store. */ + /* Cost of any integer vector operation, excluding the ones handled + specially below. */ + const int int_stmt_cost; + + /* Cost of any fp vector operation, excluding the ones handled + specially below. */ + const int fp_stmt_cost; + + /* Cost of a permute operation. */ + const int permute_cost; + + /* Cost of reductions for various vector types: iN is for N-bit + integer elements and fN is for N-bit floating-point elements. + We need to single out the element type because it affects the + depth of the reduction. */ + const int reduc_i8_cost; + const int reduc_i16_cost; + const int reduc_i32_cost; + const int reduc_i64_cost; + const int reduc_f16_cost; + const int reduc_f32_cost; + const int reduc_f64_cost; + + /* Cost of a vector-to-scalar operation. */ + const int vec_to_scalar_cost; + + /* Cost of a scalar-to-vector operation. */ + const int scalar_to_vec_cost; + + /* Cost of an aligned vector load. */ + const int align_load_cost; + + /* Cost of an unaligned vector load. */ + const int unalign_load_cost; + + /* Cost of an unaligned vector store. */ + const int unalign_store_cost; + + /* Cost of a vector store. */ + const int store_cost; }; typedef struct simd_vec_cost advsimd_vec_cost; diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 588edf4d923..a61fcf94916 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -48,4 +48,6 @@ AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS) AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) +AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index c8a87fe858a..b44dcdc6a6e 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -591,6 +591,13 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -605,6 +612,13 @@ static const sve_vec_cost generic_sve_vector_cost = 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 2, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ 2, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -631,6 +645,13 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 3, /* permute_cost */ + 13, /* reduc_i8_cost */ + 13, /* reduc_i16_cost */ + 13, /* reduc_i32_cost */ + 13, /* reduc_i64_cost */ + 13, /* reduc_f16_cost */ + 13, /* reduc_f32_cost */ + 13, /* reduc_f64_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -644,6 +665,13 @@ static const sve_vec_cost a64fx_sve_vector_cost = 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 3, /* permute_cost */ + 13, /* reduc_i8_cost */ + 13, /* reduc_i16_cost */ + 13, /* reduc_i32_cost */ + 13, /* reduc_i64_cost */ + 13, /* reduc_f16_cost */ + 13, /* reduc_f32_cost */ + 13, /* reduc_f64_cost */ 13, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 6, /* align_load_cost */ @@ -669,6 +697,13 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = 1, /* int_stmt_cost */ 3, /* fp_stmt_cost */ 2, /* permute_cost */ + 1, /* reduc_i8_cost */ + 1, /* reduc_i16_cost */ + 1, /* reduc_i32_cost */ + 1, /* reduc_i64_cost */ + 1, /* reduc_f16_cost */ + 1, /* reduc_f32_cost */ + 1, /* reduc_f64_cost */ 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ 1, /* align_load_cost */ @@ -696,6 +731,13 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost = 4, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 4, /* permute_cost */ + 2, /* reduc_i8_cost */ + 2, /* reduc_i16_cost */ + 2, /* reduc_i32_cost */ + 2, /* reduc_i64_cost */ + 2, /* reduc_f16_cost */ + 2, /* reduc_f32_cost */ + 2, /* reduc_f64_cost */ 2, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 3, /* align_load_cost */ @@ -722,6 +764,13 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost = 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ 2, /* permute_cost */ + 3, /* reduc_i8_cost */ + 3, /* reduc_i16_cost */ + 3, /* reduc_i32_cost */ + 3, /* reduc_i64_cost */ + 3, /* reduc_f16_cost */ + 3, /* reduc_f32_cost */ + 3, /* reduc_f64_cost */ 3, /* vec_to_scalar_cost */ 2, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -747,6 +796,13 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost = 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ 3, /* permute_cost */ + 8, /* reduc_i8_cost */ + 8, /* reduc_i16_cost */ + 8, /* reduc_i32_cost */ + 8, /* reduc_i64_cost */ + 8, /* reduc_f16_cost */ + 8, /* reduc_f32_cost */ + 8, /* reduc_f64_cost */ 8, /* vec_to_scalar_cost */ 8, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -773,6 +829,13 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost = 3, /* int_stmt_cost */ 3, /* fp_stmt_cost */ 3, /* permute_cost */ + 3, /* reduc_i8_cost */ + 3, /* reduc_i16_cost */ + 3, /* reduc_i32_cost */ + 3, /* reduc_i64_cost */ + 3, /* reduc_f16_cost */ + 3, /* reduc_f32_cost */ + 3, /* reduc_f64_cost */ 3, /* vec_to_scalar_cost */ 3, /* scalar_to_vec_cost */ 5, /* align_load_cost */ @@ -798,6 +861,13 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost = 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ 2, /* permute_cost */ + 4, /* reduc_i8_cost */ + 4, /* reduc_i16_cost */ + 4, /* reduc_i32_cost */ + 4, /* reduc_i64_cost */ + 4, /* reduc_f16_cost */ + 4, /* reduc_f32_cost */ + 4, /* reduc_f64_cost */ 4, /* vec_to_scalar_cost */ 4, /* scalar_to_vec_cost */ 10, /* align_load_cost */ @@ -824,6 +894,13 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = 4, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 10, /* permute_cost */ + 6, /* reduc_i8_cost */ + 6, /* reduc_i16_cost */ + 6, /* reduc_i32_cost */ + 6, /* reduc_i64_cost */ + 6, /* reduc_f16_cost */ + 6, /* reduc_f32_cost */ + 6, /* reduc_f64_cost */ 6, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -850,6 +927,13 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = 5, /* int_stmt_cost */ 5, /* fp_stmt_cost */ 10, /* permute_cost */ + 5, /* reduc_i8_cost */ + 5, /* reduc_i16_cost */ + 5, /* reduc_i32_cost */ + 5, /* reduc_i64_cost */ + 5, /* reduc_f16_cost */ + 5, /* reduc_f32_cost */ + 5, /* reduc_f64_cost */ 5, /* vec_to_scalar_cost */ 5, /* scalar_to_vec_cost */ 4, /* align_load_cost */ @@ -13874,6 +13958,28 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, /* Vectorizer cost model target hooks. */ +/* Return true if the current CPU should use the new costs defined + in GCC 11. This should be removed for GCC 12 and above, with the + costs applying to all CPUs instead. */ +static bool +aarch64_use_new_vector_costs_p () +{ + return (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS); +} + +/* Return the appropriate SIMD costs for vectors of type VECTYPE. */ +static const simd_vec_cost * +aarch64_simd_vec_costs (tree vectype) +{ + const cpu_vector_cost *costs = aarch64_tune_params.vec_costs; + if (vectype != NULL + && aarch64_sve_mode_p (TYPE_MODE (vectype)) + && costs->sve != NULL) + return costs->sve; + return costs->advsimd; +} + /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, @@ -13887,12 +13993,7 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, if (vectype != NULL) fp = FLOAT_TYPE_P (vectype); - const simd_vec_cost *simd_costs; - if (vectype != NULL && aarch64_sve_mode_p (TYPE_MODE (vectype)) - && costs->sve != NULL) - simd_costs = costs->sve; - else - simd_costs = costs->advsimd; + const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); switch (type_of_cost) { @@ -13951,6 +14052,14 @@ aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, } } +/* Return true if STMT_INFO represents part of a reduction. */ +static bool +aarch64_is_reduction (stmt_vec_info stmt_info) +{ + return (STMT_VINFO_REDUC_DEF (stmt_info) + || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))); +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14014,6 +14123,57 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info) && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type)); } +/* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost + for the vectorized form of STMT_INFO, which has cost kind KIND and which + when vectorized would operate on vector type VECTYPE. Try to subdivide + the target-independent categorization provided by KIND to get a more + accurate cost. WHERE specifies where the cost associated with KIND + occurs. */ +static unsigned int +aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind, + stmt_vec_info stmt_info, tree vectype, + enum vect_cost_model_location where, + unsigned int stmt_cost) +{ + const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); + + /* Detect cases in which vec_to_scalar represents a single reduction + instruction like FADDP or MAXV. */ + if (kind == vec_to_scalar + && where == vect_epilogue + && aarch64_is_reduction (stmt_info)) + switch (GET_MODE_INNER (TYPE_MODE (vectype))) + { + case E_QImode: + return simd_costs->reduc_i8_cost; + + case E_HImode: + return simd_costs->reduc_i16_cost; + + case E_SImode: + return simd_costs->reduc_i32_cost; + + case E_DImode: + return simd_costs->reduc_i64_cost; + + case E_HFmode: + case E_BFmode: + return simd_costs->reduc_f16_cost; + + case E_SFmode: + return simd_costs->reduc_f32_cost; + + case E_DFmode: + return simd_costs->reduc_f64_cost; + + default: + break; + } + + /* Otherwise stick with the original categorization. */ + return stmt_cost; +} + /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost for STMT_INFO, which has cost kind KIND and which when vectorized would operate on vector type VECTYPE. Adjust the cost as necessary for SVE @@ -14097,6 +14257,14 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, int stmt_cost = aarch64_builtin_vectorization_cost (kind, vectype, misalign); + /* Try to get a more accurate cost by looking at STMT_INFO instead + of just looking at KIND. */ + if (stmt_info && vectype && aarch64_use_new_vector_costs_p ()) + stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info, + vectype, where, + stmt_cost); + + /* Do any SVE-specific adjustments to the cost. */ if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype))) stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info, vectype, stmt_cost);

[01/13] aarch64: Add reduction costs to simd_vec_costs

Commit Message

Patch