[08/13] aarch64: Try to detect when Advanced SIMD code would be completely unrolled

Message ID	mptk0ptritm.fsf@arm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org AF687385800F To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: [PATCH 08/13] aarch64: Try to detect when Advanced SIMD code would be completely unrolled References: <mptpmzlsxl1.fsf@arm.com> Date: Fri, 26 Mar 2021 16:16:53 +0000 In-Reply-To: <mptpmzlsxl1.fsf@arm.com> (Richard Sandiford's message of "Fri, 26 Mar 2021 16:12:42 +0000") Message-ID: <mptk0ptritm.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list From: Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Richard Sandiford <richard.sandiford@arm.com> Errors-To: gcc-patches-bounces@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces@gcc.gnu.org>
Series	[01/13] aarch64: Add reduction costs to simd_vec_costs \| expand [01/13] aarch64: Add reduction costs to simd_vec_costs [02/13] aarch64: Add vector costs for SVE CLAST[AB] and FADDA [03/13] aarch64: Add costs for LD[234]/ST[234] permutes [04/13] aarch64: Add costs for storing one element of a vector [05/13] aarch64: Add costs for one element of a scatter store [06/13] aarch64: Add a CPU-specific cost table for Neoverse V1 [07/13] aarch64: Use an aarch64-specific structure for vector costing [08/13] aarch64: Try to detect when Advanced SIMD code would be completely unrolled [09/13] aarch64: Detect scalar extending loads [10/13] aarch64: Cost comparisons embedded in COND_EXPRs [11/13] aarch64: Ignore inductions when costing vector code [12/13] aarch64: Take issue rate into account for vector loop costs [13/13] aarch64: Add costs for LD[34] and ST[34] postincrements

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index a61fcf94916..65b4c37d652 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -50,4 +50,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS) AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS) +AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 81683b7faa7..63750e38862 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1732,7 +1732,8 @@ static const struct tune_params neoversev1_tunings = 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS - | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */ &generic_prefetch_tune }; @@ -2539,6 +2540,14 @@ aarch64_bit_representation (rtx x) return x; } +/* Return an estimate for the number of quadwords in an SVE vector. This is + equivalent to the number of Advanced SIMD vectors in an SVE vector. */ +static unsigned int +aarch64_estimated_sve_vq () +{ + return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128; +} + /* Return true if MODE is any of the Advanced SIMD structure modes. */ static bool aarch64_advsimd_struct_mode_p (machine_mode mode) @@ -14117,6 +14126,39 @@ struct aarch64_vector_costs /* The normal latency-based costs for each region (prologue, body and epilogue), indexed by vect_cost_model_location. */ unsigned int region[3] = {}; + + /* True if we have performed one-time initialization based on the vec_info. + + This variable exists because the vec_info is not passed to the + init_cost hook. We therefore have to defer initialization based on + it till later. */ + bool analyzed_vinfo = false; + + /* True if we're costing a vector loop, false if we're costing block-level + vectorization. */ + bool is_loop = false; + + /* - If VEC_FLAGS is zero then we're costing the original scalar code. + - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced + SIMD code. + - If VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */ + unsigned int vec_flags = 0; + + /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector + throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those + situations, we try to predict whether an Advanced SIMD implementation + of the loop could be completely unrolled and become straight-line code. + If so, it is generally better to use the Advanced SIMD version rather + than length-agnostic SVE, since the SVE loop would execute an unknown + number of times and so could not be completely unrolled in the same way. + + If we're applying this heuristic, UNROLLED_ADVSIMD_NITERS is the + number of Advanced SIMD loop iterations that would be unrolled and + UNROLLED_ADVSIMD_STMTS estimates the total number of statements + in the unrolled loop. Both values are zero if we're not applying + the heuristic. */ + unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0; + unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0; }; /* Implement TARGET_VECTORIZE_INIT_COST. */ @@ -14148,6 +14190,94 @@ aarch64_simd_vec_costs (tree vectype) return costs->advsimd; } +/* Decide whether to use the unrolling heuristic described above + aarch64_vector_costs::unrolled_advsimd_niters, updating that + field if so. LOOP_VINFO describes the loop that we're vectorizing + and COSTS are the costs that we're calculating for it. */ +static void +aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo, + aarch64_vector_costs *costs) +{ + /* The heuristic only makes sense on targets that have the same + vector throughput for SVE and Advanced SIMD. */ + if (!(aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)) + return; + + /* We only want to apply the heuristic if LOOP_VINFO is being + vectorized for SVE. */ + if (!(costs->vec_flags & VEC_ANY_SVE)) + return; + + /* Check whether it is possible in principle to use Advanced SIMD + instead. */ + if (aarch64_autovec_preference == 2) + return; + + /* We don't want to apply the heuristic to outer loops, since it's + harder to track two levels of unrolling. */ + if (LOOP_VINFO_LOOP (loop_vinfo)->inner) + return; + + /* Only handle cases in which the number of Advanced SIMD iterations + would be known at compile time but the number of SVE iterations + would not. */ + if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + || aarch64_sve_vg.is_constant ()) + return; + + /* Guess how many times the Advanced SIMD loop would iterate and make + sure that it is within the complete unrolling limit. Even if the + number of iterations is small enough, the number of statements might + not be, which is why we need to estimate the number of statements too. */ + unsigned int estimated_vq = aarch64_estimated_sve_vq (); + unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq); + unsigned HOST_WIDE_INT unrolled_advsimd_niters + = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf; + if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times) + return; + + /* Record that we're applying the heuristic and should try to estimate + the number of statements in the Advanced SIMD loop. */ + costs->unrolled_advsimd_niters = unrolled_advsimd_niters; +} + +/* Do one-time initialization of COSTS given that we're costing the loop + vectorization described by LOOP_VINFO. */ +static void +aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo, + aarch64_vector_costs *costs) +{ + costs->is_loop = true; + + /* Detect whether we're costing the scalar code or the vector code. + This is a bit hacky: it would be better if the vectorizer told + us directly. + + If we're costing the vector code, record whether we're vectorizing + for Advanced SIMD or SVE. */ + if (costs == LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)) + costs->vec_flags = aarch64_classify_vector_mode (loop_vinfo->vector_mode); + else + costs->vec_flags = 0; + + /* Detect whether we're vectorizing for SVE and should + apply the unrolling heuristic described above + aarch64_vector_costs::unrolled_advsimd_niters. */ + aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs); +} + +/* Do one-time initialization of COSTS given that we're costing the block + vectorization described by BB_VINFO. */ +static void +aarch64_analyze_bb_vinfo (bb_vec_info bb_vinfo, aarch64_vector_costs *costs) +{ + /* Unfortunately, there's no easy way of telling whether we're costing + the vector code or the scalar code, so just assume that we're costing + the vector code. */ + costs->vec_flags = aarch64_classify_vector_mode (bb_vinfo->vector_mode); +} + /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, @@ -14555,8 +14685,20 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, if (flag_vect_cost_model) { - int stmt_cost = - aarch64_builtin_vectorization_cost (kind, vectype, misalign); + int stmt_cost + = aarch64_builtin_vectorization_cost (kind, vectype, misalign); + + /* Do one-time initialization based on the vinfo. */ + loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo); + bb_vec_info bb_vinfo = dyn_cast<bb_vec_info> (vinfo); + if (!costs->analyzed_vinfo && aarch64_use_new_vector_costs_p ()) + { + if (loop_vinfo) + aarch64_analyze_loop_vinfo (loop_vinfo, costs); + else + aarch64_analyze_bb_vinfo (bb_vinfo, costs); + costs->analyzed_vinfo = true; + } /* Try to get a more accurate cost by looking at STMT_INFO instead of just looking at KIND. */ @@ -14571,10 +14713,21 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, vectype, stmt_cost); if (stmt_info && aarch64_use_new_vector_costs_p ()) - /* Account for any extra "embedded" costs that apply additively - to the base cost calculated above. */ - stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, - stmt_cost); + { + /* Account for any extra "embedded" costs that apply additively + to the base cost calculated above. */ + stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, + stmt_cost); + + /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic, + estimate the number of statements in the unrolled Advanced SIMD + loop. For simplicitly, we assume that one iteration of the + Advanced SIMD loop would need the same number of statements + as one iteration of the SVE loop. */ + if (where == vect_body && costs->unrolled_advsimd_niters) + costs->unrolled_advsimd_stmts + += count * costs->unrolled_advsimd_niters; + } /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is @@ -14590,6 +14743,49 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, return retval; } +/* BODY_COST is the cost of a vector loop body recorded in COSTS. + Adjust the cost as necessary and return the new cost. */ +static unsigned int +aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost) +{ + unsigned int orig_body_cost = body_cost; + + if (costs->unrolled_advsimd_stmts) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in" + " unrolled Advanced SIMD loop = %d\n", + costs->unrolled_advsimd_stmts); + + /* Apply the Advanced SIMD vs. SVE unrolling heuristic described above + aarch64_vector_costs::unrolled_advsimd_niters. + + The balance here is tricky. On the one hand, we can't be sure whether + the code is vectorizable with Advanced SIMD or not. However, even if + it isn't vectorizable with Advanced SIMD, there's a possibility that + the scalar code could also be unrolled. Some of the code might then + benefit from SLP, or from using LDP and STP. We therefore apply + the heuristic regardless of can_use_advsimd_p. */ + if (costs->unrolled_advsimd_stmts + && (costs->unrolled_advsimd_stmts + <= (unsigned int) param_max_completely_peeled_insns)) + { + unsigned int estimated_vq = aarch64_estimated_sve_vq (); + unsigned int min_cost = (orig_body_cost * estimated_vq) + 1; + if (body_cost < min_cost) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d to account for" + " unrolling\n", min_cost); + body_cost = min_cost; + } + } + } + + return body_cost; +} + /* Implement TARGET_VECTORIZE_FINISH_COST. */ static void aarch64_finish_cost (void *data, unsigned *prologue_cost, @@ -14599,6 +14795,11 @@ aarch64_finish_cost (void *data, unsigned *prologue_cost, *prologue_cost = costs->region[vect_prologue]; *body_cost = costs->region[vect_body]; *epilogue_cost = costs->region[vect_epilogue]; + + if (costs->is_loop + && costs->vec_flags + && aarch64_use_new_vector_costs_p ()) + *body_cost = aarch64_adjust_body_cost (costs, *body_cost); } /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA. */

[08/13] aarch64: Try to detect when Advanced SIMD code would be completely unrolled

Commit Message

Patch