[committed] aarch64: Use real scalar op counts

Message ID	mpt8rxtp94i.fsf@arm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 120033858012 To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: [committed] aarch64: Use real scalar op counts Date: Fri, 12 Nov 2021 17:38:21 +0000 Message-ID: <mpt8rxtp94i.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list From: Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Richard Sandiford <richard.sandiford@arm.com> Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>
Series	[committed] aarch64: Use real scalar op counts \| expand [committed] aarch64: Use real scalar op counts

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index d8bbc66c226..3944c095e1d 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -14813,7 +14813,8 @@ private: fractional_cost, fractional_cost, bool, unsigned int, unsigned int *, bool *); - unsigned int adjust_body_cost (unsigned int); + unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *, + unsigned int); /* True if we have performed one-time initialization based on the vec_info. */ @@ -14850,22 +14851,16 @@ private: iterate, otherwise it is zero. */ uint64_t m_num_vector_iterations = 0; - /* Used only when vectorizing loops. Estimates the number and kind of scalar - operations that would be needed to perform the same work as one iteration - of the vector loop. */ - aarch64_vec_op_count m_scalar_ops; + /* Used only when vectorizing loops. Estimates the number and kind of + operations that would be needed by one iteration of the scalar + or vector loop. */ + aarch64_vec_op_count m_ops; - /* Used only when vectorizing loops. If M_VEC_FLAGS & VEC_ADVSIMD, - this structure estimates the number and kind of operations that the - vector loop would contain. If M_VEC_FLAGS & VEC_SVE, the structure - estimates what the equivalent Advanced SIMD-only code would need in - order to perform the same work as one iteration of the SVE loop. */ + /* Used only when vectorizing loops for SVE. It estimates what the + equivalent Advanced SIMD-only code would need in order to perform + the same work as one iteration of the SVE loop. */ aarch64_vec_op_count m_advsimd_ops; - /* Used only when vectorizing loops with SVE. It estimates the number and - kind of operations that the SVE loop would contain. */ - aarch64_vec_op_count m_sve_ops; - /* Used to detect cases in which we end up costing the same load twice, once to account for results that are actually used and once to account for unused results. */ @@ -14875,9 +14870,10 @@ private: aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo, bool costing_for_scalar) : vector_costs (vinfo, costing_for_scalar), - m_scalar_ops (aarch64_tune_params.vec_costs->issue_info, 0), - m_advsimd_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ADVSIMD), - m_sve_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ANY_SVE) + m_vec_flags (costing_for_scalar ? 0 + : aarch64_classify_vector_mode (vinfo->vector_mode)), + m_ops (aarch64_tune_params.vec_costs->issue_info, m_vec_flags), + m_advsimd_ops (aarch64_tune_params.vec_costs->issue_info, VEC_ADVSIMD) { } @@ -15016,7 +15012,7 @@ aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo) FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) if (rgm->type) num_masks += num_vectors_m1 + 1; - m_sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops; + m_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops; } } @@ -15550,8 +15546,8 @@ aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, /* COUNT, KIND, STMT_INFO and VECTYPE are the same as for vector_costs::add_stmt_cost and they describe an operation in the body of a vector loop. Record issue information relating to the vector - operation in OPS, where OPS is one of m_scalar_ops, m_advsimd_ops - or m_sve_ops; see the comments above those variables for details. + operation in OPS, where OPS is one of m_ops or m_advsimd_ops; see the + comments above those variables for details. FACTOR says how many iterations of the loop described by VEC_FLAGS would be needed to match one iteration of the vector loop in VINFO. */ @@ -15570,14 +15566,14 @@ aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind, /* Calculate the minimum cycles per iteration imposed by a reduction operation. */ - if ((kind == vector_stmt || kind == vec_to_scalar) + if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar) && vect_is_reduction (stmt_info)) { unsigned int base = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, vec_flags); if (vect_reduc_type (m_vinfo, stmt_info) == FOLD_LEFT_REDUCTION) { - if (aarch64_sve_mode_p (TYPE_MODE (vectype))) + if (vectype && aarch64_sve_mode_p (TYPE_MODE (vectype))) { /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar as a single operation, whereas for Advanced SIMD it is a @@ -15744,11 +15740,6 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ()) { - /* If we're costing the vector code, record whether we're vectorizing - for Advanced SIMD or SVE. */ - if (!m_costing_for_scalar) - m_vec_flags = aarch64_classify_vector_mode (m_vinfo->vector_mode); - if (loop_vinfo) analyze_loop_vinfo (loop_vinfo); @@ -15793,31 +15784,16 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, innermost loop, also estimate the operations that would need to be issued by all relevant implementations of the loop. */ if (loop_vinfo - && m_vec_flags - && where == vect_body + && (m_costing_for_scalar || where == vect_body) && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p) - && vectype && stmt_cost != 0) { - /* Record estimates for the scalar code. */ - count_ops (count, kind, stmt_info, vectype, &m_scalar_ops, - vect_nunits_for_cost (vectype)); - - if (aarch64_sve_mode_p (m_vinfo->vector_mode) - && m_sve_ops.base_issue_info ()) - { - /* Record estimates for a possible Advanced SIMD version - of the SVE code. */ - count_ops (count, kind, stmt_info, vectype, &m_advsimd_ops, - aarch64_estimated_sve_vq ()); - - /* Record estimates for the SVE code itself. */ - count_ops (count, kind, stmt_info, vectype, &m_sve_ops, 1); - } - else - /* Record estimates for the Advanced SIMD code. Treat SVE like - Advanced SIMD if the CPU has no specific SVE costs. */ - count_ops (count, kind, stmt_info, vectype, &m_advsimd_ops, 1); + count_ops (count, kind, stmt_info, vectype, &m_ops, 1); + if (aarch64_sve_mode_p (m_vinfo->vector_mode)) + /* Record estimates for a possible Advanced SIMD version + of the SVE code. */ + count_ops (count, kind, stmt_info, vectype, + &m_advsimd_ops, aarch64_estimated_sve_vq ()); } /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic, @@ -15885,7 +15861,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info, /* Estimate the minimum number of cycles per iteration needed to issue non-predicate operations. */ fractional_cost sve_nonpred_issue_cycles_per_iter - = aarch64_estimate_min_cycles_per_iter (&m_sve_ops, issue_info->sve); + = aarch64_estimate_min_cycles_per_iter (&m_ops, issue_info->sve); /* Estimate the minimum number of cycles per iteration needed to rename SVE instructions. @@ -15901,9 +15877,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info, ??? This value is very much on the pessimistic side, but seems to work pretty well in practice. */ sve_rename_cycles_per_iter - = { m_sve_ops.general_ops - + m_sve_ops.loads - + m_sve_ops.pred_ops + 1, 5 }; + = { m_ops.general_ops + m_ops.loads + m_ops.pred_ops + 1, 5 }; /* Combine the rename and non-predicate issue limits into a single value. */ fractional_cost sve_nonpred_cycles_per_iter @@ -15912,7 +15886,7 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info, /* Separately estimate the minimum number of cycles per iteration needed to issue the predicate operations. */ fractional_cost sve_pred_issue_cycles_per_iter - = { m_sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; + = { m_ops.pred_ops, issue_info->sve->pred_ops_per_cycle }; /* Calculate the overall limit on the number of cycles per iteration. */ fractional_cost sve_cycles_per_iter @@ -15920,15 +15894,15 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info, if (dump_enabled_p ()) { - m_sve_ops.dump (); + m_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per iteration = %f\n", sve_cycles_per_iter.as_double ()); - if (m_sve_ops.pred_ops) + if (m_ops.pred_ops) dump_printf_loc (MSG_NOTE, vect_location, " predicate issue = %f\n", sve_pred_issue_cycles_per_iter.as_double ()); - if (m_sve_ops.pred_ops || sve_rename_cycles_per_iter) + if (m_ops.pred_ops || sve_rename_cycles_per_iter) dump_printf_loc (MSG_NOTE, vect_location, " non-predicate issue = %f\n", sve_nonpred_issue_cycles_per_iter.as_double ()); @@ -16008,8 +15982,13 @@ adjust_body_cost_sve (const aarch64_vec_issue_info *issue_info, /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary and return the new cost. */ unsigned int -aarch64_vector_costs::adjust_body_cost (unsigned int body_cost) +aarch64_vector_costs:: +adjust_body_cost (loop_vec_info loop_vinfo, + const aarch64_vector_costs *scalar_costs, + unsigned int body_cost) { + const auto &scalar_ops = scalar_costs->m_ops; + unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo); unsigned int orig_body_cost = body_cost; bool should_disparage = false; @@ -16056,19 +16035,11 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost) return body_cost; fractional_cost scalar_cycles_per_iter - = aarch64_estimate_min_cycles_per_iter (&m_scalar_ops, - issue_info->scalar); - - fractional_cost advsimd_cycles_per_iter - = aarch64_estimate_min_cycles_per_iter (&m_advsimd_ops, - issue_info->advsimd); + = aarch64_estimate_min_cycles_per_iter (&scalar_ops, issue_info->scalar); + scalar_cycles_per_iter *= estimated_vf; - bool could_use_advsimd - = ((m_vec_flags & VEC_ADVSIMD) - || (aarch64_autovec_preference != 2 - && (aarch64_tune_params.extra_tuning_flags - & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT) - && !m_saw_sve_only_op)); + fractional_cost vector_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&m_ops, m_ops.base_issue_info ()); if (dump_enabled_p ()) { @@ -16077,32 +16048,40 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost) "Vector loop iterates at most %wd times\n", m_num_vector_iterations); dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n"); - m_scalar_ops.dump (); + scalar_ops.dump (); dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration = %f\n", - scalar_cycles_per_iter.as_double ()); - if (could_use_advsimd) - { - dump_printf_loc (MSG_NOTE, vect_location, - "Advanced SIMD issue estimate:\n"); - m_advsimd_ops.dump (); - dump_printf_loc (MSG_NOTE, vect_location, - " estimated cycles per iteration = %f\n", - advsimd_cycles_per_iter.as_double ()); - } - else - dump_printf_loc (MSG_NOTE, vect_location, - "Loop could not use Advanced SIMD\n"); + " estimated cycles per vector iteration" + " (for VF %d) = %f\n", + estimated_vf, scalar_cycles_per_iter.as_double ()); } - fractional_cost vector_cycles_per_iter = advsimd_cycles_per_iter; - unsigned int vector_reduction_latency = m_advsimd_ops.reduction_latency; - if ((m_vec_flags & VEC_ANY_SVE) && issue_info->sve) { + bool could_use_advsimd + = (aarch64_autovec_preference != 2 + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT) + && !m_saw_sve_only_op); + + fractional_cost advsimd_cycles_per_iter + = aarch64_estimate_min_cycles_per_iter (&m_advsimd_ops, + issue_info->advsimd); if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); - vector_reduction_latency = m_sve_ops.reduction_latency; + { + if (could_use_advsimd) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Advanced SIMD issue estimate:\n"); + m_advsimd_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %f\n", + advsimd_cycles_per_iter.as_double ()); + } + else + dump_printf_loc (MSG_NOTE, vect_location, + "Loop could not use Advanced SIMD\n"); + dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n"); + } vector_cycles_per_iter = adjust_body_cost_sve (issue_info, scalar_cycles_per_iter, advsimd_cycles_per_iter, could_use_advsimd, @@ -16123,6 +16102,18 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost) &body_cost, &should_disparage); } } + else + { + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Vector issue estimate:\n"); + m_ops.dump (); + dump_printf_loc (MSG_NOTE, vect_location, + " estimated cycles per iteration = %f\n", + vector_cycles_per_iter.as_double ()); + } + } /* Decide whether to stick to latency-based costs or whether to try to take issue rates into account. */ @@ -16164,8 +16155,8 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost) vector code is an improvement, even if adding the other (non-loop-carried) latencies tends to hide this saving. We therefore reduce the cost of the vector loop body in proportion to the saving. */ - else if (m_scalar_ops.reduction_latency > vector_reduction_latency - && m_scalar_ops.reduction_latency == scalar_cycles_per_iter + else if (scalar_ops.reduction_latency > m_ops.reduction_latency + && scalar_ops.reduction_latency == scalar_cycles_per_iter && scalar_cycles_per_iter > vector_cycles_per_iter && !should_disparage) { @@ -16181,13 +16172,16 @@ aarch64_vector_costs::adjust_body_cost (unsigned int body_cost) } void -aarch64_vector_costs::finish_cost (const vector_costs *scalar_costs) +aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs) { + auto *scalar_costs + = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs); loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo); if (loop_vinfo && m_vec_flags && aarch64_use_new_vector_costs_p ()) - m_costs[vect_body] = adjust_body_cost (m_costs[vect_body]); + m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, + m_costs[vect_body]); vector_costs::finish_cost (scalar_costs); }

[committed] aarch64: Use real scalar op counts

Commit Message

Patch