[03/13] aarch64: Add costs for LD[234]/ST[234] permutes

Message ID	mpta6qpsxh3.fsf@arm.com
State	New
Headers	show Return-Path: <gcc-patches-bounces@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org E8C363857C63 To: gcc-patches@gcc.gnu.org Mail-Followup-To: gcc-patches@gcc.gnu.org, richard.sandiford@arm.com Subject: [PATCH 03/13] aarch64: Add costs for LD[234]/ST[234] permutes References: <mptpmzlsxl1.fsf@arm.com> Date: Fri, 26 Mar 2021 16:15:04 +0000 In-Reply-To: <mptpmzlsxl1.fsf@arm.com> (Richard Sandiford's message of "Fri, 26 Mar 2021 16:12:42 +0000") Message-ID: <mpta6qpsxh3.fsf@arm.com> User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain Precedence: list From: Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Richard Sandiford <richard.sandiford@arm.com> Errors-To: gcc-patches-bounces@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces@gcc.gnu.org>
Series	[01/13] aarch64: Add reduction costs to simd_vec_costs \| expand [01/13] aarch64: Add reduction costs to simd_vec_costs [02/13] aarch64: Add vector costs for SVE CLAST[AB] and FADDA [03/13] aarch64: Add costs for LD[234]/ST[234] permutes [04/13] aarch64: Add costs for storing one element of a vector [05/13] aarch64: Add costs for one element of a scatter store [06/13] aarch64: Add a CPU-specific cost table for Neoverse V1 [07/13] aarch64: Use an aarch64-specific structure for vector costing [08/13] aarch64: Try to detect when Advanced SIMD code would be completely unrolled [09/13] aarch64: Detect scalar extending loads [10/13] aarch64: Cost comparisons embedded in COND_EXPRs [11/13] aarch64: Ignore inductions when costing vector code [12/13] aarch64: Take issue rate into account for vector loop costs [13/13] aarch64: Add costs for LD[34] and ST[34] postincrements

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index bfcab72b122..3d152754981 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -202,6 +202,13 @@ struct simd_vec_cost specially below. */ const int fp_stmt_cost; + /* Per-vector cost of permuting vectors after an LD2, LD3 or LD4, + as well as the per-vector cost of permuting vectors before + an ST2, ST3 or ST4. */ + const int ld2_st2_permute_cost; + const int ld3_st3_permute_cost; + const int ld4_st4_permute_cost; + /* Cost of a permute operation. */ const int permute_cost; diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index b62169a267a..8fb723dabd2 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -590,6 +590,9 @@ static const advsimd_vec_cost generic_advsimd_vector_cost = { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -612,6 +615,9 @@ static const sve_vec_cost generic_sve_vector_cost = { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -650,6 +656,9 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost = { 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 13, /* reduc_i8_cost */ 13, /* reduc_i16_cost */ @@ -671,6 +680,9 @@ static const sve_vec_cost a64fx_sve_vector_cost = { 2, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 13, /* reduc_i8_cost */ 13, /* reduc_i16_cost */ @@ -708,6 +720,9 @@ static const advsimd_vec_cost qdf24xx_advsimd_vector_cost = { 1, /* int_stmt_cost */ 3, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 1, /* reduc_i8_cost */ 1, /* reduc_i16_cost */ @@ -742,6 +757,9 @@ static const advsimd_vec_cost thunderx_advsimd_vector_cost = { 4, /* int_stmt_cost */ 1, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 4, /* permute_cost */ 2, /* reduc_i8_cost */ 2, /* reduc_i16_cost */ @@ -775,6 +793,9 @@ static const advsimd_vec_cost tsv110_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 3, /* reduc_i8_cost */ 3, /* reduc_i16_cost */ @@ -807,6 +828,9 @@ static const advsimd_vec_cost cortexa57_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 8, /* reduc_i8_cost */ 8, /* reduc_i16_cost */ @@ -840,6 +864,9 @@ static const advsimd_vec_cost exynosm1_advsimd_vector_cost = { 3, /* int_stmt_cost */ 3, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 3, /* permute_cost */ 3, /* reduc_i8_cost */ 3, /* reduc_i16_cost */ @@ -872,6 +899,9 @@ static const advsimd_vec_cost xgene1_advsimd_vector_cost = { 2, /* int_stmt_cost */ 2, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 2, /* permute_cost */ 4, /* reduc_i8_cost */ 4, /* reduc_i16_cost */ @@ -905,6 +935,9 @@ static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost = { 4, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 10, /* permute_cost */ 6, /* reduc_i8_cost */ 6, /* reduc_i16_cost */ @@ -938,6 +971,9 @@ static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost = { 5, /* int_stmt_cost */ 5, /* fp_stmt_cost */ + 0, /* ld2_st2_permute_cost */ + 0, /* ld3_st3_permute_cost */ + 0, /* ld4_st4_permute_cost */ 10, /* permute_cost */ 5, /* reduc_i8_cost */ 5, /* reduc_i16_cost */ @@ -14086,6 +14122,26 @@ aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info) return -1; } +/* Return true if an access of kind KIND for STMT_INFO represents one + vector of an LD[234] or ST[234] operation. Return the total number of + vectors (2, 3 or 4) if so, otherwise return a value outside that range. */ +static int +aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info) +{ + if ((kind == vector_load + || kind == unaligned_load + || kind == vector_store + || kind == unaligned_store) + && STMT_VINFO_DATA_REF (stmt_info)) + { + stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); + if (stmt_info + && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES) + return DR_GROUP_SIZE (stmt_info); + } + return 0; +} + /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD vectors would produce a series of LDP or STP operations. KIND is the kind of statement that STMT_INFO represents. */ @@ -14320,6 +14376,38 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind, return stmt_cost; } +/* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND + and which when vectorized would operate on vector type VECTYPE. Add the + cost of any embedded operations. */ +static unsigned int +aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info, + tree vectype, unsigned int stmt_cost) +{ + if (vectype) + { + const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype); + + /* Detect cases in which a vector load or store represents an + LD[234] or ST[234] instruction. */ + switch (aarch64_ld234_st234_vectors (kind, stmt_info)) + { + case 2: + stmt_cost += simd_costs->ld2_st2_permute_cost; + break; + + case 3: + stmt_cost += simd_costs->ld3_st3_permute_cost; + break; + + case 4: + stmt_cost += simd_costs->ld4_st4_permute_cost; + break; + } + } + + return stmt_cost; +} + /* Implement targetm.vectorize.add_stmt_cost. */ static unsigned aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, @@ -14347,6 +14435,12 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count, stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info, vectype, stmt_cost); + if (stmt_info && aarch64_use_new_vector_costs_p ()) + /* Account for any extra "embedded" costs that apply additively + to the base cost calculated above. */ + stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype, + stmt_cost); + /* Statements in an inner loop relative to the loop being vectorized are weighted more heavily. The value here is arbitrary and could potentially be improved with analysis. */

[03/13] aarch64: Add costs for LD[234]/ST[234] permutes

Commit Message

Patch