[3/3] RISC-V: load and store-lanes with SLP

Message ID	20240709113856.3D7AC386C5B3@sourceware.org
State	New
Headers	show Return-Path: <gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 3CB113858420 Date: Tue, 9 Jul 2024 13:38:27 +0200 (CEST) From: Richard Biener <rguenther@suse.de> To: gcc-patches@gcc.gnu.org cc: richard.sandiford@arm.com Subject: [PATCH 3/3] RISC-V: load and store-lanes with SLP MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Precedence: list Errors-To: gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org Message-Id: <20240709113856.3D7AC386C5B3@sourceware.org>
Series	[1/3] lower SLP load permutation to interleaving \| expand [1/3] lower SLP load permutation to interleaving [2/3] Support group-size of three in SLP load permutation lowering [3/3] RISC-V: load and store-lanes with SLP

diff --git a/gcc/testsuite/gcc.dg/vect/slp-55.c b/gcc/testsuite/gcc.dg/vect/slp-55.c new file mode 100644 index 00000000000..0bf65ef6dc4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-55.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_int_mult } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +void foo (int * __restrict a, int *b, int *c) +{ + for (int i = 0; i < 1024; ++i) + { + a[2*i] = b[i] + 7; + a[2*i+1] = c[i] * 3; + } +} + +int bar (int *b) +{ + int res = 0; + for (int i = 0; i < 1024; ++i) + { + res += b[2*i] + 7; + res += b[2*i+1] * 3; + } + return res; +} + +void baz (int * __restrict a, int *b) +{ + for (int i = 0; i < 1024; ++i) + { + a[2*i] = b[2*i] + 7; + a[2*i+1] = b[2*i+1] * 3; + } +} + +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */ +/* { dg-final { scan-tree-dump-times "LOAD_LANES" 2 "optimized" { target vect_load_lanes } } } */ +/* { dg-final { scan-tree-dump-times "STORE_LANES" 2 "optimized" { target vect_load_lanes } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index a64b5082bd1..0d48c4980ce 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -2957,82 +2957,6 @@ start_over: "unsupported SLP instances\n"); goto again; } - - /* Check whether any load in ALL SLP instances is possibly permuted. */ - slp_tree load_node, slp_root; - unsigned i, x; - slp_instance instance; - bool can_use_lanes = true; - FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance) - { - slp_root = SLP_INSTANCE_TREE (instance); - int group_size = SLP_TREE_LANES (slp_root); - tree vectype = SLP_TREE_VECTYPE (slp_root); - bool loads_permuted = false; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) - { - if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ()) - continue; - unsigned j; - stmt_vec_info load_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) - if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) - { - loads_permuted = true; - break; - } - } - - /* If the loads and stores can be handled with load/store-lane - instructions record it and move on to the next instance. */ - if (loads_permuted - && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store - && vect_store_lanes_supported (vectype, group_size, false) - != IFN_LAST) - { - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node) - if (STMT_VINFO_GROUPED_ACCESS - (SLP_TREE_REPRESENTATIVE (load_node))) - { - stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT - (SLP_TREE_REPRESENTATIVE (load_node)); - /* Use SLP for strided accesses (or if we can't - load-lanes). */ - if (STMT_VINFO_STRIDED_P (stmt_vinfo) - || vect_load_lanes_supported - (STMT_VINFO_VECTYPE (stmt_vinfo), - DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST) - break; - } - - can_use_lanes - = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length (); - - if (can_use_lanes && dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "SLP instance %p can use load/store-lanes\n", - (void *) instance); - } - else - { - can_use_lanes = false; - break; - } - } - - /* If all SLP instances can use load/store-lanes abort SLP and try again - with SLP disabled. */ - if (can_use_lanes) - { - ok = opt_result::failure_at (vect_location, - "Built SLP cancelled: can use " - "load/store-lanes\n"); - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Built SLP cancelled: all SLP instances support " - "load/store-lanes\n"); - goto again; - } } /* Dissolve SLP-only groups. */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 2dc6d365303..17d3c59a3d8 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -120,6 +120,7 @@ _slp_tree::_slp_tree () SLP_TREE_SIMD_CLONE_INFO (this) = vNULL; SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def; SLP_TREE_CODE (this) = ERROR_MARK; + this->ldst_lanes = false; SLP_TREE_VECTYPE (this) = NULL_TREE; SLP_TREE_REPRESENTATIVE (this) = NULL; SLP_TREE_REF_COUNT (this) = 1; @@ -3600,10 +3601,24 @@ vect_build_slp_instance (vec_info *vinfo, /* For loop vectorization split the RHS into arbitrary pieces of size >= 1. */ else if (is_a <loop_vec_info> (vinfo) - && (i > 0 && i < group_size) - && !vect_slp_prefer_store_lanes_p (vinfo, - stmt_info, group_size, i)) - { + && (i > 0 && i < group_size)) + { + /* There are targets that cannot do even/odd interleaving schemes + so they absolutely need to use load/store-lanes. For now + force single-lane SLP for them - they would be happy with + uniform power-of-two lanes (but depending on element size), + but even if we can use 'i' as indicator we would need to + backtrack when later lanes fail to discover with the same + granularity. We cannot turn any of .MASK_STORE or + scatter store into store-lanes. */ + bool want_store_lanes + = (! is_a <gcall *> (stmt_info->stmt) + && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info) + && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, + group_size, 1)); + if (want_store_lanes) + i = 1; + if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Splitting SLP group at stmt %u\n", i); @@ -3637,7 +3652,10 @@ vect_build_slp_instance (vec_info *vinfo, (max_nunits, end - start)); rhs_nodes.safe_push (node); start = end; - end = group_size; + if (want_store_lanes) + end = start + 1; + else + end = group_size; } else { @@ -3676,6 +3694,18 @@ vect_build_slp_instance (vec_info *vinfo, SLP_TREE_CHILDREN (rhs_nodes[0]).length ()); SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]); + if (want_store_lanes) + { + /* For store-lanes feed the store node with all RHS nodes + in order. We cannot handle .MASK_STORE here. */ + gcc_assert (SLP_TREE_CHILDREN (rhs_nodes[0]).length () == 1); + node->ldst_lanes = 1; + SLP_TREE_CHILDREN (node).reserve_exact (rhs_nodes.length ()); + for (unsigned j = 0; j < rhs_nodes.length (); ++j) + SLP_TREE_CHILDREN (node) + .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]); + } + else for (unsigned l = 0; l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l) { @@ -4057,6 +4087,42 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, if (exact_log2 (group_lanes) == -1 && group_lanes != 3) return; + /* Verify if all load permutations can be implemented with a suitably + large element load-lanes operation. */ + unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]); + if (exact_log2 (ld_lanes_lanes) == -1 + /* ??? For now only support the single-lane case as there is + missing support on the store-lane side and code generation + isn't up to the task yet. */ + || ld_lanes_lanes != 1 + || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]), + group_lanes / ld_lanes_lanes, + false) == IFN_LAST) + ld_lanes_lanes = 0; + else + /* Verify the loads access the same number of lanes aligned to + ld_lanes_lanes. */ + for (slp_tree load : loads) + { + if (SLP_TREE_LANES (load) != ld_lanes_lanes) + { + ld_lanes_lanes = 0; + break; + } + unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0]; + if (first % ld_lanes_lanes != 0) + { + ld_lanes_lanes = 0; + break; + } + for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i) + if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i) + { + ld_lanes_lanes = 0; + break; + } + } + for (slp_tree load : loads) { /* Leave masked or gather loads alone for now. */ @@ -4071,7 +4137,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, with a non-1:1 load permutation around instead of canonicalizing those into a load and a permute node. Removing this early check would do such canonicalization. */ - if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2) + if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2 + && ld_lanes_lanes == 0) continue; /* First build (and possibly re-use) a load node for the @@ -4104,6 +4171,12 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo, final_perm.quick_push (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i])); + if (ld_lanes_lanes != 0) + { + l0->ldst_lanes = true; + load->ldst_lanes = true; + } + else while (1) { unsigned group_lanes = SLP_TREE_LANES (l0); @@ -9758,6 +9831,28 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi, gcc_assert (perm.length () == SLP_TREE_LANES (node)); + /* Load-lanes permute. This permute only acts as a forwarder to + select the correct vector def of the load-lanes load which + has the permuted vectors in its vector defs like + { v0, w0, r0, v1, w1, r1 ... } for a ld3. */ + if (node->ldst_lanes) + { + gcc_assert (children.length () == 1); + if (!gsi) + /* This is a trivial op always supported. */ + return 1; + slp_tree child = children[0]; + unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second + / SLP_TREE_LANES (node)); + unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node); + for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i) + { + tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx]; + node->push_vec_def (def); + } + return 1; + } + /* REPEATING_P is true if every output vector is guaranteed to use the same permute vector. We can handle that case for both variable-length and constant-length vectors, but we only handle other cases for diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index fdcda0d2aba..330091f08b3 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1508,7 +1508,8 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, unsigned int nvectors; if (slp_node) - nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + /* ??? Incorrect for multi-lane lanes. */ + nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size; else nvectors = vect_get_num_copies (loop_vinfo, vectype); @@ -2069,6 +2070,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, is irrelevant for them. */ *alignment_support_scheme = dr_unaligned_supported; } + /* Try using LOAD/STORE_LANES. */ + else if (slp_node->ldst_lanes + && (*lanes_ifn + = (vls_type == VLS_LOAD + ? vect_load_lanes_supported (vectype, group_size, masked_p) + : vect_store_lanes_supported (vectype, group_size, + masked_p))) != IFN_LAST) + *memory_access_type = VMAT_LOAD_STORE_LANES; else *memory_access_type = VMAT_CONTIGUOUS; @@ -8705,7 +8714,7 @@ vectorizable_store (vec_info *vinfo, else { if (memory_access_type == VMAT_LOAD_STORE_LANES) - aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); + aggr_type = build_array_type_nelts (elem_type, group_size * nunits); else aggr_type = vectype; bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type, @@ -8762,11 +8771,12 @@ vectorizable_store (vec_info *vinfo, if (memory_access_type == VMAT_LOAD_STORE_LANES) { - gcc_assert (!slp && grouped_store); unsigned inside_cost = 0, prologue_cost = 0; /* For costing some adjacent vector stores, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_stores = 0; + if (slp) + ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) / group_size; for (j = 0; j < ncopies; j++) { gimple *new_stmt; @@ -8784,7 +8794,7 @@ vectorizable_store (vec_info *vinfo, op = vect_get_store_rhs (next_stmt_info); if (costing_p) update_prologue_cost (&prologue_cost, op); - else + else if (!slp) { vect_get_vec_defs_for_operand (vinfo, next_stmt_info, ncopies, op, @@ -8799,15 +8809,15 @@ vectorizable_store (vec_info *vinfo, { if (mask) { - vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, - mask, &vec_masks, - mask_vectype); + if (slp_node) + vect_get_slp_defs (mask_node, &vec_masks); + else + vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, + mask, &vec_masks, + mask_vectype); vec_mask = vec_masks[0]; } - /* We should have catched mismatched types earlier. */ - gcc_assert ( - useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd))); dataref_ptr = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, NULL, offset, &dummy, @@ -8819,10 +8829,16 @@ vectorizable_store (vec_info *vinfo, gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); /* DR_CHAIN is then used as an input to vect_permute_store_chain(). */ - for (i = 0; i < group_size; i++) + if (!slp) { - vec_oprnd = (*gvec_oprnds[i])[j]; - dr_chain[i] = vec_oprnd; + /* We should have catched mismatched types earlier. */ + gcc_assert ( + useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd))); + for (i = 0; i < group_size; i++) + { + vec_oprnd = (*gvec_oprnds[i])[j]; + dr_chain[i] = vec_oprnd; + } } if (mask) vec_mask = vec_masks[j]; @@ -8832,12 +8848,12 @@ vectorizable_store (vec_info *vinfo, if (costing_p) { - n_adjacent_stores += vec_num; + n_adjacent_stores += group_size; continue; } /* Get an array into which we can store the individual vectors. */ - tree vec_array = create_vector_array (vectype, vec_num); + tree vec_array = create_vector_array (vectype, group_size); /* Invalidate the current contents of VEC_ARRAY. This should become an RTL clobber too, which prevents the vector registers @@ -8845,9 +8861,13 @@ vectorizable_store (vec_info *vinfo, vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); /* Store the individual vectors into the array. */ - for (i = 0; i < vec_num; i++) + for (i = 0; i < group_size; i++) { - vec_oprnd = dr_chain[i]; + if (slp) + vec_oprnd + = SLP_TREE_VEC_DEFS (SLP_TREE_CHILDREN (slp_node)[i])[j]; + else + vec_oprnd = dr_chain[i]; write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array, i); } @@ -8917,9 +8937,10 @@ vectorizable_store (vec_info *vinfo, /* Record that VEC_ARRAY is now dead. */ vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); - if (j == 0) + if (j == 0 && !slp) *vec_stmt = new_stmt; - STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); + if (!slp) + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); } if (costing_p) @@ -10765,12 +10786,13 @@ vectorizable_load (vec_info *vinfo, { gcc_assert (alignment_support_scheme == dr_aligned || alignment_support_scheme == dr_unaligned_supported); - gcc_assert (grouped_load && !slp); unsigned int inside_cost = 0, prologue_cost = 0; /* For costing some adjacent vector loads, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_loads = 0; + if (slp_node) + ncopies = slp_node->vec_stmts_size / vec_num; for (j = 0; j < ncopies; j++) { if (costing_p) @@ -10884,24 +10906,31 @@ vectorizable_load (vec_info *vinfo, gimple_call_set_nothrow (call, true); vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); - dr_chain.create (vec_num); + if (!slp) + dr_chain.create (vec_num); /* Extract each vector into an SSA_NAME. */ for (i = 0; i < vec_num; i++) { new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest, vec_array, i); - dr_chain.quick_push (new_temp); + if (slp) + slp_node->push_vec_def (new_temp); + else + dr_chain.quick_push (new_temp); } - /* Record the mapping between SSA_NAMEs and statements. */ - vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); + if (!slp) + /* Record the mapping between SSA_NAMEs and statements. */ + vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain); /* Record that VEC_ARRAY is now dead. */ vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); - dr_chain.release (); + if (!slp) + dr_chain.release (); - *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; + if (!slp_node) + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; } if (costing_p) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 8eb3ec4df86..ac288541c51 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -222,6 +222,9 @@ struct _slp_tree { unsigned int lanes; /* The operation of this node. */ enum tree_code code; + /* Whether uses of this load or feeders of this store are suitable + for load/store-lanes. */ + bool ldst_lanes; int vertex;

[3/3] RISC-V: load and store-lanes with SLP

Commit Message

Patch