[1/5] lower SLP load permutation to interleaving

Message ID	20240703132415.AD3A3384A48D@sourceware.org
State	New
Headers	show Return-Path: <gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org BFA6B3860769 Date: Wed, 3 Jul 2024 15:23:42 +0200 (CEST) From: Richard Biener <rguenther@suse.de> To: gcc-patches@gcc.gnu.org Subject: [PATCH 1/5] lower SLP load permutation to interleaving MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII default: False [-0.41 / 50.00]; BAYES_HAM(-3.00)[100.00%]; MISSING_MID(2.50)[]; NEURAL_SPAM_LONG(0.39)[0.112]; NEURAL_HAM_SHORT(-0.20)[-1.000]; MIME_GOOD(-0.10)[text/plain]; RCPT_COUNT_ONE(0.00)[1]; RCVD_COUNT_ZERO(0.00)[0]; ARC_NA(0.00)[]; MISSING_XM_UA(0.00)[]; DKIM_SIGNED(0.00)[suse.de:s=susede2_rsa,suse.de:s=susede2_ed25519]; FUZZY_BLOCKED(0.00)[rspamd.com]; FROM_EQ_ENVFROM(0.00)[]; MIME_TRACE(0.00)[0:+]; TO_DN_NONE(0.00)[]; TO_MATCH_ENVRCPT_ALL(0.00)[]; FROM_HAS_DN(0.00)[] Precedence: list Errors-To: gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org Message-Id: <20240703132415.AD3A3384A48D@sourceware.org>
Series	[1/5] lower SLP load permutation to interleaving \| expand [1/5] lower SLP load permutation to interleaving [2/5] extend SLP load permutation lowering [3/5] Handle gaps in SLP load permutation lowering [4/5] Support group-size of three in SLP load permutation lowering [5/5] RISC-V: Support group size of three in SLP store permute lowering

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index b88bba44760..1d5c4d99549 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -3889,6 +3889,287 @@ vect_analyze_slp_instance (vec_info *vinfo, return res; } +/* qsort comparator ordering SLP load nodes. */ + +static int +vllp_cmp (const void *a_, const void *b_) +{ + const slp_tree a = *(const slp_tree *)a_; + const slp_tree b = *(const slp_tree *)b_; + stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0]; + stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0]; + if (STMT_VINFO_GROUPED_ACCESS (a0) + && STMT_VINFO_GROUPED_ACCESS (b0) + && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0)) + { + /* Same group, order after lanes used. */ + if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b)) + return 1; + else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b)) + return -1; + else + { + /* Try to order loads using the same lanes together, breaking + the tie with the lane number that first differs. */ + if (!SLP_TREE_LOAD_PERMUTATION (a).exists () + && !SLP_TREE_LOAD_PERMUTATION (b).exists ()) + return 0; + else if (SLP_TREE_LOAD_PERMUTATION (a).exists () + && !SLP_TREE_LOAD_PERMUTATION (b).exists ()) + return 1; + else if (!SLP_TREE_LOAD_PERMUTATION (a).exists () + && SLP_TREE_LOAD_PERMUTATION (b).exists ()) + return -1; + else + { + for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i) + if (SLP_TREE_LOAD_PERMUTATION (a)[i] + != SLP_TREE_LOAD_PERMUTATION (b)[i]) + { + /* In-order lane first, that's what the above case for + no permutation does. */ + if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i) + return -1; + else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i) + return 1; + else if (SLP_TREE_LOAD_PERMUTATION (a)[i] + < SLP_TREE_LOAD_PERMUTATION (b)[i]) + return -1; + else + return 1; + } + return 0; + } + } + } + else /* Different groups or non-groups. */ + { + /* Order groups as their first element to keep them together. */ + if (STMT_VINFO_GROUPED_ACCESS (a0)) + a0 = DR_GROUP_FIRST_ELEMENT (a0); + if (STMT_VINFO_GROUPED_ACCESS (b0)) + b0 = DR_GROUP_FIRST_ELEMENT (b0); + if (a0 == b0) + return 0; + /* Tie using UID. */ + else if (gimple_uid (STMT_VINFO_STMT (a0)) + < gimple_uid (STMT_VINFO_STMT (b0))) + return -1; + else + { + gcc_assert (gimple_uid (STMT_VINFO_STMT (a0)) + != gimple_uid (STMT_VINFO_STMT (b0))); + return 1; + } + } +} + +/* Process the set of LOADS that are all from the same dataref group. */ + +static void +vect_lower_load_permutations (loop_vec_info loop_vinfo, + scalar_stmts_to_slp_tree_map_t *bst_map, + const array_slice<slp_tree> &loads) +{ + /* We at this point want to lower without a fixed VF or vector + size in mind which means we cannot actually compute whether we + need three or more vectors for a load permutation yet. So always + lower. */ + stmt_vec_info first + = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]); + + /* ??? In principle we have to consider a gap up to the next full + vector, but we have to actually represent a scalar stmt for the + gaps value so delay handling this. The same is true for + inbetween gaps which the load places in the load-permutation + represent. It's probably not worth trying an intermediate packing + to vectors without gap even if that might handle some more cases. + Instead get the gap case correct in some way. */ + unsigned group_lanes = 0; + for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s)) + { + if ((s == first && DR_GROUP_GAP (s) != 0) + || (s != first && DR_GROUP_GAP (s) != 1)) + return; + group_lanes++; + } + /* Only a power-of-two number of lanes matches interleaving. */ + if (exact_log2 (group_lanes) == -1) + return; + + for (slp_tree load : loads) + { + /* Leave masked or gather loads alone for now. */ + if (!SLP_TREE_CHILDREN (load).is_empty ()) + continue; + + /* We need to lower only loads of less than half of the groups + lanes, including duplicate lanes. */ + if (SLP_TREE_LANES (load) >= group_lanes / 2) + continue; + + /* Lower by reducing the group to half its size using an + interleaving scheme. For this try to compute whether all + elements needed for this load are in even or odd elements of + an even/odd decomposition with N consecutive elements. + Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition + with N == 2. */ + unsigned even = (1 << ceil_log2 (DR_GROUP_SIZE (first))) - 1; + unsigned odd = even; + for (unsigned l : SLP_TREE_LOAD_PERMUTATION (load)) + { + even &= ~l; + odd &= l; + } + /* Give up when this doesn't match up with an interleaving scheme. */ + if (!even && !odd) + continue; + + /* First build (and possibly re-use) a load node for the + unpermuted group. */ + vec<stmt_vec_info> stmts; + stmts.create (group_lanes); + for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s)) + stmts.quick_push (s); + poly_uint64 max_nunits; + bool *matches = XALLOCAVEC (bool, group_lanes); + unsigned limit = 1; + unsigned tree_size = 0; + slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts, + group_lanes, + &max_nunits, matches, &limit, + &tree_size, bst_map); + + /* Build the permute to get the original load permutation order. */ + lane_permutation_t final_perm; + final_perm.create (SLP_TREE_LANES (load)); + for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i) + final_perm.quick_push + (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i])); + + /* Now build an even or odd extraction from the unpermuted load. */ + lane_permutation_t perm; + perm.create (group_lanes / 2); + unsigned level; + if (even + && ((level = 1 << ctz_hwi (even)), true) + && group_lanes % (2 * level) == 0) + { + /* { 0, 1, ... 4, 5 ..., } */ + unsigned level = 1 << ctz_hwi (even); + for (unsigned i = 0; i < group_lanes / 2 / level; ++i) + for (unsigned j = 0; j < level; ++j) + perm.quick_push (std::make_pair (0, 2 * i * level + j)); + } + else if (odd) + { + /* { ..., 2, 3, ... 6, 7 } */ + unsigned level = 1 << ctz_hwi (odd); + gcc_assert (group_lanes % (2 * level) == 0); + for (unsigned i = 0; i < group_lanes / 2 / level; ++i) + for (unsigned j = 0; j < level; ++j) + perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j)); + } + else + gcc_unreachable (); + + /* And update final_perm. */ + for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i) + { + unsigned l = final_perm[i].second; + unsigned j; + for (j = 0; j < perm.length (); ++j) + if (perm[j].second == l) + { + final_perm[i].second = j; + break; + } + gcc_assert (j < perm.length ()); + } + + /* And create scalar stmts. */ + vec<stmt_vec_info> perm_stmts; + perm_stmts.create (perm.length ()); + for (unsigned i = 0; i < perm.length (); ++i) + perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]); + + slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR); + SLP_TREE_CHILDREN (p).quick_push (l0); + SLP_TREE_LANE_PERMUTATION (p) = perm; + SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load); + SLP_TREE_LANES (p) = perm.length (); + SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load); + /* ??? As we have scalar stmts for this intermediate permute we + could CSE it via bst_map but we do not want to pick up + another SLP node with a load permutation. We instead should + have a "local" CSE map here. */ + SLP_TREE_SCALAR_STMTS (p) = perm_stmts; + /* ??? We need to iterate if group_lanes / 2 is still too large. */ + /* ??? Ideally pick the best even/odd scheme usable for + most of the loads. -> do a multi-step scheme? */ + + /* And finally from the ordered reduction node create the + permute to shuffle the lanes into the original load-permutation + order. We replace the original load node with this. */ + SLP_TREE_CODE (load) = VEC_PERM_EXPR; + SLP_TREE_LOAD_PERMUTATION (load).release (); + SLP_TREE_LANE_PERMUTATION (load) = final_perm; + SLP_TREE_CHILDREN (load).create (1); + SLP_TREE_CHILDREN (load).quick_push (p); + } +} + +/* Transform SLP loads in the SLP graph created by SLP discovery to + group loads from the same group and lower load permutations that + are unlikely to be supported into a series of permutes. + In the degenerate case of having only single-lane SLP instances + this should result in a series of permute nodes emulating an + interleaving scheme. */ + +static void +vect_lower_load_permutations (loop_vec_info loop_vinfo, + scalar_stmts_to_slp_tree_map_t *bst_map) +{ + /* Gather and sort loads across all instances. */ + hash_set<slp_tree> visited; + auto_vec<slp_tree> loads; + for (auto inst : loop_vinfo->slp_instances) + vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited); + if (loads.is_empty ()) + return; + loads.qsort (vllp_cmp); + + /* Now process each dataref group separately. */ + unsigned firsti = 0; + for (unsigned i = 1; i < loads.length (); ++i) + { + slp_tree first = loads[firsti]; + slp_tree next = loads[i]; + stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0]; + stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0]; + if (STMT_VINFO_GROUPED_ACCESS (a0) + && STMT_VINFO_GROUPED_ACCESS (b0) + && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0)) + continue; + /* Just one SLP load of a possible group, leave those alone. */ + if (i == firsti + 1) + { + firsti = i; + continue; + } + /* Now we have multiple SLP loads of the same group from + firsti to i - 1. */ + vect_lower_load_permutations (loop_vinfo, bst_map, + make_array_slice (&loads[firsti], + i - firsti)); + firsti = i; + } + if (firsti < loads.length () - 1) + vect_lower_load_permutations (loop_vinfo, bst_map, + make_array_slice (&loads[firsti], + loads.length () - firsti)); +} + /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP trees of packed scalar stmts if SLP is possible. */ @@ -4030,6 +4311,12 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) } } + /* When we end up with load permutations that we cannot possibly handle, + like those requiring three vector inputs, lower them using interleaving + like schemes. */ + if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) + vect_lower_load_permutations (loop_vinfo, bst_map); + hash_set<slp_tree> visited_patterns; slp_tree_to_load_perm_map_t perm_cache; slp_compat_nodes_map_t compat_cache;

[1/5] lower SLP load permutation to interleaving

Commit Message

Patch