[1/3] lower SLP load permutation to interleaving

Message ID	20240830133923.3FD2F13A44@imap1.dmz-prg2.suse.org
State	New
Headers	show Return-Path: <gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 84A71385DDD1 Date: Fri, 30 Aug 2024 15:39:18 +0200 (CEST) From: Richard Biener <rguenther@suse.de> To: gcc-patches@gcc.gnu.org Subject: [PATCH 1/3] lower SLP load permutation to interleaving MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Message-Id: <20240830133923.3FD2F13A44@imap1.dmz-prg2.suse.org> default: False [-4.30 / 50.00]; BAYES_HAM(-3.00)[100.00%]; NEURAL_HAM_LONG(-1.00)[-1.000]; NEURAL_HAM_SHORT(-0.20)[-1.000]; MIME_GOOD(-0.10)[text/plain]; RCVD_VIA_SMTP_AUTH(0.00)[]; ARC_NA(0.00)[]; RCPT_COUNT_ONE(0.00)[1]; MIME_TRACE(0.00)[0:+]; MISSING_XM_UA(0.00)[]; RCVD_TLS_ALL(0.00)[]; DKIM_SIGNED(0.00)[suse.de:s=susede2_rsa,suse.de:s=susede2_ed25519]; PREVIOUSLY_DELIVERED(0.00)[gcc-patches@gcc.gnu.org]; FROM_EQ_ENVFROM(0.00)[]; FROM_HAS_DN(0.00)[]; FUZZY_BLOCKED(0.00)[rspamd.com]; RCVD_COUNT_TWO(0.00)[2]; TO_MATCH_ENVRCPT_ALL(0.00)[]; TO_DN_NONE(0.00)[]; DBL_BLOCKED_OPENRESOLVER(0.00)[imap1.dmz-prg2.suse.org:helo, imap1.dmz-prg2.suse.org:mid, tree-vect-slp.cc:url] Precedence: list Errors-To: gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org
Series	[1/3] lower SLP load permutation to interleaving \| expand [1/3] lower SLP load permutation to interleaving [2/3] RISC-V: load and store-lanes with SLP [3/3] RISC-V: Do not cancel SLP to force load/store-lanes

diff --git a/gcc/testsuite/gcc.dg/vect/slp-11a.c b/gcc/testsuite/gcc.dg/vect/slp-11a.c index fcb7cf6c7a2..2efa1796757 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-11a.c +++ b/gcc/testsuite/gcc.dg/vect/slp-11a.c @@ -72,4 +72,4 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-12a.c b/gcc/testsuite/gcc.dg/vect/slp-12a.c index 2f98dc9da0b..fedf27b69d2 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-12a.c +++ b/gcc/testsuite/gcc.dg/vect/slp-12a.c @@ -80,5 +80,5 @@ int main (void) /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */ /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */ -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { vect_strided8 && {! vect_load_lanes } } && vect_int_mult } } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_strided8 && {! vect_load_lanes } } && vect_int_mult } } } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-51.c b/gcc/testsuite/gcc.dg/vect/slp-51.c new file mode 100644 index 00000000000..91ae763be30 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-51.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ + +void foo (int * __restrict x, int *y) +{ + x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__); + y = __builtin_assume_aligned (y, __BIGGEST_ALIGNMENT__); + for (int i = 0; i < 1024; ++i) + { + x[4*i+0] = y[4*i+0]; + x[4*i+1] = y[4*i+2] * 2; + x[4*i+2] = y[4*i+0] + 3; + x[4*i+3] = y[4*i+2] * 2 - 5; + } +} + +/* Check we can handle SLP with gaps and an interleaving scheme. */ +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-52.c b/gcc/testsuite/gcc.dg/vect/slp-52.c new file mode 100644 index 00000000000..ba49f0046e2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-52.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ + +void foo (int * __restrict x, int *y) +{ + for (int i = 0; i < 1024; ++i) + { + x[4*i+0] = y[3*i+0]; + x[4*i+1] = y[3*i+1] * 2; + x[4*i+2] = y[3*i+2] + 3; + x[4*i+3] = y[3*i+2] * 2 - 5; + } +} + +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target { vect_int && vect_int_mult } } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 621ebf670c7..2304cdac583 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1081,10 +1081,15 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, stmt_vec_info stmt_info; FOR_EACH_VEC_ELT (stmts, i, stmt_info) { - gimple *stmt = stmt_info->stmt; swap[i] = 0; matches[i] = false; + if (!stmt_info) + { + matches[i] = true; + continue; + } + gimple *stmt = stmt_info->stmt; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt); @@ -1979,10 +1984,16 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); bool any_permute = false; + bool any_null = false; FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) { int load_place; - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + if (! load_info) + { + load_place = j; + any_null = true; + } + else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) load_place = vect_get_place_in_interleaving_chain (load_info, first_stmt_info); else @@ -1991,6 +2002,11 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, any_permute |= load_place != j; load_permutation.quick_push (load_place); } + if (any_null) + { + gcc_assert (!any_permute); + load_permutation.release (); + } if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) { @@ -4080,6 +4096,316 @@ vect_analyze_slp_instance (vec_info *vinfo, return res; } +/* qsort comparator ordering SLP load nodes. */ + +static int +vllp_cmp (const void *a_, const void *b_) +{ + const slp_tree a = *(const slp_tree *)a_; + const slp_tree b = *(const slp_tree *)b_; + stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0]; + stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0]; + if (STMT_VINFO_GROUPED_ACCESS (a0) + && STMT_VINFO_GROUPED_ACCESS (b0) + && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0)) + { + /* Same group, order after lanes used. */ + if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b)) + return 1; + else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b)) + return -1; + else + { + /* Try to order loads using the same lanes together, breaking + the tie with the lane number that first differs. */ + if (!SLP_TREE_LOAD_PERMUTATION (a).exists () + && !SLP_TREE_LOAD_PERMUTATION (b).exists ()) + return 0; + else if (SLP_TREE_LOAD_PERMUTATION (a).exists () + && !SLP_TREE_LOAD_PERMUTATION (b).exists ()) + return 1; + else if (!SLP_TREE_LOAD_PERMUTATION (a).exists () + && SLP_TREE_LOAD_PERMUTATION (b).exists ()) + return -1; + else + { + for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i) + if (SLP_TREE_LOAD_PERMUTATION (a)[i] + != SLP_TREE_LOAD_PERMUTATION (b)[i]) + { + /* In-order lane first, that's what the above case for + no permutation does. */ + if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i) + return -1; + else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i) + return 1; + else if (SLP_TREE_LOAD_PERMUTATION (a)[i] + < SLP_TREE_LOAD_PERMUTATION (b)[i]) + return -1; + else + return 1; + } + return 0; + } + } + } + else /* Different groups or non-groups. */ + { + /* Order groups as their first element to keep them together. */ + if (STMT_VINFO_GROUPED_ACCESS (a0)) + a0 = DR_GROUP_FIRST_ELEMENT (a0); + if (STMT_VINFO_GROUPED_ACCESS (b0)) + b0 = DR_GROUP_FIRST_ELEMENT (b0); + if (a0 == b0) + return 0; + /* Tie using UID. */ + else if (gimple_uid (STMT_VINFO_STMT (a0)) + < gimple_uid (STMT_VINFO_STMT (b0))) + return -1; + else + { + gcc_assert (gimple_uid (STMT_VINFO_STMT (a0)) + != gimple_uid (STMT_VINFO_STMT (b0))); + return 1; + } + } +} + +/* Process the set of LOADS that are all from the same dataref group. */ + +static void +vect_lower_load_permutations (loop_vec_info loop_vinfo, + scalar_stmts_to_slp_tree_map_t *bst_map, + const array_slice<slp_tree> &loads) +{ + /* We at this point want to lower without a fixed VF or vector + size in mind which means we cannot actually compute whether we + need three or more vectors for a load permutation yet. So always + lower. */ + stmt_vec_info first + = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]); + + /* Only a power-of-two number of lanes matches interleaving with N levels. + ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes + at each step. */ + unsigned group_lanes = DR_GROUP_SIZE (first); + if (exact_log2 (group_lanes) == -1 && group_lanes != 3) + return; + + for (slp_tree load : loads) + { + /* Leave masked or gather loads alone for now. */ + if (!SLP_TREE_CHILDREN (load).is_empty ()) + continue; + + /* We want to pattern-match special cases here and keep those + alone. Candidates are splats and load-lane. */ + + /* We need to lower only loads of less than half of the groups + lanes, including duplicate lanes. Note this leaves nodes + with a non-1:1 load permutation around instead of canonicalizing + those into a load and a permute node. Removing this early + check would do such canonicalization. */ + if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2) + continue; + + /* First build (and possibly re-use) a load node for the + unpermuted group. Gaps in the middle and on the end are + represented with NULL stmts. */ + vec<stmt_vec_info> stmts; + stmts.create (group_lanes); + for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s)) + { + if (s != first) + for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i) + stmts.quick_push (NULL); + stmts.quick_push (s); + } + for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i) + stmts.quick_push (NULL); + poly_uint64 max_nunits = 1; + bool *matches = XALLOCAVEC (bool, group_lanes); + unsigned limit = 1; + unsigned tree_size = 0; + slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts, + group_lanes, + &max_nunits, matches, &limit, + &tree_size, bst_map); + + /* Build the permute to get the original load permutation order. */ + lane_permutation_t final_perm; + final_perm.create (SLP_TREE_LANES (load)); + for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i) + final_perm.quick_push + (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i])); + + while (1) + { + unsigned group_lanes = SLP_TREE_LANES (l0); + if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2) + break; + + /* Try to lower by reducing the group to half its size using an + interleaving scheme. For this try to compute whether all + elements needed for this load are in even or odd elements of + an even/odd decomposition with N consecutive elements. + Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition + with N == 2. */ + /* ??? Only an even number of lanes can be handed this way, but the + fallback below could work for any number. We have to make sure + to round up in that case. */ + gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3); + unsigned even = 0, odd = 0; + if ((group_lanes & 1) == 0) + { + even = (1 << ceil_log2 (group_lanes)) - 1; + odd = even; + for (auto l : final_perm) + { + even &= ~l.second; + odd &= l.second; + } + } + + /* Now build an even or odd extraction from the unpermuted load. */ + lane_permutation_t perm; + perm.create ((group_lanes + 1) / 2); + unsigned level; + if (even + && ((level = 1 << ctz_hwi (even)), true) + && group_lanes % (2 * level) == 0) + { + /* { 0, 1, ... 4, 5 ..., } */ + unsigned level = 1 << ctz_hwi (even); + for (unsigned i = 0; i < group_lanes / 2 / level; ++i) + for (unsigned j = 0; j < level; ++j) + perm.quick_push (std::make_pair (0, 2 * i * level + j)); + } + else if (odd) + { + /* { ..., 2, 3, ... 6, 7 } */ + unsigned level = 1 << ctz_hwi (odd); + gcc_assert (group_lanes % (2 * level) == 0); + for (unsigned i = 0; i < group_lanes / 2 / level; ++i) + for (unsigned j = 0; j < level; ++j) + perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j)); + } + else + { + /* As fallback extract all used lanes and fill to half the + group size by repeating the last element. + ??? This is quite a bad strathegy for re-use - we could + brute force our way to find more optimal filling lanes to + maximize re-use when looking at all loads from the group. */ + auto_bitmap l; + for (auto p : final_perm) + bitmap_set_bit (l, p.second); + unsigned i = 0; + bitmap_iterator bi; + EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi) + perm.quick_push (std::make_pair (0, i)); + while (perm.length () < (group_lanes + 1) / 2) + perm.quick_push (perm.last ()); + } + + /* Update final_perm with the intermediate permute. */ + for (unsigned i = 0; i < final_perm.length (); ++i) + { + unsigned l = final_perm[i].second; + unsigned j; + for (j = 0; j < perm.length (); ++j) + if (perm[j].second == l) + { + final_perm[i].second = j; + break; + } + gcc_assert (j < perm.length ()); + } + + /* And create scalar stmts. */ + vec<stmt_vec_info> perm_stmts; + perm_stmts.create (perm.length ()); + for (unsigned i = 0; i < perm.length (); ++i) + perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]); + + slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR); + SLP_TREE_CHILDREN (p).quick_push (l0); + SLP_TREE_LANE_PERMUTATION (p) = perm; + SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load); + SLP_TREE_LANES (p) = perm.length (); + SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load); + /* ??? As we have scalar stmts for this intermediate permute we + could CSE it via bst_map but we do not want to pick up + another SLP node with a load permutation. We instead should + have a "local" CSE map here. */ + SLP_TREE_SCALAR_STMTS (p) = perm_stmts; + + /* We now have a node for (group_lanes + 1) / 2 lanes. */ + l0 = p; + } + + /* And finally from the ordered reduction node create the + permute to shuffle the lanes into the original load-permutation + order. We replace the original load node with this. */ + SLP_TREE_CODE (load) = VEC_PERM_EXPR; + SLP_TREE_LOAD_PERMUTATION (load).release (); + SLP_TREE_LANE_PERMUTATION (load) = final_perm; + SLP_TREE_CHILDREN (load).create (1); + SLP_TREE_CHILDREN (load).quick_push (l0); + } +} + +/* Transform SLP loads in the SLP graph created by SLP discovery to + group loads from the same group and lower load permutations that + are unlikely to be supported into a series of permutes. + In the degenerate case of having only single-lane SLP instances + this should result in a series of permute nodes emulating an + interleaving scheme. */ + +static void +vect_lower_load_permutations (loop_vec_info loop_vinfo, + scalar_stmts_to_slp_tree_map_t *bst_map) +{ + /* Gather and sort loads across all instances. */ + hash_set<slp_tree> visited; + auto_vec<slp_tree> loads; + for (auto inst : loop_vinfo->slp_instances) + vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited); + if (loads.is_empty ()) + return; + loads.qsort (vllp_cmp); + + /* Now process each dataref group separately. */ + unsigned firsti = 0; + for (unsigned i = 1; i < loads.length (); ++i) + { + slp_tree first = loads[firsti]; + slp_tree next = loads[i]; + stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0]; + stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0]; + if (STMT_VINFO_GROUPED_ACCESS (a0) + && STMT_VINFO_GROUPED_ACCESS (b0) + && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0)) + continue; + /* Just one SLP load of a possible group, leave those alone. */ + if (i == firsti + 1) + { + firsti = i; + continue; + } + /* Now we have multiple SLP loads of the same group from + firsti to i - 1. */ + vect_lower_load_permutations (loop_vinfo, bst_map, + make_array_slice (&loads[firsti], + i - firsti)); + firsti = i; + } + if (firsti < loads.length () - 1) + vect_lower_load_permutations (loop_vinfo, bst_map, + make_array_slice (&loads[firsti], + loads.length () - firsti)); +} + /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP trees of packed scalar stmts if SLP is possible. */ @@ -4244,6 +4570,23 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) } } + /* When we end up with load permutations that we cannot possibly handle, + like those requiring three vector inputs, lower them using interleaving + like schemes. */ + if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) + { + vect_lower_load_permutations (loop_vinfo, bst_map); + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "SLP graph after lowering permutations:\n"); + hash_set<slp_tree> visited; + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) + vect_print_slp_graph (MSG_NOTE, vect_location, + SLP_INSTANCE_TREE (instance), visited); + } + } + release_scalar_stmts_to_slp_tree_map (bst_map); if (pattern_found && dump_enabled_p ())

[1/3] lower SLP load permutation to interleaving

Commit Message

Patch