Message ID | patch-13785-tamar@arm.com |
---|---|
State | New |
Headers | show |
Series | [1/2] middle-end : Initial scaffolding and definitions for SLP patttern matches | expand |
On Sat, 14 Nov 2020, Tamar Christina wrote: > Hi All, > > This patch adds the pre-requisites and general scaffolding for supporting doing > SLP pattern matching. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? Comments below. > Thanks, > Tamar > > gcc/ChangeLog: > > * tree-vect-loop.c (vect_dissolve_slp_only_patterns): New. > (vect_dissolve_slp_only_groups): Call here. > * tree-vect-slp.c (vect_free_slp_tree, vect_create_new_slp_node): Export > from file. > (vect_build_slp_tree_2): Set vectype for externals. > (vect_print_slp_tree): Print SLP only patterns. > (optimize_load_redistribution_1, optimize_load_redistribution, > vect_match_slp_patterns_2, vect_match_slp_patterns): New. > (vect_analyze_slp): Call matcher. > * tree-vectorizer.c (vec_info::add_pattern_stmt): Save relevancy. > * tree-vectorizer.h (STMT_VINFO_SAVED_RELEVANT, vect_pop_relevancy, > vect_dissolve_pattern_relevancy, vect_save_relevancy, > vect_push_relevancy, vect_free_slp_tree, enum _complex_operation, > class vect_pattern): New. > > --- inline copy of patch -- > > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index 39b7319e8253c351a4f6fbdd8c154330f08f2b1b..791d9c6cb0649862a84fd3c80efc89fefedbb085 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -1979,6 +1979,61 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, > return opt_result::success (); > } > > +/* For every SLP only pattern created by the pattern matched rooted in ROOT > + restore the relevancy of the original statements over those of the pattern > + and destroy the pattern relationship. This restores the SLP tree to a state > + where it can be used when SLP build is cancelled or re-tried. */ > + > +static void > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo, > + hash_set<slp_tree> *visited, slp_tree root) > +{ > + if (!root || visited->add (root)) > + return; > + > + unsigned int i; > + slp_tree node; > + stmt_vec_info related_stmt_info; > + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root); > + > + if (stmt_info && STMT_VINFO_SLP_VECT_ONLY (stmt_info)) > + { > + vect_pop_relevancy (stmt_info); > + if ((related_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info)) != NULL) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "dissolving relevancy of %G", > + STMT_VINFO_STMT (stmt_info)); > + vect_dissolve_pattern_relevancy (related_stmt_info); > + } > + } > + > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) > + vect_dissolve_slp_only_patterns (loop_vinfo, visited, node); > +} > + > +/* Lookup any SLP Only Pattern statements created by the SLP pattern matcher in > + all slp_instances in LOOP_VINFO and undo the relevancy of statements such > + that the original SLP tree before the pattern matching is used. */ > + > +static void > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo) > +{ > + > + unsigned int i; > + hash_set<slp_tree> visited; > + > + DUMP_VECT_SCOPE ("vect_dissolve_slp_only_patterns"); > + > + /* Unmark any SLP only patterns as relevant and restore the STMT_INFO of the > + related instruction. */ > + slp_instance instance; > + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) > + vect_dissolve_slp_only_patterns (loop_vinfo, &visited, > + SLP_INSTANCE_TREE (instance)); > +} > + > /* Look for SLP-only access groups and turn each individual access into its own > group. */ > static void > @@ -2583,6 +2638,9 @@ again: > /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ > gcc_assert (!ok); > > + /* Dissolve any SLP patterns created by the SLP pattern matcher. */ > + vect_dissolve_slp_only_patterns (loop_vinfo); > + > /* Try again with SLP forced off but if we didn't do any SLP there is > no point in re-trying. */ > if (!slp) > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index 0c065e835ad13ad32d222e2590e05ef56849c411..3be565a2e566e09a9e42d6c77ba402b9499b06b6 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -105,7 +105,7 @@ _slp_tree::~_slp_tree () > > /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ > > -static void > +void > vect_free_slp_tree (slp_tree node) > { > int i; > @@ -148,7 +148,7 @@ vect_free_slp_instance (slp_instance instance) > > /* Create an SLP node for SCALAR_STMTS. */ > > -slp_tree > +static slp_tree > vect_create_new_slp_node (slp_tree node, > vec<stmt_vec_info> scalar_stmts, unsigned nops) > { > @@ -165,7 +165,7 @@ vect_create_new_slp_node (slp_tree node, > > /* Create an SLP node for SCALAR_STMTS. */ > > -static slp_tree > +slp_tree > vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) > { > return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops); > @@ -1646,6 +1646,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > { > slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); > SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; > + SLP_TREE_VECTYPE (invnode) = vectype; This is wrong - the vector type of invariants is determined by the consuming SLP stmts in vectorizable_* > oprnd_info->ops = vNULL; > children.safe_push (invnode); > continue; > @@ -1929,6 +1930,13 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, > dump_printf (dump_kind, " %u", j); > dump_printf (dump_kind, " }\n"); > } > + if (SLP_TREE_REPRESENTATIVE (node) > + && STMT_VINFO_SLP_VECT_ONLY (SLP_TREE_REPRESENTATIVE (node))) > + { > + dump_printf_loc (metadata, user_loc, "\tSLP Only pattern:\n"); > + dump_printf_loc (dump_kind, user_loc, "\t %G", > + STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node))); > + } > if (SLP_TREE_LANE_PERMUTATION (node).exists ()) > { > dump_printf_loc (metadata, user_loc, "\tlane permutation {"); > @@ -2174,6 +2182,219 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) > return exact_div (common_multiple (nunits, group_size), group_size); > } > > +/* Helper function of optimize_load_redistribution that performs the operation > + recursively. */ > + > +bool optimize_load_redistribution_1 (hash_set<slp_tree> *loads, > + scalar_stmts_to_slp_tree_map_t *bst_map, > + hash_set<slp_tree> *visited, > + slp_tree parent, unsigned idx, > + slp_tree root) > +{ > + if (visited->contains (root)) > + return true; > + visited->add (root); if (visited->add (root)) return true; > + > + slp_tree node; > + unsigned i; > + stmt_vec_info dr_stmt = NULL; > + > + /* For now, we don't know anything about externals so do not do anything. */ > + if (SLP_TREE_DEF_TYPE (root) == vect_external_def > + || SLP_TREE_DEF_TYPE (root) == vect_constant_def) > + return false; > + > + if (gimple_assign_load_p (STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (root)))) The appropriate check is STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)) && DR_IS_READ (STMT_VINFO_DATA_REF (...)) let's not mix gimple & SLP checks when not necessary. > + loads->add (root); > + > + if (SLP_TREE_CODE (root) == VEC_PERM_EXPR else if > + && SLP_TREE_LANE_PERMUTATION (root).exists () > + && !SLP_TREE_SCALAR_STMTS (root).exists ()) why do !SLP_TREE_SCALAR_STMTS matter? > + { > + extra vertical space > + /* First convert this node into a load node and add it to the leaves > + list and flatten the permute from a lane to a load one. If it's > + unneeded it will be elided later. */ > + auto_vec<stmt_vec_info> stmts; > + stmts.create (SLP_TREE_LANES (root)); > + load_permutation_t load_perm; > + load_perm.create (SLP_TREE_LANES (root)); > + lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root); > + for (unsigned j = 0; j < lane_perm.length (); j++) > + { > + std::pair<unsigned, unsigned> perm = lane_perm[j]; > + /* This isn't strictly needed, but this function is a temporary > + one for specifically pattern matching, so don't want it to > + optimize things the remainder of the pipeline will. */ > + if (perm.first != j) > + goto next; > + node = SLP_TREE_CHILDREN (root)[perm.first]; > + if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) > + { > + load_perm.release (); > + return false; you're possibly prematurely exiting the DFS walk on a two-operator permute. Ah, guess that's what the above check was for? I guess it's better to pre-check all children of the VEC_PERM node to be proper, two(?) leafs with load permutation. > + } > + > + stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node); > + if (!STMT_VINFO_GROUPED_ACCESS (rep_stmt)) > + goto next; I think for a node with a load permutation this never triggers. > + > + if (!dr_stmt) > + dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt); > + > + if (dr_stmt != DR_GROUP_FIRST_ELEMENT (rep_stmt)) > + goto next; So all of the above could be done on the children w/o allocating and the rest on the loop iterating over the lanes. > + stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); > + load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]); > + } > + > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "converting loads on permute node %p\n", root); > + > + slp_tree *value = bst_map->get (stmts); > + if (value) > + node = *value; > + else > + { > + /* One last iteration to free the nodes. */ > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) > + { > + /* If we are the only reference to the node, remove the vertex. > + We don't have to modify the graph since vertices lead the > + graph traversal. */ > + vect_free_slp_tree (node); > + } You should do this unconditionally (also if the bst_map lookup succeeded), but after bumping the refcount for the use in this node. > + > + vec<stmt_vec_info> stmts_cpy = stmts.copy (); > + node = vect_create_new_slp_node (stmts_cpy.copy (), 0); > + bst_map->put (stmts_cpy, node); > + } > + SLP_TREE_CHILDREN (parent)[idx] = node; hmm, what is 'parent' - shouldn't this be 'root'? Ah, I see what you are doing. I think you should do what optimize_slp does, namely replace 'root' in-place so you do not have to adjust parents (or even care for the case of multiple parents refering to 'root'). I see that doesn't easily work when attempting to CSE so the actual CSE needs to happen below (*) > + SLP_TREE_REF_COUNT (node)++; > + SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root); > + SLP_TREE_LOAD_PERMUTATION (node) = load_perm; > + loads->add (node); Note with the load_lane check delayed we no longer need to vect_gather_slp_loads so early so I suggest to simply remove the existing early call and do it after pattern matching instead. > + //do this up the recursive call. > + //vect_free_slp_tree (root); > + > + return true; > + } > + > +next: > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) > + optimize_load_redistribution_1 (loads, bst_map, visited, root, i, node); (*) so here after recursing (and maybe returning that 'node' was changed) try to CSE 'node' like FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) if (optimize_load_redistribution_1 (loads, bst_map, visited, root, i, node)) { if (slp_tree *value = bst_map->get (stmts)) { SLP_TREE_CHILDREN (root)[i] = value; SLP_TREE_REFCNT (value)++; slp_tree_free (root); } } or so. Note that the lane configuration of the result of you transform doesn't change, we're merely propagating the lanes from the permuted loads to the VEC_PERM node (where we "missed" them) and then we're doing a re-lookup to do the CSE. Then of course there's the part turning a VEC_PERM node to a load itself. So I wonder if we can make sure SLP_TREE_SCALAR_STMTS (the output lane config) for the VEC_PERM node is set appropriately by pattern matching? Or if we should make this a two-step thing, propagate the lane setup and CSE and then turn VEC_PERMs into permuted loads? It might make sense to split out this optimization from the patch. > + > + return true; > +} > + > +/* Temporary workaround for loads not being CSEd during SLP build. This > + function will traverse the SLP tree rooted in ROOT for INSTANCE and find > + VEC_PERM nodes that blend vectors from multiple nodes that all read from the > + same DR such that the final operation is equal to a permuted load. Such > + NODES are then directly converted into LOADS themselves. The nodes are > + CSEd using BST_MAP. > + > + Finally the LOADS in INSTANCE are updated with the current set of loads. */ > + > +bool optimize_load_redistribution (slp_instance instance, > + scalar_stmts_to_slp_tree_map_t *bst_map, > + slp_tree root) > +{ > + slp_tree node; > + unsigned i; > + hash_set<slp_tree> visited; > + hash_set<slp_tree> loads; > + > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) > + optimize_load_redistribution_1 (&loads, bst_map, &visited, root, i, node); > + > + SLP_INSTANCE_LOADS (instance).truncate (0); > + for (hash_set<slp_tree>::iterator it = loads.begin (); > + it != loads.end (); ++it) > + SLP_INSTANCE_LOADS (instance).safe_push (*it); > + > + return true; > +} > + > +/* Helper function of vect_match_slp_patterns. > + > + Attempts to match patterns against the slp tree rooted in REF_NODE using > + VINFO. Patterns are matched in post-order traversal. > + > + If matching is successful the value in REF_NODE is updated and returned, if > + not then it is returned unchanged. */ > + > +static bool > +vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo, > + slp_tree_to_load_perm_map_t *perm_cache, > + hash_set<slp_tree> *visited) > +{ > + unsigned i; > + slp_tree node = *ref_node; > + bool found_p = false, found_rec_p = false; > + if (!node || visited->add (node)) > + return false; > + > + slp_tree child; > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) > + found_rec_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i], > + vinfo, perm_cache, visited); > + > + for (unsigned x = 0; x < num__slp_patterns; x++) > + { > + vect_pattern *pattern = slp_patterns[x] (ref_node); > + found_p = pattern->recognize (perm_cache, vinfo); > + delete pattern; > + found_rec_p = found_p | found_rec_p; > + } > + > + return found_rec_p; > +} > + > +/* Applies pattern matching to the given SLP tree rooted in REF_NODE using > + vec_info VINFO. > + > + The modified tree is returned. Patterns are tried in order and multiple > + patterns may match. */ > + > +static bool > +vect_match_slp_patterns (slp_instance instance, vec_info *vinfo, > + hash_set<slp_tree> *visited, > + slp_tree_to_load_perm_map_t *perm_cache, > + scalar_stmts_to_slp_tree_map_t *bst_map) > +{ > + DUMP_VECT_SCOPE ("vect_match_slp_patterns"); > + slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); > + > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "Analyzing SLP tree %p for patterns\n", > + SLP_INSTANCE_TREE (instance)); > + > + bool found_p > + = vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited); > + > + if (found_p) > + { > + optimize_load_redistribution (instance, bst_map, *ref_node); > + > + if (dump_enabled_p ()) > + { > + dump_printf_loc (MSG_NOTE, vect_location, > + "Pattern matched SLP tree\n"); > + vect_print_slp_graph (MSG_NOTE, vect_location, *ref_node); > + } > + } > + > + return found_p; > +} > + > +/* Analyze an SLP instance starting from a group of grouped stores. Call > + vect_build_slp_tree to build a tree of packed stmts if possible. > + Return FALSE if it's impossible to SLP any stmt in the loop. */ > + > static bool > vect_analyze_slp_instance (vec_info *vinfo, > scalar_stmts_to_slp_tree_map_t *bst_map, > @@ -2540,6 +2761,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) > { > unsigned int i; > stmt_vec_info first_element; > + slp_instance instance; > > DUMP_VECT_SCOPE ("vect_analyze_slp"); > > @@ -2586,6 +2808,13 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) > slp_inst_kind_reduc_group, max_tree_size); > } > > + hash_set<slp_tree> visited_patterns; > + slp_tree_to_load_perm_map_t perm_cache; > + /* See if any patterns can be found in the SLP tree. */ > + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) > + vect_match_slp_patterns (instance, vinfo, &visited_patterns, &perm_cache, > + bst_map); > + > /* The map keeps a reference on SLP nodes built, release that. */ > for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); > it != bst_map->end (); ++it) > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index fa211b95c0e54be1d51ed949d7a06c31b7b50802..3b49ce22f7aae0465dbd0b24cbf48ae054c31d22 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -27,6 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info; > #include "tree-hash-traits.h" > #include "target.h" > #include "alloc-pool.h" > +#include "internal-fn.h" > > > /* Used for naming of new temporaries. */ > @@ -1118,6 +1119,11 @@ public: > indicates whether the stmt needs to be vectorized. */ > enum vect_relevant relevant; > > + /* During SLP vectorization we may need to change the relevancy of a statement > + but restore it during dissolving of SLP nodes. This field contains a copy > + of the original relevancy analysis. */ > + enum vect_relevant saved_relevant; > + > /* For loads if this is a gather, for stores if this is a scatter. */ > bool gather_scatter_p; > > @@ -1240,6 +1246,7 @@ struct gather_scatter_info { > #define STMT_VINFO_TYPE(S) (S)->type > #define STMT_VINFO_STMT(S) (S)->stmt > #define STMT_VINFO_RELEVANT(S) (S)->relevant > +#define STMT_VINFO_SAVED_RELEVANT(S) (S)->saved_relevant > #define STMT_VINFO_LIVE_P(S) (S)->live > #define STMT_VINFO_VECTYPE(S) (S)->vectype > #define STMT_VINFO_VEC_STMTS(S) (S)->vec_stmts > @@ -1368,6 +1375,46 @@ vect_orig_stmt (stmt_vec_info stmt_info) > return stmt_info; > } > > +/* If restore the saved relevancy information STMT_INFO from the copy made > + during SLP pattern detection. */ > + > +static inline void > +vect_pop_relevancy (stmt_vec_info stmt_info) > +{ > + STMT_VINFO_RELEVANT (stmt_info) = STMT_VINFO_SAVED_RELEVANT (stmt_info); > +} Calling the single-level "stack" a stack is a bit odd. Since you already have vect_save_relevancy () just call ig vect_restore_relevancy ()? > + > +/* Restores the saved relevancy of STMT_INFO and marks it as not being inside a > + pattern. Lastly the SLP_TYPE is set to loop_vect. */ > + > +static inline void > +vect_dissolve_pattern_relevancy (stmt_vec_info stmt_info) > +{ > + vect_pop_relevancy (stmt_info); > + STMT_VINFO_IN_PATTERN_P (stmt_info) = false; > + STMT_SLP_TYPE (stmt_info) = loop_vect; > +} > + > +/* Save the current relevancy of STMT_INFO such that it can be restored by > + vect_pop_relevancy. */ > + > +static inline void > +vect_save_relevancy (stmt_vec_info stmt_info) > +{ > + STMT_VINFO_SAVED_RELEVANT (stmt_info) > + = STMT_VINFO_RELEVANT (stmt_info); > +} > + > +/* Save the current relevancy of STMT_INFO before changing it to REL. */ > + > +static inline void > +vect_push_relevancy (stmt_vec_info stmt_info, enum vect_relevant rel) > +{ > + vect_save_relevancy (stmt_info); > + STMT_VINFO_RELEVANT (stmt_info) = rel; > +} See above - maybe combine save and push by doing vect_set_slp_relevancy (...) and vect_restore_loop_relevancy (...) rather than the four separate functions. It would be nice if the vect_dissolve_slp_only_patterns work could be done fully from the IL walk we do here: /* Reset SLP type to loop_vect on all stmts. */ for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) { basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; for (gimple_stmt_iterator si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) { ... where we also reset STMT_VINFO_SLP_TYPE. > + > /* Return the later statement between STMT1_INFO and STMT2_INFO. */ > > static inline stmt_vec_info > @@ -1993,6 +2040,7 @@ extern void duplicate_and_interleave (vec_info *, gimple_seq *, tree, > extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info); > extern bool vect_update_shared_vectype (stmt_vec_info, tree); > extern slp_tree vect_create_new_slp_node (vec<stmt_vec_info>, unsigned); > +extern void vect_free_slp_tree (slp_tree); > > /* In tree-vect-patterns.c. */ > extern void > @@ -2009,4 +2057,108 @@ void vect_free_loop_info_assumptions (class loop *); > gimple *vect_loop_vectorized_call (class loop *, gcond **cond = NULL); > bool vect_stmt_dominates_stmt_p (gimple *, gimple *); > > +/* SLP Pattern matcher types, tree-vect-slp-patterns.c. */ > + > +/* Forward declaration of possible two operands operation that can be matched > + by the complex numbers pattern matchers. */ > +enum _complex_operation : unsigned; > + > +/* Cache from nodes to the load permutation they represent. */ > +typedef hash_map <slp_tree, load_permutation_t > > + slp_tree_to_load_perm_map_t; > + > +/* Vector pattern matcher base class. All SLP pattern matchers must inherit > + from this type. */ > + > +class vect_pattern > +{ > + protected: > + /* The number of arguments that the IFN requires. */ > + unsigned m_num_args; > + > + /* The internal function that will be used when a pattern is created. */ > + internal_fn m_ifn; > + > + /* The current node being inspected. */ > + slp_tree *m_node; > + > + /* The list of operands to be the children for the node produced when the > + internal function is created. */ > + vec<slp_tree> m_ops; > + > + /* Default constructor where NODE is the root of the tree to inspect. */ > + vect_pattern (slp_tree *node) > + { > + this->m_ifn = IFN_LAST; > + this->m_node = node; > + this->m_ops.create (0); > + } > + > + public: > + /* Attempt to recognize a pattern, validate and update the tree rooted in > + M_NODE. */ > + virtual bool recognize (slp_tree_to_load_perm_map_t *, vec_info *); > + > + /* Only perform the pattern creation part of the matcher. This creates and > + returns the new pattern statement. */ > + virtual gcall *build (vec_info *) = 0; > + > + /* Performs a check to see if the matched IFN is supported by the current > + target. */ > + virtual bool is_optab_supported_p (tree vectype, optimization_type opt_type) > + { > + if (!vectype) > + return false; > + > + return direct_internal_fn_supported_p (this->m_ifn, vectype, opt_type); > + } > + > + /* Create a new instance of the pattern matcher class of the given type. */ > + static vect_pattern* create (slp_tree *); > + > + /* Match but do not perform any additional operations on the SLP tree. */ > + virtual bool matches (slp_tree_to_load_perm_map_t *) = 0; > + > + /* Match but use for the first operation the supplied COMPLEX_OPERATION. No > + additional operations or modification of the SLP tree are performed. */ > + virtual bool matches (enum _complex_operation, > + slp_tree_to_load_perm_map_t *, vec<slp_tree>) > + { > + return false; > + } > + > + /* Friendly name of the operation the pattern matches. */ > + virtual const char* get_name () = 0; > + > + /* Default destructor. */ > + virtual ~vect_pattern () > + { > + this->m_ops.release (); > + } > + > + /* Check to see if the matched tree is valid for the operation the matcher > + wants. If the operation is valid then the tree is reshaped in the final > + format that build () requires. */ > + virtual bool validate_p (slp_tree_to_load_perm_map_t *) > + { > + return true; > + } > + > + /* Return the matched internal function. If no match was done this is set > + to LAST_IFN. */ > + virtual internal_fn get_ifn () > + { > + return this->m_ifn; > + } > +}; > + > +/* Function pointer to create a new pattern matcher from a generic type. */ > +typedef vect_pattern* (*vect_pattern_decl_t) (slp_tree *); > + > +/* List of supported pattern matchers. */ > +extern vect_pattern_decl_t slp_patterns[]; > + > +/* Number of supported pattern matchers. */ > +extern size_t num__slp_patterns; > + > #endif /* GCC_TREE_VECTORIZER_H */ > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c > index d81774b242569262a51b7be02815acd6d1a6bfd0..2a6ddd685922f6b60ae1305974335fb863a2af39 100644 > --- a/gcc/tree-vectorizer.c > +++ b/gcc/tree-vectorizer.c > @@ -535,6 +535,8 @@ vec_info::add_pattern_stmt (gimple *stmt, stmt_vec_info stmt_info) > stmt_vec_info res = new_stmt_vec_info (stmt); > set_vinfo_for_stmt (stmt, res, false); > STMT_VINFO_RELATED_STMT (res) = stmt_info; > + vect_save_relevancy (stmt_info); > + vect_push_relevancy (res, STMT_VINFO_RELEVANT (stmt_info)); Hmmm, that looks like an odd place to do this. I suspect it's not the "final" modification of either relevancy? Can we get rid of this hunk somehow? Thanks, Richard. > return res; > } > > > >
On Mon, 16 Nov 2020, Richard Biener wrote: > On Sat, 14 Nov 2020, Tamar Christina wrote: > > > Hi All, > > > > This patch adds the pre-requisites and general scaffolding for supporting doing > > SLP pattern matching. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > Comments below. > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * tree-vect-loop.c (vect_dissolve_slp_only_patterns): New. > > (vect_dissolve_slp_only_groups): Call here. > > * tree-vect-slp.c (vect_free_slp_tree, vect_create_new_slp_node): Export > > from file. > > (vect_build_slp_tree_2): Set vectype for externals. > > (vect_print_slp_tree): Print SLP only patterns. > > (optimize_load_redistribution_1, optimize_load_redistribution, > > vect_match_slp_patterns_2, vect_match_slp_patterns): New. > > (vect_analyze_slp): Call matcher. > > * tree-vectorizer.c (vec_info::add_pattern_stmt): Save relevancy. > > * tree-vectorizer.h (STMT_VINFO_SAVED_RELEVANT, vect_pop_relevancy, > > vect_dissolve_pattern_relevancy, vect_save_relevancy, > > vect_push_relevancy, vect_free_slp_tree, enum _complex_operation, > > class vect_pattern): New. > > > > --- inline copy of patch -- > > > > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > > index 39b7319e8253c351a4f6fbdd8c154330f08f2b1b..791d9c6cb0649862a84fd3c80efc89fefedbb085 100644 > > --- a/gcc/tree-vect-loop.c > > +++ b/gcc/tree-vect-loop.c > > @@ -1979,6 +1979,61 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, > > return opt_result::success (); > > } > > > > +/* For every SLP only pattern created by the pattern matched rooted in ROOT > > + restore the relevancy of the original statements over those of the pattern > > + and destroy the pattern relationship. This restores the SLP tree to a state > > + where it can be used when SLP build is cancelled or re-tried. */ > > + > > +static void > > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo, > > + hash_set<slp_tree> *visited, slp_tree root) > > +{ > > + if (!root || visited->add (root)) > > + return; > > + > > + unsigned int i; > > + slp_tree node; > > + stmt_vec_info related_stmt_info; > > + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root); > > + > > + if (stmt_info && STMT_VINFO_SLP_VECT_ONLY (stmt_info)) > > + { > > + vect_pop_relevancy (stmt_info); > > + if ((related_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info)) != NULL) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "dissolving relevancy of %G", > > + STMT_VINFO_STMT (stmt_info)); > > + vect_dissolve_pattern_relevancy (related_stmt_info); > > + } > > + } > > + > > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) > > + vect_dissolve_slp_only_patterns (loop_vinfo, visited, node); > > +} > > + > > +/* Lookup any SLP Only Pattern statements created by the SLP pattern matcher in > > + all slp_instances in LOOP_VINFO and undo the relevancy of statements such > > + that the original SLP tree before the pattern matching is used. */ > > + > > +static void > > +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo) > > +{ > > + > > + unsigned int i; > > + hash_set<slp_tree> visited; > > + > > + DUMP_VECT_SCOPE ("vect_dissolve_slp_only_patterns"); > > + > > + /* Unmark any SLP only patterns as relevant and restore the STMT_INFO of the > > + related instruction. */ > > + slp_instance instance; > > + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) > > + vect_dissolve_slp_only_patterns (loop_vinfo, &visited, > > + SLP_INSTANCE_TREE (instance)); > > +} > > + > > /* Look for SLP-only access groups and turn each individual access into its own > > group. */ > > static void > > @@ -2583,6 +2638,9 @@ again: > > /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ > > gcc_assert (!ok); > > > > + /* Dissolve any SLP patterns created by the SLP pattern matcher. */ > > + vect_dissolve_slp_only_patterns (loop_vinfo); > > + > > /* Try again with SLP forced off but if we didn't do any SLP there is > > no point in re-trying. */ > > if (!slp) > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > > index 0c065e835ad13ad32d222e2590e05ef56849c411..3be565a2e566e09a9e42d6c77ba402b9499b06b6 100644 > > --- a/gcc/tree-vect-slp.c > > +++ b/gcc/tree-vect-slp.c > > @@ -105,7 +105,7 @@ _slp_tree::~_slp_tree () > > > > /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ > > > > -static void > > +void > > vect_free_slp_tree (slp_tree node) > > { > > int i; > > @@ -148,7 +148,7 @@ vect_free_slp_instance (slp_instance instance) > > > > /* Create an SLP node for SCALAR_STMTS. */ > > > > -slp_tree > > +static slp_tree > > vect_create_new_slp_node (slp_tree node, > > vec<stmt_vec_info> scalar_stmts, unsigned nops) > > { > > @@ -165,7 +165,7 @@ vect_create_new_slp_node (slp_tree node, > > > > /* Create an SLP node for SCALAR_STMTS. */ > > > > -static slp_tree > > +slp_tree > > vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) > > { > > return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops); > > @@ -1646,6 +1646,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > { > > slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); > > SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; > > + SLP_TREE_VECTYPE (invnode) = vectype; > > This is wrong - the vector type of invariants is determined by > the consuming SLP stmts in vectorizable_* > > > oprnd_info->ops = vNULL; > > children.safe_push (invnode); > > continue; > > @@ -1929,6 +1930,13 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, > > dump_printf (dump_kind, " %u", j); > > dump_printf (dump_kind, " }\n"); > > } > > + if (SLP_TREE_REPRESENTATIVE (node) > > + && STMT_VINFO_SLP_VECT_ONLY (SLP_TREE_REPRESENTATIVE (node))) > > + { > > + dump_printf_loc (metadata, user_loc, "\tSLP Only pattern:\n"); > > + dump_printf_loc (dump_kind, user_loc, "\t %G", > > + STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node))); > > + } > > if (SLP_TREE_LANE_PERMUTATION (node).exists ()) > > { > > dump_printf_loc (metadata, user_loc, "\tlane permutation {"); > > @@ -2174,6 +2182,219 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) > > return exact_div (common_multiple (nunits, group_size), group_size); > > } > > > > +/* Helper function of optimize_load_redistribution that performs the operation > > + recursively. */ > > + > > +bool optimize_load_redistribution_1 (hash_set<slp_tree> *loads, > > + scalar_stmts_to_slp_tree_map_t *bst_map, > > + hash_set<slp_tree> *visited, > > + slp_tree parent, unsigned idx, > > + slp_tree root) > > +{ > > + if (visited->contains (root)) > > + return true; > > + visited->add (root); > > if (visited->add (root)) > return true; > > > + > > + slp_tree node; > > + unsigned i; > > + stmt_vec_info dr_stmt = NULL; > > + > > + /* For now, we don't know anything about externals so do not do anything. */ > > + if (SLP_TREE_DEF_TYPE (root) == vect_external_def > > + || SLP_TREE_DEF_TYPE (root) == vect_constant_def) > > + return false; > > + > > + if (gimple_assign_load_p (STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (root)))) > > The appropriate check is STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE > (root)) && DR_IS_READ (STMT_VINFO_DATA_REF (...)) > > let's not mix gimple & SLP checks when not necessary. > > > + loads->add (root); > > + > > + if (SLP_TREE_CODE (root) == VEC_PERM_EXPR > > else if > > > + && SLP_TREE_LANE_PERMUTATION (root).exists () > > + && !SLP_TREE_SCALAR_STMTS (root).exists ()) > > why do !SLP_TREE_SCALAR_STMTS matter? > > > + { > > + > > extra vertical space > > > + /* First convert this node into a load node and add it to the leaves > > + list and flatten the permute from a lane to a load one. If it's > > + unneeded it will be elided later. */ > > + auto_vec<stmt_vec_info> stmts; > > + stmts.create (SLP_TREE_LANES (root)); > > + load_permutation_t load_perm; > > + load_perm.create (SLP_TREE_LANES (root)); > > + lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root); > > + for (unsigned j = 0; j < lane_perm.length (); j++) > > + { > > + std::pair<unsigned, unsigned> perm = lane_perm[j]; > > + /* This isn't strictly needed, but this function is a temporary > > + one for specifically pattern matching, so don't want it to > > + optimize things the remainder of the pipeline will. */ > > + if (perm.first != j) > > + goto next; > > + node = SLP_TREE_CHILDREN (root)[perm.first]; > > + if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) > > + { > > + load_perm.release (); > > + return false; > > you're possibly prematurely exiting the DFS walk on a two-operator > permute. Ah, guess that's what the above check was for? I guess > it's better to pre-check all children of the VEC_PERM node > to be proper, two(?) leafs with load permutation. > > > + } > > + > > + stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node); > > + if (!STMT_VINFO_GROUPED_ACCESS (rep_stmt)) > > + goto next; > > I think for a node with a load permutation this never triggers. > > > + > > + if (!dr_stmt) > > + dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt); > > + > > + if (dr_stmt != DR_GROUP_FIRST_ELEMENT (rep_stmt)) > > + goto next; > > So all of the above could be done on the children w/o allocating > and the rest on the loop iterating over the lanes. > > > + stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); > > + load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]); > > + } > > + > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "converting loads on permute node %p\n", root); > > + > > + slp_tree *value = bst_map->get (stmts); > > + if (value) > > + node = *value; > > + else > > + { > > + /* One last iteration to free the nodes. */ > > + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) > > + { > > + /* If we are the only reference to the node, remove the vertex. > > + We don't have to modify the graph since vertices lead the > > + graph traversal. */ > > + vect_free_slp_tree (node); > > + } > > You should do this unconditionally (also if the bst_map lookup > succeeded), but after bumping the refcount for the use in this > node. > > > + > > + vec<stmt_vec_info> stmts_cpy = stmts.copy (); > > + node = vect_create_new_slp_node (stmts_cpy.copy (), 0); > > + bst_map->put (stmts_cpy, node); > > + } > > + SLP_TREE_CHILDREN (parent)[idx] = node; > > hmm, what is 'parent' - shouldn't this be 'root'? Ah, I see what > you are doing. I think you should do what optimize_slp does, > namely replace 'root' in-place so you do not have to adjust > parents (or even care for the case of multiple parents refering to > 'root'). I see that doesn't easily work when attempting to CSE so > the actual CSE needs to happen below (*) > > > + SLP_TREE_REF_COUNT (node)++; > > + SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root); > > + SLP_TREE_LOAD_PERMUTATION (node) = load_perm; > > + loads->add (node); > > Note with the load_lane check delayed we no longer need > to vect_gather_slp_loads so early so I suggest to simply > remove the existing early call and do it after > pattern matching instead. I'm testing the patch below for this. Richard. From 51b89070fcf8eacb188439e6c1b777fd9db4b2ae Mon Sep 17 00:00:00 2001 From: Richard Biener <rguenther@suse.de> Date: Mon, 16 Nov 2020 14:26:20 +0100 Subject: [PATCH] Delay SLP instance loads gathering To: gcc-patches@gcc.gnu.org This delays filling SLP_INSTANCE_LOADS. 2020-11-16 Richard Biener <rguenther@suse.de> * tree-vectorizer.h (vect_gather_slp_loads): Declare. * tree-vect-loop.c (vect_analyze_loop_2): Call vect_gather_slp_loads. * tree-vect-slp.c (vect_build_slp_instance): Do not gather SLP loads here. (vect_gather_slp_loads): Remove wrapper, new function. (vect_slp_analyze_bb_1): Call it. --- gcc/tree-vect-loop.c | 3 +++ gcc/tree-vect-slp.c | 26 ++++++++++++++++++-------- gcc/tree-vectorizer.h | 1 + 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 4d5532f71d0..ecaaf0116d3 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2298,6 +2298,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) /* Optimize the SLP graph with the vectorization factor fixed. */ vect_optimize_slp (loop_vinfo); + + /* Gather the loads reachable from the SLP graph entries. */ + vect_gather_slp_loads (loop_vinfo); } bool saved_can_use_partial_vectors_p diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index b98d5db9f76..d2f2407ac92 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2071,13 +2071,6 @@ vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node, } } -static void -vect_gather_slp_loads (slp_instance inst, slp_tree node) -{ - hash_set<slp_tree> visited; - vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited); -} - /* Find the last store in SLP INSTANCE. */ @@ -2252,7 +2245,6 @@ vect_build_slp_instance (vec_info *vinfo, new_instance->cost_vec = vNULL; new_instance->subgraph_entries = vNULL; - vect_gather_slp_loads (new_instance, node); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "SLP size %u vs. limit %u.\n", @@ -3071,6 +3063,21 @@ vect_optimize_slp (vec_info *vinfo) } } +/* Gather loads reachable from the individual SLP graph entries. */ + +void +vect_gather_slp_loads (vec_info *vinfo) +{ + unsigned i; + slp_instance instance; + FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) + { + hash_set<slp_tree> visited; + vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance), + SLP_INSTANCE_TREE (instance), visited); + } +} + /* For each possible SLP instance decide whether to SLP it and calculate overall unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at @@ -4152,6 +4159,9 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, /* Optimize permutations. */ vect_optimize_slp (bb_vinfo); + /* Gather the loads reachable from the SLP graph entries. */ + vect_gather_slp_loads (bb_vinfo); + vect_record_base_alignments (bb_vinfo); /* Analyze and verify the alignment of data references and the diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 3ccd0fd552d..0ee4ef32eb2 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1974,6 +1974,7 @@ extern opt_result vect_analyze_slp (vec_info *, unsigned); extern bool vect_make_slp_decision (loop_vec_info); extern void vect_detect_hybrid_slp (loop_vec_info); extern void vect_optimize_slp (vec_info *); +extern void vect_gather_slp_loads (vec_info *); extern void vect_get_slp_defs (slp_tree, vec<tree> *); extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *, unsigned n = -1U);
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 39b7319e8253c351a4f6fbdd8c154330f08f2b1b..791d9c6cb0649862a84fd3c80efc89fefedbb085 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1979,6 +1979,61 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, return opt_result::success (); } +/* For every SLP only pattern created by the pattern matched rooted in ROOT + restore the relevancy of the original statements over those of the pattern + and destroy the pattern relationship. This restores the SLP tree to a state + where it can be used when SLP build is cancelled or re-tried. */ + +static void +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo, + hash_set<slp_tree> *visited, slp_tree root) +{ + if (!root || visited->add (root)) + return; + + unsigned int i; + slp_tree node; + stmt_vec_info related_stmt_info; + stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (root); + + if (stmt_info && STMT_VINFO_SLP_VECT_ONLY (stmt_info)) + { + vect_pop_relevancy (stmt_info); + if ((related_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info)) != NULL) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "dissolving relevancy of %G", + STMT_VINFO_STMT (stmt_info)); + vect_dissolve_pattern_relevancy (related_stmt_info); + } + } + + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) + vect_dissolve_slp_only_patterns (loop_vinfo, visited, node); +} + +/* Lookup any SLP Only Pattern statements created by the SLP pattern matcher in + all slp_instances in LOOP_VINFO and undo the relevancy of statements such + that the original SLP tree before the pattern matching is used. */ + +static void +vect_dissolve_slp_only_patterns (loop_vec_info loop_vinfo) +{ + + unsigned int i; + hash_set<slp_tree> visited; + + DUMP_VECT_SCOPE ("vect_dissolve_slp_only_patterns"); + + /* Unmark any SLP only patterns as relevant and restore the STMT_INFO of the + related instruction. */ + slp_instance instance; + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) + vect_dissolve_slp_only_patterns (loop_vinfo, &visited, + SLP_INSTANCE_TREE (instance)); +} + /* Look for SLP-only access groups and turn each individual access into its own group. */ static void @@ -2583,6 +2638,9 @@ again: /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ gcc_assert (!ok); + /* Dissolve any SLP patterns created by the SLP pattern matcher. */ + vect_dissolve_slp_only_patterns (loop_vinfo); + /* Try again with SLP forced off but if we didn't do any SLP there is no point in re-trying. */ if (!slp) diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 0c065e835ad13ad32d222e2590e05ef56849c411..3be565a2e566e09a9e42d6c77ba402b9499b06b6 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -105,7 +105,7 @@ _slp_tree::~_slp_tree () /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ -static void +void vect_free_slp_tree (slp_tree node) { int i; @@ -148,7 +148,7 @@ vect_free_slp_instance (slp_instance instance) /* Create an SLP node for SCALAR_STMTS. */ -slp_tree +static slp_tree vect_create_new_slp_node (slp_tree node, vec<stmt_vec_info> scalar_stmts, unsigned nops) { @@ -165,7 +165,7 @@ vect_create_new_slp_node (slp_tree node, /* Create an SLP node for SCALAR_STMTS. */ -static slp_tree +slp_tree vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) { return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops); @@ -1646,6 +1646,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, { slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; + SLP_TREE_VECTYPE (invnode) = vectype; oprnd_info->ops = vNULL; children.safe_push (invnode); continue; @@ -1929,6 +1930,13 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc, dump_printf (dump_kind, " %u", j); dump_printf (dump_kind, " }\n"); } + if (SLP_TREE_REPRESENTATIVE (node) + && STMT_VINFO_SLP_VECT_ONLY (SLP_TREE_REPRESENTATIVE (node))) + { + dump_printf_loc (metadata, user_loc, "\tSLP Only pattern:\n"); + dump_printf_loc (dump_kind, user_loc, "\t %G", + STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node))); + } if (SLP_TREE_LANE_PERMUTATION (node).exists ()) { dump_printf_loc (metadata, user_loc, "\tlane permutation {"); @@ -2174,6 +2182,219 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size) return exact_div (common_multiple (nunits, group_size), group_size); } +/* Helper function of optimize_load_redistribution that performs the operation + recursively. */ + +bool optimize_load_redistribution_1 (hash_set<slp_tree> *loads, + scalar_stmts_to_slp_tree_map_t *bst_map, + hash_set<slp_tree> *visited, + slp_tree parent, unsigned idx, + slp_tree root) +{ + if (visited->contains (root)) + return true; + visited->add (root); + + slp_tree node; + unsigned i; + stmt_vec_info dr_stmt = NULL; + + /* For now, we don't know anything about externals so do not do anything. */ + if (SLP_TREE_DEF_TYPE (root) == vect_external_def + || SLP_TREE_DEF_TYPE (root) == vect_constant_def) + return false; + + if (gimple_assign_load_p (STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (root)))) + loads->add (root); + + if (SLP_TREE_CODE (root) == VEC_PERM_EXPR + && SLP_TREE_LANE_PERMUTATION (root).exists () + && !SLP_TREE_SCALAR_STMTS (root).exists ()) + { + + /* First convert this node into a load node and add it to the leaves + list and flatten the permute from a lane to a load one. If it's + unneeded it will be elided later. */ + auto_vec<stmt_vec_info> stmts; + stmts.create (SLP_TREE_LANES (root)); + load_permutation_t load_perm; + load_perm.create (SLP_TREE_LANES (root)); + lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root); + for (unsigned j = 0; j < lane_perm.length (); j++) + { + std::pair<unsigned, unsigned> perm = lane_perm[j]; + /* This isn't strictly needed, but this function is a temporary + one for specifically pattern matching, so don't want it to + optimize things the remainder of the pipeline will. */ + if (perm.first != j) + goto next; + node = SLP_TREE_CHILDREN (root)[perm.first]; + if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) + { + load_perm.release (); + return false; + } + + stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node); + if (!STMT_VINFO_GROUPED_ACCESS (rep_stmt)) + goto next; + + if (!dr_stmt) + dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt); + + if (dr_stmt != DR_GROUP_FIRST_ELEMENT (rep_stmt)) + goto next; + + stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); + load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]); + } + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "converting loads on permute node %p\n", root); + + slp_tree *value = bst_map->get (stmts); + if (value) + node = *value; + else + { + /* One last iteration to free the nodes. */ + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node) + { + /* If we are the only reference to the node, remove the vertex. + We don't have to modify the graph since vertices lead the + graph traversal. */ + vect_free_slp_tree (node); + } + + vec<stmt_vec_info> stmts_cpy = stmts.copy (); + node = vect_create_new_slp_node (stmts_cpy.copy (), 0); + bst_map->put (stmts_cpy, node); + } + SLP_TREE_CHILDREN (parent)[idx] = node; + SLP_TREE_REF_COUNT (node)++; + SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root); + SLP_TREE_LOAD_PERMUTATION (node) = load_perm; + loads->add (node); + //do this up the recursive call. + //vect_free_slp_tree (root); + + return true; + } + +next: + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) + optimize_load_redistribution_1 (loads, bst_map, visited, root, i, node); + + return true; +} + +/* Temporary workaround for loads not being CSEd during SLP build. This + function will traverse the SLP tree rooted in ROOT for INSTANCE and find + VEC_PERM nodes that blend vectors from multiple nodes that all read from the + same DR such that the final operation is equal to a permuted load. Such + NODES are then directly converted into LOADS themselves. The nodes are + CSEd using BST_MAP. + + Finally the LOADS in INSTANCE are updated with the current set of loads. */ + +bool optimize_load_redistribution (slp_instance instance, + scalar_stmts_to_slp_tree_map_t *bst_map, + slp_tree root) +{ + slp_tree node; + unsigned i; + hash_set<slp_tree> visited; + hash_set<slp_tree> loads; + + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node) + optimize_load_redistribution_1 (&loads, bst_map, &visited, root, i, node); + + SLP_INSTANCE_LOADS (instance).truncate (0); + for (hash_set<slp_tree>::iterator it = loads.begin (); + it != loads.end (); ++it) + SLP_INSTANCE_LOADS (instance).safe_push (*it); + + return true; +} + +/* Helper function of vect_match_slp_patterns. + + Attempts to match patterns against the slp tree rooted in REF_NODE using + VINFO. Patterns are matched in post-order traversal. + + If matching is successful the value in REF_NODE is updated and returned, if + not then it is returned unchanged. */ + +static bool +vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo, + slp_tree_to_load_perm_map_t *perm_cache, + hash_set<slp_tree> *visited) +{ + unsigned i; + slp_tree node = *ref_node; + bool found_p = false, found_rec_p = false; + if (!node || visited->add (node)) + return false; + + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + found_rec_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i], + vinfo, perm_cache, visited); + + for (unsigned x = 0; x < num__slp_patterns; x++) + { + vect_pattern *pattern = slp_patterns[x] (ref_node); + found_p = pattern->recognize (perm_cache, vinfo); + delete pattern; + found_rec_p = found_p | found_rec_p; + } + + return found_rec_p; +} + +/* Applies pattern matching to the given SLP tree rooted in REF_NODE using + vec_info VINFO. + + The modified tree is returned. Patterns are tried in order and multiple + patterns may match. */ + +static bool +vect_match_slp_patterns (slp_instance instance, vec_info *vinfo, + hash_set<slp_tree> *visited, + slp_tree_to_load_perm_map_t *perm_cache, + scalar_stmts_to_slp_tree_map_t *bst_map) +{ + DUMP_VECT_SCOPE ("vect_match_slp_patterns"); + slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Analyzing SLP tree %p for patterns\n", + SLP_INSTANCE_TREE (instance)); + + bool found_p + = vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited); + + if (found_p) + { + optimize_load_redistribution (instance, bst_map, *ref_node); + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Pattern matched SLP tree\n"); + vect_print_slp_graph (MSG_NOTE, vect_location, *ref_node); + } + } + + return found_p; +} + +/* Analyze an SLP instance starting from a group of grouped stores. Call + vect_build_slp_tree to build a tree of packed stmts if possible. + Return FALSE if it's impossible to SLP any stmt in the loop. */ + static bool vect_analyze_slp_instance (vec_info *vinfo, scalar_stmts_to_slp_tree_map_t *bst_map, @@ -2540,6 +2761,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) { unsigned int i; stmt_vec_info first_element; + slp_instance instance; DUMP_VECT_SCOPE ("vect_analyze_slp"); @@ -2586,6 +2808,13 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) slp_inst_kind_reduc_group, max_tree_size); } + hash_set<slp_tree> visited_patterns; + slp_tree_to_load_perm_map_t perm_cache; + /* See if any patterns can be found in the SLP tree. */ + FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance) + vect_match_slp_patterns (instance, vinfo, &visited_patterns, &perm_cache, + bst_map); + /* The map keeps a reference on SLP nodes built, release that. */ for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); it != bst_map->end (); ++it) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index fa211b95c0e54be1d51ed949d7a06c31b7b50802..3b49ce22f7aae0465dbd0b24cbf48ae054c31d22 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -27,6 +27,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-hash-traits.h" #include "target.h" #include "alloc-pool.h" +#include "internal-fn.h" /* Used for naming of new temporaries. */ @@ -1118,6 +1119,11 @@ public: indicates whether the stmt needs to be vectorized. */ enum vect_relevant relevant; + /* During SLP vectorization we may need to change the relevancy of a statement + but restore it during dissolving of SLP nodes. This field contains a copy + of the original relevancy analysis. */ + enum vect_relevant saved_relevant; + /* For loads if this is a gather, for stores if this is a scatter. */ bool gather_scatter_p; @@ -1240,6 +1246,7 @@ struct gather_scatter_info { #define STMT_VINFO_TYPE(S) (S)->type #define STMT_VINFO_STMT(S) (S)->stmt #define STMT_VINFO_RELEVANT(S) (S)->relevant +#define STMT_VINFO_SAVED_RELEVANT(S) (S)->saved_relevant #define STMT_VINFO_LIVE_P(S) (S)->live #define STMT_VINFO_VECTYPE(S) (S)->vectype #define STMT_VINFO_VEC_STMTS(S) (S)->vec_stmts @@ -1368,6 +1375,46 @@ vect_orig_stmt (stmt_vec_info stmt_info) return stmt_info; } +/* If restore the saved relevancy information STMT_INFO from the copy made + during SLP pattern detection. */ + +static inline void +vect_pop_relevancy (stmt_vec_info stmt_info) +{ + STMT_VINFO_RELEVANT (stmt_info) = STMT_VINFO_SAVED_RELEVANT (stmt_info); +} + +/* Restores the saved relevancy of STMT_INFO and marks it as not being inside a + pattern. Lastly the SLP_TYPE is set to loop_vect. */ + +static inline void +vect_dissolve_pattern_relevancy (stmt_vec_info stmt_info) +{ + vect_pop_relevancy (stmt_info); + STMT_VINFO_IN_PATTERN_P (stmt_info) = false; + STMT_SLP_TYPE (stmt_info) = loop_vect; +} + +/* Save the current relevancy of STMT_INFO such that it can be restored by + vect_pop_relevancy. */ + +static inline void +vect_save_relevancy (stmt_vec_info stmt_info) +{ + STMT_VINFO_SAVED_RELEVANT (stmt_info) + = STMT_VINFO_RELEVANT (stmt_info); +} + +/* Save the current relevancy of STMT_INFO before changing it to REL. */ + +static inline void +vect_push_relevancy (stmt_vec_info stmt_info, enum vect_relevant rel) +{ + vect_save_relevancy (stmt_info); + STMT_VINFO_RELEVANT (stmt_info) = rel; +} + + /* Return the later statement between STMT1_INFO and STMT2_INFO. */ static inline stmt_vec_info @@ -1993,6 +2040,7 @@ extern void duplicate_and_interleave (vec_info *, gimple_seq *, tree, extern int vect_get_place_in_interleaving_chain (stmt_vec_info, stmt_vec_info); extern bool vect_update_shared_vectype (stmt_vec_info, tree); extern slp_tree vect_create_new_slp_node (vec<stmt_vec_info>, unsigned); +extern void vect_free_slp_tree (slp_tree); /* In tree-vect-patterns.c. */ extern void @@ -2009,4 +2057,108 @@ void vect_free_loop_info_assumptions (class loop *); gimple *vect_loop_vectorized_call (class loop *, gcond **cond = NULL); bool vect_stmt_dominates_stmt_p (gimple *, gimple *); +/* SLP Pattern matcher types, tree-vect-slp-patterns.c. */ + +/* Forward declaration of possible two operands operation that can be matched + by the complex numbers pattern matchers. */ +enum _complex_operation : unsigned; + +/* Cache from nodes to the load permutation they represent. */ +typedef hash_map <slp_tree, load_permutation_t > + slp_tree_to_load_perm_map_t; + +/* Vector pattern matcher base class. All SLP pattern matchers must inherit + from this type. */ + +class vect_pattern +{ + protected: + /* The number of arguments that the IFN requires. */ + unsigned m_num_args; + + /* The internal function that will be used when a pattern is created. */ + internal_fn m_ifn; + + /* The current node being inspected. */ + slp_tree *m_node; + + /* The list of operands to be the children for the node produced when the + internal function is created. */ + vec<slp_tree> m_ops; + + /* Default constructor where NODE is the root of the tree to inspect. */ + vect_pattern (slp_tree *node) + { + this->m_ifn = IFN_LAST; + this->m_node = node; + this->m_ops.create (0); + } + + public: + /* Attempt to recognize a pattern, validate and update the tree rooted in + M_NODE. */ + virtual bool recognize (slp_tree_to_load_perm_map_t *, vec_info *); + + /* Only perform the pattern creation part of the matcher. This creates and + returns the new pattern statement. */ + virtual gcall *build (vec_info *) = 0; + + /* Performs a check to see if the matched IFN is supported by the current + target. */ + virtual bool is_optab_supported_p (tree vectype, optimization_type opt_type) + { + if (!vectype) + return false; + + return direct_internal_fn_supported_p (this->m_ifn, vectype, opt_type); + } + + /* Create a new instance of the pattern matcher class of the given type. */ + static vect_pattern* create (slp_tree *); + + /* Match but do not perform any additional operations on the SLP tree. */ + virtual bool matches (slp_tree_to_load_perm_map_t *) = 0; + + /* Match but use for the first operation the supplied COMPLEX_OPERATION. No + additional operations or modification of the SLP tree are performed. */ + virtual bool matches (enum _complex_operation, + slp_tree_to_load_perm_map_t *, vec<slp_tree>) + { + return false; + } + + /* Friendly name of the operation the pattern matches. */ + virtual const char* get_name () = 0; + + /* Default destructor. */ + virtual ~vect_pattern () + { + this->m_ops.release (); + } + + /* Check to see if the matched tree is valid for the operation the matcher + wants. If the operation is valid then the tree is reshaped in the final + format that build () requires. */ + virtual bool validate_p (slp_tree_to_load_perm_map_t *) + { + return true; + } + + /* Return the matched internal function. If no match was done this is set + to LAST_IFN. */ + virtual internal_fn get_ifn () + { + return this->m_ifn; + } +}; + +/* Function pointer to create a new pattern matcher from a generic type. */ +typedef vect_pattern* (*vect_pattern_decl_t) (slp_tree *); + +/* List of supported pattern matchers. */ +extern vect_pattern_decl_t slp_patterns[]; + +/* Number of supported pattern matchers. */ +extern size_t num__slp_patterns; + #endif /* GCC_TREE_VECTORIZER_H */ diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index d81774b242569262a51b7be02815acd6d1a6bfd0..2a6ddd685922f6b60ae1305974335fb863a2af39 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -535,6 +535,8 @@ vec_info::add_pattern_stmt (gimple *stmt, stmt_vec_info stmt_info) stmt_vec_info res = new_stmt_vec_info (stmt); set_vinfo_for_stmt (stmt, res, false); STMT_VINFO_RELATED_STMT (res) = stmt_info; + vect_save_relevancy (stmt_info); + vect_push_relevancy (res, STMT_VINFO_RELEVANT (stmt_info)); return res; }