Message ID | 20240527062949.1942207-1-pan2.li@intel.com |
---|---|
State | New |
Headers | show |
Series | [v3] Match: Support more form for scalar unsigned SAT_ADD | expand |
On Mon, May 27, 2024 at 8:29 AM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > After we support one gassign form of the unsigned .SAT_ADD, we > would like to support more forms including both the branch and > branchless. There are 5 other forms of .SAT_ADD, list as below: > > Form 1: > #define SAT_ADD_U_1(T) \ > T sat_add_u_1_##T(T x, T y) \ > { \ > return (T)(x + y) >= x ? (x + y) : -1; \ > } > > Form 2: > #define SAT_ADD_U_2(T) \ > T sat_add_u_2_##T(T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_add_overflow (x, y, &ret); \ > return (T)(-overflow) | ret; \ > } > > Form 3: > #define SAT_ADD_U_3(T) \ > T sat_add_u_3_##T (T x, T y) \ > { \ > T ret; \ > return __builtin_add_overflow (x, y, &ret) ? -1 : ret; \ > } > > Form 4: > #define SAT_ADD_U_4(T) \ > T sat_add_u_4_##T (T x, T y) \ > { \ > T ret; \ > return __builtin_add_overflow (x, y, &ret) == 0 ? ret : -1; \ > } > > Form 5: > #define SAT_ADD_U_5(T) \ > T sat_add_u_5_##T(T x, T y) \ > { \ > return (T)(x + y) < x ? -1 : (x + y); \ > } > > Take the forms 3 of above as example: > > uint64_t > sat_add (uint64_t x, uint64_t y) > { > uint64_t ret; > return __builtin_add_overflow (x, y, &ret) ? -1 : ret; > } > > Before this patch: > uint64_t sat_add (uint64_t x, uint64_t y) > { > long unsigned int _1; > long unsigned int _2; > uint64_t _3; > __complex__ long unsigned int _6; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _6 = .ADD_OVERFLOW (x_4(D), y_5(D)); > _2 = IMAGPART_EXPR <_6>; > if (_2 != 0) > goto <bb 4>; [35.00%] > else > goto <bb 3>; [65.00%] > ;; succ: 4 > ;; 3 > > ;; basic block 3, loop depth 0 > ;; pred: 2 > _1 = REALPART_EXPR <_6>; > ;; succ: 4 > > ;; basic block 4, loop depth 0 > ;; pred: 3 > ;; 2 > # _3 = PHI <_1(3), 18446744073709551615(2)> > return _3; > ;; succ: EXIT > } > > After this patch: > uint64_t sat_add (uint64_t x, uint64_t y) > { > long unsigned int _12; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _12 = .SAT_ADD (x_4(D), y_5(D)); [tail call] > return _12; > ;; succ: EXIT > } > > The below test suites are still running, will update it later. > * The x86 bootstrap test. > * The x86 fully regression test. > * The riscv fully regression test. > > gcc/ChangeLog: > > * genmatch.cc (dt_node::gen_kids): Add new arg of predicate id. > (allow_phi_predicate_p): New func impl to check the phi > predicate is allowed or not. > (dt_node::gen_kids_1): Add COND_EXPR gen for phi node if allowed. > (dt_operand::gen_phi_on_cond): > (write_predicate): Init the predicate id before gen_kids. > * match.pd: Add more forms of unsigned_integer_sat_add and > comments. > * tree-ssa-math-opts.cc (match_saturation_arith): Rename from. > (match_assign_saturation_arith): Rename to. > (match_phi_saturation_arith): New func impl to match phi. > (math_opts_dom_walker::after_dom_children): Add phi match for > echo bb. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/genmatch.cc | 123 ++++++++++++++++++++++++++++++++++++-- > gcc/match.pd | 43 ++++++++++++- > gcc/tree-ssa-math-opts.cc | 51 +++++++++++++++- > 3 files changed, 210 insertions(+), 7 deletions(-) > > diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc > index f1e0e7abe0c..816d2dafd23 100644 > --- a/gcc/genmatch.cc > +++ b/gcc/genmatch.cc > @@ -1767,6 +1767,7 @@ public: > unsigned level; > dt_node *parent; > vec<dt_node *> kids; > + const char *id; > > /* Statistics. */ > unsigned num_leafs; > @@ -1786,7 +1787,7 @@ public: > virtual void gen (FILE *, int, bool, int) {} > > void gen_kids (FILE *, int, bool, int); > - void gen_kids_1 (FILE *, int, bool, int, > + void gen_kids_1 (FILE *, const char *, int, bool, int, > const vec<dt_operand *> &, const vec<dt_operand *> &, > const vec<dt_operand *> &, const vec<dt_operand *> &, > const vec<dt_operand *> &, const vec<dt_node *> &); > @@ -1819,6 +1820,7 @@ public: > > char *get_name (char *); > void gen_opname (char *, unsigned); > + void gen_phi_on_cond (FILE *, int, bool, int); > }; > > /* Leaf node of the decision tree, used for DT_SIMPLIFY. */ > @@ -3173,7 +3175,7 @@ dt_node::gen_kids (FILE *f, int indent, bool gimple, int depth) > for what we have collected sofar. */ > fns.qsort (fns_cmp); > generic_fns.qsort (fns_cmp); > - gen_kids_1 (f, indent, gimple, depth, gimple_exprs, generic_exprs, > + gen_kids_1 (f, id, indent, gimple, depth, gimple_exprs, generic_exprs, > fns, generic_fns, preds, others); > /* And output the true operand itself. */ > kids[i]->gen (f, indent, gimple, depth); > @@ -3191,14 +3193,21 @@ dt_node::gen_kids (FILE *f, int indent, bool gimple, int depth) > /* Generate code for the remains. */ > fns.qsort (fns_cmp); > generic_fns.qsort (fns_cmp); > - gen_kids_1 (f, indent, gimple, depth, gimple_exprs, generic_exprs, > + gen_kids_1 (f, id, indent, gimple, depth, gimple_exprs, generic_exprs, > fns, generic_fns, preds, others); > } > > +static bool > +allow_phi_predicate_p (const char *id) > +{ > + return id && strcmp (id, "unsigned_integer_sat_add") == 0; Of course that's not OK. I suggest to either check ... > +} > + > /* Generate matching code for the children of the decision tree node. */ > > void > -dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > +dt_node::gen_kids_1 (FILE *f, const char *id, int indent, bool gimple, > + int depth, > const vec<dt_operand *> &gimple_exprs, > const vec<dt_operand *> &generic_exprs, > const vec<dt_operand *> &fns, > @@ -3251,6 +3260,7 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > depth); > indent += 4; > fprintf_indent (f, indent, "{\n"); > + bool cond_expr_p = false; > for (unsigned i = 0; i < exprs_len; ++i) > { > expr *e = as_a <expr *> (gimple_exprs[i]->op); > @@ -3262,6 +3272,7 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > else > { > id_base *op = e->operation; > + cond_expr_p |= *op == COND_EXPR; > if (*op == CONVERT_EXPR || *op == NOP_EXPR) > fprintf_indent (f, indent, "CASE_CONVERT:\n"); > else > @@ -3275,6 +3286,27 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > fprintf_indent (f, indent, "default:;\n"); > fprintf_indent (f, indent, "}\n"); > indent -= 4; > + > + if (cond_expr_p && allow_phi_predicate_p (id)) ... this->parent == nullptr restricting PHI handling to the outermost COND_EXPR, which is probably a good idea (but there's no way to see we're in a match or in a simplify). Or even better, either mark the expression node with a new flag (make sure to not merge cond with/without the flag during decision tree insert), add a new operation code like (cond_or_phi ...) and either handle or lower it, with lowering introducing yet another operation, (phi ..). From all of the above simply doing this->parent == nullptr and adding a global flag "writing_predicate" to make sure to only cover (match ...) is the simplest. I somewhat dislike adding fake operation codes so a flag, like '^' (cond^ ...), would be my second choice where I'd limit that to (match and the outermost cond within that unless we see need for more cases. There are a few existing (match (cond ..)) patterns so the flag looks like the safest option - not all users might expect PHIs. > + { > + fprintf_indent (f, indent, > + "else if (gphi *_a%d = dyn_cast <gphi *> (_d%d))\n", > + depth, depth); > + indent += 2; > + fprintf_indent (f, indent, "{\n"); > + indent += 2; > + > + for (unsigned i = 0; i < exprs_len; i++) > + { > + expr *e = as_a <expr *> (gimple_exprs[i]->op); > + if (*e->operation == COND_EXPR) > + gimple_exprs[i]->gen_phi_on_cond (f, indent, true, depth); > + } > + > + indent -= 2; > + fprintf_indent (f, indent, "}\n"); > + indent -= 2; > + } > } > > if (fns_len) > @@ -3483,6 +3515,87 @@ dt_operand::gen (FILE *f, int indent, bool gimple, int depth) > } > } > > +/* Generate matching code for the phi when meet COND_EXPR. */ > + > +void > +dt_operand::gen_phi_on_cond (FILE *f, int indent, bool gimple, int depth) Note this is implicitly 'gimple', no need to pass that flag. > +{ > + fprintf_indent (f, indent, > + "basic_block _b%d = gimple_bb (_a%d);\n", depth, depth); > + > + fprintf_indent (f, indent, > + "if (EDGE_COUNT (_b%d->preds) == 2 && gimple_phi_num_args (_a%d) == 2)\n", > + depth, depth); Either is redundant - the number of preds always matches the number of PHI args. > + > + indent += 2; > + fprintf_indent (f, indent, "{\n"); > + indent += 2; > + > + fprintf_indent (f, indent, > + "basic_block _pb_0_%d = EDGE_PRED (_b%d, 0)->src;\n", depth, depth); > + fprintf_indent (f, indent, > + "basic_block _pb_1_%d = EDGE_PRED (_b%d, 1)->src;\n", depth, depth); > + fprintf_indent (f, indent, > + "basic_block _db_%d = safe_dyn_cast <gcond *> (*gsi_last_bb (_pb_0_%d)) ? " > + "_pb_0_%d : _pb_1_%d;\n", depth, depth, depth, depth); > + fprintf_indent (f, indent, > + "basic_block _other_db_%d = safe_dyn_cast <gcond *> " > + "(*gsi_last_bb (_pb_0_%d)) ? _pb_1_%d : _pb_0_%d;\n", > + depth, depth, depth, depth); > + > + fprintf_indent (f, indent, > + "gcond *_ct_%d = safe_dyn_cast <gcond *> (*gsi_last_bb (_db_%d));\n ", > + depth, depth); > + fprintf_indent (f, indent, "if (_ct_%d " > + "&& EDGE_COUNT (_other_db_%d->preds) == 1 " > + "&& EDGE_COUNT (_other_db_%d->succs) == 1 " > + "&& EDGE_SUCC (_other_db_%d, 0)->dest == _b%d)\n", > + depth, depth, depth, depth, depth); > + > + indent += 2; > + fprintf_indent (f, indent, "{\n"); > + indent += 2; I'm quoting generated code here: else if (gphi *_a1 = dyn_cast <gphi *> (_d1)) { basic_block _b1 = gimple_bb (_a1); if (EDGE_COUNT (_b1->preds) == 2 && gimple_phi_num_args (_a1) == 2) { basic_block _pb_0_1 = EDGE_PRED (_b1, 0)->src; basic_block _pb_1_1 = EDGE_PRED (_b1, 1)->src; basic_block _db_1 = safe_dyn_cast <gcond *> (*gsi_last_bb (_pb_0_1)) ? _pb_0_1 : _pb_1_1; basic_block _other_db_1 = safe_dyn_cast <gcond *> (*gsi_last_bb (_pb_0_1)) ? _pb_1_1 : _pb_0_1; _other_db_1 = _db_1 == _pb_0_1 : _pb_1_1 : _pb_0_1; gcond *_ct_1 = safe_dyn_cast <gcond *> (*gsi_last_bb (_db_1)); if (_ct_1 && EDGE_COUNT (_other_db_1->preds) == 1 && EDGE_COUNT (_other_db_1->succs) == 1 && EDGE_SUCC (_other_db_1, 0)->dest == _b1) new lines with indenting && EDGE_COUNT checks please. I think EDGE_SUCC (_other_db_1, 0)->dest == _b1 is pointless. Instead you want to check EDGE_PRED (_other_db_1, 0)->src == _db_1? { tree _cond_lhs_1 = gimple_cond_lhs (_ct_1); tree _cond_rhs_1 = gimple_cond_rhs (_ct_1); tree _p0 = build2 (gimple_cond_code (_ct_1), boolean_type_node, _cond_lhs_1, _cond_rhs_1); bool _arg_0_is_true_1 = gimple_phi_arg_edge (_a1, 0)->flags & EDGE_TRUE_VALUE; Hmm, you don't know this PHI arg edge comes from the cond block, it could come from the middle-bb. Otherwise I think this will work, of course it depends on us handling GENERIC conditions in COND_EXPRs. Not building the _p0 GENERIC tree would be nice but that's quite difficult to pull off here (impossible even). Overall I like it. Can you please try making the '^' flag work instead (also document it in doc/match-and-simplify.texi)? Thanks for working on this. Richard. tree _p1 = gimple_phi_arg_def (_a1, _arg_0_is_true_1 ? 0 : 1); tree _p2 = gimple_phi_arg_def (_a1, _arg_0_is_true_1 ? 1 : 0); > + fprintf_indent (f, indent, > + "tree _cond_lhs_%d = gimple_cond_lhs (_ct_%d);\n", depth, depth); > + fprintf_indent (f, indent, > + "tree _cond_rhs_%d = gimple_cond_rhs (_ct_%d);\n", depth, depth); > + > + char opname_0[20]; > + char opname_1[20]; > + char opname_2[20]; > + gen_opname (opname_0, 0); > + > + fprintf_indent (f, indent, > + "tree %s = build2 (gimple_cond_code (_ct_%d), " > + "boolean_type_node, _cond_lhs_%d, _cond_rhs_%d);\n", > + opname_0, depth, depth, depth); > + > + fprintf_indent (f, indent, > + "bool _arg_0_is_true_%d = gimple_phi_arg_edge (_a%d, 0)->flags " > + " & EDGE_TRUE_VALUE;\n", depth, depth); > + > + gen_opname (opname_1, 1); > + fprintf_indent (f, indent, > + "tree %s = gimple_phi_arg_def (_a%d, _arg_0_is_true_%d ? 0 : 1);\n", > + opname_1, depth, depth); > + > + gen_opname (opname_2, 2); > + fprintf_indent (f, indent, > + "tree %s = gimple_phi_arg_def (_a%d, _arg_0_is_true_%d ? 1 : 0);\n", > + opname_2, depth, depth); > + > + gen_kids (f, indent, gimple, depth); > + > + indent -= 2; > + fprintf_indent (f, indent, "}\n"); > + indent -= 2; > + > + indent -= 2; > + fprintf_indent (f, indent, "}\n"); > + indent -= 2; > +} > + > /* Emit a logging call to the debug file to the file F, with the INDENT from > either the RESULT location or the S's match location if RESULT is null. */ > static void > @@ -4263,6 +4376,8 @@ write_predicate (FILE *f, predicate_id *p, decision_tree &dt, bool gimple) > > if (!gimple) > fprintf_indent (f, 2, "if (TREE_SIDE_EFFECTS (t)) return false;\n"); > + > + dt.root->id = p->id; > dt.root->gen_kids (f, 2, gimple, 0); > > fprintf_indent (f, 2, "return false;\n" > diff --git a/gcc/match.pd b/gcc/match.pd > index 024e3350465..ddbdb7f7638 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3046,31 +3046,48 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > /* Unsigned Saturation Add */ > +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: > + SAT_ADD = (X + Y) | -((X + Y) < X) */ > (match (usadd_left_part_1 @0 @1) > (plus:c @0 @1) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ > (match (usadd_left_part_2 @0 @1) > (realpart (IFN_ADD_OVERFLOW:c @0 @1)) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: > + SAT_ADD = (X + Y) | -((type)(X + Y) < X) */ > (match (usadd_right_part_1 @0 @1) > (negate (convert (lt (plus:c @0 @1) @0))) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: > + SAT_ADD = (X + Y) | -(X > (X + Y)) */ > (match (usadd_right_part_1 @0 @1) > (negate (convert (gt @0 (plus:c @0 @1)))) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ > (match (usadd_right_part_2 @0 @1) > (negate (convert (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop))) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | -IMAGPART_EXPR <.ADD_OVERFLOW> */ > +(match (usadd_right_part_2 @0 @1) > + (negate (imagpart (IFN_ADD_OVERFLOW:c @0 @1))) > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > /* We cannot merge or overload usadd_left_part_1 and usadd_left_part_2 > because the sub part of left_part_2 cannot work with right_part_1. > For example, left_part_2 pattern focus one .ADD_OVERFLOW but the > @@ -3082,10 +3099,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (match (unsigned_integer_sat_add @0 @1) > (bit_ior:c (usadd_left_part_1 @0 @1) (usadd_right_part_1 @0 @1))) > > -/* Unsigned saturation add, case 2 (branchless with .ADD_OVERFLOW). */ > +/* Unsigned saturation add, case 2 (branchless with .ADD_OVERFLOW): > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | -IMAGPART_EXPR <.ADD_OVERFLOW> or > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ > (match (unsigned_integer_sat_add @0 @1) > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > +/* Unsigned saturation add, case 3 (branch with ge): > + SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (ge (usadd_left_part_1@2 @0 @1) @0) @2 integer_minus_onep)) > + > +/* Unsigned saturation add, case 4 (branch with lt): > + SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > + > +/* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > + SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1. */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (eq (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop) > + (usadd_left_part_2 @0 @1) integer_minus_onep)) > + > +/* Unsigned saturation add, case 6 (branch with ne .ADD_OVERFLOW): > + SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> != 0 ? -1 : .ADD_OVERFLOW. */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop) > + integer_minus_onep (usadd_left_part_2 @0 @1))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index 62da1c5ee08..8624d414347 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4099,7 +4099,7 @@ extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > * => > * _12 = .SAT_ADD (_4, _6); */ > static void > -match_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) > +match_assign_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) > { > gcall *call = NULL; > > @@ -4116,6 +4116,47 @@ match_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) > } > } > > +/* > + * Try to match saturation arith pattern(s). > + * From: > + * <bb 2> : > + * _1 = x_3(D) + y_4(D); > + * if (_1 >= x_3(D)) > + * goto <bb 3>; [INV] > + * else > + * goto <bb 4>; [INV] > + * > + * <bb 3> : > + * > + * <bb 4> : > + * # _2 = PHI <255(2), _1(3)> > + * > + * To: > + * <bb 4> [local count: 1073741824]: > + * _2 = .SAT_ADD (x_4(D), y_5(D)); */ > + > +static void > +match_phi_saturation_arith (gimple_stmt_iterator *gsi, gphi *phi) > +{ > + if (gimple_phi_num_args (phi) != 2) > + return; > + > + tree ops[2]; > + tree phi_result = gimple_phi_result (phi); > + > + if (gimple_unsigned_integer_sat_add (phi_result, ops, NULL) > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (phi_result), > + OPTIMIZE_FOR_BOTH)) > + { > + gcall *call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > + gimple_call_set_lhs (call, phi_result); > + gsi_insert_before (gsi, call, GSI_SAME_STMT); > + > + gimple_stmt_iterator psi = gsi_for_stmt (phi); > + remove_phi_node (&psi, false); > + } > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -6029,6 +6070,12 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > + for (gphi_iterator psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi)) > + { > + gimple_stmt_iterator gsi = gsi_last_bb (bb); > + match_phi_saturation_arith (&gsi, psi.phi ()); > + } > + > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > gimple *stmt = gsi_stmt (gsi); > @@ -6078,7 +6125,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > break; > > case BIT_IOR_EXPR: > - match_saturation_arith (&gsi, as_a<gassign *> (stmt)); > + match_assign_saturation_arith (&gsi, as_a<gassign *> (stmt)); > /* fall-through */ > case BIT_XOR_EXPR: > match_uaddc_usubc (&gsi, stmt, code); > -- > 2.34.1 >
Thanks Richard for suggestion and review. Did some tricky/ugly restrictions v3 for the phi gen as there are sorts of (cond in match.pd, will have a try with your proposal in v4. Thanks again for help. Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Wednesday, May 29, 2024 8:36 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; tamar.christina@arm.com Subject: Re: [PATCH v3] Match: Support more form for scalar unsigned SAT_ADD On Mon, May 27, 2024 at 8:29 AM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > After we support one gassign form of the unsigned .SAT_ADD, we > would like to support more forms including both the branch and > branchless. There are 5 other forms of .SAT_ADD, list as below: > > Form 1: > #define SAT_ADD_U_1(T) \ > T sat_add_u_1_##T(T x, T y) \ > { \ > return (T)(x + y) >= x ? (x + y) : -1; \ > } > > Form 2: > #define SAT_ADD_U_2(T) \ > T sat_add_u_2_##T(T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_add_overflow (x, y, &ret); \ > return (T)(-overflow) | ret; \ > } > > Form 3: > #define SAT_ADD_U_3(T) \ > T sat_add_u_3_##T (T x, T y) \ > { \ > T ret; \ > return __builtin_add_overflow (x, y, &ret) ? -1 : ret; \ > } > > Form 4: > #define SAT_ADD_U_4(T) \ > T sat_add_u_4_##T (T x, T y) \ > { \ > T ret; \ > return __builtin_add_overflow (x, y, &ret) == 0 ? ret : -1; \ > } > > Form 5: > #define SAT_ADD_U_5(T) \ > T sat_add_u_5_##T(T x, T y) \ > { \ > return (T)(x + y) < x ? -1 : (x + y); \ > } > > Take the forms 3 of above as example: > > uint64_t > sat_add (uint64_t x, uint64_t y) > { > uint64_t ret; > return __builtin_add_overflow (x, y, &ret) ? -1 : ret; > } > > Before this patch: > uint64_t sat_add (uint64_t x, uint64_t y) > { > long unsigned int _1; > long unsigned int _2; > uint64_t _3; > __complex__ long unsigned int _6; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _6 = .ADD_OVERFLOW (x_4(D), y_5(D)); > _2 = IMAGPART_EXPR <_6>; > if (_2 != 0) > goto <bb 4>; [35.00%] > else > goto <bb 3>; [65.00%] > ;; succ: 4 > ;; 3 > > ;; basic block 3, loop depth 0 > ;; pred: 2 > _1 = REALPART_EXPR <_6>; > ;; succ: 4 > > ;; basic block 4, loop depth 0 > ;; pred: 3 > ;; 2 > # _3 = PHI <_1(3), 18446744073709551615(2)> > return _3; > ;; succ: EXIT > } > > After this patch: > uint64_t sat_add (uint64_t x, uint64_t y) > { > long unsigned int _12; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _12 = .SAT_ADD (x_4(D), y_5(D)); [tail call] > return _12; > ;; succ: EXIT > } > > The below test suites are still running, will update it later. > * The x86 bootstrap test. > * The x86 fully regression test. > * The riscv fully regression test. > > gcc/ChangeLog: > > * genmatch.cc (dt_node::gen_kids): Add new arg of predicate id. > (allow_phi_predicate_p): New func impl to check the phi > predicate is allowed or not. > (dt_node::gen_kids_1): Add COND_EXPR gen for phi node if allowed. > (dt_operand::gen_phi_on_cond): > (write_predicate): Init the predicate id before gen_kids. > * match.pd: Add more forms of unsigned_integer_sat_add and > comments. > * tree-ssa-math-opts.cc (match_saturation_arith): Rename from. > (match_assign_saturation_arith): Rename to. > (match_phi_saturation_arith): New func impl to match phi. > (math_opts_dom_walker::after_dom_children): Add phi match for > echo bb. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/genmatch.cc | 123 ++++++++++++++++++++++++++++++++++++-- > gcc/match.pd | 43 ++++++++++++- > gcc/tree-ssa-math-opts.cc | 51 +++++++++++++++- > 3 files changed, 210 insertions(+), 7 deletions(-) > > diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc > index f1e0e7abe0c..816d2dafd23 100644 > --- a/gcc/genmatch.cc > +++ b/gcc/genmatch.cc > @@ -1767,6 +1767,7 @@ public: > unsigned level; > dt_node *parent; > vec<dt_node *> kids; > + const char *id; > > /* Statistics. */ > unsigned num_leafs; > @@ -1786,7 +1787,7 @@ public: > virtual void gen (FILE *, int, bool, int) {} > > void gen_kids (FILE *, int, bool, int); > - void gen_kids_1 (FILE *, int, bool, int, > + void gen_kids_1 (FILE *, const char *, int, bool, int, > const vec<dt_operand *> &, const vec<dt_operand *> &, > const vec<dt_operand *> &, const vec<dt_operand *> &, > const vec<dt_operand *> &, const vec<dt_node *> &); > @@ -1819,6 +1820,7 @@ public: > > char *get_name (char *); > void gen_opname (char *, unsigned); > + void gen_phi_on_cond (FILE *, int, bool, int); > }; > > /* Leaf node of the decision tree, used for DT_SIMPLIFY. */ > @@ -3173,7 +3175,7 @@ dt_node::gen_kids (FILE *f, int indent, bool gimple, int depth) > for what we have collected sofar. */ > fns.qsort (fns_cmp); > generic_fns.qsort (fns_cmp); > - gen_kids_1 (f, indent, gimple, depth, gimple_exprs, generic_exprs, > + gen_kids_1 (f, id, indent, gimple, depth, gimple_exprs, generic_exprs, > fns, generic_fns, preds, others); > /* And output the true operand itself. */ > kids[i]->gen (f, indent, gimple, depth); > @@ -3191,14 +3193,21 @@ dt_node::gen_kids (FILE *f, int indent, bool gimple, int depth) > /* Generate code for the remains. */ > fns.qsort (fns_cmp); > generic_fns.qsort (fns_cmp); > - gen_kids_1 (f, indent, gimple, depth, gimple_exprs, generic_exprs, > + gen_kids_1 (f, id, indent, gimple, depth, gimple_exprs, generic_exprs, > fns, generic_fns, preds, others); > } > > +static bool > +allow_phi_predicate_p (const char *id) > +{ > + return id && strcmp (id, "unsigned_integer_sat_add") == 0; Of course that's not OK. I suggest to either check ... > +} > + > /* Generate matching code for the children of the decision tree node. */ > > void > -dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > +dt_node::gen_kids_1 (FILE *f, const char *id, int indent, bool gimple, > + int depth, > const vec<dt_operand *> &gimple_exprs, > const vec<dt_operand *> &generic_exprs, > const vec<dt_operand *> &fns, > @@ -3251,6 +3260,7 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > depth); > indent += 4; > fprintf_indent (f, indent, "{\n"); > + bool cond_expr_p = false; > for (unsigned i = 0; i < exprs_len; ++i) > { > expr *e = as_a <expr *> (gimple_exprs[i]->op); > @@ -3262,6 +3272,7 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > else > { > id_base *op = e->operation; > + cond_expr_p |= *op == COND_EXPR; > if (*op == CONVERT_EXPR || *op == NOP_EXPR) > fprintf_indent (f, indent, "CASE_CONVERT:\n"); > else > @@ -3275,6 +3286,27 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, > fprintf_indent (f, indent, "default:;\n"); > fprintf_indent (f, indent, "}\n"); > indent -= 4; > + > + if (cond_expr_p && allow_phi_predicate_p (id)) ... this->parent == nullptr restricting PHI handling to the outermost COND_EXPR, which is probably a good idea (but there's no way to see we're in a match or in a simplify). Or even better, either mark the expression node with a new flag (make sure to not merge cond with/without the flag during decision tree insert), add a new operation code like (cond_or_phi ...) and either handle or lower it, with lowering introducing yet another operation, (phi ..). From all of the above simply doing this->parent == nullptr and adding a global flag "writing_predicate" to make sure to only cover (match ...) is the simplest. I somewhat dislike adding fake operation codes so a flag, like '^' (cond^ ...), would be my second choice where I'd limit that to (match and the outermost cond within that unless we see need for more cases. There are a few existing (match (cond ..)) patterns so the flag looks like the safest option - not all users might expect PHIs. > + { > + fprintf_indent (f, indent, > + "else if (gphi *_a%d = dyn_cast <gphi *> (_d%d))\n", > + depth, depth); > + indent += 2; > + fprintf_indent (f, indent, "{\n"); > + indent += 2; > + > + for (unsigned i = 0; i < exprs_len; i++) > + { > + expr *e = as_a <expr *> (gimple_exprs[i]->op); > + if (*e->operation == COND_EXPR) > + gimple_exprs[i]->gen_phi_on_cond (f, indent, true, depth); > + } > + > + indent -= 2; > + fprintf_indent (f, indent, "}\n"); > + indent -= 2; > + } > } > > if (fns_len) > @@ -3483,6 +3515,87 @@ dt_operand::gen (FILE *f, int indent, bool gimple, int depth) > } > } > > +/* Generate matching code for the phi when meet COND_EXPR. */ > + > +void > +dt_operand::gen_phi_on_cond (FILE *f, int indent, bool gimple, int depth) Note this is implicitly 'gimple', no need to pass that flag. > +{ > + fprintf_indent (f, indent, > + "basic_block _b%d = gimple_bb (_a%d);\n", depth, depth); > + > + fprintf_indent (f, indent, > + "if (EDGE_COUNT (_b%d->preds) == 2 && gimple_phi_num_args (_a%d) == 2)\n", > + depth, depth); Either is redundant - the number of preds always matches the number of PHI args. > + > + indent += 2; > + fprintf_indent (f, indent, "{\n"); > + indent += 2; > + > + fprintf_indent (f, indent, > + "basic_block _pb_0_%d = EDGE_PRED (_b%d, 0)->src;\n", depth, depth); > + fprintf_indent (f, indent, > + "basic_block _pb_1_%d = EDGE_PRED (_b%d, 1)->src;\n", depth, depth); > + fprintf_indent (f, indent, > + "basic_block _db_%d = safe_dyn_cast <gcond *> (*gsi_last_bb (_pb_0_%d)) ? " > + "_pb_0_%d : _pb_1_%d;\n", depth, depth, depth, depth); > + fprintf_indent (f, indent, > + "basic_block _other_db_%d = safe_dyn_cast <gcond *> " > + "(*gsi_last_bb (_pb_0_%d)) ? _pb_1_%d : _pb_0_%d;\n", > + depth, depth, depth, depth); > + > + fprintf_indent (f, indent, > + "gcond *_ct_%d = safe_dyn_cast <gcond *> (*gsi_last_bb (_db_%d));\n ", > + depth, depth); > + fprintf_indent (f, indent, "if (_ct_%d " > + "&& EDGE_COUNT (_other_db_%d->preds) == 1 " > + "&& EDGE_COUNT (_other_db_%d->succs) == 1 " > + "&& EDGE_SUCC (_other_db_%d, 0)->dest == _b%d)\n", > + depth, depth, depth, depth, depth); > + > + indent += 2; > + fprintf_indent (f, indent, "{\n"); > + indent += 2; I'm quoting generated code here: else if (gphi *_a1 = dyn_cast <gphi *> (_d1)) { basic_block _b1 = gimple_bb (_a1); if (EDGE_COUNT (_b1->preds) == 2 && gimple_phi_num_args (_a1) == 2) { basic_block _pb_0_1 = EDGE_PRED (_b1, 0)->src; basic_block _pb_1_1 = EDGE_PRED (_b1, 1)->src; basic_block _db_1 = safe_dyn_cast <gcond *> (*gsi_last_bb (_pb_0_1)) ? _pb_0_1 : _pb_1_1; basic_block _other_db_1 = safe_dyn_cast <gcond *> (*gsi_last_bb (_pb_0_1)) ? _pb_1_1 : _pb_0_1; _other_db_1 = _db_1 == _pb_0_1 : _pb_1_1 : _pb_0_1; gcond *_ct_1 = safe_dyn_cast <gcond *> (*gsi_last_bb (_db_1)); if (_ct_1 && EDGE_COUNT (_other_db_1->preds) == 1 && EDGE_COUNT (_other_db_1->succs) == 1 && EDGE_SUCC (_other_db_1, 0)->dest == _b1) new lines with indenting && EDGE_COUNT checks please. I think EDGE_SUCC (_other_db_1, 0)->dest == _b1 is pointless. Instead you want to check EDGE_PRED (_other_db_1, 0)->src == _db_1? { tree _cond_lhs_1 = gimple_cond_lhs (_ct_1); tree _cond_rhs_1 = gimple_cond_rhs (_ct_1); tree _p0 = build2 (gimple_cond_code (_ct_1), boolean_type_node, _cond_lhs_1, _cond_rhs_1); bool _arg_0_is_true_1 = gimple_phi_arg_edge (_a1, 0)->flags & EDGE_TRUE_VALUE; Hmm, you don't know this PHI arg edge comes from the cond block, it could come from the middle-bb. Otherwise I think this will work, of course it depends on us handling GENERIC conditions in COND_EXPRs. Not building the _p0 GENERIC tree would be nice but that's quite difficult to pull off here (impossible even). Overall I like it. Can you please try making the '^' flag work instead (also document it in doc/match-and-simplify.texi)? Thanks for working on this. Richard. tree _p1 = gimple_phi_arg_def (_a1, _arg_0_is_true_1 ? 0 : 1); tree _p2 = gimple_phi_arg_def (_a1, _arg_0_is_true_1 ? 1 : 0); > + fprintf_indent (f, indent, > + "tree _cond_lhs_%d = gimple_cond_lhs (_ct_%d);\n", depth, depth); > + fprintf_indent (f, indent, > + "tree _cond_rhs_%d = gimple_cond_rhs (_ct_%d);\n", depth, depth); > + > + char opname_0[20]; > + char opname_1[20]; > + char opname_2[20]; > + gen_opname (opname_0, 0); > + > + fprintf_indent (f, indent, > + "tree %s = build2 (gimple_cond_code (_ct_%d), " > + "boolean_type_node, _cond_lhs_%d, _cond_rhs_%d);\n", > + opname_0, depth, depth, depth); > + > + fprintf_indent (f, indent, > + "bool _arg_0_is_true_%d = gimple_phi_arg_edge (_a%d, 0)->flags " > + " & EDGE_TRUE_VALUE;\n", depth, depth); > + > + gen_opname (opname_1, 1); > + fprintf_indent (f, indent, > + "tree %s = gimple_phi_arg_def (_a%d, _arg_0_is_true_%d ? 0 : 1);\n", > + opname_1, depth, depth); > + > + gen_opname (opname_2, 2); > + fprintf_indent (f, indent, > + "tree %s = gimple_phi_arg_def (_a%d, _arg_0_is_true_%d ? 1 : 0);\n", > + opname_2, depth, depth); > + > + gen_kids (f, indent, gimple, depth); > + > + indent -= 2; > + fprintf_indent (f, indent, "}\n"); > + indent -= 2; > + > + indent -= 2; > + fprintf_indent (f, indent, "}\n"); > + indent -= 2; > +} > + > /* Emit a logging call to the debug file to the file F, with the INDENT from > either the RESULT location or the S's match location if RESULT is null. */ > static void > @@ -4263,6 +4376,8 @@ write_predicate (FILE *f, predicate_id *p, decision_tree &dt, bool gimple) > > if (!gimple) > fprintf_indent (f, 2, "if (TREE_SIDE_EFFECTS (t)) return false;\n"); > + > + dt.root->id = p->id; > dt.root->gen_kids (f, 2, gimple, 0); > > fprintf_indent (f, 2, "return false;\n" > diff --git a/gcc/match.pd b/gcc/match.pd > index 024e3350465..ddbdb7f7638 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3046,31 +3046,48 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > /* Unsigned Saturation Add */ > +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: > + SAT_ADD = (X + Y) | -((X + Y) < X) */ > (match (usadd_left_part_1 @0 @1) > (plus:c @0 @1) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ > (match (usadd_left_part_2 @0 @1) > (realpart (IFN_ADD_OVERFLOW:c @0 @1)) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: > + SAT_ADD = (X + Y) | -((type)(X + Y) < X) */ > (match (usadd_right_part_1 @0 @1) > (negate (convert (lt (plus:c @0 @1) @0))) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: > + SAT_ADD = (X + Y) | -(X > (X + Y)) */ > (match (usadd_right_part_1 @0 @1) > (negate (convert (gt @0 (plus:c @0 @1)))) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ > (match (usadd_right_part_2 @0 @1) > (negate (convert (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop))) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | -IMAGPART_EXPR <.ADD_OVERFLOW> */ > +(match (usadd_right_part_2 @0 @1) > + (negate (imagpart (IFN_ADD_OVERFLOW:c @0 @1))) > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > /* We cannot merge or overload usadd_left_part_1 and usadd_left_part_2 > because the sub part of left_part_2 cannot work with right_part_1. > For example, left_part_2 pattern focus one .ADD_OVERFLOW but the > @@ -3082,10 +3099,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (match (unsigned_integer_sat_add @0 @1) > (bit_ior:c (usadd_left_part_1 @0 @1) (usadd_right_part_1 @0 @1))) > > -/* Unsigned saturation add, case 2 (branchless with .ADD_OVERFLOW). */ > +/* Unsigned saturation add, case 2 (branchless with .ADD_OVERFLOW): > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | -IMAGPART_EXPR <.ADD_OVERFLOW> or > + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ > (match (unsigned_integer_sat_add @0 @1) > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > +/* Unsigned saturation add, case 3 (branch with ge): > + SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (ge (usadd_left_part_1@2 @0 @1) @0) @2 integer_minus_onep)) > + > +/* Unsigned saturation add, case 4 (branch with lt): > + SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > + > +/* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > + SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1. */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (eq (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop) > + (usadd_left_part_2 @0 @1) integer_minus_onep)) > + > +/* Unsigned saturation add, case 6 (branch with ne .ADD_OVERFLOW): > + SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> != 0 ? -1 : .ADD_OVERFLOW. */ > +(match (unsigned_integer_sat_add @0 @1) > + (cond (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop) > + integer_minus_onep (usadd_left_part_2 @0 @1))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index 62da1c5ee08..8624d414347 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4099,7 +4099,7 @@ extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > * => > * _12 = .SAT_ADD (_4, _6); */ > static void > -match_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) > +match_assign_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) > { > gcall *call = NULL; > > @@ -4116,6 +4116,47 @@ match_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) > } > } > > +/* > + * Try to match saturation arith pattern(s). > + * From: > + * <bb 2> : > + * _1 = x_3(D) + y_4(D); > + * if (_1 >= x_3(D)) > + * goto <bb 3>; [INV] > + * else > + * goto <bb 4>; [INV] > + * > + * <bb 3> : > + * > + * <bb 4> : > + * # _2 = PHI <255(2), _1(3)> > + * > + * To: > + * <bb 4> [local count: 1073741824]: > + * _2 = .SAT_ADD (x_4(D), y_5(D)); */ > + > +static void > +match_phi_saturation_arith (gimple_stmt_iterator *gsi, gphi *phi) > +{ > + if (gimple_phi_num_args (phi) != 2) > + return; > + > + tree ops[2]; > + tree phi_result = gimple_phi_result (phi); > + > + if (gimple_unsigned_integer_sat_add (phi_result, ops, NULL) > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (phi_result), > + OPTIMIZE_FOR_BOTH)) > + { > + gcall *call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > + gimple_call_set_lhs (call, phi_result); > + gsi_insert_before (gsi, call, GSI_SAME_STMT); > + > + gimple_stmt_iterator psi = gsi_for_stmt (phi); > + remove_phi_node (&psi, false); > + } > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -6029,6 +6070,12 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > + for (gphi_iterator psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi)) > + { > + gimple_stmt_iterator gsi = gsi_last_bb (bb); > + match_phi_saturation_arith (&gsi, psi.phi ()); > + } > + > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > gimple *stmt = gsi_stmt (gsi); > @@ -6078,7 +6125,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > break; > > case BIT_IOR_EXPR: > - match_saturation_arith (&gsi, as_a<gassign *> (stmt)); > + match_assign_saturation_arith (&gsi, as_a<gassign *> (stmt)); > /* fall-through */ > case BIT_XOR_EXPR: > match_uaddc_usubc (&gsi, stmt, code); > -- > 2.34.1 >
diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc index f1e0e7abe0c..816d2dafd23 100644 --- a/gcc/genmatch.cc +++ b/gcc/genmatch.cc @@ -1767,6 +1767,7 @@ public: unsigned level; dt_node *parent; vec<dt_node *> kids; + const char *id; /* Statistics. */ unsigned num_leafs; @@ -1786,7 +1787,7 @@ public: virtual void gen (FILE *, int, bool, int) {} void gen_kids (FILE *, int, bool, int); - void gen_kids_1 (FILE *, int, bool, int, + void gen_kids_1 (FILE *, const char *, int, bool, int, const vec<dt_operand *> &, const vec<dt_operand *> &, const vec<dt_operand *> &, const vec<dt_operand *> &, const vec<dt_operand *> &, const vec<dt_node *> &); @@ -1819,6 +1820,7 @@ public: char *get_name (char *); void gen_opname (char *, unsigned); + void gen_phi_on_cond (FILE *, int, bool, int); }; /* Leaf node of the decision tree, used for DT_SIMPLIFY. */ @@ -3173,7 +3175,7 @@ dt_node::gen_kids (FILE *f, int indent, bool gimple, int depth) for what we have collected sofar. */ fns.qsort (fns_cmp); generic_fns.qsort (fns_cmp); - gen_kids_1 (f, indent, gimple, depth, gimple_exprs, generic_exprs, + gen_kids_1 (f, id, indent, gimple, depth, gimple_exprs, generic_exprs, fns, generic_fns, preds, others); /* And output the true operand itself. */ kids[i]->gen (f, indent, gimple, depth); @@ -3191,14 +3193,21 @@ dt_node::gen_kids (FILE *f, int indent, bool gimple, int depth) /* Generate code for the remains. */ fns.qsort (fns_cmp); generic_fns.qsort (fns_cmp); - gen_kids_1 (f, indent, gimple, depth, gimple_exprs, generic_exprs, + gen_kids_1 (f, id, indent, gimple, depth, gimple_exprs, generic_exprs, fns, generic_fns, preds, others); } +static bool +allow_phi_predicate_p (const char *id) +{ + return id && strcmp (id, "unsigned_integer_sat_add") == 0; +} + /* Generate matching code for the children of the decision tree node. */ void -dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, +dt_node::gen_kids_1 (FILE *f, const char *id, int indent, bool gimple, + int depth, const vec<dt_operand *> &gimple_exprs, const vec<dt_operand *> &generic_exprs, const vec<dt_operand *> &fns, @@ -3251,6 +3260,7 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, depth); indent += 4; fprintf_indent (f, indent, "{\n"); + bool cond_expr_p = false; for (unsigned i = 0; i < exprs_len; ++i) { expr *e = as_a <expr *> (gimple_exprs[i]->op); @@ -3262,6 +3272,7 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, else { id_base *op = e->operation; + cond_expr_p |= *op == COND_EXPR; if (*op == CONVERT_EXPR || *op == NOP_EXPR) fprintf_indent (f, indent, "CASE_CONVERT:\n"); else @@ -3275,6 +3286,27 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth, fprintf_indent (f, indent, "default:;\n"); fprintf_indent (f, indent, "}\n"); indent -= 4; + + if (cond_expr_p && allow_phi_predicate_p (id)) + { + fprintf_indent (f, indent, + "else if (gphi *_a%d = dyn_cast <gphi *> (_d%d))\n", + depth, depth); + indent += 2; + fprintf_indent (f, indent, "{\n"); + indent += 2; + + for (unsigned i = 0; i < exprs_len; i++) + { + expr *e = as_a <expr *> (gimple_exprs[i]->op); + if (*e->operation == COND_EXPR) + gimple_exprs[i]->gen_phi_on_cond (f, indent, true, depth); + } + + indent -= 2; + fprintf_indent (f, indent, "}\n"); + indent -= 2; + } } if (fns_len) @@ -3483,6 +3515,87 @@ dt_operand::gen (FILE *f, int indent, bool gimple, int depth) } } +/* Generate matching code for the phi when meet COND_EXPR. */ + +void +dt_operand::gen_phi_on_cond (FILE *f, int indent, bool gimple, int depth) +{ + fprintf_indent (f, indent, + "basic_block _b%d = gimple_bb (_a%d);\n", depth, depth); + + fprintf_indent (f, indent, + "if (EDGE_COUNT (_b%d->preds) == 2 && gimple_phi_num_args (_a%d) == 2)\n", + depth, depth); + + indent += 2; + fprintf_indent (f, indent, "{\n"); + indent += 2; + + fprintf_indent (f, indent, + "basic_block _pb_0_%d = EDGE_PRED (_b%d, 0)->src;\n", depth, depth); + fprintf_indent (f, indent, + "basic_block _pb_1_%d = EDGE_PRED (_b%d, 1)->src;\n", depth, depth); + fprintf_indent (f, indent, + "basic_block _db_%d = safe_dyn_cast <gcond *> (*gsi_last_bb (_pb_0_%d)) ? " + "_pb_0_%d : _pb_1_%d;\n", depth, depth, depth, depth); + fprintf_indent (f, indent, + "basic_block _other_db_%d = safe_dyn_cast <gcond *> " + "(*gsi_last_bb (_pb_0_%d)) ? _pb_1_%d : _pb_0_%d;\n", + depth, depth, depth, depth); + + fprintf_indent (f, indent, + "gcond *_ct_%d = safe_dyn_cast <gcond *> (*gsi_last_bb (_db_%d));\n ", + depth, depth); + fprintf_indent (f, indent, "if (_ct_%d " + "&& EDGE_COUNT (_other_db_%d->preds) == 1 " + "&& EDGE_COUNT (_other_db_%d->succs) == 1 " + "&& EDGE_SUCC (_other_db_%d, 0)->dest == _b%d)\n", + depth, depth, depth, depth, depth); + + indent += 2; + fprintf_indent (f, indent, "{\n"); + indent += 2; + + fprintf_indent (f, indent, + "tree _cond_lhs_%d = gimple_cond_lhs (_ct_%d);\n", depth, depth); + fprintf_indent (f, indent, + "tree _cond_rhs_%d = gimple_cond_rhs (_ct_%d);\n", depth, depth); + + char opname_0[20]; + char opname_1[20]; + char opname_2[20]; + gen_opname (opname_0, 0); + + fprintf_indent (f, indent, + "tree %s = build2 (gimple_cond_code (_ct_%d), " + "boolean_type_node, _cond_lhs_%d, _cond_rhs_%d);\n", + opname_0, depth, depth, depth); + + fprintf_indent (f, indent, + "bool _arg_0_is_true_%d = gimple_phi_arg_edge (_a%d, 0)->flags " + " & EDGE_TRUE_VALUE;\n", depth, depth); + + gen_opname (opname_1, 1); + fprintf_indent (f, indent, + "tree %s = gimple_phi_arg_def (_a%d, _arg_0_is_true_%d ? 0 : 1);\n", + opname_1, depth, depth); + + gen_opname (opname_2, 2); + fprintf_indent (f, indent, + "tree %s = gimple_phi_arg_def (_a%d, _arg_0_is_true_%d ? 1 : 0);\n", + opname_2, depth, depth); + + gen_kids (f, indent, gimple, depth); + + indent -= 2; + fprintf_indent (f, indent, "}\n"); + indent -= 2; + + indent -= 2; + fprintf_indent (f, indent, "}\n"); + indent -= 2; +} + /* Emit a logging call to the debug file to the file F, with the INDENT from either the RESULT location or the S's match location if RESULT is null. */ static void @@ -4263,6 +4376,8 @@ write_predicate (FILE *f, predicate_id *p, decision_tree &dt, bool gimple) if (!gimple) fprintf_indent (f, 2, "if (TREE_SIDE_EFFECTS (t)) return false;\n"); + + dt.root->id = p->id; dt.root->gen_kids (f, 2, gimple, 0); fprintf_indent (f, 2, "return false;\n" diff --git a/gcc/match.pd b/gcc/match.pd index 024e3350465..ddbdb7f7638 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3046,31 +3046,48 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) /* Unsigned Saturation Add */ +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: + SAT_ADD = (X + Y) | -((X + Y) < X) */ (match (usadd_left_part_1 @0 @1) (plus:c @0 @1) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ (match (usadd_left_part_2 @0 @1) (realpart (IFN_ADD_OVERFLOW:c @0 @1)) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: + SAT_ADD = (X + Y) | -((type)(X + Y) < X) */ (match (usadd_right_part_1 @0 @1) (negate (convert (lt (plus:c @0 @1) @0))) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) +/* SAT_ADD = usadd_left_part_1 | usadd_right_part_1, aka: + SAT_ADD = (X + Y) | -(X > (X + Y)) */ (match (usadd_right_part_1 @0 @1) (negate (convert (gt @0 (plus:c @0 @1)))) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ (match (usadd_right_part_2 @0 @1) (negate (convert (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop))) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) +/* SAT_ADD = usadd_left_part_2 | usadd_right_part_2, aka: + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | -IMAGPART_EXPR <.ADD_OVERFLOW> */ +(match (usadd_right_part_2 @0 @1) + (negate (imagpart (IFN_ADD_OVERFLOW:c @0 @1))) + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) + && types_match (type, @0, @1)))) + /* We cannot merge or overload usadd_left_part_1 and usadd_left_part_2 because the sub part of left_part_2 cannot work with right_part_1. For example, left_part_2 pattern focus one .ADD_OVERFLOW but the @@ -3082,10 +3099,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (match (unsigned_integer_sat_add @0 @1) (bit_ior:c (usadd_left_part_1 @0 @1) (usadd_right_part_1 @0 @1))) -/* Unsigned saturation add, case 2 (branchless with .ADD_OVERFLOW). */ +/* Unsigned saturation add, case 2 (branchless with .ADD_OVERFLOW): + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | -IMAGPART_EXPR <.ADD_OVERFLOW> or + SAT_ADD = REALPART_EXPR <.ADD_OVERFLOW> | (IMAGPART_EXPR <.ADD_OVERFLOW> != 0) */ (match (unsigned_integer_sat_add @0 @1) (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) +/* Unsigned saturation add, case 3 (branch with ge): + SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ +(match (unsigned_integer_sat_add @0 @1) + (cond (ge (usadd_left_part_1@2 @0 @1) @0) @2 integer_minus_onep)) + +/* Unsigned saturation add, case 4 (branch with lt): + SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ +(match (unsigned_integer_sat_add @0 @1) + (cond (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) + +/* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): + SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1. */ +(match (unsigned_integer_sat_add @0 @1) + (cond (eq (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop) + (usadd_left_part_2 @0 @1) integer_minus_onep)) + +/* Unsigned saturation add, case 6 (branch with ne .ADD_OVERFLOW): + SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> != 0 ? -1 : .ADD_OVERFLOW. */ +(match (unsigned_integer_sat_add @0 @1) + (cond (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) integer_zerop) + integer_minus_onep (usadd_left_part_2 @0 @1))) + /* x > y && x != XXX_MIN --> x > y x > y && x == XXX_MIN --> false . */ (for eqne (eq ne) diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc index 62da1c5ee08..8624d414347 100644 --- a/gcc/tree-ssa-math-opts.cc +++ b/gcc/tree-ssa-math-opts.cc @@ -4099,7 +4099,7 @@ extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); * => * _12 = .SAT_ADD (_4, _6); */ static void -match_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) +match_assign_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) { gcall *call = NULL; @@ -4116,6 +4116,47 @@ match_saturation_arith (gimple_stmt_iterator *gsi, gassign *stmt) } } +/* + * Try to match saturation arith pattern(s). + * From: + * <bb 2> : + * _1 = x_3(D) + y_4(D); + * if (_1 >= x_3(D)) + * goto <bb 3>; [INV] + * else + * goto <bb 4>; [INV] + * + * <bb 3> : + * + * <bb 4> : + * # _2 = PHI <255(2), _1(3)> + * + * To: + * <bb 4> [local count: 1073741824]: + * _2 = .SAT_ADD (x_4(D), y_5(D)); */ + +static void +match_phi_saturation_arith (gimple_stmt_iterator *gsi, gphi *phi) +{ + if (gimple_phi_num_args (phi) != 2) + return; + + tree ops[2]; + tree phi_result = gimple_phi_result (phi); + + if (gimple_unsigned_integer_sat_add (phi_result, ops, NULL) + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (phi_result), + OPTIMIZE_FOR_BOTH)) + { + gcall *call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); + gimple_call_set_lhs (call, phi_result); + gsi_insert_before (gsi, call, GSI_SAME_STMT); + + gimple_stmt_iterator psi = gsi_for_stmt (phi); + remove_phi_node (&psi, false); + } +} + /* Recognize for unsigned x x = y - z; if (x > y) @@ -6029,6 +6070,12 @@ math_opts_dom_walker::after_dom_children (basic_block bb) fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); + for (gphi_iterator psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi)) + { + gimple_stmt_iterator gsi = gsi_last_bb (bb); + match_phi_saturation_arith (&gsi, psi.phi ()); + } + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) { gimple *stmt = gsi_stmt (gsi); @@ -6078,7 +6125,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) break; case BIT_IOR_EXPR: - match_saturation_arith (&gsi, as_a<gassign *> (stmt)); + match_assign_saturation_arith (&gsi, as_a<gassign *> (stmt)); /* fall-through */ case BIT_XOR_EXPR: match_uaddc_usubc (&gsi, stmt, code);