Message ID | 20240612123753.201660-1-pan2.li@intel.com |
---|---|
State | New |
Headers | show |
Series | [v1] Match: Support more forms for the scalar unsigned .SAT_SUB | expand |
On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > After we support the scalar unsigned form 1 and 2, we would like > to introduce more forms include the branch and branchless. There > are forms 3-10 list as below: > > Form 3: > #define SAT_SUB_U_3(T) \ > T sat_sub_u_3_##T (T x, T y) \ > { \ > return x > y ? x - y : 0; \ > } > > Form 4: > #define SAT_SUB_U_4(T) \ > T sat_sub_u_4_##T (T x, T y) \ > { \ > return x >= y ? x - y : 0; \ > } > > Form 5: > #define SAT_SUB_U_5(T) \ > T sat_sub_u_5_##T (T x, T y) \ > { \ > return x < y ? 0 : x - y; \ > } > > Form 6: > #define SAT_SUB_U_6(T) \ > T sat_sub_u_6_##T (T x, T y) \ > { \ > return x <= y ? 0 : x - y; \ > } > > Form 7: > #define SAT_SUB_U_7(T) \ > T sat_sub_u_7_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)(overflow - 1); \ > } > > Form 8: > #define SAT_SUB_U_8(T) \ > T sat_sub_u_8_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)-(!overflow); \ > } > > Form 9: > #define SAT_SUB_U_9(T) \ > T sat_sub_u_9_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return overflow ? 0 : ret; \ > } > > Form 10: > #define SAT_SUB_U_10(T) \ > T sat_sub_u_10_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return !overflow ? ret : 0; \ > } > > Take form 10 as example: > > SAT_SUB_U_10(uint64_t); > > Before this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > unsigned char _1; > unsigned char _2; > uint8_t _3; > __complex__ unsigned char _6; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > _2 = IMAGPART_EXPR <_6>; > if (_2 == 0) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > ;; succ: 3 > ;; 4 > > ;; basic block 3, loop depth 0 > ;; pred: 2 > _1 = REALPART_EXPR <_6>; > ;; succ: 4 > > ;; basic block 4, loop depth 0 > ;; pred: 2 > ;; 3 > # _3 = PHI <0(2), _1(3)> > return _3; > ;; succ: EXIT > > } > > After this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > uint8_t _3; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > return _3; > ;; succ: EXIT > > } > > The below test suites are passed for this patch: > 1. The rv64gcv fully regression test with newlib. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap test. > 4. The x86 fully regression test. > > gcc/ChangeLog: > > * match.pd: Add more match for unsigned sat_sub. > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > func impl to match phi node for .SAT_SUB. > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 25 +++++++++++++++++++++++-- > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > 2 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 5cfe81e80b3..66e411b3359 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 1 (branch with gt): > SAT_U_SUB = X > Y ? X - Y : 0 */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (plus:c (imagpart @2) integer_minus_onep)) :c shouldn't be necessary on the plus > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (bit_xor:c (imagpart @2) integer_onep)) or on the bit_xor OK with those changes. Richard. > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > + (realpart @2) integer_zerop) > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index fbb8e0ea306..05aa157611b 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > } > > +/* > + * Try to match saturation unsigned sub. > + * <bb 2> [local count: 1073741824]: > + * if (x_2(D) > y_3(D)) > + * goto <bb 3>; [50.00%] > + * else > + * goto <bb 4>; [50.00%] > + * > + * <bb 3> [local count: 536870912]: > + * _4 = x_2(D) - y_3(D); > + * > + * <bb 4> [local count: 1073741824]: > + * # _1 = PHI <0(2), _4(3)> > + * => > + * <bb 4> [local count: 1073741824]: > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > +static void > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > +{ > + if (gimple_phi_num_args (phi) != 2) > + return; > + > + tree ops[2]; > + tree phi_result = gimple_phi_result (phi); > + > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > + ops[0], ops[1]); > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > { > gimple_stmt_iterator gsi = gsi_start_bb (bb); > match_unsigned_saturation_add (&gsi, psi.phi ()); > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > } > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > continue; > } > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > case PLUS_EXPR: > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > break; > > case COND_EXPR: > + case BIT_AND_EXPR: > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > -- > 2.34.1 >
Thanks Richard for comments. > :c shouldn't be necessary on the plus > or on the bit_xor > OK with those changes. Will remove the :c and commit it if there is no surprise from test suites. Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Friday, June 14, 2024 4:05 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > After we support the scalar unsigned form 1 and 2, we would like > to introduce more forms include the branch and branchless. There > are forms 3-10 list as below: > > Form 3: > #define SAT_SUB_U_3(T) \ > T sat_sub_u_3_##T (T x, T y) \ > { \ > return x > y ? x - y : 0; \ > } > > Form 4: > #define SAT_SUB_U_4(T) \ > T sat_sub_u_4_##T (T x, T y) \ > { \ > return x >= y ? x - y : 0; \ > } > > Form 5: > #define SAT_SUB_U_5(T) \ > T sat_sub_u_5_##T (T x, T y) \ > { \ > return x < y ? 0 : x - y; \ > } > > Form 6: > #define SAT_SUB_U_6(T) \ > T sat_sub_u_6_##T (T x, T y) \ > { \ > return x <= y ? 0 : x - y; \ > } > > Form 7: > #define SAT_SUB_U_7(T) \ > T sat_sub_u_7_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)(overflow - 1); \ > } > > Form 8: > #define SAT_SUB_U_8(T) \ > T sat_sub_u_8_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)-(!overflow); \ > } > > Form 9: > #define SAT_SUB_U_9(T) \ > T sat_sub_u_9_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return overflow ? 0 : ret; \ > } > > Form 10: > #define SAT_SUB_U_10(T) \ > T sat_sub_u_10_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return !overflow ? ret : 0; \ > } > > Take form 10 as example: > > SAT_SUB_U_10(uint64_t); > > Before this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > unsigned char _1; > unsigned char _2; > uint8_t _3; > __complex__ unsigned char _6; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > _2 = IMAGPART_EXPR <_6>; > if (_2 == 0) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > ;; succ: 3 > ;; 4 > > ;; basic block 3, loop depth 0 > ;; pred: 2 > _1 = REALPART_EXPR <_6>; > ;; succ: 4 > > ;; basic block 4, loop depth 0 > ;; pred: 2 > ;; 3 > # _3 = PHI <0(2), _1(3)> > return _3; > ;; succ: EXIT > > } > > After this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > uint8_t _3; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > return _3; > ;; succ: EXIT > > } > > The below test suites are passed for this patch: > 1. The rv64gcv fully regression test with newlib. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap test. > 4. The x86 fully regression test. > > gcc/ChangeLog: > > * match.pd: Add more match for unsigned sat_sub. > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > func impl to match phi node for .SAT_SUB. > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 25 +++++++++++++++++++++++-- > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > 2 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 5cfe81e80b3..66e411b3359 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 1 (branch with gt): > SAT_U_SUB = X > Y ? X - Y : 0 */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (plus:c (imagpart @2) integer_minus_onep)) :c shouldn't be necessary on the plus > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (bit_xor:c (imagpart @2) integer_onep)) or on the bit_xor OK with those changes. Richard. > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > + (realpart @2) integer_zerop) > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index fbb8e0ea306..05aa157611b 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > } > > +/* > + * Try to match saturation unsigned sub. > + * <bb 2> [local count: 1073741824]: > + * if (x_2(D) > y_3(D)) > + * goto <bb 3>; [50.00%] > + * else > + * goto <bb 4>; [50.00%] > + * > + * <bb 3> [local count: 536870912]: > + * _4 = x_2(D) - y_3(D); > + * > + * <bb 4> [local count: 1073741824]: > + * # _1 = PHI <0(2), _4(3)> > + * => > + * <bb 4> [local count: 1073741824]: > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > +static void > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > +{ > + if (gimple_phi_num_args (phi) != 2) > + return; > + > + tree ops[2]; > + tree phi_result = gimple_phi_result (phi); > + > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > + ops[0], ops[1]); > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > { > gimple_stmt_iterator gsi = gsi_start_bb (bb); > match_unsigned_saturation_add (&gsi, psi.phi ()); > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > } > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > continue; > } > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > case PLUS_EXPR: > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > break; > > case COND_EXPR: > + case BIT_AND_EXPR: > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > -- > 2.34.1 >
Committed with those changes and test suites passed. Pan -----Original Message----- From: Li, Pan2 Sent: Friday, June 14, 2024 4:15 PM To: Richard Biener <richard.guenther@gmail.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: RE: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB Thanks Richard for comments. > :c shouldn't be necessary on the plus > or on the bit_xor > OK with those changes. Will remove the :c and commit it if there is no surprise from test suites. Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Friday, June 14, 2024 4:05 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > After we support the scalar unsigned form 1 and 2, we would like > to introduce more forms include the branch and branchless. There > are forms 3-10 list as below: > > Form 3: > #define SAT_SUB_U_3(T) \ > T sat_sub_u_3_##T (T x, T y) \ > { \ > return x > y ? x - y : 0; \ > } > > Form 4: > #define SAT_SUB_U_4(T) \ > T sat_sub_u_4_##T (T x, T y) \ > { \ > return x >= y ? x - y : 0; \ > } > > Form 5: > #define SAT_SUB_U_5(T) \ > T sat_sub_u_5_##T (T x, T y) \ > { \ > return x < y ? 0 : x - y; \ > } > > Form 6: > #define SAT_SUB_U_6(T) \ > T sat_sub_u_6_##T (T x, T y) \ > { \ > return x <= y ? 0 : x - y; \ > } > > Form 7: > #define SAT_SUB_U_7(T) \ > T sat_sub_u_7_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)(overflow - 1); \ > } > > Form 8: > #define SAT_SUB_U_8(T) \ > T sat_sub_u_8_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)-(!overflow); \ > } > > Form 9: > #define SAT_SUB_U_9(T) \ > T sat_sub_u_9_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return overflow ? 0 : ret; \ > } > > Form 10: > #define SAT_SUB_U_10(T) \ > T sat_sub_u_10_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return !overflow ? ret : 0; \ > } > > Take form 10 as example: > > SAT_SUB_U_10(uint64_t); > > Before this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > unsigned char _1; > unsigned char _2; > uint8_t _3; > __complex__ unsigned char _6; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > _2 = IMAGPART_EXPR <_6>; > if (_2 == 0) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > ;; succ: 3 > ;; 4 > > ;; basic block 3, loop depth 0 > ;; pred: 2 > _1 = REALPART_EXPR <_6>; > ;; succ: 4 > > ;; basic block 4, loop depth 0 > ;; pred: 2 > ;; 3 > # _3 = PHI <0(2), _1(3)> > return _3; > ;; succ: EXIT > > } > > After this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > uint8_t _3; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > return _3; > ;; succ: EXIT > > } > > The below test suites are passed for this patch: > 1. The rv64gcv fully regression test with newlib. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap test. > 4. The x86 fully regression test. > > gcc/ChangeLog: > > * match.pd: Add more match for unsigned sat_sub. > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > func impl to match phi node for .SAT_SUB. > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 25 +++++++++++++++++++++++-- > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > 2 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 5cfe81e80b3..66e411b3359 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 1 (branch with gt): > SAT_U_SUB = X > Y ? X - Y : 0 */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (plus:c (imagpart @2) integer_minus_onep)) :c shouldn't be necessary on the plus > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (bit_xor:c (imagpart @2) integer_onep)) or on the bit_xor OK with those changes. Richard. > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > + (realpart @2) integer_zerop) > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index fbb8e0ea306..05aa157611b 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > } > > +/* > + * Try to match saturation unsigned sub. > + * <bb 2> [local count: 1073741824]: > + * if (x_2(D) > y_3(D)) > + * goto <bb 3>; [50.00%] > + * else > + * goto <bb 4>; [50.00%] > + * > + * <bb 3> [local count: 536870912]: > + * _4 = x_2(D) - y_3(D); > + * > + * <bb 4> [local count: 1073741824]: > + * # _1 = PHI <0(2), _4(3)> > + * => > + * <bb 4> [local count: 1073741824]: > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > +static void > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > +{ > + if (gimple_phi_num_args (phi) != 2) > + return; > + > + tree ops[2]; > + tree phi_result = gimple_phi_result (phi); > + > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > + ops[0], ops[1]); > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > { > gimple_stmt_iterator gsi = gsi_start_bb (bb); > match_unsigned_saturation_add (&gsi, psi.phi ()); > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > } > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > continue; > } > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > case PLUS_EXPR: > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > break; > > case COND_EXPR: > + case BIT_AND_EXPR: > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > -- > 2.34.1 >
Hi Richard, Given almost all unsigned SAT_ADD/SAT_SUB patches are merged, I revisit the original code pattern aka zip benchmark. It may look like below: void test (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count) { unsigned m = 0, n = count; register uint16_t *p; p = x; do { m = *--p; *p = (uint16_t)(m >= wsize ? m-wsize : 0); // There will be a conversion here. } while (--n); } And we can have 179 tree pass as below: <bb 3> [local count: 1073741824]: # n_3 = PHI <n_15(7), count_7(D)(15)> # p_4 = PHI <p_10(7), x_8(D)(15)> p_10 = p_4 + 18446744073709551614; _1 = *p_10; m_11 = (unsigned int) _1; _2 = m_11 - wsize_12(D); iftmp.0_13 = (short unsigned int) _2; _18 = m_11 >= wsize_12(D); iftmp.0_5 = _18 ? iftmp.0_13 : 0; *p_10 = iftmp.0_5; The above form doesn't hit any form we have supported in match.pd. Then I have one idea that to convert uint16 d, tmp; uint32 a, b, m; m = a - b; tmp = (uint16)m; d = a >= b ? tmp : 0; to d = (uint16)(.SAT_SUB (a, b)); I am not very sure it is reasonable to make it work, it may have gimple assignment with convert similar as below (may require the help of vectorize_conversion?). Would like to get some hint from you before the next step, thanks a lot. patt_34 = .SAT_SUB (m_11, wsize_12(D)); patt_35 = (vector([8,8]) short unsigned int) patt_34; Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Friday, June 14, 2024 4:05 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > After we support the scalar unsigned form 1 and 2, we would like > to introduce more forms include the branch and branchless. There > are forms 3-10 list as below: > > Form 3: > #define SAT_SUB_U_3(T) \ > T sat_sub_u_3_##T (T x, T y) \ > { \ > return x > y ? x - y : 0; \ > } > > Form 4: > #define SAT_SUB_U_4(T) \ > T sat_sub_u_4_##T (T x, T y) \ > { \ > return x >= y ? x - y : 0; \ > } > > Form 5: > #define SAT_SUB_U_5(T) \ > T sat_sub_u_5_##T (T x, T y) \ > { \ > return x < y ? 0 : x - y; \ > } > > Form 6: > #define SAT_SUB_U_6(T) \ > T sat_sub_u_6_##T (T x, T y) \ > { \ > return x <= y ? 0 : x - y; \ > } > > Form 7: > #define SAT_SUB_U_7(T) \ > T sat_sub_u_7_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)(overflow - 1); \ > } > > Form 8: > #define SAT_SUB_U_8(T) \ > T sat_sub_u_8_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return ret & (T)-(!overflow); \ > } > > Form 9: > #define SAT_SUB_U_9(T) \ > T sat_sub_u_9_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return overflow ? 0 : ret; \ > } > > Form 10: > #define SAT_SUB_U_10(T) \ > T sat_sub_u_10_##T (T x, T y) \ > { \ > T ret; \ > T overflow = __builtin_sub_overflow (x, y, &ret); \ > return !overflow ? ret : 0; \ > } > > Take form 10 as example: > > SAT_SUB_U_10(uint64_t); > > Before this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > unsigned char _1; > unsigned char _2; > uint8_t _3; > __complex__ unsigned char _6; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > _2 = IMAGPART_EXPR <_6>; > if (_2 == 0) > goto <bb 3>; [50.00%] > else > goto <bb 4>; [50.00%] > ;; succ: 3 > ;; 4 > > ;; basic block 3, loop depth 0 > ;; pred: 2 > _1 = REALPART_EXPR <_6>; > ;; succ: 4 > > ;; basic block 4, loop depth 0 > ;; pred: 2 > ;; 3 > # _3 = PHI <0(2), _1(3)> > return _3; > ;; succ: EXIT > > } > > After this patch: > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > { > uint8_t _3; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > return _3; > ;; succ: EXIT > > } > > The below test suites are passed for this patch: > 1. The rv64gcv fully regression test with newlib. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap test. > 4. The x86 fully regression test. > > gcc/ChangeLog: > > * match.pd: Add more match for unsigned sat_sub. > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > func impl to match phi node for .SAT_SUB. > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 25 +++++++++++++++++++++++-- > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > 2 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 5cfe81e80b3..66e411b3359 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 1 (branch with gt): > SAT_U_SUB = X > Y ? X - Y : 0 */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > && types_match (type, @0, @1)))) > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (plus:c (imagpart @2) integer_minus_onep)) :c shouldn't be necessary on the plus > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > + (bit_xor:c (imagpart @2) integer_onep)) or on the bit_xor OK with those changes. Richard. > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > +(match (unsigned_integer_sat_sub @0 @1) > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > + (realpart @2) integer_zerop) > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > + && types_match (type, @0, @1)))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index fbb8e0ea306..05aa157611b 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > } > > +/* > + * Try to match saturation unsigned sub. > + * <bb 2> [local count: 1073741824]: > + * if (x_2(D) > y_3(D)) > + * goto <bb 3>; [50.00%] > + * else > + * goto <bb 4>; [50.00%] > + * > + * <bb 3> [local count: 536870912]: > + * _4 = x_2(D) - y_3(D); > + * > + * <bb 4> [local count: 1073741824]: > + * # _1 = PHI <0(2), _4(3)> > + * => > + * <bb 4> [local count: 1073741824]: > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > +static void > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > +{ > + if (gimple_phi_num_args (phi) != 2) > + return; > + > + tree ops[2]; > + tree phi_result = gimple_phi_result (phi); > + > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > + ops[0], ops[1]); > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > { > gimple_stmt_iterator gsi = gsi_start_bb (bb); > match_unsigned_saturation_add (&gsi, psi.phi ()); > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > } > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > continue; > } > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > case PLUS_EXPR: > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > break; > > case COND_EXPR: > + case BIT_AND_EXPR: > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > break; > > -- > 2.34.1 >
On Wed, Jun 19, 2024 at 9:37 AM Li, Pan2 <pan2.li@intel.com> wrote: > > Hi Richard, > > Given almost all unsigned SAT_ADD/SAT_SUB patches are merged, I revisit the original code pattern aka zip benchmark. > It may look like below: > > void test (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count) > { > unsigned m = 0, n = count; > register uint16_t *p; > > p = x; > > do { > m = *--p; > > *p = (uint16_t)(m >= wsize ? m-wsize : 0); // There will be a conversion here. > } while (--n); > } > > And we can have 179 tree pass as below: > > <bb 3> [local count: 1073741824]: > # n_3 = PHI <n_15(7), count_7(D)(15)> > # p_4 = PHI <p_10(7), x_8(D)(15)> > p_10 = p_4 + 18446744073709551614; > _1 = *p_10; > m_11 = (unsigned int) _1; > _2 = m_11 - wsize_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = m_11 >= wsize_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > The above form doesn't hit any form we have supported in match.pd. Then I have one idea that to convert > > uint16 d, tmp; > uint32 a, b, m; > > m = a - b; > tmp = (uint16)m; > d = a >= b ? tmp : 0; > > to > > d = (uint16)(.SAT_SUB (a, b)); The key here is to turn this into m = a - b; tmp = a >= b ? m : 0; d = (uint16) tmp; I guess? We probably have the reverse transform, turn (uint16) a ? b : c; into a ? (uint16)b : (uint16)c if any of the arm simplifies. OTOH if you figure the correct rules for the allowed conversions adjusting the pattern matching to allow a conversion on the subtract would work. > I am not very sure it is reasonable to make it work, it may have gimple assignment with convert similar as below (may require the help of vectorize_conversion?). > Would like to get some hint from you before the next step, thanks a lot. > > patt_34 = .SAT_SUB (m_11, wsize_12(D)); > patt_35 = (vector([8,8]) short unsigned int) patt_34; > > Pan > > -----Original Message----- > From: Richard Biener <richard.guenther@gmail.com> > Sent: Friday, June 14, 2024 4:05 PM > To: Li, Pan2 <pan2.li@intel.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB > > On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > > > From: Pan Li <pan2.li@intel.com> > > > > After we support the scalar unsigned form 1 and 2, we would like > > to introduce more forms include the branch and branchless. There > > are forms 3-10 list as below: > > > > Form 3: > > #define SAT_SUB_U_3(T) \ > > T sat_sub_u_3_##T (T x, T y) \ > > { \ > > return x > y ? x - y : 0; \ > > } > > > > Form 4: > > #define SAT_SUB_U_4(T) \ > > T sat_sub_u_4_##T (T x, T y) \ > > { \ > > return x >= y ? x - y : 0; \ > > } > > > > Form 5: > > #define SAT_SUB_U_5(T) \ > > T sat_sub_u_5_##T (T x, T y) \ > > { \ > > return x < y ? 0 : x - y; \ > > } > > > > Form 6: > > #define SAT_SUB_U_6(T) \ > > T sat_sub_u_6_##T (T x, T y) \ > > { \ > > return x <= y ? 0 : x - y; \ > > } > > > > Form 7: > > #define SAT_SUB_U_7(T) \ > > T sat_sub_u_7_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)(overflow - 1); \ > > } > > > > Form 8: > > #define SAT_SUB_U_8(T) \ > > T sat_sub_u_8_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)-(!overflow); \ > > } > > > > Form 9: > > #define SAT_SUB_U_9(T) \ > > T sat_sub_u_9_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return overflow ? 0 : ret; \ > > } > > > > Form 10: > > #define SAT_SUB_U_10(T) \ > > T sat_sub_u_10_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return !overflow ? ret : 0; \ > > } > > > > Take form 10 as example: > > > > SAT_SUB_U_10(uint64_t); > > > > Before this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > unsigned char _1; > > unsigned char _2; > > uint8_t _3; > > __complex__ unsigned char _6; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > > _2 = IMAGPART_EXPR <_6>; > > if (_2 == 0) > > goto <bb 3>; [50.00%] > > else > > goto <bb 4>; [50.00%] > > ;; succ: 3 > > ;; 4 > > > > ;; basic block 3, loop depth 0 > > ;; pred: 2 > > _1 = REALPART_EXPR <_6>; > > ;; succ: 4 > > > > ;; basic block 4, loop depth 0 > > ;; pred: 2 > > ;; 3 > > # _3 = PHI <0(2), _1(3)> > > return _3; > > ;; succ: EXIT > > > > } > > > > After this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > uint8_t _3; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > > return _3; > > ;; succ: EXIT > > > > } > > > > The below test suites are passed for this patch: > > 1. The rv64gcv fully regression test with newlib. > > 2. The rv64gcv build with glibc. > > 3. The x86 bootstrap test. > > 4. The x86 fully regression test. > > > > gcc/ChangeLog: > > > > * match.pd: Add more match for unsigned sat_sub. > > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > > func impl to match phi node for .SAT_SUB. > > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 25 +++++++++++++++++++++++-- > > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > > 2 files changed, 56 insertions(+), 2 deletions(-) > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 5cfe81e80b3..66e411b3359 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > /* Unsigned saturation sub, case 1 (branch with gt): > > SAT_U_SUB = X > Y ? X - Y : 0 */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > /* Unsigned saturation sub, case 2 (branch with ge): > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (plus:c (imagpart @2) integer_minus_onep)) > > :c shouldn't be necessary on the plus > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (bit_xor:c (imagpart @2) integer_onep)) > > or on the bit_xor > > OK with those changes. > > Richard. > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > > + (realpart @2) integer_zerop) > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > /* x > y && x != XXX_MIN --> x > y > > x > y && x == XXX_MIN --> false . */ > > (for eqne (eq ne) > > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > > index fbb8e0ea306..05aa157611b 100644 > > --- a/gcc/tree-ssa-math-opts.cc > > +++ b/gcc/tree-ssa-math-opts.cc > > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > > } > > > > +/* > > + * Try to match saturation unsigned sub. > > + * <bb 2> [local count: 1073741824]: > > + * if (x_2(D) > y_3(D)) > > + * goto <bb 3>; [50.00%] > > + * else > > + * goto <bb 4>; [50.00%] > > + * > > + * <bb 3> [local count: 536870912]: > > + * _4 = x_2(D) - y_3(D); > > + * > > + * <bb 4> [local count: 1073741824]: > > + * # _1 = PHI <0(2), _4(3)> > > + * => > > + * <bb 4> [local count: 1073741824]: > > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > > +static void > > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > > +{ > > + if (gimple_phi_num_args (phi) != 2) > > + return; > > + > > + tree ops[2]; > > + tree phi_result = gimple_phi_result (phi); > > + > > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > > + ops[0], ops[1]); > > +} > > + > > /* Recognize for unsigned x > > x = y - z; > > if (x > y) > > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > { > > gimple_stmt_iterator gsi = gsi_start_bb (bb); > > match_unsigned_saturation_add (&gsi, psi.phi ()); > > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > > } > > > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > continue; > > } > > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > case PLUS_EXPR: > > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > break; > > > > case COND_EXPR: > > + case BIT_AND_EXPR: > > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > -- > > 2.34.1 > >
Got it. Thanks Richard for suggestion. Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Wednesday, June 19, 2024 4:00 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB On Wed, Jun 19, 2024 at 9:37 AM Li, Pan2 <pan2.li@intel.com> wrote: > > Hi Richard, > > Given almost all unsigned SAT_ADD/SAT_SUB patches are merged, I revisit the original code pattern aka zip benchmark. > It may look like below: > > void test (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count) > { > unsigned m = 0, n = count; > register uint16_t *p; > > p = x; > > do { > m = *--p; > > *p = (uint16_t)(m >= wsize ? m-wsize : 0); // There will be a conversion here. > } while (--n); > } > > And we can have 179 tree pass as below: > > <bb 3> [local count: 1073741824]: > # n_3 = PHI <n_15(7), count_7(D)(15)> > # p_4 = PHI <p_10(7), x_8(D)(15)> > p_10 = p_4 + 18446744073709551614; > _1 = *p_10; > m_11 = (unsigned int) _1; > _2 = m_11 - wsize_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = m_11 >= wsize_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > The above form doesn't hit any form we have supported in match.pd. Then I have one idea that to convert > > uint16 d, tmp; > uint32 a, b, m; > > m = a - b; > tmp = (uint16)m; > d = a >= b ? tmp : 0; > > to > > d = (uint16)(.SAT_SUB (a, b)); The key here is to turn this into m = a - b; tmp = a >= b ? m : 0; d = (uint16) tmp; I guess? We probably have the reverse transform, turn (uint16) a ? b : c; into a ? (uint16)b : (uint16)c if any of the arm simplifies. OTOH if you figure the correct rules for the allowed conversions adjusting the pattern matching to allow a conversion on the subtract would work. > I am not very sure it is reasonable to make it work, it may have gimple assignment with convert similar as below (may require the help of vectorize_conversion?). > Would like to get some hint from you before the next step, thanks a lot. > > patt_34 = .SAT_SUB (m_11, wsize_12(D)); > patt_35 = (vector([8,8]) short unsigned int) patt_34; > > Pan > > -----Original Message----- > From: Richard Biener <richard.guenther@gmail.com> > Sent: Friday, June 14, 2024 4:05 PM > To: Li, Pan2 <pan2.li@intel.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB > > On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > > > From: Pan Li <pan2.li@intel.com> > > > > After we support the scalar unsigned form 1 and 2, we would like > > to introduce more forms include the branch and branchless. There > > are forms 3-10 list as below: > > > > Form 3: > > #define SAT_SUB_U_3(T) \ > > T sat_sub_u_3_##T (T x, T y) \ > > { \ > > return x > y ? x - y : 0; \ > > } > > > > Form 4: > > #define SAT_SUB_U_4(T) \ > > T sat_sub_u_4_##T (T x, T y) \ > > { \ > > return x >= y ? x - y : 0; \ > > } > > > > Form 5: > > #define SAT_SUB_U_5(T) \ > > T sat_sub_u_5_##T (T x, T y) \ > > { \ > > return x < y ? 0 : x - y; \ > > } > > > > Form 6: > > #define SAT_SUB_U_6(T) \ > > T sat_sub_u_6_##T (T x, T y) \ > > { \ > > return x <= y ? 0 : x - y; \ > > } > > > > Form 7: > > #define SAT_SUB_U_7(T) \ > > T sat_sub_u_7_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)(overflow - 1); \ > > } > > > > Form 8: > > #define SAT_SUB_U_8(T) \ > > T sat_sub_u_8_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)-(!overflow); \ > > } > > > > Form 9: > > #define SAT_SUB_U_9(T) \ > > T sat_sub_u_9_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return overflow ? 0 : ret; \ > > } > > > > Form 10: > > #define SAT_SUB_U_10(T) \ > > T sat_sub_u_10_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return !overflow ? ret : 0; \ > > } > > > > Take form 10 as example: > > > > SAT_SUB_U_10(uint64_t); > > > > Before this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > unsigned char _1; > > unsigned char _2; > > uint8_t _3; > > __complex__ unsigned char _6; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > > _2 = IMAGPART_EXPR <_6>; > > if (_2 == 0) > > goto <bb 3>; [50.00%] > > else > > goto <bb 4>; [50.00%] > > ;; succ: 3 > > ;; 4 > > > > ;; basic block 3, loop depth 0 > > ;; pred: 2 > > _1 = REALPART_EXPR <_6>; > > ;; succ: 4 > > > > ;; basic block 4, loop depth 0 > > ;; pred: 2 > > ;; 3 > > # _3 = PHI <0(2), _1(3)> > > return _3; > > ;; succ: EXIT > > > > } > > > > After this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > uint8_t _3; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > > return _3; > > ;; succ: EXIT > > > > } > > > > The below test suites are passed for this patch: > > 1. The rv64gcv fully regression test with newlib. > > 2. The rv64gcv build with glibc. > > 3. The x86 bootstrap test. > > 4. The x86 fully regression test. > > > > gcc/ChangeLog: > > > > * match.pd: Add more match for unsigned sat_sub. > > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > > func impl to match phi node for .SAT_SUB. > > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 25 +++++++++++++++++++++++-- > > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > > 2 files changed, 56 insertions(+), 2 deletions(-) > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 5cfe81e80b3..66e411b3359 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > /* Unsigned saturation sub, case 1 (branch with gt): > > SAT_U_SUB = X > Y ? X - Y : 0 */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > /* Unsigned saturation sub, case 2 (branch with ge): > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (plus:c (imagpart @2) integer_minus_onep)) > > :c shouldn't be necessary on the plus > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (bit_xor:c (imagpart @2) integer_onep)) > > or on the bit_xor > > OK with those changes. > > Richard. > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > > + (realpart @2) integer_zerop) > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > /* x > y && x != XXX_MIN --> x > y > > x > y && x == XXX_MIN --> false . */ > > (for eqne (eq ne) > > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > > index fbb8e0ea306..05aa157611b 100644 > > --- a/gcc/tree-ssa-math-opts.cc > > +++ b/gcc/tree-ssa-math-opts.cc > > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > > } > > > > +/* > > + * Try to match saturation unsigned sub. > > + * <bb 2> [local count: 1073741824]: > > + * if (x_2(D) > y_3(D)) > > + * goto <bb 3>; [50.00%] > > + * else > > + * goto <bb 4>; [50.00%] > > + * > > + * <bb 3> [local count: 536870912]: > > + * _4 = x_2(D) - y_3(D); > > + * > > + * <bb 4> [local count: 1073741824]: > > + * # _1 = PHI <0(2), _4(3)> > > + * => > > + * <bb 4> [local count: 1073741824]: > > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > > +static void > > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > > +{ > > + if (gimple_phi_num_args (phi) != 2) > > + return; > > + > > + tree ops[2]; > > + tree phi_result = gimple_phi_result (phi); > > + > > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > > + ops[0], ops[1]); > > +} > > + > > /* Recognize for unsigned x > > x = y - z; > > if (x > y) > > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > { > > gimple_stmt_iterator gsi = gsi_start_bb (bb); > > match_unsigned_saturation_add (&gsi, psi.phi ()); > > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > > } > > > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > continue; > > } > > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > case PLUS_EXPR: > > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > break; > > > > case COND_EXPR: > > + case BIT_AND_EXPR: > > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > -- > > 2.34.1 > >
On Wed, Jun 19, 2024 at 12:37 AM Li, Pan2 <pan2.li@intel.com> wrote: > > Hi Richard, > > Given almost all unsigned SAT_ADD/SAT_SUB patches are merged, I revisit the original code pattern aka zip benchmark. > It may look like below: > > void test (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count) > { > unsigned m = 0, n = count; > register uint16_t *p; > > p = x; > > do { > m = *--p; > > *p = (uint16_t)(m >= wsize ? m-wsize : 0); // There will be a conversion here. > } while (--n); > } > > And we can have 179 tree pass as below: > > <bb 3> [local count: 1073741824]: > # n_3 = PHI <n_15(7), count_7(D)(15)> > # p_4 = PHI <p_10(7), x_8(D)(15)> > p_10 = p_4 + 18446744073709551614; > _1 = *p_10; > m_11 = (unsigned int) _1; > _2 = m_11 - wsize_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = m_11 >= wsize_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > The above form doesn't hit any form we have supported in match.pd. Then I have one idea that to convert > > uint16 d, tmp; > uint32 a, b, m; > > m = a - b; > tmp = (uint16)m; > d = a >= b ? tmp : 0; > > to > > d = (uint16)(.SAT_SUB (a, b)); > > I am not very sure it is reasonable to make it work, it may have gimple assignment with convert similar as below (may require the help of vectorize_conversion?). > Would like to get some hint from you before the next step, thanks a lot. > > patt_34 = .SAT_SUB (m_11, wsize_12(D)); > patt_35 = (vector([8,8]) short unsigned int) patt_34; I am not sure if this is related to the above but we also miss: ``` uint32_t saturation_add(uint32_t a, uint32_t b) { const uint64_t tmp = (uint64_t)a + b; if (tmp > UINT32_MAX) { return UINT32_MAX; } return tmp; } ``` This comes from https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88603 . I thought you might be interested in that form too. Thanks, Andrew > > Pan > > -----Original Message----- > From: Richard Biener <richard.guenther@gmail.com> > Sent: Friday, June 14, 2024 4:05 PM > To: Li, Pan2 <pan2.li@intel.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB > > On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > > > From: Pan Li <pan2.li@intel.com> > > > > After we support the scalar unsigned form 1 and 2, we would like > > to introduce more forms include the branch and branchless. There > > are forms 3-10 list as below: > > > > Form 3: > > #define SAT_SUB_U_3(T) \ > > T sat_sub_u_3_##T (T x, T y) \ > > { \ > > return x > y ? x - y : 0; \ > > } > > > > Form 4: > > #define SAT_SUB_U_4(T) \ > > T sat_sub_u_4_##T (T x, T y) \ > > { \ > > return x >= y ? x - y : 0; \ > > } > > > > Form 5: > > #define SAT_SUB_U_5(T) \ > > T sat_sub_u_5_##T (T x, T y) \ > > { \ > > return x < y ? 0 : x - y; \ > > } > > > > Form 6: > > #define SAT_SUB_U_6(T) \ > > T sat_sub_u_6_##T (T x, T y) \ > > { \ > > return x <= y ? 0 : x - y; \ > > } > > > > Form 7: > > #define SAT_SUB_U_7(T) \ > > T sat_sub_u_7_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)(overflow - 1); \ > > } > > > > Form 8: > > #define SAT_SUB_U_8(T) \ > > T sat_sub_u_8_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)-(!overflow); \ > > } > > > > Form 9: > > #define SAT_SUB_U_9(T) \ > > T sat_sub_u_9_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return overflow ? 0 : ret; \ > > } > > > > Form 10: > > #define SAT_SUB_U_10(T) \ > > T sat_sub_u_10_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return !overflow ? ret : 0; \ > > } > > > > Take form 10 as example: > > > > SAT_SUB_U_10(uint64_t); > > > > Before this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > unsigned char _1; > > unsigned char _2; > > uint8_t _3; > > __complex__ unsigned char _6; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > > _2 = IMAGPART_EXPR <_6>; > > if (_2 == 0) > > goto <bb 3>; [50.00%] > > else > > goto <bb 4>; [50.00%] > > ;; succ: 3 > > ;; 4 > > > > ;; basic block 3, loop depth 0 > > ;; pred: 2 > > _1 = REALPART_EXPR <_6>; > > ;; succ: 4 > > > > ;; basic block 4, loop depth 0 > > ;; pred: 2 > > ;; 3 > > # _3 = PHI <0(2), _1(3)> > > return _3; > > ;; succ: EXIT > > > > } > > > > After this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > uint8_t _3; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > > return _3; > > ;; succ: EXIT > > > > } > > > > The below test suites are passed for this patch: > > 1. The rv64gcv fully regression test with newlib. > > 2. The rv64gcv build with glibc. > > 3. The x86 bootstrap test. > > 4. The x86 fully regression test. > > > > gcc/ChangeLog: > > > > * match.pd: Add more match for unsigned sat_sub. > > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > > func impl to match phi node for .SAT_SUB. > > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 25 +++++++++++++++++++++++-- > > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > > 2 files changed, 56 insertions(+), 2 deletions(-) > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 5cfe81e80b3..66e411b3359 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > /* Unsigned saturation sub, case 1 (branch with gt): > > SAT_U_SUB = X > Y ? X - Y : 0 */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > /* Unsigned saturation sub, case 2 (branch with ge): > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (plus:c (imagpart @2) integer_minus_onep)) > > :c shouldn't be necessary on the plus > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (bit_xor:c (imagpart @2) integer_onep)) > > or on the bit_xor > > OK with those changes. > > Richard. > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > > + (realpart @2) integer_zerop) > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > /* x > y && x != XXX_MIN --> x > y > > x > y && x == XXX_MIN --> false . */ > > (for eqne (eq ne) > > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > > index fbb8e0ea306..05aa157611b 100644 > > --- a/gcc/tree-ssa-math-opts.cc > > +++ b/gcc/tree-ssa-math-opts.cc > > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > > } > > > > +/* > > + * Try to match saturation unsigned sub. > > + * <bb 2> [local count: 1073741824]: > > + * if (x_2(D) > y_3(D)) > > + * goto <bb 3>; [50.00%] > > + * else > > + * goto <bb 4>; [50.00%] > > + * > > + * <bb 3> [local count: 536870912]: > > + * _4 = x_2(D) - y_3(D); > > + * > > + * <bb 4> [local count: 1073741824]: > > + * # _1 = PHI <0(2), _4(3)> > > + * => > > + * <bb 4> [local count: 1073741824]: > > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > > +static void > > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > > +{ > > + if (gimple_phi_num_args (phi) != 2) > > + return; > > + > > + tree ops[2]; > > + tree phi_result = gimple_phi_result (phi); > > + > > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > > + ops[0], ops[1]); > > +} > > + > > /* Recognize for unsigned x > > x = y - z; > > if (x > y) > > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > { > > gimple_stmt_iterator gsi = gsi_start_bb (bb); > > match_unsigned_saturation_add (&gsi, psi.phi ()); > > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > > } > > > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > continue; > > } > > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > case PLUS_EXPR: > > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > break; > > > > case COND_EXPR: > > + case BIT_AND_EXPR: > > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > -- > > 2.34.1 > >
Yes, you are right. The supported forms for now don't involve any widening ops. Thus, below uint64_t form is failed to detect, will add it to my backlog. uint32_t saturation_add(uint32_t a, uint32_t b) { const uint64_t tmp = (uint64_t)a + b; if (tmp > UINT32_MAX) { return UINT32_MAX; } return tmp; } Pan -----Original Message----- From: Andrew Pinski <pinskia@gmail.com> Sent: Thursday, June 27, 2024 3:32 PM To: Li, Pan2 <pan2.li@intel.com> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB On Wed, Jun 19, 2024 at 12:37 AM Li, Pan2 <pan2.li@intel.com> wrote: > > Hi Richard, > > Given almost all unsigned SAT_ADD/SAT_SUB patches are merged, I revisit the original code pattern aka zip benchmark. > It may look like below: > > void test (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count) > { > unsigned m = 0, n = count; > register uint16_t *p; > > p = x; > > do { > m = *--p; > > *p = (uint16_t)(m >= wsize ? m-wsize : 0); // There will be a conversion here. > } while (--n); > } > > And we can have 179 tree pass as below: > > <bb 3> [local count: 1073741824]: > # n_3 = PHI <n_15(7), count_7(D)(15)> > # p_4 = PHI <p_10(7), x_8(D)(15)> > p_10 = p_4 + 18446744073709551614; > _1 = *p_10; > m_11 = (unsigned int) _1; > _2 = m_11 - wsize_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = m_11 >= wsize_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > The above form doesn't hit any form we have supported in match.pd. Then I have one idea that to convert > > uint16 d, tmp; > uint32 a, b, m; > > m = a - b; > tmp = (uint16)m; > d = a >= b ? tmp : 0; > > to > > d = (uint16)(.SAT_SUB (a, b)); > > I am not very sure it is reasonable to make it work, it may have gimple assignment with convert similar as below (may require the help of vectorize_conversion?). > Would like to get some hint from you before the next step, thanks a lot. > > patt_34 = .SAT_SUB (m_11, wsize_12(D)); > patt_35 = (vector([8,8]) short unsigned int) patt_34; I am not sure if this is related to the above but we also miss: ``` uint32_t saturation_add(uint32_t a, uint32_t b) { const uint64_t tmp = (uint64_t)a + b; if (tmp > UINT32_MAX) { return UINT32_MAX; } return tmp; } ``` This comes from https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88603 . I thought you might be interested in that form too. Thanks, Andrew > > Pan > > -----Original Message----- > From: Richard Biener <richard.guenther@gmail.com> > Sent: Friday, June 14, 2024 4:05 PM > To: Li, Pan2 <pan2.li@intel.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: Re: [PATCH v1] Match: Support more forms for the scalar unsigned .SAT_SUB > > On Wed, Jun 12, 2024 at 2:38 PM <pan2.li@intel.com> wrote: > > > > From: Pan Li <pan2.li@intel.com> > > > > After we support the scalar unsigned form 1 and 2, we would like > > to introduce more forms include the branch and branchless. There > > are forms 3-10 list as below: > > > > Form 3: > > #define SAT_SUB_U_3(T) \ > > T sat_sub_u_3_##T (T x, T y) \ > > { \ > > return x > y ? x - y : 0; \ > > } > > > > Form 4: > > #define SAT_SUB_U_4(T) \ > > T sat_sub_u_4_##T (T x, T y) \ > > { \ > > return x >= y ? x - y : 0; \ > > } > > > > Form 5: > > #define SAT_SUB_U_5(T) \ > > T sat_sub_u_5_##T (T x, T y) \ > > { \ > > return x < y ? 0 : x - y; \ > > } > > > > Form 6: > > #define SAT_SUB_U_6(T) \ > > T sat_sub_u_6_##T (T x, T y) \ > > { \ > > return x <= y ? 0 : x - y; \ > > } > > > > Form 7: > > #define SAT_SUB_U_7(T) \ > > T sat_sub_u_7_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)(overflow - 1); \ > > } > > > > Form 8: > > #define SAT_SUB_U_8(T) \ > > T sat_sub_u_8_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return ret & (T)-(!overflow); \ > > } > > > > Form 9: > > #define SAT_SUB_U_9(T) \ > > T sat_sub_u_9_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return overflow ? 0 : ret; \ > > } > > > > Form 10: > > #define SAT_SUB_U_10(T) \ > > T sat_sub_u_10_##T (T x, T y) \ > > { \ > > T ret; \ > > T overflow = __builtin_sub_overflow (x, y, &ret); \ > > return !overflow ? ret : 0; \ > > } > > > > Take form 10 as example: > > > > SAT_SUB_U_10(uint64_t); > > > > Before this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > unsigned char _1; > > unsigned char _2; > > uint8_t _3; > > __complex__ unsigned char _6; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _6 = .SUB_OVERFLOW (x_4(D), y_5(D)); > > _2 = IMAGPART_EXPR <_6>; > > if (_2 == 0) > > goto <bb 3>; [50.00%] > > else > > goto <bb 4>; [50.00%] > > ;; succ: 3 > > ;; 4 > > > > ;; basic block 3, loop depth 0 > > ;; pred: 2 > > _1 = REALPART_EXPR <_6>; > > ;; succ: 4 > > > > ;; basic block 4, loop depth 0 > > ;; pred: 2 > > ;; 3 > > # _3 = PHI <0(2), _1(3)> > > return _3; > > ;; succ: EXIT > > > > } > > > > After this patch: > > uint8_t sat_sub_u_10_uint8_t (uint8_t x, uint8_t y) > > { > > uint8_t _3; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _3 = .SAT_SUB (x_4(D), y_5(D)); [tail call] > > return _3; > > ;; succ: EXIT > > > > } > > > > The below test suites are passed for this patch: > > 1. The rv64gcv fully regression test with newlib. > > 2. The rv64gcv build with glibc. > > 3. The x86 bootstrap test. > > 4. The x86 fully regression test. > > > > gcc/ChangeLog: > > > > * match.pd: Add more match for unsigned sat_sub. > > * tree-ssa-math-opts.cc (match_unsigned_saturation_sub): Add new > > func impl to match phi node for .SAT_SUB. > > (math_opts_dom_walker::after_dom_children): Try match .SAT_SUB > > for the phi node, MULT_EXPR, BIT_XOR_EXPR and BIT_AND_EXPR. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 25 +++++++++++++++++++++++-- > > gcc/tree-ssa-math-opts.cc | 33 +++++++++++++++++++++++++++++++++ > > 2 files changed, 56 insertions(+), 2 deletions(-) > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 5cfe81e80b3..66e411b3359 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > /* Unsigned saturation sub, case 1 (branch with gt): > > SAT_U_SUB = X > Y ? X - Y : 0 */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (gt @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > /* Unsigned saturation sub, case 2 (branch with ge): > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond (ge @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > && types_match (type, @0, @1)))) > > > > +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (plus:c (imagpart @2) integer_minus_onep)) > > :c shouldn't be necessary on the plus > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) > > + (bit_xor:c (imagpart @2) integer_onep)) > > or on the bit_xor > > OK with those changes. > > Richard. > > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ > > +(match (unsigned_integer_sat_sub @0 @1) > > + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) > > + (realpart @2) integer_zerop) > > + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > + && types_match (type, @0, @1)))) > > + > > /* x > y && x != XXX_MIN --> x > y > > x > y && x == XXX_MIN --> false . */ > > (for eqne (eq ne) > > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > > index fbb8e0ea306..05aa157611b 100644 > > --- a/gcc/tree-ssa-math-opts.cc > > +++ b/gcc/tree-ssa-math-opts.cc > > @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) > > build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); > > } > > > > +/* > > + * Try to match saturation unsigned sub. > > + * <bb 2> [local count: 1073741824]: > > + * if (x_2(D) > y_3(D)) > > + * goto <bb 3>; [50.00%] > > + * else > > + * goto <bb 4>; [50.00%] > > + * > > + * <bb 3> [local count: 536870912]: > > + * _4 = x_2(D) - y_3(D); > > + * > > + * <bb 4> [local count: 1073741824]: > > + * # _1 = PHI <0(2), _4(3)> > > + * => > > + * <bb 4> [local count: 1073741824]: > > + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ > > +static void > > +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) > > +{ > > + if (gimple_phi_num_args (phi) != 2) > > + return; > > + > > + tree ops[2]; > > + tree phi_result = gimple_phi_result (phi); > > + > > + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) > > + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, > > + ops[0], ops[1]); > > +} > > + > > /* Recognize for unsigned x > > x = y - z; > > if (x > y) > > @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > { > > gimple_stmt_iterator gsi = gsi_start_bb (bb); > > match_unsigned_saturation_add (&gsi, psi.phi ()); > > + match_unsigned_saturation_sub (&gsi, psi.phi ()); > > } > > > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > > @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > continue; > > } > > match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); > > + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > case PLUS_EXPR: > > @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > break; > > > > case COND_EXPR: > > + case BIT_AND_EXPR: > > match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); > > break; > > > > -- > > 2.34.1 > >
diff --git a/gcc/match.pd b/gcc/match.pd index 5cfe81e80b3..66e411b3359 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3140,14 +3140,14 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) /* Unsigned saturation sub, case 1 (branch with gt): SAT_U_SUB = X > Y ? X - Y : 0 */ (match (unsigned_integer_sat_sub @0 @1) - (cond (gt @0 @1) (minus @0 @1) integer_zerop) + (cond^ (gt @0 @1) (minus @0 @1) integer_zerop) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) /* Unsigned saturation sub, case 2 (branch with ge): SAT_U_SUB = X >= Y ? X - Y : 0. */ (match (unsigned_integer_sat_sub @0 @1) - (cond (ge @0 @1) (minus @0 @1) integer_zerop) + (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) @@ -3165,6 +3165,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) && types_match (type, @0, @1)))) +/* Unsigned saturation sub, case 5 (branchless bit_and with .SUB_OVERFLOW. */ +(match (unsigned_integer_sat_sub @0 @1) + (bit_and:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) + (plus:c (imagpart @2) integer_minus_onep)) + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) + && types_match (type, @0, @1)))) + +/* Unsigned saturation sub, case 6 (branchless mult with .SUB_OVERFLOW. */ +(match (unsigned_integer_sat_sub @0 @1) + (mult:c (realpart (IFN_SUB_OVERFLOW@2 @0 @1)) + (bit_xor:c (imagpart @2) integer_onep)) + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) + && types_match (type, @0, @1)))) + +/* Unsigned saturation sub, case 7 (branch with .SUB_OVERFLOW. */ +(match (unsigned_integer_sat_sub @0 @1) + (cond^ (eq (imagpart (IFN_SUB_OVERFLOW@2 @0 @1)) integer_zerop) + (realpart @2) integer_zerop) + (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) + && types_match (type, @0, @1)))) + /* x > y && x != XXX_MIN --> x > y x > y && x == XXX_MIN --> false . */ (for eqne (eq ne) diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc index fbb8e0ea306..05aa157611b 100644 --- a/gcc/tree-ssa-math-opts.cc +++ b/gcc/tree-ssa-math-opts.cc @@ -4186,6 +4186,36 @@ match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gassign *stmt) build_saturation_binary_arith_call (gsi, IFN_SAT_SUB, lhs, ops[0], ops[1]); } +/* + * Try to match saturation unsigned sub. + * <bb 2> [local count: 1073741824]: + * if (x_2(D) > y_3(D)) + * goto <bb 3>; [50.00%] + * else + * goto <bb 4>; [50.00%] + * + * <bb 3> [local count: 536870912]: + * _4 = x_2(D) - y_3(D); + * + * <bb 4> [local count: 1073741824]: + * # _1 = PHI <0(2), _4(3)> + * => + * <bb 4> [local count: 1073741824]: + * _1 = .SAT_SUB (x_2(D), y_3(D)); */ +static void +match_unsigned_saturation_sub (gimple_stmt_iterator *gsi, gphi *phi) +{ + if (gimple_phi_num_args (phi) != 2) + return; + + tree ops[2]; + tree phi_result = gimple_phi_result (phi); + + if (gimple_unsigned_integer_sat_sub (phi_result, ops, NULL)) + build_saturation_binary_arith_call (gsi, phi, IFN_SAT_SUB, phi_result, + ops[0], ops[1]); +} + /* Recognize for unsigned x x = y - z; if (x > y) @@ -6104,6 +6134,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) { gimple_stmt_iterator gsi = gsi_start_bb (bb); match_unsigned_saturation_add (&gsi, psi.phi ()); + match_unsigned_saturation_sub (&gsi, psi.phi ()); } for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) @@ -6129,6 +6160,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) continue; } match_arith_overflow (&gsi, stmt, code, m_cfg_changed_p); + match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); break; case PLUS_EXPR: @@ -6167,6 +6199,7 @@ math_opts_dom_walker::after_dom_children (basic_block bb) break; case COND_EXPR: + case BIT_AND_EXPR: match_unsigned_saturation_sub (&gsi, as_a<gassign *> (stmt)); break;