Message ID | 20240506144805.725379-1-pan2.li@intel.com |
---|---|
State | New |
Headers | show |
Series | [v4,1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int | expand |
Hi Pan, > -----Original Message----- > From: pan2.li@intel.com <pan2.li@intel.com> > Sent: Monday, May 6, 2024 3:48 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina > <Tamar.Christina@arm.com>; richard.guenther@gmail.com; > hongtao.liu@intel.com; Pan Li <pan2.li@intel.com> > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar > int > > From: Pan Li <pan2.li@intel.com> > > This patch would like to add the middle-end presentation for the > saturation add. Aka set the result of add to the max when overflow. > It will take the pattern similar as below. > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Take uint8_t as example, we will have: > > * SAT_ADD (1, 254) => 255. > * SAT_ADD (1, 255) => 255. > * SAT_ADD (2, 255) => 255. > * SAT_ADD (255, 255) => 255. > > Given below example for the unsigned scalar integer uint64_t: > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > { > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > } > > Before this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > _Bool _2; > long unsigned int _3; > long unsigned int _4; > uint64_t _7; > long unsigned int _10; > __complex__ long unsigned int _11; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > _1 = REALPART_EXPR <_11>; > _10 = IMAGPART_EXPR <_11>; > _2 = _10 != 0; > _3 = (long unsigned int) _2; > _4 = -_3; > _7 = _1 | _4; > return _7; > ;; succ: EXIT > > } > > After this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > uint64_t _7; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > return _7; > ;; succ: EXIT > } > > We perform the tranform during widen_mult because that the sub-expr of > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > cannot perform the .SAT_ADD pattern match as the sub-expr will be > optmized to .ADD_OVERFLOW first. > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The aarch64 fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > to the return true switch case(s). > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > * match.pd: Add unsigned SAT_ADD match. > * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd. > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern > func decl generated in match.pd match. > (match_saturation_arith): New func impl to match the saturation arith. > (math_opts_dom_walker::after_dom_children): Try match saturation > arith. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/internal-fn.cc | 1 + > gcc/internal-fn.def | 2 ++ > gcc/match.pd | 28 ++++++++++++++++++++++++ > gcc/optabs.def | 4 ++-- > gcc/tree-ssa-math-opts.cc | 46 > +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 79 insertions(+), 2 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 0a7053c2286..73045ca8c8c 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > case IFN_UBSAN_CHECK_MUL: > case IFN_ADD_OVERFLOW: > case IFN_MUL_OVERFLOW: > + case IFN_SAT_ADD: > case IFN_VEC_WIDEN_PLUS: > case IFN_VEC_WIDEN_PLUS_LO: > case IFN_VEC_WIDEN_PLUS_HI: > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 848bb9dbff3..25badbb86e5 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST > | ECF_NOTHROW, first, > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, > first, > smulhrs, umulhrs, binary) > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, > binary) > + > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > diff --git a/gcc/match.pd b/gcc/match.pd > index d401e7503e6..7058e4cbe29 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > || POINTER_TYPE_P (itype)) > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > +/* Unsigned Saturation Add */ > +(match (usadd_left_part @0 @1) > + (plus:c @0 @1) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (lt (plus:c @0 @1) @0))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (gt @0 (plus:c @0 @1)))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +/* Unsigned saturation add, case 1 (branchless): > + SAT_U_ADD = (X + Y) | - ((X + Y) < X) or > + SAT_U_ADD = (X + Y) | - (X > (X + Y)). */ > +(match (unsigned_integer_sat_add @0 @1) > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/optabs.def b/gcc/optabs.def > index ad14f9328b9..3f2cb46aff8 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3") > OPTAB_NX(add_optab, "add$Q$a3") > OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) > OPTAB_VX(addv_optab, "add$F$a3") > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', > gen_signed_fixed_libfunc) > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', > gen_unsigned_fixed_libfunc) > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', > gen_signed_fixed_libfunc) > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', > gen_unsigned_fixed_libfunc) > OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc) > OPTAB_NX(sub_optab, "sub$F$a3") > OPTAB_NX(sub_optab, "sub$Q$a3") > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index 705f4a4695a..35a46edc9f6 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple > *cast_stmt, gimple *&use_stmt, > return 0; > } > > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > + > +/* > + * Try to match saturation arith pattern(s). > + * 1. SAT_ADD (unsigned) > + * _7 = _4 + _6; > + * _8 = _4 > _7; > + * _9 = (long unsigned int) _8; > + * _10 = -_9; > + * _12 = _7 | _10; > + * => > + * _12 = .SAT_ADD (_4, _6); */ > +static bool > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt, > + bool *cfg_changed_p) > +{ > + gcall *call = NULL; > + bool changed_p = false; > + > + gcc_assert (is_gimple_assign (stmt)); > + > + tree ops[2]; > + tree lhs = gimple_assign_lhs (stmt); > + > + if (gimple_unsigned_integer_sat_add (lhs, ops, NULL) > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs), > + OPTIMIZE_FOR_SPEED)) I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing for size. > + { > + call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > + gimple_call_set_lhs (call, lhs); > + gsi_replace (gsi, call, true); > + changed_p = true; > + *cfg_changed_p = changed_p; > + } > + > + return changed_p; > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children > (basic_block bb) > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple *stmt = gsi_stmt (gsi); > + > + if (is_gimple_assign (stmt)) > + match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > + } > + Hmm why do you iterate independently over the statements? The block below already visits Every statement doesn't it? The root of your match is a BIT_IOR_EXPR expression, so I think you just need to change the entry below to: case BIT_IOR_EXPR: match_saturation_arith (&gsi, stmt, m_cfg_changed_p); /* fall-through */ case BIT_XOR_EXPR: match_uaddc_usubc (&gsi, stmt, code); break; Patch is looking good! Thanks again for working on this. Regards, Tamar > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > gimple *stmt = gsi_stmt (gsi); > -- > 2.34.1
Thanks Tamer for comments. > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing for size. Sure thing, let me update it in v5. > Hmm why do you iterate independently over the statements? The block below already visits > Every statement doesn't it? Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the shape changed, or shall we put it to the previous pass ? > The root of your match is a BIT_IOR_EXPR expression, so I think you just need to change the entry below to: > > case BIT_IOR_EXPR: > match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > /* fall-through */ > case BIT_XOR_EXPR: > match_uaddc_usubc (&gsi, stmt, code); > break; There are other shapes (not covered in this patch) of SAT_ADD like below branch version, the IOR should be one of the ROOT. Thus doesn't add case here. Then, shall we take case for each shape here ? Both works for me. #define SAT_ADD_U_1(T) \ T sat_add_u_1_##T(T x, T y) \ { \ return (T)(x + y) >= x ? (x + y) : -1; \ } SAT_ADD_U_1(uint32_t) Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Monday, May 13, 2024 5:10 PM To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; Liu, Hongtao <hongtao.liu@intel.com> Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int Hi Pan, > -----Original Message----- > From: pan2.li@intel.com <pan2.li@intel.com> > Sent: Monday, May 6, 2024 3:48 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina > <Tamar.Christina@arm.com>; richard.guenther@gmail.com; > hongtao.liu@intel.com; Pan Li <pan2.li@intel.com> > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar > int > > From: Pan Li <pan2.li@intel.com> > > This patch would like to add the middle-end presentation for the > saturation add. Aka set the result of add to the max when overflow. > It will take the pattern similar as below. > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Take uint8_t as example, we will have: > > * SAT_ADD (1, 254) => 255. > * SAT_ADD (1, 255) => 255. > * SAT_ADD (2, 255) => 255. > * SAT_ADD (255, 255) => 255. > > Given below example for the unsigned scalar integer uint64_t: > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > { > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > } > > Before this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > _Bool _2; > long unsigned int _3; > long unsigned int _4; > uint64_t _7; > long unsigned int _10; > __complex__ long unsigned int _11; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > _1 = REALPART_EXPR <_11>; > _10 = IMAGPART_EXPR <_11>; > _2 = _10 != 0; > _3 = (long unsigned int) _2; > _4 = -_3; > _7 = _1 | _4; > return _7; > ;; succ: EXIT > > } > > After this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > uint64_t _7; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > return _7; > ;; succ: EXIT > } > > We perform the tranform during widen_mult because that the sub-expr of > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > cannot perform the .SAT_ADD pattern match as the sub-expr will be > optmized to .ADD_OVERFLOW first. > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The aarch64 fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > to the return true switch case(s). > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > * match.pd: Add unsigned SAT_ADD match. > * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd. > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern > func decl generated in match.pd match. > (match_saturation_arith): New func impl to match the saturation arith. > (math_opts_dom_walker::after_dom_children): Try match saturation > arith. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/internal-fn.cc | 1 + > gcc/internal-fn.def | 2 ++ > gcc/match.pd | 28 ++++++++++++++++++++++++ > gcc/optabs.def | 4 ++-- > gcc/tree-ssa-math-opts.cc | 46 > +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 79 insertions(+), 2 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 0a7053c2286..73045ca8c8c 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > case IFN_UBSAN_CHECK_MUL: > case IFN_ADD_OVERFLOW: > case IFN_MUL_OVERFLOW: > + case IFN_SAT_ADD: > case IFN_VEC_WIDEN_PLUS: > case IFN_VEC_WIDEN_PLUS_LO: > case IFN_VEC_WIDEN_PLUS_HI: > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 848bb9dbff3..25badbb86e5 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST > | ECF_NOTHROW, first, > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, > first, > smulhrs, umulhrs, binary) > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, > binary) > + > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > diff --git a/gcc/match.pd b/gcc/match.pd > index d401e7503e6..7058e4cbe29 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > || POINTER_TYPE_P (itype)) > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > +/* Unsigned Saturation Add */ > +(match (usadd_left_part @0 @1) > + (plus:c @0 @1) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (lt (plus:c @0 @1) @0))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (gt @0 (plus:c @0 @1)))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +/* Unsigned saturation add, case 1 (branchless): > + SAT_U_ADD = (X + Y) | - ((X + Y) < X) or > + SAT_U_ADD = (X + Y) | - (X > (X + Y)). */ > +(match (unsigned_integer_sat_add @0 @1) > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/optabs.def b/gcc/optabs.def > index ad14f9328b9..3f2cb46aff8 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3") > OPTAB_NX(add_optab, "add$Q$a3") > OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) > OPTAB_VX(addv_optab, "add$F$a3") > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', > gen_signed_fixed_libfunc) > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', > gen_unsigned_fixed_libfunc) > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', > gen_signed_fixed_libfunc) > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', > gen_unsigned_fixed_libfunc) > OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc) > OPTAB_NX(sub_optab, "sub$F$a3") > OPTAB_NX(sub_optab, "sub$Q$a3") > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index 705f4a4695a..35a46edc9f6 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple > *cast_stmt, gimple *&use_stmt, > return 0; > } > > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > + > +/* > + * Try to match saturation arith pattern(s). > + * 1. SAT_ADD (unsigned) > + * _7 = _4 + _6; > + * _8 = _4 > _7; > + * _9 = (long unsigned int) _8; > + * _10 = -_9; > + * _12 = _7 | _10; > + * => > + * _12 = .SAT_ADD (_4, _6); */ > +static bool > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt, > + bool *cfg_changed_p) > +{ > + gcall *call = NULL; > + bool changed_p = false; > + > + gcc_assert (is_gimple_assign (stmt)); > + > + tree ops[2]; > + tree lhs = gimple_assign_lhs (stmt); > + > + if (gimple_unsigned_integer_sat_add (lhs, ops, NULL) > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs), > + OPTIMIZE_FOR_SPEED)) I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing for size. > + { > + call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > + gimple_call_set_lhs (call, lhs); > + gsi_replace (gsi, call, true); > + changed_p = true; > + *cfg_changed_p = changed_p; > + } > + > + return changed_p; > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children > (basic_block bb) > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple *stmt = gsi_stmt (gsi); > + > + if (is_gimple_assign (stmt)) > + match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > + } > + Hmm why do you iterate independently over the statements? The block below already visits Every statement doesn't it? The root of your match is a BIT_IOR_EXPR expression, so I think you just need to change the entry below to: case BIT_IOR_EXPR: match_saturation_arith (&gsi, stmt, m_cfg_changed_p); /* fall-through */ case BIT_XOR_EXPR: match_uaddc_usubc (&gsi, stmt, code); break; Patch is looking good! Thanks again for working on this. Regards, Tamar > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > gimple *stmt = gsi_stmt (gsi); > -- > 2.34.1
> > Thanks Tamer for comments. > > > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when > optimizing for size. > > Sure thing, let me update it in v5. > > > Hmm why do you iterate independently over the statements? The block below > already visits > > Every statement doesn't it? > > Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the > shape changed, or shall we put it to the previous pass ? > That's just a matter of matching the overflow as an additional case no? i.e. you can add an overload for unsigned_integer_sat_add matching the IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers. I think that would be better as it avoid visiting all the statements twice but also extends the matching to some __builtin_add_overflow uses and should be fairly simple. > > The root of your match is a BIT_IOR_EXPR expression, so I think you just need to > change the entry below to: > > > > case BIT_IOR_EXPR: > > match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > > /* fall-through */ > > case BIT_XOR_EXPR: > > match_uaddc_usubc (&gsi, stmt, code); > > break; > > There are other shapes (not covered in this patch) of SAT_ADD like below branch > version, the IOR should be one of the ROOT. Thus doesn't > add case here. Then, shall we take case for each shape here ? Both works for me. > Yeah, I think that's better than iterating over the statements twice. It also fits better In the existing code. Tamar. > #define SAT_ADD_U_1(T) \ > T sat_add_u_1_##T(T x, T y) \ > { \ > return (T)(x + y) >= x ? (x + y) : -1; \ > } > > SAT_ADD_U_1(uint32_t) > > Pan > > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Monday, May 13, 2024 5:10 PM > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > Liu, Hongtao <hongtao.liu@intel.com> > Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar int > > Hi Pan, > > > -----Original Message----- > > From: pan2.li@intel.com <pan2.li@intel.com> > > Sent: Monday, May 6, 2024 3:48 PM > > To: gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina > > <Tamar.Christina@arm.com>; richard.guenther@gmail.com; > > hongtao.liu@intel.com; Pan Li <pan2.li@intel.com> > > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar > > int > > > > From: Pan Li <pan2.li@intel.com> > > > > This patch would like to add the middle-end presentation for the > > saturation add. Aka set the result of add to the max when overflow. > > It will take the pattern similar as below. > > > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > > > Take uint8_t as example, we will have: > > > > * SAT_ADD (1, 254) => 255. > > * SAT_ADD (1, 255) => 255. > > * SAT_ADD (2, 255) => 255. > > * SAT_ADD (255, 255) => 255. > > > > Given below example for the unsigned scalar integer uint64_t: > > > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > > { > > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > > } > > > > Before this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > long unsigned int _1; > > _Bool _2; > > long unsigned int _3; > > long unsigned int _4; > > uint64_t _7; > > long unsigned int _10; > > __complex__ long unsigned int _11; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > > _1 = REALPART_EXPR <_11>; > > _10 = IMAGPART_EXPR <_11>; > > _2 = _10 != 0; > > _3 = (long unsigned int) _2; > > _4 = -_3; > > _7 = _1 | _4; > > return _7; > > ;; succ: EXIT > > > > } > > > > After this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > uint64_t _7; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > > return _7; > > ;; succ: EXIT > > } > > > > We perform the tranform during widen_mult because that the sub-expr of > > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > > cannot perform the .SAT_ADD pattern match as the sub-expr will be > > optmized to .ADD_OVERFLOW first. > > > > The below tests are passed for this patch: > > 1. The riscv fully regression tests. > > 2. The aarch64 fully regression tests. > > 3. The x86 bootstrap tests. > > 4. The x86 fully regression tests. > > > > PR target/51492 > > PR target/112600 > > > > gcc/ChangeLog: > > > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > > to the return true switch case(s). > > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > > * match.pd: Add unsigned SAT_ADD match. > > * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd. > > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern > > func decl generated in match.pd match. > > (match_saturation_arith): New func impl to match the saturation arith. > > (math_opts_dom_walker::after_dom_children): Try match saturation > > arith. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/internal-fn.cc | 1 + > > gcc/internal-fn.def | 2 ++ > > gcc/match.pd | 28 ++++++++++++++++++++++++ > > gcc/optabs.def | 4 ++-- > > gcc/tree-ssa-math-opts.cc | 46 > > +++++++++++++++++++++++++++++++++++++++ > > 5 files changed, 79 insertions(+), 2 deletions(-) > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > index 0a7053c2286..73045ca8c8c 100644 > > --- a/gcc/internal-fn.cc > > +++ b/gcc/internal-fn.cc > > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > > case IFN_UBSAN_CHECK_MUL: > > case IFN_ADD_OVERFLOW: > > case IFN_MUL_OVERFLOW: > > + case IFN_SAT_ADD: > > case IFN_VEC_WIDEN_PLUS: > > case IFN_VEC_WIDEN_PLUS_LO: > > case IFN_VEC_WIDEN_PLUS_HI: > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > > index 848bb9dbff3..25badbb86e5 100644 > > --- a/gcc/internal-fn.def > > +++ b/gcc/internal-fn.def > > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, > ECF_CONST > > | ECF_NOTHROW, first, > > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, > > first, > > smulhrs, umulhrs, binary) > > > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, > > binary) > > + > > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > > diff --git a/gcc/match.pd b/gcc/match.pd > > index d401e7503e6..7058e4cbe29 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > || POINTER_TYPE_P (itype)) > > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > > > +/* Unsigned Saturation Add */ > > +(match (usadd_left_part @0 @1) > > + (plus:c @0 @1) > > + (if (INTEGRAL_TYPE_P (type) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@1))))) > > + > > +(match (usadd_right_part @0 @1) > > + (negate (convert (lt (plus:c @0 @1) @0))) > > + (if (INTEGRAL_TYPE_P (type) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@1))))) > > + > > +(match (usadd_right_part @0 @1) > > + (negate (convert (gt @0 (plus:c @0 @1)))) > > + (if (INTEGRAL_TYPE_P (type) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@1))))) > > + > > +/* Unsigned saturation add, case 1 (branchless): > > + SAT_U_ADD = (X + Y) | - ((X + Y) < X) or > > + SAT_U_ADD = (X + Y) | - (X > (X + Y)). */ > > +(match (unsigned_integer_sat_add @0 @1) > > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1))) > > + > > /* x > y && x != XXX_MIN --> x > y > > x > y && x == XXX_MIN --> false . */ > > (for eqne (eq ne) > > diff --git a/gcc/optabs.def b/gcc/optabs.def > > index ad14f9328b9..3f2cb46aff8 100644 > > --- a/gcc/optabs.def > > +++ b/gcc/optabs.def > > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3") > > OPTAB_NX(add_optab, "add$Q$a3") > > OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) > > OPTAB_VX(addv_optab, "add$F$a3") > > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', > > gen_signed_fixed_libfunc) > > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', > > gen_unsigned_fixed_libfunc) > > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', > > gen_signed_fixed_libfunc) > > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', > > gen_unsigned_fixed_libfunc) > > OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', > gen_int_fp_fixed_libfunc) > > OPTAB_NX(sub_optab, "sub$F$a3") > > OPTAB_NX(sub_optab, "sub$Q$a3") > > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > > index 705f4a4695a..35a46edc9f6 100644 > > --- a/gcc/tree-ssa-math-opts.cc > > +++ b/gcc/tree-ssa-math-opts.cc > > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple > > *cast_stmt, gimple *&use_stmt, > > return 0; > > } > > > > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > > + > > +/* > > + * Try to match saturation arith pattern(s). > > + * 1. SAT_ADD (unsigned) > > + * _7 = _4 + _6; > > + * _8 = _4 > _7; > > + * _9 = (long unsigned int) _8; > > + * _10 = -_9; > > + * _12 = _7 | _10; > > + * => > > + * _12 = .SAT_ADD (_4, _6); */ > > +static bool > > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt, > > + bool *cfg_changed_p) > > +{ > > + gcall *call = NULL; > > + bool changed_p = false; > > + > > + gcc_assert (is_gimple_assign (stmt)); > > + > > + tree ops[2]; > > + tree lhs = gimple_assign_lhs (stmt); > > + > > + if (gimple_unsigned_integer_sat_add (lhs, ops, NULL) > > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs), > > + OPTIMIZE_FOR_SPEED)) > > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing > for size. > > + { > > + call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > > + gimple_call_set_lhs (call, lhs); > > + gsi_replace (gsi, call, true); > > + changed_p = true; > > + *cfg_changed_p = changed_p; > > + } > > + > > + return changed_p; > > +} > > + > > /* Recognize for unsigned x > > x = y - z; > > if (x > y) > > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children > > (basic_block bb) > > > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > > > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > > + { > > + gimple *stmt = gsi_stmt (gsi); > > + > > + if (is_gimple_assign (stmt)) > > + match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > > + } > > + > > Hmm why do you iterate independently over the statements? The block below > already visits > Every statement doesn't it? > > The root of your match is a BIT_IOR_EXPR expression, so I think you just need to > change the entry below to: > > case BIT_IOR_EXPR: > match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > /* fall-through */ > case BIT_XOR_EXPR: > match_uaddc_usubc (&gsi, stmt, code); > break; > > Patch is looking good! Thanks again for working on this. > > Regards, > Tamar > > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > > { > > gimple *stmt = gsi_stmt (gsi); > > -- > > 2.34.1
> That's just a matter of matching the overflow as an additional case no? > i.e. you can add an overload for unsigned_integer_sat_add matching the > IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers. > I think that would be better as it avoid visiting all the statements twice but also > extends the matching to some __builtin_add_overflow uses and should be fairly > simple. Thanks Tamar, got the point here, will have a try with overload unsigned_integer_sat_add for that. > Yeah, I think that's better than iterating over the statements twice. It also fits better > In the existing code. Ack, will follow the existing code. Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Monday, May 13, 2024 11:03 PM To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; Liu, Hongtao <hongtao.liu@intel.com> Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int > > Thanks Tamer for comments. > > > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when > optimizing for size. > > Sure thing, let me update it in v5. > > > Hmm why do you iterate independently over the statements? The block below > already visits > > Every statement doesn't it? > > Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the > shape changed, or shall we put it to the previous pass ? > That's just a matter of matching the overflow as an additional case no? i.e. you can add an overload for unsigned_integer_sat_add matching the IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers. I think that would be better as it avoid visiting all the statements twice but also extends the matching to some __builtin_add_overflow uses and should be fairly simple. > > The root of your match is a BIT_IOR_EXPR expression, so I think you just need to > change the entry below to: > > > > case BIT_IOR_EXPR: > > match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > > /* fall-through */ > > case BIT_XOR_EXPR: > > match_uaddc_usubc (&gsi, stmt, code); > > break; > > There are other shapes (not covered in this patch) of SAT_ADD like below branch > version, the IOR should be one of the ROOT. Thus doesn't > add case here. Then, shall we take case for each shape here ? Both works for me. > Yeah, I think that's better than iterating over the statements twice. It also fits better In the existing code. Tamar. > #define SAT_ADD_U_1(T) \ > T sat_add_u_1_##T(T x, T y) \ > { \ > return (T)(x + y) >= x ? (x + y) : -1; \ > } > > SAT_ADD_U_1(uint32_t) > > Pan > > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Monday, May 13, 2024 5:10 PM > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > Liu, Hongtao <hongtao.liu@intel.com> > Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar int > > Hi Pan, > > > -----Original Message----- > > From: pan2.li@intel.com <pan2.li@intel.com> > > Sent: Monday, May 6, 2024 3:48 PM > > To: gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina > > <Tamar.Christina@arm.com>; richard.guenther@gmail.com; > > hongtao.liu@intel.com; Pan Li <pan2.li@intel.com> > > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar > > int > > > > From: Pan Li <pan2.li@intel.com> > > > > This patch would like to add the middle-end presentation for the > > saturation add. Aka set the result of add to the max when overflow. > > It will take the pattern similar as below. > > > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > > > Take uint8_t as example, we will have: > > > > * SAT_ADD (1, 254) => 255. > > * SAT_ADD (1, 255) => 255. > > * SAT_ADD (2, 255) => 255. > > * SAT_ADD (255, 255) => 255. > > > > Given below example for the unsigned scalar integer uint64_t: > > > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > > { > > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > > } > > > > Before this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > long unsigned int _1; > > _Bool _2; > > long unsigned int _3; > > long unsigned int _4; > > uint64_t _7; > > long unsigned int _10; > > __complex__ long unsigned int _11; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > > _1 = REALPART_EXPR <_11>; > > _10 = IMAGPART_EXPR <_11>; > > _2 = _10 != 0; > > _3 = (long unsigned int) _2; > > _4 = -_3; > > _7 = _1 | _4; > > return _7; > > ;; succ: EXIT > > > > } > > > > After this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > uint64_t _7; > > > > ;; basic block 2, loop depth 0 > > ;; pred: ENTRY > > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > > return _7; > > ;; succ: EXIT > > } > > > > We perform the tranform during widen_mult because that the sub-expr of > > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > > cannot perform the .SAT_ADD pattern match as the sub-expr will be > > optmized to .ADD_OVERFLOW first. > > > > The below tests are passed for this patch: > > 1. The riscv fully regression tests. > > 2. The aarch64 fully regression tests. > > 3. The x86 bootstrap tests. > > 4. The x86 fully regression tests. > > > > PR target/51492 > > PR target/112600 > > > > gcc/ChangeLog: > > > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > > to the return true switch case(s). > > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > > * match.pd: Add unsigned SAT_ADD match. > > * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd. > > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern > > func decl generated in match.pd match. > > (match_saturation_arith): New func impl to match the saturation arith. > > (math_opts_dom_walker::after_dom_children): Try match saturation > > arith. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/internal-fn.cc | 1 + > > gcc/internal-fn.def | 2 ++ > > gcc/match.pd | 28 ++++++++++++++++++++++++ > > gcc/optabs.def | 4 ++-- > > gcc/tree-ssa-math-opts.cc | 46 > > +++++++++++++++++++++++++++++++++++++++ > > 5 files changed, 79 insertions(+), 2 deletions(-) > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > index 0a7053c2286..73045ca8c8c 100644 > > --- a/gcc/internal-fn.cc > > +++ b/gcc/internal-fn.cc > > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > > case IFN_UBSAN_CHECK_MUL: > > case IFN_ADD_OVERFLOW: > > case IFN_MUL_OVERFLOW: > > + case IFN_SAT_ADD: > > case IFN_VEC_WIDEN_PLUS: > > case IFN_VEC_WIDEN_PLUS_LO: > > case IFN_VEC_WIDEN_PLUS_HI: > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > > index 848bb9dbff3..25badbb86e5 100644 > > --- a/gcc/internal-fn.def > > +++ b/gcc/internal-fn.def > > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, > ECF_CONST > > | ECF_NOTHROW, first, > > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, > > first, > > smulhrs, umulhrs, binary) > > > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, > > binary) > > + > > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > > diff --git a/gcc/match.pd b/gcc/match.pd > > index d401e7503e6..7058e4cbe29 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > || POINTER_TYPE_P (itype)) > > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > > > +/* Unsigned Saturation Add */ > > +(match (usadd_left_part @0 @1) > > + (plus:c @0 @1) > > + (if (INTEGRAL_TYPE_P (type) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@1))))) > > + > > +(match (usadd_right_part @0 @1) > > + (negate (convert (lt (plus:c @0 @1) @0))) > > + (if (INTEGRAL_TYPE_P (type) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@1))))) > > + > > +(match (usadd_right_part @0 @1) > > + (negate (convert (gt @0 (plus:c @0 @1)))) > > + (if (INTEGRAL_TYPE_P (type) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@0)) > > + && types_match (type, TREE_TYPE (@1))))) > > + > > +/* Unsigned saturation add, case 1 (branchless): > > + SAT_U_ADD = (X + Y) | - ((X + Y) < X) or > > + SAT_U_ADD = (X + Y) | - (X > (X + Y)). */ > > +(match (unsigned_integer_sat_add @0 @1) > > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1))) > > + > > /* x > y && x != XXX_MIN --> x > y > > x > y && x == XXX_MIN --> false . */ > > (for eqne (eq ne) > > diff --git a/gcc/optabs.def b/gcc/optabs.def > > index ad14f9328b9..3f2cb46aff8 100644 > > --- a/gcc/optabs.def > > +++ b/gcc/optabs.def > > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3") > > OPTAB_NX(add_optab, "add$Q$a3") > > OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) > > OPTAB_VX(addv_optab, "add$F$a3") > > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', > > gen_signed_fixed_libfunc) > > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', > > gen_unsigned_fixed_libfunc) > > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', > > gen_signed_fixed_libfunc) > > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', > > gen_unsigned_fixed_libfunc) > > OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', > gen_int_fp_fixed_libfunc) > > OPTAB_NX(sub_optab, "sub$F$a3") > > OPTAB_NX(sub_optab, "sub$Q$a3") > > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > > index 705f4a4695a..35a46edc9f6 100644 > > --- a/gcc/tree-ssa-math-opts.cc > > +++ b/gcc/tree-ssa-math-opts.cc > > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple > > *cast_stmt, gimple *&use_stmt, > > return 0; > > } > > > > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > > + > > +/* > > + * Try to match saturation arith pattern(s). > > + * 1. SAT_ADD (unsigned) > > + * _7 = _4 + _6; > > + * _8 = _4 > _7; > > + * _9 = (long unsigned int) _8; > > + * _10 = -_9; > > + * _12 = _7 | _10; > > + * => > > + * _12 = .SAT_ADD (_4, _6); */ > > +static bool > > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt, > > + bool *cfg_changed_p) > > +{ > > + gcall *call = NULL; > > + bool changed_p = false; > > + > > + gcc_assert (is_gimple_assign (stmt)); > > + > > + tree ops[2]; > > + tree lhs = gimple_assign_lhs (stmt); > > + > > + if (gimple_unsigned_integer_sat_add (lhs, ops, NULL) > > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs), > > + OPTIMIZE_FOR_SPEED)) > > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing > for size. > > + { > > + call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > > + gimple_call_set_lhs (call, lhs); > > + gsi_replace (gsi, call, true); > > + changed_p = true; > > + *cfg_changed_p = changed_p; > > + } > > + > > + return changed_p; > > +} > > + > > /* Recognize for unsigned x > > x = y - z; > > if (x > y) > > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children > > (basic_block bb) > > > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > > > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > > + { > > + gimple *stmt = gsi_stmt (gsi); > > + > > + if (is_gimple_assign (stmt)) > > + match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > > + } > > + > > Hmm why do you iterate independently over the statements? The block below > already visits > Every statement doesn't it? > > The root of your match is a BIT_IOR_EXPR expression, so I think you just need to > change the entry below to: > > case BIT_IOR_EXPR: > match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > /* fall-through */ > case BIT_XOR_EXPR: > match_uaddc_usubc (&gsi, stmt, code); > break; > > Patch is looking good! Thanks again for working on this. > > Regards, > Tamar > > > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > > { > > gimple *stmt = gsi_stmt (gsi); > > -- > > 2.34.1
On Mon, May 6, 2024 at 4:48 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > This patch would like to add the middle-end presentation for the > saturation add. Aka set the result of add to the max when overflow. > It will take the pattern similar as below. > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Take uint8_t as example, we will have: > > * SAT_ADD (1, 254) => 255. > * SAT_ADD (1, 255) => 255. > * SAT_ADD (2, 255) => 255. > * SAT_ADD (255, 255) => 255. > > Given below example for the unsigned scalar integer uint64_t: > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > { > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > } > > Before this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > _Bool _2; > long unsigned int _3; > long unsigned int _4; > uint64_t _7; > long unsigned int _10; > __complex__ long unsigned int _11; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > _1 = REALPART_EXPR <_11>; > _10 = IMAGPART_EXPR <_11>; > _2 = _10 != 0; > _3 = (long unsigned int) _2; > _4 = -_3; > _7 = _1 | _4; > return _7; > ;; succ: EXIT > > } > > After this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > uint64_t _7; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > return _7; > ;; succ: EXIT > } > > We perform the tranform during widen_mult because that the sub-expr of > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > cannot perform the .SAT_ADD pattern match as the sub-expr will be > optmized to .ADD_OVERFLOW first. > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The aarch64 fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > to the return true switch case(s). > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > * match.pd: Add unsigned SAT_ADD match. > * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd. > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern > func decl generated in match.pd match. > (match_saturation_arith): New func impl to match the saturation arith. > (math_opts_dom_walker::after_dom_children): Try match saturation > arith. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/internal-fn.cc | 1 + > gcc/internal-fn.def | 2 ++ > gcc/match.pd | 28 ++++++++++++++++++++++++ > gcc/optabs.def | 4 ++-- > gcc/tree-ssa-math-opts.cc | 46 +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 79 insertions(+), 2 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 0a7053c2286..73045ca8c8c 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > case IFN_UBSAN_CHECK_MUL: > case IFN_ADD_OVERFLOW: > case IFN_MUL_OVERFLOW: > + case IFN_SAT_ADD: > case IFN_VEC_WIDEN_PLUS: > case IFN_VEC_WIDEN_PLUS_LO: > case IFN_VEC_WIDEN_PLUS_HI: > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 848bb9dbff3..25badbb86e5 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first, > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first, > smulhrs, umulhrs, binary) > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary) > + > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > diff --git a/gcc/match.pd b/gcc/match.pd > index d401e7503e6..7058e4cbe29 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > || POINTER_TYPE_P (itype)) > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > +/* Unsigned Saturation Add */ > +(match (usadd_left_part @0 @1) > + (plus:c @0 @1) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (lt (plus:c @0 @1) @0))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (gt @0 (plus:c @0 @1)))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +/* Unsigned saturation add, case 1 (branchless): > + SAT_U_ADD = (X + Y) | - ((X + Y) < X) or > + SAT_U_ADD = (X + Y) | - (X > (X + Y)). */ > +(match (unsigned_integer_sat_add @0 @1) > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/optabs.def b/gcc/optabs.def > index ad14f9328b9..3f2cb46aff8 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3") > OPTAB_NX(add_optab, "add$Q$a3") > OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) > OPTAB_VX(addv_optab, "add$F$a3") > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc) > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc) > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc) > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc) > OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc) > OPTAB_NX(sub_optab, "sub$F$a3") > OPTAB_NX(sub_optab, "sub$Q$a3") > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index 705f4a4695a..35a46edc9f6 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple *cast_stmt, gimple *&use_stmt, > return 0; > } > > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > + > +/* > + * Try to match saturation arith pattern(s). > + * 1. SAT_ADD (unsigned) > + * _7 = _4 + _6; > + * _8 = _4 > _7; > + * _9 = (long unsigned int) _8; > + * _10 = -_9; > + * _12 = _7 | _10; > + * => > + * _12 = .SAT_ADD (_4, _6); */ > +static bool > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt, > + bool *cfg_changed_p) > +{ > + gcall *call = NULL; > + bool changed_p = false; > + > + gcc_assert (is_gimple_assign (stmt)); If you require a gassign please statically type your function argument as gassign * instead and remove this assert. > + > + tree ops[2]; > + tree lhs = gimple_assign_lhs (stmt); > + > + if (gimple_unsigned_integer_sat_add (lhs, ops, NULL) > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs), > + OPTIMIZE_FOR_SPEED)) > + { > + call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > + gimple_call_set_lhs (call, lhs); > + gsi_replace (gsi, call, true); > + changed_p = true; > + *cfg_changed_p = changed_p; As addition to Tamars good comments why do you set *cfg_changed_p to true? You are not changing the CFG afer all? > + } > + > + return changed_p; > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple *stmt = gsi_stmt (gsi); > + > + if (is_gimple_assign (stmt)) > + match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > + } > + > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > gimple *stmt = gsi_stmt (gsi); > -- > 2.34.1 >
Thanks Richard for comments. > If you require a gassign please statically type your function > argument as gassign * instead and remove this assert. Sure > As addition to Tamars good comments why do you set *cfg_changed_p to > true? You are > not changing the CFG afer all? Yes, we can add it back in future if we really changed cfg, will update in v5 (include vect patch 2/3) after all test passed. Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Tuesday, May 14, 2024 9:18 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; tamar.christina@arm.com; Liu, Hongtao <hongtao.liu@intel.com> Subject: Re: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int On Mon, May 6, 2024 at 4:48 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > This patch would like to add the middle-end presentation for the > saturation add. Aka set the result of add to the max when overflow. > It will take the pattern similar as below. > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Take uint8_t as example, we will have: > > * SAT_ADD (1, 254) => 255. > * SAT_ADD (1, 255) => 255. > * SAT_ADD (2, 255) => 255. > * SAT_ADD (255, 255) => 255. > > Given below example for the unsigned scalar integer uint64_t: > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > { > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > } > > Before this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > _Bool _2; > long unsigned int _3; > long unsigned int _4; > uint64_t _7; > long unsigned int _10; > __complex__ long unsigned int _11; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > _1 = REALPART_EXPR <_11>; > _10 = IMAGPART_EXPR <_11>; > _2 = _10 != 0; > _3 = (long unsigned int) _2; > _4 = -_3; > _7 = _1 | _4; > return _7; > ;; succ: EXIT > > } > > After this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > uint64_t _7; > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > return _7; > ;; succ: EXIT > } > > We perform the tranform during widen_mult because that the sub-expr of > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > cannot perform the .SAT_ADD pattern match as the sub-expr will be > optmized to .ADD_OVERFLOW first. > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The aarch64 fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > to the return true switch case(s). > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > * match.pd: Add unsigned SAT_ADD match. > * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd. > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern > func decl generated in match.pd match. > (match_saturation_arith): New func impl to match the saturation arith. > (math_opts_dom_walker::after_dom_children): Try match saturation > arith. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/internal-fn.cc | 1 + > gcc/internal-fn.def | 2 ++ > gcc/match.pd | 28 ++++++++++++++++++++++++ > gcc/optabs.def | 4 ++-- > gcc/tree-ssa-math-opts.cc | 46 +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 79 insertions(+), 2 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 0a7053c2286..73045ca8c8c 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > case IFN_UBSAN_CHECK_MUL: > case IFN_ADD_OVERFLOW: > case IFN_MUL_OVERFLOW: > + case IFN_SAT_ADD: > case IFN_VEC_WIDEN_PLUS: > case IFN_VEC_WIDEN_PLUS_LO: > case IFN_VEC_WIDEN_PLUS_HI: > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 848bb9dbff3..25badbb86e5 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first, > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first, > smulhrs, umulhrs, binary) > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary) > + > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > diff --git a/gcc/match.pd b/gcc/match.pd > index d401e7503e6..7058e4cbe29 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > || POINTER_TYPE_P (itype)) > && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) > > +/* Unsigned Saturation Add */ > +(match (usadd_left_part @0 @1) > + (plus:c @0 @1) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (lt (plus:c @0 @1) @0))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +(match (usadd_right_part @0 @1) > + (negate (convert (gt @0 (plus:c @0 @1)))) > + (if (INTEGRAL_TYPE_P (type) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@0)) > + && types_match (type, TREE_TYPE (@1))))) > + > +/* Unsigned saturation add, case 1 (branchless): > + SAT_U_ADD = (X + Y) | - ((X + Y) < X) or > + SAT_U_ADD = (X + Y) | - (X > (X + Y)). */ > +(match (unsigned_integer_sat_add @0 @1) > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1))) > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > diff --git a/gcc/optabs.def b/gcc/optabs.def > index ad14f9328b9..3f2cb46aff8 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3") > OPTAB_NX(add_optab, "add$Q$a3") > OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) > OPTAB_VX(addv_optab, "add$F$a3") > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc) > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc) > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc) > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc) > OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc) > OPTAB_NX(sub_optab, "sub$F$a3") > OPTAB_NX(sub_optab, "sub$Q$a3") > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc > index 705f4a4695a..35a46edc9f6 100644 > --- a/gcc/tree-ssa-math-opts.cc > +++ b/gcc/tree-ssa-math-opts.cc > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple *cast_stmt, gimple *&use_stmt, > return 0; > } > > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > + > +/* > + * Try to match saturation arith pattern(s). > + * 1. SAT_ADD (unsigned) > + * _7 = _4 + _6; > + * _8 = _4 > _7; > + * _9 = (long unsigned int) _8; > + * _10 = -_9; > + * _12 = _7 | _10; > + * => > + * _12 = .SAT_ADD (_4, _6); */ > +static bool > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt, > + bool *cfg_changed_p) > +{ > + gcall *call = NULL; > + bool changed_p = false; > + > + gcc_assert (is_gimple_assign (stmt)); If you require a gassign please statically type your function argument as gassign * instead and remove this assert. > + > + tree ops[2]; > + tree lhs = gimple_assign_lhs (stmt); > + > + if (gimple_unsigned_integer_sat_add (lhs, ops, NULL) > + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs), > + OPTIMIZE_FOR_SPEED)) > + { > + call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); > + gimple_call_set_lhs (call, lhs); > + gsi_replace (gsi, call, true); > + changed_p = true; > + *cfg_changed_p = changed_p; As addition to Tamars good comments why do you set *cfg_changed_p to true? You are not changing the CFG afer all? > + } > + > + return changed_p; > +} > + > /* Recognize for unsigned x > x = y - z; > if (x > y) > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children (basic_block bb) > > fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); > > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gimple *stmt = gsi_stmt (gsi); > + > + if (is_gimple_assign (stmt)) > + match_saturation_arith (&gsi, stmt, m_cfg_changed_p); > + } > + > for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > gimple *stmt = gsi_stmt (gsi); > -- > 2.34.1 >
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 0a7053c2286..73045ca8c8c 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) case IFN_UBSAN_CHECK_MUL: case IFN_ADD_OVERFLOW: case IFN_MUL_OVERFLOW: + case IFN_SAT_ADD: case IFN_VEC_WIDEN_PLUS: case IFN_VEC_WIDEN_PLUS_LO: case IFN_VEC_WIDEN_PLUS_HI: diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 848bb9dbff3..25badbb86e5 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first, DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first, smulhrs, umulhrs, binary) +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary) + DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) diff --git a/gcc/match.pd b/gcc/match.pd index d401e7503e6..7058e4cbe29 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) || POINTER_TYPE_P (itype)) && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)))))) +/* Unsigned Saturation Add */ +(match (usadd_left_part @0 @1) + (plus:c @0 @1) + (if (INTEGRAL_TYPE_P (type) + && TYPE_UNSIGNED (TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@1))))) + +(match (usadd_right_part @0 @1) + (negate (convert (lt (plus:c @0 @1) @0))) + (if (INTEGRAL_TYPE_P (type) + && TYPE_UNSIGNED (TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@1))))) + +(match (usadd_right_part @0 @1) + (negate (convert (gt @0 (plus:c @0 @1)))) + (if (INTEGRAL_TYPE_P (type) + && TYPE_UNSIGNED (TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@0)) + && types_match (type, TREE_TYPE (@1))))) + +/* Unsigned saturation add, case 1 (branchless): + SAT_U_ADD = (X + Y) | - ((X + Y) < X) or + SAT_U_ADD = (X + Y) | - (X > (X + Y)). */ +(match (unsigned_integer_sat_add @0 @1) + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1))) + /* x > y && x != XXX_MIN --> x > y x > y && x == XXX_MIN --> false . */ (for eqne (eq ne) diff --git a/gcc/optabs.def b/gcc/optabs.def index ad14f9328b9..3f2cb46aff8 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3") OPTAB_NX(add_optab, "add$Q$a3") OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc) OPTAB_VX(addv_optab, "add$F$a3") -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc) -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc) +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc) +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc) OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc) OPTAB_NX(sub_optab, "sub$F$a3") OPTAB_NX(sub_optab, "sub$Q$a3") diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc index 705f4a4695a..35a46edc9f6 100644 --- a/gcc/tree-ssa-math-opts.cc +++ b/gcc/tree-ssa-math-opts.cc @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple *cast_stmt, gimple *&use_stmt, return 0; } +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); + +/* + * Try to match saturation arith pattern(s). + * 1. SAT_ADD (unsigned) + * _7 = _4 + _6; + * _8 = _4 > _7; + * _9 = (long unsigned int) _8; + * _10 = -_9; + * _12 = _7 | _10; + * => + * _12 = .SAT_ADD (_4, _6); */ +static bool +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt, + bool *cfg_changed_p) +{ + gcall *call = NULL; + bool changed_p = false; + + gcc_assert (is_gimple_assign (stmt)); + + tree ops[2]; + tree lhs = gimple_assign_lhs (stmt); + + if (gimple_unsigned_integer_sat_add (lhs, ops, NULL) + && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs), + OPTIMIZE_FOR_SPEED)) + { + call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]); + gimple_call_set_lhs (call, lhs); + gsi_replace (gsi, call, true); + changed_p = true; + *cfg_changed_p = changed_p; + } + + return changed_p; +} + /* Recognize for unsigned x x = y - z; if (x > y) @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children (basic_block bb) fma_deferring_state fma_state (param_avoid_fma_max_bits > 0); + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (is_gimple_assign (stmt)) + match_saturation_arith (&gsi, stmt, m_cfg_changed_p); + } + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) { gimple *stmt = gsi_stmt (gsi);