diff mbox series

[v4,1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

Message ID 20240506144805.725379-1-pan2.li@intel.com
State New
Headers show
Series [v4,1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int | expand

Commit Message

Li, Pan2 May 6, 2024, 2:48 p.m. UTC
From: Pan Li <pan2.li@intel.com>

This patch would like to add the middle-end presentation for the
saturation add.  Aka set the result of add to the max when overflow.
It will take the pattern similar as below.

SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))

Take uint8_t as example, we will have:

* SAT_ADD (1, 254)   => 255.
* SAT_ADD (1, 255)   => 255.
* SAT_ADD (2, 255)   => 255.
* SAT_ADD (255, 255) => 255.

Given below example for the unsigned scalar integer uint64_t:

uint64_t sat_add_u64 (uint64_t x, uint64_t y)
{
  return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
}

Before this patch:
uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
{
  long unsigned int _1;
  _Bool _2;
  long unsigned int _3;
  long unsigned int _4;
  uint64_t _7;
  long unsigned int _10;
  __complex__ long unsigned int _11;

;;   basic block 2, loop depth 0
;;    pred:       ENTRY
  _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
  _1 = REALPART_EXPR <_11>;
  _10 = IMAGPART_EXPR <_11>;
  _2 = _10 != 0;
  _3 = (long unsigned int) _2;
  _4 = -_3;
  _7 = _1 | _4;
  return _7;
;;    succ:       EXIT

}

After this patch:
uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
{
  uint64_t _7;

;;   basic block 2, loop depth 0
;;    pred:       ENTRY
  _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
  return _7;
;;    succ:       EXIT
}

We perform the tranform during widen_mult because that the sub-expr of
SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
.SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
cannot perform the .SAT_ADD pattern match as the sub-expr will be
optmized to .ADD_OVERFLOW first.

The below tests are passed for this patch:
1. The riscv fully regression tests.
2. The aarch64 fully regression tests.
3. The x86 bootstrap tests.
4. The x86 fully regression tests.

	PR target/51492
	PR target/112600

gcc/ChangeLog:

	* internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
	to the return true switch case(s).
	* internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
	* match.pd: Add unsigned SAT_ADD match.
	* optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
	* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
	func decl generated in match.pd match.
	(match_saturation_arith): New func impl to match the saturation arith.
	(math_opts_dom_walker::after_dom_children): Try match saturation
	arith.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/internal-fn.cc        |  1 +
 gcc/internal-fn.def       |  2 ++
 gcc/match.pd              | 28 ++++++++++++++++++++++++
 gcc/optabs.def            |  4 ++--
 gcc/tree-ssa-math-opts.cc | 46 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 79 insertions(+), 2 deletions(-)

Comments

Tamar Christina May 13, 2024, 9:09 a.m. UTC | #1
Hi Pan,

> -----Original Message-----
> From: pan2.li@intel.com <pan2.li@intel.com>
> Sent: Monday, May 6, 2024 3:48 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina
> <Tamar.Christina@arm.com>; richard.guenther@gmail.com;
> hongtao.liu@intel.com; Pan Li <pan2.li@intel.com>
> Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar
> int
> 
> From: Pan Li <pan2.li@intel.com>
> 
> This patch would like to add the middle-end presentation for the
> saturation add.  Aka set the result of add to the max when overflow.
> It will take the pattern similar as below.
> 
> SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> 
> Take uint8_t as example, we will have:
> 
> * SAT_ADD (1, 254)   => 255.
> * SAT_ADD (1, 255)   => 255.
> * SAT_ADD (2, 255)   => 255.
> * SAT_ADD (255, 255) => 255.
> 
> Given below example for the unsigned scalar integer uint64_t:
> 
> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> {
>   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> }
> 
> Before this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   _Bool _2;
>   long unsigned int _3;
>   long unsigned int _4;
>   uint64_t _7;
>   long unsigned int _10;
>   __complex__ long unsigned int _11;
> 
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
>   _1 = REALPART_EXPR <_11>;
>   _10 = IMAGPART_EXPR <_11>;
>   _2 = _10 != 0;
>   _3 = (long unsigned int) _2;
>   _4 = -_3;
>   _7 = _1 | _4;
>   return _7;
> ;;    succ:       EXIT
> 
> }
> 
> After this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   uint64_t _7;
> 
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
>   return _7;
> ;;    succ:       EXIT
> }
> 
> We perform the tranform during widen_mult because that the sub-expr of
> SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> cannot perform the .SAT_ADD pattern match as the sub-expr will be
> optmized to .ADD_OVERFLOW first.
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The aarch64 fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
> 	PR target/51492
> 	PR target/112600
> 
> gcc/ChangeLog:
> 
> 	* internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
> 	to the return true switch case(s).
> 	* internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
> 	* match.pd: Add unsigned SAT_ADD match.
> 	* optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
> 	* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
> 	func decl generated in match.pd match.
> 	(match_saturation_arith): New func impl to match the saturation arith.
> 	(math_opts_dom_walker::after_dom_children): Try match saturation
> 	arith.
> 
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/internal-fn.cc        |  1 +
>  gcc/internal-fn.def       |  2 ++
>  gcc/match.pd              | 28 ++++++++++++++++++++++++
>  gcc/optabs.def            |  4 ++--
>  gcc/tree-ssa-math-opts.cc | 46
> +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 79 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0a7053c2286..73045ca8c8c 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
>      case IFN_UBSAN_CHECK_MUL:
>      case IFN_ADD_OVERFLOW:
>      case IFN_MUL_OVERFLOW:
> +    case IFN_SAT_ADD:
>      case IFN_VEC_WIDEN_PLUS:
>      case IFN_VEC_WIDEN_PLUS_LO:
>      case IFN_VEC_WIDEN_PLUS_HI:
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..25badbb86e5 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST
> | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> first,
>  			      smulhrs, umulhrs, binary)
> 
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd,
> binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index d401e7503e6..7058e4cbe29 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         || POINTER_TYPE_P (itype))
>        && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))))))
> 
> +/* Unsigned Saturation Add */
> +(match (usadd_left_part @0 @1)
> + (plus:c @0 @1)
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (lt (plus:c @0 @1) @0)))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (gt @0 (plus:c @0 @1))))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +/* Unsigned saturation add, case 1 (branchless):
> +   SAT_U_ADD = (X + Y) | - ((X + Y) < X) or
> +   SAT_U_ADD = (X + Y) | - (X > (X + Y)).  */
> +(match (unsigned_integer_sat_add @0 @1)
> + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1)))
> +
>  /* x >  y  &&  x != XXX_MIN  -->  x > y
>     x >  y  &&  x == XXX_MIN  -->  false . */
>  (for eqne (eq ne)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..3f2cb46aff8 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3")
>  OPTAB_NX(add_optab, "add$Q$a3")
>  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(addv_optab, "add$F$a3")
> -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3',
> gen_signed_fixed_libfunc)
> -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3',
> gen_unsigned_fixed_libfunc)
> +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3',
> gen_signed_fixed_libfunc)
> +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3',
> gen_unsigned_fixed_libfunc)
>  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(sub_optab, "sub$F$a3")
>  OPTAB_NX(sub_optab, "sub$Q$a3")
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index 705f4a4695a..35a46edc9f6 100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple
> *cast_stmt, gimple *&use_stmt,
>    return 0;
>  }
> 
> +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> +
> +/*
> + * Try to match saturation arith pattern(s).
> + *   1. SAT_ADD (unsigned)
> + *      _7 = _4 + _6;
> + *      _8 = _4 > _7;
> + *      _9 = (long unsigned int) _8;
> + *      _10 = -_9;
> + *      _12 = _7 | _10;
> + *      =>
> + *      _12 = .SAT_ADD (_4, _6);  */
> +static bool
> +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt,
> +			bool *cfg_changed_p)
> +{
> +  gcall *call = NULL;
> +  bool changed_p = false;
> +
> +  gcc_assert (is_gimple_assign (stmt));
> +
> +  tree ops[2];
> +  tree lhs = gimple_assign_lhs (stmt);
> +
> +  if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)
> +      && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs),
> +					OPTIMIZE_FOR_SPEED))

I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing for size.
> +    {
> +      call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]);
> +      gimple_call_set_lhs (call, lhs);
> +      gsi_replace (gsi, call, true);
> +      changed_p = true;
> +      *cfg_changed_p = changed_p;
> +    }
> +
> +  return changed_p;
> +}
> +
>  /* Recognize for unsigned x
>     x = y - z;
>     if (x > y)
> @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children
> (basic_block bb)
> 
>    fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
> 
> +  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +
> +      if (is_gimple_assign (stmt))
> +	match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> +    }
> +

Hmm why do you iterate independently over the statements? The block below already visits
Every statement doesn't it?

The root of your match is a BIT_IOR_EXPR expression, so I think you just need to change the entry below to:

	    case BIT_IOR_EXPR:
	      match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
	      /* fall-through */
	    case BIT_XOR_EXPR:
	      match_uaddc_usubc (&gsi, stmt, code);
	      break;

Patch is looking good! Thanks again for working on this.

Regards,
Tamar

>    for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
>      {
>        gimple *stmt = gsi_stmt (gsi);
> --
> 2.34.1
Li, Pan2 May 13, 2024, 1:36 p.m. UTC | #2
Thanks Tamer for comments.

> I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing for size.

Sure thing, let me update it in v5.

> Hmm why do you iterate independently over the statements? The block below already visits
> Every statement doesn't it?

Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the shape changed, or shall we put it to the previous pass ?

> The root of your match is a BIT_IOR_EXPR expression, so I think you just need to change the entry below to:
>
> 	    case BIT_IOR_EXPR:
> 	      match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> 	      /* fall-through */
> 	    case BIT_XOR_EXPR:
> 	      match_uaddc_usubc (&gsi, stmt, code);
> 	      break;

There are other shapes (not covered in this patch) of SAT_ADD like below branch version, the IOR should be one of the ROOT. Thus doesn't
add case here.  Then, shall we take case for each shape here ? Both works for me.

#define SAT_ADD_U_1(T) \
T sat_add_u_1_##T(T x, T y) \
{ \
  return (T)(x + y) >= x ? (x + y) : -1; \
}

SAT_ADD_U_1(uint32_t)

Pan


-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Monday, May 13, 2024 5:10 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; Liu, Hongtao <hongtao.liu@intel.com>
Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

Hi Pan,

> -----Original Message-----
> From: pan2.li@intel.com <pan2.li@intel.com>
> Sent: Monday, May 6, 2024 3:48 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina
> <Tamar.Christina@arm.com>; richard.guenther@gmail.com;
> hongtao.liu@intel.com; Pan Li <pan2.li@intel.com>
> Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar
> int
> 
> From: Pan Li <pan2.li@intel.com>
> 
> This patch would like to add the middle-end presentation for the
> saturation add.  Aka set the result of add to the max when overflow.
> It will take the pattern similar as below.
> 
> SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> 
> Take uint8_t as example, we will have:
> 
> * SAT_ADD (1, 254)   => 255.
> * SAT_ADD (1, 255)   => 255.
> * SAT_ADD (2, 255)   => 255.
> * SAT_ADD (255, 255) => 255.
> 
> Given below example for the unsigned scalar integer uint64_t:
> 
> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> {
>   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> }
> 
> Before this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   _Bool _2;
>   long unsigned int _3;
>   long unsigned int _4;
>   uint64_t _7;
>   long unsigned int _10;
>   __complex__ long unsigned int _11;
> 
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
>   _1 = REALPART_EXPR <_11>;
>   _10 = IMAGPART_EXPR <_11>;
>   _2 = _10 != 0;
>   _3 = (long unsigned int) _2;
>   _4 = -_3;
>   _7 = _1 | _4;
>   return _7;
> ;;    succ:       EXIT
> 
> }
> 
> After this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   uint64_t _7;
> 
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
>   return _7;
> ;;    succ:       EXIT
> }
> 
> We perform the tranform during widen_mult because that the sub-expr of
> SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> cannot perform the .SAT_ADD pattern match as the sub-expr will be
> optmized to .ADD_OVERFLOW first.
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The aarch64 fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
> 	PR target/51492
> 	PR target/112600
> 
> gcc/ChangeLog:
> 
> 	* internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
> 	to the return true switch case(s).
> 	* internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
> 	* match.pd: Add unsigned SAT_ADD match.
> 	* optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
> 	* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
> 	func decl generated in match.pd match.
> 	(match_saturation_arith): New func impl to match the saturation arith.
> 	(math_opts_dom_walker::after_dom_children): Try match saturation
> 	arith.
> 
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/internal-fn.cc        |  1 +
>  gcc/internal-fn.def       |  2 ++
>  gcc/match.pd              | 28 ++++++++++++++++++++++++
>  gcc/optabs.def            |  4 ++--
>  gcc/tree-ssa-math-opts.cc | 46
> +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 79 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0a7053c2286..73045ca8c8c 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
>      case IFN_UBSAN_CHECK_MUL:
>      case IFN_ADD_OVERFLOW:
>      case IFN_MUL_OVERFLOW:
> +    case IFN_SAT_ADD:
>      case IFN_VEC_WIDEN_PLUS:
>      case IFN_VEC_WIDEN_PLUS_LO:
>      case IFN_VEC_WIDEN_PLUS_HI:
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..25badbb86e5 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST
> | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> first,
>  			      smulhrs, umulhrs, binary)
> 
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd,
> binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index d401e7503e6..7058e4cbe29 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         || POINTER_TYPE_P (itype))
>        && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))))))
> 
> +/* Unsigned Saturation Add */
> +(match (usadd_left_part @0 @1)
> + (plus:c @0 @1)
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (lt (plus:c @0 @1) @0)))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (gt @0 (plus:c @0 @1))))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +/* Unsigned saturation add, case 1 (branchless):
> +   SAT_U_ADD = (X + Y) | - ((X + Y) < X) or
> +   SAT_U_ADD = (X + Y) | - (X > (X + Y)).  */
> +(match (unsigned_integer_sat_add @0 @1)
> + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1)))
> +
>  /* x >  y  &&  x != XXX_MIN  -->  x > y
>     x >  y  &&  x == XXX_MIN  -->  false . */
>  (for eqne (eq ne)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..3f2cb46aff8 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3")
>  OPTAB_NX(add_optab, "add$Q$a3")
>  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(addv_optab, "add$F$a3")
> -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3',
> gen_signed_fixed_libfunc)
> -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3',
> gen_unsigned_fixed_libfunc)
> +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3',
> gen_signed_fixed_libfunc)
> +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3',
> gen_unsigned_fixed_libfunc)
>  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(sub_optab, "sub$F$a3")
>  OPTAB_NX(sub_optab, "sub$Q$a3")
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index 705f4a4695a..35a46edc9f6 100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple
> *cast_stmt, gimple *&use_stmt,
>    return 0;
>  }
> 
> +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> +
> +/*
> + * Try to match saturation arith pattern(s).
> + *   1. SAT_ADD (unsigned)
> + *      _7 = _4 + _6;
> + *      _8 = _4 > _7;
> + *      _9 = (long unsigned int) _8;
> + *      _10 = -_9;
> + *      _12 = _7 | _10;
> + *      =>
> + *      _12 = .SAT_ADD (_4, _6);  */
> +static bool
> +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt,
> +			bool *cfg_changed_p)
> +{
> +  gcall *call = NULL;
> +  bool changed_p = false;
> +
> +  gcc_assert (is_gimple_assign (stmt));
> +
> +  tree ops[2];
> +  tree lhs = gimple_assign_lhs (stmt);
> +
> +  if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)
> +      && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs),
> +					OPTIMIZE_FOR_SPEED))

I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing for size.
> +    {
> +      call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]);
> +      gimple_call_set_lhs (call, lhs);
> +      gsi_replace (gsi, call, true);
> +      changed_p = true;
> +      *cfg_changed_p = changed_p;
> +    }
> +
> +  return changed_p;
> +}
> +
>  /* Recognize for unsigned x
>     x = y - z;
>     if (x > y)
> @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children
> (basic_block bb)
> 
>    fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
> 
> +  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +
> +      if (is_gimple_assign (stmt))
> +	match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> +    }
> +

Hmm why do you iterate independently over the statements? The block below already visits
Every statement doesn't it?

The root of your match is a BIT_IOR_EXPR expression, so I think you just need to change the entry below to:

	    case BIT_IOR_EXPR:
	      match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
	      /* fall-through */
	    case BIT_XOR_EXPR:
	      match_uaddc_usubc (&gsi, stmt, code);
	      break;

Patch is looking good! Thanks again for working on this.

Regards,
Tamar

>    for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
>      {
>        gimple *stmt = gsi_stmt (gsi);
> --
> 2.34.1
Tamar Christina May 13, 2024, 3:03 p.m. UTC | #3
> 
> Thanks Tamer for comments.
> 
> > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when
> optimizing for size.
> 
> Sure thing, let me update it in v5.
> 
> > Hmm why do you iterate independently over the statements? The block below
> already visits
> > Every statement doesn't it?
> 
> Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the
> shape changed, or shall we put it to the previous pass ?
> 

That's just a matter of matching the overflow as an additional case no?
i.e. you can add an overload for unsigned_integer_sat_add matching the
IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers.

I think that would be better as it avoid visiting all the statements twice but also
extends the matching to some __builtin_add_overflow uses and should be fairly
simple.

> > The root of your match is a BIT_IOR_EXPR expression, so I think you just need to
> change the entry below to:
> >
> > 	    case BIT_IOR_EXPR:
> > 	      match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> > 	      /* fall-through */
> > 	    case BIT_XOR_EXPR:
> > 	      match_uaddc_usubc (&gsi, stmt, code);
> > 	      break;
> 
> There are other shapes (not covered in this patch) of SAT_ADD like below branch
> version, the IOR should be one of the ROOT. Thus doesn't
> add case here.  Then, shall we take case for each shape here ? Both works for me.
> 

Yeah, I think that's better than iterating over the statements twice.  It also fits better
In the existing code.

Tamar.

> #define SAT_ADD_U_1(T) \
> T sat_add_u_1_##T(T x, T y) \
> { \
>   return (T)(x + y) >= x ? (x + y) : -1; \
> }
> 
> SAT_ADD_U_1(uint32_t)
> 
> Pan
> 
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Monday, May 13, 2024 5:10 PM
> To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> Liu, Hongtao <hongtao.liu@intel.com>
> Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar int
> 
> Hi Pan,
> 
> > -----Original Message-----
> > From: pan2.li@intel.com <pan2.li@intel.com>
> > Sent: Monday, May 6, 2024 3:48 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina
> > <Tamar.Christina@arm.com>; richard.guenther@gmail.com;
> > hongtao.liu@intel.com; Pan Li <pan2.li@intel.com>
> > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar
> > int
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > This patch would like to add the middle-end presentation for the
> > saturation add.  Aka set the result of add to the max when overflow.
> > It will take the pattern similar as below.
> >
> > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> >
> > Take uint8_t as example, we will have:
> >
> > * SAT_ADD (1, 254)   => 255.
> > * SAT_ADD (1, 255)   => 255.
> > * SAT_ADD (2, 255)   => 255.
> > * SAT_ADD (255, 255) => 255.
> >
> > Given below example for the unsigned scalar integer uint64_t:
> >
> > uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> > {
> >   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> > }
> >
> > Before this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   long unsigned int _1;
> >   _Bool _2;
> >   long unsigned int _3;
> >   long unsigned int _4;
> >   uint64_t _7;
> >   long unsigned int _10;
> >   __complex__ long unsigned int _11;
> >
> > ;;   basic block 2, loop depth 0
> > ;;    pred:       ENTRY
> >   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
> >   _1 = REALPART_EXPR <_11>;
> >   _10 = IMAGPART_EXPR <_11>;
> >   _2 = _10 != 0;
> >   _3 = (long unsigned int) _2;
> >   _4 = -_3;
> >   _7 = _1 | _4;
> >   return _7;
> > ;;    succ:       EXIT
> >
> > }
> >
> > After this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   uint64_t _7;
> >
> > ;;   basic block 2, loop depth 0
> > ;;    pred:       ENTRY
> >   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
> >   return _7;
> > ;;    succ:       EXIT
> > }
> >
> > We perform the tranform during widen_mult because that the sub-expr of
> > SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> > pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> > .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> > cannot perform the .SAT_ADD pattern match as the sub-expr will be
> > optmized to .ADD_OVERFLOW first.
> >
> > The below tests are passed for this patch:
> > 1. The riscv fully regression tests.
> > 2. The aarch64 fully regression tests.
> > 3. The x86 bootstrap tests.
> > 4. The x86 fully regression tests.
> >
> > 	PR target/51492
> > 	PR target/112600
> >
> > gcc/ChangeLog:
> >
> > 	* internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
> > 	to the return true switch case(s).
> > 	* internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
> > 	* match.pd: Add unsigned SAT_ADD match.
> > 	* optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
> > 	* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
> > 	func decl generated in match.pd match.
> > 	(match_saturation_arith): New func impl to match the saturation arith.
> > 	(math_opts_dom_walker::after_dom_children): Try match saturation
> > 	arith.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/internal-fn.cc        |  1 +
> >  gcc/internal-fn.def       |  2 ++
> >  gcc/match.pd              | 28 ++++++++++++++++++++++++
> >  gcc/optabs.def            |  4 ++--
> >  gcc/tree-ssa-math-opts.cc | 46
> > +++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 79 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 0a7053c2286..73045ca8c8c 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
> >      case IFN_UBSAN_CHECK_MUL:
> >      case IFN_ADD_OVERFLOW:
> >      case IFN_MUL_OVERFLOW:
> > +    case IFN_SAT_ADD:
> >      case IFN_VEC_WIDEN_PLUS:
> >      case IFN_VEC_WIDEN_PLUS_LO:
> >      case IFN_VEC_WIDEN_PLUS_HI:
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 848bb9dbff3..25badbb86e5 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> ECF_CONST
> > | ECF_NOTHROW, first,
> >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > first,
> >  			      smulhrs, umulhrs, binary)
> >
> > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd,
> > binary)
> > +
> >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> >  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index d401e7503e6..7058e4cbe29 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >         || POINTER_TYPE_P (itype))
> >        && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))))))
> >
> > +/* Unsigned Saturation Add */
> > +(match (usadd_left_part @0 @1)
> > + (plus:c @0 @1)
> > + (if (INTEGRAL_TYPE_P (type)
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@1)))))
> > +
> > +(match (usadd_right_part @0 @1)
> > + (negate (convert (lt (plus:c @0 @1) @0)))
> > + (if (INTEGRAL_TYPE_P (type)
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@1)))))
> > +
> > +(match (usadd_right_part @0 @1)
> > + (negate (convert (gt @0 (plus:c @0 @1))))
> > + (if (INTEGRAL_TYPE_P (type)
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@1)))))
> > +
> > +/* Unsigned saturation add, case 1 (branchless):
> > +   SAT_U_ADD = (X + Y) | - ((X + Y) < X) or
> > +   SAT_U_ADD = (X + Y) | - (X > (X + Y)).  */
> > +(match (unsigned_integer_sat_add @0 @1)
> > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1)))
> > +
> >  /* x >  y  &&  x != XXX_MIN  -->  x > y
> >     x >  y  &&  x == XXX_MIN  -->  false . */
> >  (for eqne (eq ne)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index ad14f9328b9..3f2cb46aff8 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3")
> >  OPTAB_NX(add_optab, "add$Q$a3")
> >  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
> >  OPTAB_VX(addv_optab, "add$F$a3")
> > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3',
> > gen_signed_fixed_libfunc)
> > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3',
> > gen_unsigned_fixed_libfunc)
> > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3',
> > gen_signed_fixed_libfunc)
> > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3',
> > gen_unsigned_fixed_libfunc)
> >  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3',
> gen_int_fp_fixed_libfunc)
> >  OPTAB_NX(sub_optab, "sub$F$a3")
> >  OPTAB_NX(sub_optab, "sub$Q$a3")
> > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> > index 705f4a4695a..35a46edc9f6 100644
> > --- a/gcc/tree-ssa-math-opts.cc
> > +++ b/gcc/tree-ssa-math-opts.cc
> > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple
> > *cast_stmt, gimple *&use_stmt,
> >    return 0;
> >  }
> >
> > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> > +
> > +/*
> > + * Try to match saturation arith pattern(s).
> > + *   1. SAT_ADD (unsigned)
> > + *      _7 = _4 + _6;
> > + *      _8 = _4 > _7;
> > + *      _9 = (long unsigned int) _8;
> > + *      _10 = -_9;
> > + *      _12 = _7 | _10;
> > + *      =>
> > + *      _12 = .SAT_ADD (_4, _6);  */
> > +static bool
> > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt,
> > +			bool *cfg_changed_p)
> > +{
> > +  gcall *call = NULL;
> > +  bool changed_p = false;
> > +
> > +  gcc_assert (is_gimple_assign (stmt));
> > +
> > +  tree ops[2];
> > +  tree lhs = gimple_assign_lhs (stmt);
> > +
> > +  if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)
> > +      && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs),
> > +					OPTIMIZE_FOR_SPEED))
> 
> I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing
> for size.
> > +    {
> > +      call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]);
> > +      gimple_call_set_lhs (call, lhs);
> > +      gsi_replace (gsi, call, true);
> > +      changed_p = true;
> > +      *cfg_changed_p = changed_p;
> > +    }
> > +
> > +  return changed_p;
> > +}
> > +
> >  /* Recognize for unsigned x
> >     x = y - z;
> >     if (x > y)
> > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children
> > (basic_block bb)
> >
> >    fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
> >
> > +  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple *stmt = gsi_stmt (gsi);
> > +
> > +      if (is_gimple_assign (stmt))
> > +	match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> > +    }
> > +
> 
> Hmm why do you iterate independently over the statements? The block below
> already visits
> Every statement doesn't it?
> 
> The root of your match is a BIT_IOR_EXPR expression, so I think you just need to
> change the entry below to:
> 
> 	    case BIT_IOR_EXPR:
> 	      match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> 	      /* fall-through */
> 	    case BIT_XOR_EXPR:
> 	      match_uaddc_usubc (&gsi, stmt, code);
> 	      break;
> 
> Patch is looking good! Thanks again for working on this.
> 
> Regards,
> Tamar
> 
> >    for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
> >      {
> >        gimple *stmt = gsi_stmt (gsi);
> > --
> > 2.34.1
Li, Pan2 May 14, 2024, 1:50 a.m. UTC | #4
> That's just a matter of matching the overflow as an additional case no?
> i.e. you can add an overload for unsigned_integer_sat_add matching the
> IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers.

> I think that would be better as it avoid visiting all the statements twice but also
> extends the matching to some __builtin_add_overflow uses and should be fairly
> simple.

Thanks Tamar, got the point here, will have a try with overload unsigned_integer_sat_add for that.

> Yeah, I think that's better than iterating over the statements twice.  It also fits better
> In the existing code.

Ack, will follow the existing code.

Pan


-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Monday, May 13, 2024 11:03 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; Liu, Hongtao <hongtao.liu@intel.com>
Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

> 
> Thanks Tamer for comments.
> 
> > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when
> optimizing for size.
> 
> Sure thing, let me update it in v5.
> 
> > Hmm why do you iterate independently over the statements? The block below
> already visits
> > Every statement doesn't it?
> 
> Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the
> shape changed, or shall we put it to the previous pass ?
> 

That's just a matter of matching the overflow as an additional case no?
i.e. you can add an overload for unsigned_integer_sat_add matching the
IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers.

I think that would be better as it avoid visiting all the statements twice but also
extends the matching to some __builtin_add_overflow uses and should be fairly
simple.

> > The root of your match is a BIT_IOR_EXPR expression, so I think you just need to
> change the entry below to:
> >
> > 	    case BIT_IOR_EXPR:
> > 	      match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> > 	      /* fall-through */
> > 	    case BIT_XOR_EXPR:
> > 	      match_uaddc_usubc (&gsi, stmt, code);
> > 	      break;
> 
> There are other shapes (not covered in this patch) of SAT_ADD like below branch
> version, the IOR should be one of the ROOT. Thus doesn't
> add case here.  Then, shall we take case for each shape here ? Both works for me.
> 

Yeah, I think that's better than iterating over the statements twice.  It also fits better
In the existing code.

Tamar.

> #define SAT_ADD_U_1(T) \
> T sat_add_u_1_##T(T x, T y) \
> { \
>   return (T)(x + y) >= x ? (x + y) : -1; \
> }
> 
> SAT_ADD_U_1(uint32_t)
> 
> Pan
> 
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Monday, May 13, 2024 5:10 PM
> To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org
> Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com;
> Liu, Hongtao <hongtao.liu@intel.com>
> Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar int
> 
> Hi Pan,
> 
> > -----Original Message-----
> > From: pan2.li@intel.com <pan2.li@intel.com>
> > Sent: Monday, May 6, 2024 3:48 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; Tamar Christina
> > <Tamar.Christina@arm.com>; richard.guenther@gmail.com;
> > hongtao.liu@intel.com; Pan Li <pan2.li@intel.com>
> > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar
> > int
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > This patch would like to add the middle-end presentation for the
> > saturation add.  Aka set the result of add to the max when overflow.
> > It will take the pattern similar as below.
> >
> > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> >
> > Take uint8_t as example, we will have:
> >
> > * SAT_ADD (1, 254)   => 255.
> > * SAT_ADD (1, 255)   => 255.
> > * SAT_ADD (2, 255)   => 255.
> > * SAT_ADD (255, 255) => 255.
> >
> > Given below example for the unsigned scalar integer uint64_t:
> >
> > uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> > {
> >   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> > }
> >
> > Before this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   long unsigned int _1;
> >   _Bool _2;
> >   long unsigned int _3;
> >   long unsigned int _4;
> >   uint64_t _7;
> >   long unsigned int _10;
> >   __complex__ long unsigned int _11;
> >
> > ;;   basic block 2, loop depth 0
> > ;;    pred:       ENTRY
> >   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
> >   _1 = REALPART_EXPR <_11>;
> >   _10 = IMAGPART_EXPR <_11>;
> >   _2 = _10 != 0;
> >   _3 = (long unsigned int) _2;
> >   _4 = -_3;
> >   _7 = _1 | _4;
> >   return _7;
> > ;;    succ:       EXIT
> >
> > }
> >
> > After this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   uint64_t _7;
> >
> > ;;   basic block 2, loop depth 0
> > ;;    pred:       ENTRY
> >   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
> >   return _7;
> > ;;    succ:       EXIT
> > }
> >
> > We perform the tranform during widen_mult because that the sub-expr of
> > SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> > pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> > .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> > cannot perform the .SAT_ADD pattern match as the sub-expr will be
> > optmized to .ADD_OVERFLOW first.
> >
> > The below tests are passed for this patch:
> > 1. The riscv fully regression tests.
> > 2. The aarch64 fully regression tests.
> > 3. The x86 bootstrap tests.
> > 4. The x86 fully regression tests.
> >
> > 	PR target/51492
> > 	PR target/112600
> >
> > gcc/ChangeLog:
> >
> > 	* internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
> > 	to the return true switch case(s).
> > 	* internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
> > 	* match.pd: Add unsigned SAT_ADD match.
> > 	* optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
> > 	* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
> > 	func decl generated in match.pd match.
> > 	(match_saturation_arith): New func impl to match the saturation arith.
> > 	(math_opts_dom_walker::after_dom_children): Try match saturation
> > 	arith.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/internal-fn.cc        |  1 +
> >  gcc/internal-fn.def       |  2 ++
> >  gcc/match.pd              | 28 ++++++++++++++++++++++++
> >  gcc/optabs.def            |  4 ++--
> >  gcc/tree-ssa-math-opts.cc | 46
> > +++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 79 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 0a7053c2286..73045ca8c8c 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
> >      case IFN_UBSAN_CHECK_MUL:
> >      case IFN_ADD_OVERFLOW:
> >      case IFN_MUL_OVERFLOW:
> > +    case IFN_SAT_ADD:
> >      case IFN_VEC_WIDEN_PLUS:
> >      case IFN_VEC_WIDEN_PLUS_LO:
> >      case IFN_VEC_WIDEN_PLUS_HI:
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index 848bb9dbff3..25badbb86e5 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> ECF_CONST
> > | ECF_NOTHROW, first,
> >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > first,
> >  			      smulhrs, umulhrs, binary)
> >
> > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd,
> > binary)
> > +
> >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> >  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index d401e7503e6..7058e4cbe29 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >         || POINTER_TYPE_P (itype))
> >        && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))))))
> >
> > +/* Unsigned Saturation Add */
> > +(match (usadd_left_part @0 @1)
> > + (plus:c @0 @1)
> > + (if (INTEGRAL_TYPE_P (type)
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@1)))))
> > +
> > +(match (usadd_right_part @0 @1)
> > + (negate (convert (lt (plus:c @0 @1) @0)))
> > + (if (INTEGRAL_TYPE_P (type)
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@1)))))
> > +
> > +(match (usadd_right_part @0 @1)
> > + (negate (convert (gt @0 (plus:c @0 @1))))
> > + (if (INTEGRAL_TYPE_P (type)
> > +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@0))
> > +      && types_match (type, TREE_TYPE (@1)))))
> > +
> > +/* Unsigned saturation add, case 1 (branchless):
> > +   SAT_U_ADD = (X + Y) | - ((X + Y) < X) or
> > +   SAT_U_ADD = (X + Y) | - (X > (X + Y)).  */
> > +(match (unsigned_integer_sat_add @0 @1)
> > + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1)))
> > +
> >  /* x >  y  &&  x != XXX_MIN  -->  x > y
> >     x >  y  &&  x == XXX_MIN  -->  false . */
> >  (for eqne (eq ne)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index ad14f9328b9..3f2cb46aff8 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3")
> >  OPTAB_NX(add_optab, "add$Q$a3")
> >  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
> >  OPTAB_VX(addv_optab, "add$F$a3")
> > -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3',
> > gen_signed_fixed_libfunc)
> > -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3',
> > gen_unsigned_fixed_libfunc)
> > +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3',
> > gen_signed_fixed_libfunc)
> > +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3',
> > gen_unsigned_fixed_libfunc)
> >  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3',
> gen_int_fp_fixed_libfunc)
> >  OPTAB_NX(sub_optab, "sub$F$a3")
> >  OPTAB_NX(sub_optab, "sub$Q$a3")
> > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> > index 705f4a4695a..35a46edc9f6 100644
> > --- a/gcc/tree-ssa-math-opts.cc
> > +++ b/gcc/tree-ssa-math-opts.cc
> > @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple
> > *cast_stmt, gimple *&use_stmt,
> >    return 0;
> >  }
> >
> > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> > +
> > +/*
> > + * Try to match saturation arith pattern(s).
> > + *   1. SAT_ADD (unsigned)
> > + *      _7 = _4 + _6;
> > + *      _8 = _4 > _7;
> > + *      _9 = (long unsigned int) _8;
> > + *      _10 = -_9;
> > + *      _12 = _7 | _10;
> > + *      =>
> > + *      _12 = .SAT_ADD (_4, _6);  */
> > +static bool
> > +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt,
> > +			bool *cfg_changed_p)
> > +{
> > +  gcall *call = NULL;
> > +  bool changed_p = false;
> > +
> > +  gcc_assert (is_gimple_assign (stmt));
> > +
> > +  tree ops[2];
> > +  tree lhs = gimple_assign_lhs (stmt);
> > +
> > +  if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)
> > +      && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs),
> > +					OPTIMIZE_FOR_SPEED))
> 
> I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when optimizing
> for size.
> > +    {
> > +      call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]);
> > +      gimple_call_set_lhs (call, lhs);
> > +      gsi_replace (gsi, call, true);
> > +      changed_p = true;
> > +      *cfg_changed_p = changed_p;
> > +    }
> > +
> > +  return changed_p;
> > +}
> > +
> >  /* Recognize for unsigned x
> >     x = y - z;
> >     if (x > y)
> > @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children
> > (basic_block bb)
> >
> >    fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
> >
> > +  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple *stmt = gsi_stmt (gsi);
> > +
> > +      if (is_gimple_assign (stmt))
> > +	match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> > +    }
> > +
> 
> Hmm why do you iterate independently over the statements? The block below
> already visits
> Every statement doesn't it?
> 
> The root of your match is a BIT_IOR_EXPR expression, so I think you just need to
> change the entry below to:
> 
> 	    case BIT_IOR_EXPR:
> 	      match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> 	      /* fall-through */
> 	    case BIT_XOR_EXPR:
> 	      match_uaddc_usubc (&gsi, stmt, code);
> 	      break;
> 
> Patch is looking good! Thanks again for working on this.
> 
> Regards,
> Tamar
> 
> >    for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
> >      {
> >        gimple *stmt = gsi_stmt (gsi);
> > --
> > 2.34.1
Richard Biener May 14, 2024, 1:18 p.m. UTC | #5
On Mon, May 6, 2024 at 4:48 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to add the middle-end presentation for the
> saturation add.  Aka set the result of add to the max when overflow.
> It will take the pattern similar as below.
>
> SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
>
> Take uint8_t as example, we will have:
>
> * SAT_ADD (1, 254)   => 255.
> * SAT_ADD (1, 255)   => 255.
> * SAT_ADD (2, 255)   => 255.
> * SAT_ADD (255, 255) => 255.
>
> Given below example for the unsigned scalar integer uint64_t:
>
> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> {
>   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> }
>
> Before this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   _Bool _2;
>   long unsigned int _3;
>   long unsigned int _4;
>   uint64_t _7;
>   long unsigned int _10;
>   __complex__ long unsigned int _11;
>
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
>   _1 = REALPART_EXPR <_11>;
>   _10 = IMAGPART_EXPR <_11>;
>   _2 = _10 != 0;
>   _3 = (long unsigned int) _2;
>   _4 = -_3;
>   _7 = _1 | _4;
>   return _7;
> ;;    succ:       EXIT
>
> }
>
> After this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   uint64_t _7;
>
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
>   return _7;
> ;;    succ:       EXIT
> }
>
> We perform the tranform during widen_mult because that the sub-expr of
> SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> cannot perform the .SAT_ADD pattern match as the sub-expr will be
> optmized to .ADD_OVERFLOW first.
>
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The aarch64 fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
>         PR target/51492
>         PR target/112600
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
>         to the return true switch case(s).
>         * internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
>         * match.pd: Add unsigned SAT_ADD match.
>         * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
>         * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
>         func decl generated in match.pd match.
>         (match_saturation_arith): New func impl to match the saturation arith.
>         (math_opts_dom_walker::after_dom_children): Try match saturation
>         arith.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/internal-fn.cc        |  1 +
>  gcc/internal-fn.def       |  2 ++
>  gcc/match.pd              | 28 ++++++++++++++++++++++++
>  gcc/optabs.def            |  4 ++--
>  gcc/tree-ssa-math-opts.cc | 46 +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 79 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0a7053c2286..73045ca8c8c 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
>      case IFN_UBSAN_CHECK_MUL:
>      case IFN_ADD_OVERFLOW:
>      case IFN_MUL_OVERFLOW:
> +    case IFN_SAT_ADD:
>      case IFN_VEC_WIDEN_PLUS:
>      case IFN_VEC_WIDEN_PLUS_LO:
>      case IFN_VEC_WIDEN_PLUS_HI:
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..25badbb86e5 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
>                               smulhrs, umulhrs, binary)
>
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index d401e7503e6..7058e4cbe29 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         || POINTER_TYPE_P (itype))
>        && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))))))
>
> +/* Unsigned Saturation Add */
> +(match (usadd_left_part @0 @1)
> + (plus:c @0 @1)
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (lt (plus:c @0 @1) @0)))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (gt @0 (plus:c @0 @1))))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +/* Unsigned saturation add, case 1 (branchless):
> +   SAT_U_ADD = (X + Y) | - ((X + Y) < X) or
> +   SAT_U_ADD = (X + Y) | - (X > (X + Y)).  */
> +(match (unsigned_integer_sat_add @0 @1)
> + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1)))
> +
>  /* x >  y  &&  x != XXX_MIN  -->  x > y
>     x >  y  &&  x == XXX_MIN  -->  false . */
>  (for eqne (eq ne)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..3f2cb46aff8 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3")
>  OPTAB_NX(add_optab, "add$Q$a3")
>  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(addv_optab, "add$F$a3")
> -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
>  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(sub_optab, "sub$F$a3")
>  OPTAB_NX(sub_optab, "sub$Q$a3")
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index 705f4a4695a..35a46edc9f6 100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple *cast_stmt, gimple *&use_stmt,
>    return 0;
>  }
>
> +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> +
> +/*
> + * Try to match saturation arith pattern(s).
> + *   1. SAT_ADD (unsigned)
> + *      _7 = _4 + _6;
> + *      _8 = _4 > _7;
> + *      _9 = (long unsigned int) _8;
> + *      _10 = -_9;
> + *      _12 = _7 | _10;
> + *      =>
> + *      _12 = .SAT_ADD (_4, _6);  */
> +static bool
> +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt,
> +                       bool *cfg_changed_p)
> +{
> +  gcall *call = NULL;
> +  bool changed_p = false;
> +
> +  gcc_assert (is_gimple_assign (stmt));

If you require a gassign please statically type your function
argument as gassign * instead and remove this assert.

> +
> +  tree ops[2];
> +  tree lhs = gimple_assign_lhs (stmt);
> +
> +  if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)
> +      && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs),
> +                                       OPTIMIZE_FOR_SPEED))
> +    {
> +      call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]);
> +      gimple_call_set_lhs (call, lhs);
> +      gsi_replace (gsi, call, true);
> +      changed_p = true;
> +      *cfg_changed_p = changed_p;

As addition to Tamars good comments why do you set *cfg_changed_p to
true?  You are
not changing the CFG afer all?

> +    }
> +
> +  return changed_p;
> +}
> +
>  /* Recognize for unsigned x
>     x = y - z;
>     if (x > y)
> @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
>
>    fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
>
> +  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +
> +      if (is_gimple_assign (stmt))
> +       match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> +    }
> +
>    for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
>      {
>        gimple *stmt = gsi_stmt (gsi);
> --
> 2.34.1
>
Li, Pan2 May 14, 2024, 2:14 p.m. UTC | #6
Thanks Richard for comments.

> If you require a gassign please statically type your function
> argument as gassign * instead and remove this assert.

Sure

> As addition to Tamars good comments why do you set *cfg_changed_p to
> true?  You are
> not changing the CFG afer all?

Yes, we can add it back in future if we really changed cfg, will update in v5 (include vect patch 2/3) after all test passed.

Pan


-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Tuesday, May 14, 2024 9:18 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; tamar.christina@arm.com; Liu, Hongtao <hongtao.liu@intel.com>
Subject: Re: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

On Mon, May 6, 2024 at 4:48 PM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to add the middle-end presentation for the
> saturation add.  Aka set the result of add to the max when overflow.
> It will take the pattern similar as below.
>
> SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
>
> Take uint8_t as example, we will have:
>
> * SAT_ADD (1, 254)   => 255.
> * SAT_ADD (1, 255)   => 255.
> * SAT_ADD (2, 255)   => 255.
> * SAT_ADD (255, 255) => 255.
>
> Given below example for the unsigned scalar integer uint64_t:
>
> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> {
>   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> }
>
> Before this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   _Bool _2;
>   long unsigned int _3;
>   long unsigned int _4;
>   uint64_t _7;
>   long unsigned int _10;
>   __complex__ long unsigned int _11;
>
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
>   _1 = REALPART_EXPR <_11>;
>   _10 = IMAGPART_EXPR <_11>;
>   _2 = _10 != 0;
>   _3 = (long unsigned int) _2;
>   _4 = -_3;
>   _7 = _1 | _4;
>   return _7;
> ;;    succ:       EXIT
>
> }
>
> After this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   uint64_t _7;
>
> ;;   basic block 2, loop depth 0
> ;;    pred:       ENTRY
>   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
>   return _7;
> ;;    succ:       EXIT
> }
>
> We perform the tranform during widen_mult because that the sub-expr of
> SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> cannot perform the .SAT_ADD pattern match as the sub-expr will be
> optmized to .ADD_OVERFLOW first.
>
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The aarch64 fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
>
>         PR target/51492
>         PR target/112600
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
>         to the return true switch case(s).
>         * internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
>         * match.pd: Add unsigned SAT_ADD match.
>         * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
>         * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
>         func decl generated in match.pd match.
>         (match_saturation_arith): New func impl to match the saturation arith.
>         (math_opts_dom_walker::after_dom_children): Try match saturation
>         arith.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/internal-fn.cc        |  1 +
>  gcc/internal-fn.def       |  2 ++
>  gcc/match.pd              | 28 ++++++++++++++++++++++++
>  gcc/optabs.def            |  4 ++--
>  gcc/tree-ssa-math-opts.cc | 46 +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 79 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0a7053c2286..73045ca8c8c 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
>      case IFN_UBSAN_CHECK_MUL:
>      case IFN_ADD_OVERFLOW:
>      case IFN_MUL_OVERFLOW:
> +    case IFN_SAT_ADD:
>      case IFN_VEC_WIDEN_PLUS:
>      case IFN_VEC_WIDEN_PLUS_LO:
>      case IFN_VEC_WIDEN_PLUS_HI:
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..25badbb86e5 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
>                               smulhrs, umulhrs, binary)
>
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index d401e7503e6..7058e4cbe29 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3043,6 +3043,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         || POINTER_TYPE_P (itype))
>        && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))))))
>
> +/* Unsigned Saturation Add */
> +(match (usadd_left_part @0 @1)
> + (plus:c @0 @1)
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (lt (plus:c @0 @1) @0)))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +(match (usadd_right_part @0 @1)
> + (negate (convert (gt @0 (plus:c @0 @1))))
> + (if (INTEGRAL_TYPE_P (type)
> +      && TYPE_UNSIGNED (TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@0))
> +      && types_match (type, TREE_TYPE (@1)))))
> +
> +/* Unsigned saturation add, case 1 (branchless):
> +   SAT_U_ADD = (X + Y) | - ((X + Y) < X) or
> +   SAT_U_ADD = (X + Y) | - (X > (X + Y)).  */
> +(match (unsigned_integer_sat_add @0 @1)
> + (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1)))
> +
>  /* x >  y  &&  x != XXX_MIN  -->  x > y
>     x >  y  &&  x == XXX_MIN  -->  false . */
>  (for eqne (eq ne)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..3f2cb46aff8 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -111,8 +111,8 @@ OPTAB_NX(add_optab, "add$F$a3")
>  OPTAB_NX(add_optab, "add$Q$a3")
>  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(addv_optab, "add$F$a3")
> -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
> +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
> +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
>  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(sub_optab, "sub$F$a3")
>  OPTAB_NX(sub_optab, "sub$Q$a3")
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index 705f4a4695a..35a46edc9f6 100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -4026,6 +4026,44 @@ arith_overflow_check_p (gimple *stmt, gimple *cast_stmt, gimple *&use_stmt,
>    return 0;
>  }
>
> +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> +
> +/*
> + * Try to match saturation arith pattern(s).
> + *   1. SAT_ADD (unsigned)
> + *      _7 = _4 + _6;
> + *      _8 = _4 > _7;
> + *      _9 = (long unsigned int) _8;
> + *      _10 = -_9;
> + *      _12 = _7 | _10;
> + *      =>
> + *      _12 = .SAT_ADD (_4, _6);  */
> +static bool
> +match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt,
> +                       bool *cfg_changed_p)
> +{
> +  gcall *call = NULL;
> +  bool changed_p = false;
> +
> +  gcc_assert (is_gimple_assign (stmt));

If you require a gassign please statically type your function
argument as gassign * instead and remove this assert.

> +
> +  tree ops[2];
> +  tree lhs = gimple_assign_lhs (stmt);
> +
> +  if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)
> +      && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs),
> +                                       OPTIMIZE_FOR_SPEED))
> +    {
> +      call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]);
> +      gimple_call_set_lhs (call, lhs);
> +      gsi_replace (gsi, call, true);
> +      changed_p = true;
> +      *cfg_changed_p = changed_p;

As addition to Tamars good comments why do you set *cfg_changed_p to
true?  You are
not changing the CFG afer all?

> +    }
> +
> +  return changed_p;
> +}
> +
>  /* Recognize for unsigned x
>     x = y - z;
>     if (x > y)
> @@ -5886,6 +5924,14 @@ math_opts_dom_walker::after_dom_children (basic_block bb)
>
>    fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
>
> +  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple *stmt = gsi_stmt (gsi);
> +
> +      if (is_gimple_assign (stmt))
> +       match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
> +    }
> +
>    for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
>      {
>        gimple *stmt = gsi_stmt (gsi);
> --
> 2.34.1
>
diff mbox series

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 0a7053c2286..73045ca8c8c 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4202,6 +4202,7 @@  commutative_binary_fn_p (internal_fn fn)
     case IFN_UBSAN_CHECK_MUL:
     case IFN_ADD_OVERFLOW:
     case IFN_MUL_OVERFLOW:
+    case IFN_SAT_ADD:
     case IFN_VEC_WIDEN_PLUS:
     case IFN_VEC_WIDEN_PLUS_LO:
     case IFN_VEC_WIDEN_PLUS_HI:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 848bb9dbff3..25badbb86e5 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -275,6 +275,8 @@  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | ECF_NOTHROW, first,
 DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
 			      smulhrs, umulhrs, binary)
 
+DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary)
+
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
 DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index d401e7503e6..7058e4cbe29 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3043,6 +3043,34 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        || POINTER_TYPE_P (itype))
       && wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))))))
 
+/* Unsigned Saturation Add */
+(match (usadd_left_part @0 @1)
+ (plus:c @0 @1)
+ (if (INTEGRAL_TYPE_P (type)
+      && TYPE_UNSIGNED (TREE_TYPE (@0))
+      && types_match (type, TREE_TYPE (@0))
+      && types_match (type, TREE_TYPE (@1)))))
+
+(match (usadd_right_part @0 @1)
+ (negate (convert (lt (plus:c @0 @1) @0)))
+ (if (INTEGRAL_TYPE_P (type)
+      && TYPE_UNSIGNED (TREE_TYPE (@0))
+      && types_match (type, TREE_TYPE (@0))
+      && types_match (type, TREE_TYPE (@1)))))
+
+(match (usadd_right_part @0 @1)
+ (negate (convert (gt @0 (plus:c @0 @1))))
+ (if (INTEGRAL_TYPE_P (type)
+      && TYPE_UNSIGNED (TREE_TYPE (@0))
+      && types_match (type, TREE_TYPE (@0))
+      && types_match (type, TREE_TYPE (@1)))))
+
+/* Unsigned saturation add, case 1 (branchless):
+   SAT_U_ADD = (X + Y) | - ((X + Y) < X) or
+   SAT_U_ADD = (X + Y) | - (X > (X + Y)).  */
+(match (unsigned_integer_sat_add @0 @1)
+ (bit_ior:c (usadd_left_part @0 @1) (usadd_right_part @0 @1)))
+
 /* x >  y  &&  x != XXX_MIN  -->  x > y
    x >  y  &&  x == XXX_MIN  -->  false . */
 (for eqne (eq ne)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index ad14f9328b9..3f2cb46aff8 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -111,8 +111,8 @@  OPTAB_NX(add_optab, "add$F$a3")
 OPTAB_NX(add_optab, "add$Q$a3")
 OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
 OPTAB_VX(addv_optab, "add$F$a3")
-OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
-OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
+OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_signed_fixed_libfunc)
+OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_unsigned_fixed_libfunc)
 OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
 OPTAB_NX(sub_optab, "sub$F$a3")
 OPTAB_NX(sub_optab, "sub$Q$a3")
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 705f4a4695a..35a46edc9f6 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -4026,6 +4026,44 @@  arith_overflow_check_p (gimple *stmt, gimple *cast_stmt, gimple *&use_stmt,
   return 0;
 }
 
+extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
+
+/*
+ * Try to match saturation arith pattern(s).
+ *   1. SAT_ADD (unsigned)
+ *      _7 = _4 + _6;
+ *      _8 = _4 > _7;
+ *      _9 = (long unsigned int) _8;
+ *      _10 = -_9;
+ *      _12 = _7 | _10;
+ *      =>
+ *      _12 = .SAT_ADD (_4, _6);  */
+static bool
+match_saturation_arith (gimple_stmt_iterator *gsi, gimple *stmt,
+			bool *cfg_changed_p)
+{
+  gcall *call = NULL;
+  bool changed_p = false;
+
+  gcc_assert (is_gimple_assign (stmt));
+
+  tree ops[2];
+  tree lhs = gimple_assign_lhs (stmt);
+
+  if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)
+      && direct_internal_fn_supported_p (IFN_SAT_ADD, TREE_TYPE (lhs),
+					OPTIMIZE_FOR_SPEED))
+    {
+      call = gimple_build_call_internal (IFN_SAT_ADD, 2, ops[0], ops[1]);
+      gimple_call_set_lhs (call, lhs);
+      gsi_replace (gsi, call, true);
+      changed_p = true;
+      *cfg_changed_p = changed_p;
+    }
+
+  return changed_p;
+}
+
 /* Recognize for unsigned x
    x = y - z;
    if (x > y)
@@ -5886,6 +5924,14 @@  math_opts_dom_walker::after_dom_children (basic_block bb)
 
   fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
 
+  for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+
+      if (is_gimple_assign (stmt))
+	match_saturation_arith (&gsi, stmt, m_cfg_changed_p);
+    }
+
   for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
     {
       gimple *stmt = gsi_stmt (gsi);