diff mbox series

[v2,01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless

Message ID 20241031062742.2709845-1-pan2.li@intel.com
State New
Headers show
Series [v2,01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless | expand

Commit Message

Li, Pan2 Oct. 31, 2024, 6:27 a.m. UTC
From: Pan Li <pan2.li@intel.com>

There are sorts of forms for the unsigned SAT_ADD.  Some of them are
complicated while others are cheap.  This patch would like to simplify
the complicated form into the cheap ones.  For example as below:

From the form 4 (branch):
  SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).

To (branchless):
  SAT_U_ADD = (X + Y) | - ((X + Y) < X).

  #define T uint8_t

  T sat_add_u_1 (T x, T y)
  {
    return (T)(x + y) < x ? -1 : (x + y);
  }

Before this patch:
   1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
   2   │ {
   3   │   uint8_t D.2809;
   4   │
   5   │   _1 = x + y;
   6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
   7   │   <D.2810>:
   8   │   D.2809 = x + y;
   9   │   goto <D.2812>;
  10   │   <D.2811>:
  11   │   D.2809 = 255;
  12   │   <D.2812>:
  13   │   return D.2809;
  14   │ }

After this patch:
   1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
   2   │ {
   3   │   uint8_t D.2809;
   4   │
   5   │   _1 = x + y;
   6   │   _2 = x + y;
   7   │   _3 = x > _2;
   8   │   _4 = (unsigned char) _3;
   9   │   _5 = -_4;
  10   │   D.2809 = _1 | _5;
  11   │   return D.2809;
  12   │ }

The below test suites are passed for this patch.
* The rv64gcv fully regression test.
* The x86 bootstrap test.
* The x86 fully regression test.

gcc/ChangeLog:

	* match.pd: Remove unsigned branch form 4 for SAT_ADD, and
	add simplify to branchless instead.

gcc/testsuite/ChangeLog:

	* gcc.dg/sat_arith_simplify.h: New test.
	* gcc.dg/sat_u_add-simplify-2-u16.c: New test.
	* gcc.dg/sat_u_add-simplify-2-u32.c: New test.
	* gcc.dg/sat_u_add-simplify-2-u64.c: New test.
	* gcc.dg/sat_u_add-simplify-2-u8.c: New test.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/match.pd                                  | 23 +++++++++----------
 gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
 .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
 .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
 .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
 .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
 6 files changed, 65 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
 create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
 create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
 create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
 create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c

Comments

Richard Biener Nov. 6, 2024, 12:43 p.m. UTC | #1
On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> complicated while others are cheap.  This patch would like to simplify
> the complicated form into the cheap ones.  For example as below:
>
> From the form 4 (branch):
>   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
>
> To (branchless):
>   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
>
>   #define T uint8_t
>
>   T sat_add_u_1 (T x, T y)
>   {
>     return (T)(x + y) < x ? -1 : (x + y);
>   }
>
> Before this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;
>    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
>    7   │   <D.2810>:
>    8   │   D.2809 = x + y;
>    9   │   goto <D.2812>;
>   10   │   <D.2811>:
>   11   │   D.2809 = 255;
>   12   │   <D.2812>:
>   13   │   return D.2809;
>   14   │ }
>
> After this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;
>    6   │   _2 = x + y;
>    7   │   _3 = x > _2;
>    8   │   _4 = (unsigned char) _3;
>    9   │   _5 = -_4;
>   10   │   D.2809 = _1 | _5;
>   11   │   return D.2809;
>   12   │ }
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.
>
> gcc/ChangeLog:
>
>         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
>         add simplify to branchless instead.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/sat_arith_simplify.h: New test.
>         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
>         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
>         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
>         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd                                  | 23 +++++++++----------
>  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
>  6 files changed, 65 insertions(+), 12 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index c851ac56e37..8425d7c4f20 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (match (unsigned_integer_sat_add @0 @1)
>   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
>
> -/* Simplify SAT_U_ADD to the cheap form
> -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))
> -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> -
> -/* Unsigned saturation add, case 4 (branch with lt):
> -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> -(match (unsigned_integer_sat_add @0 @1)
> - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> +  (if (types_match (type, @0, @1))
> +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> +  (if (types_match (type, @0, @1))
> +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))

Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
cheaper.  When I count the number of gimple statements (sub-expressions)
the original appears as 3 while the result looks to have 5.  The catch is of
course that the original might involve control flow and a PHI.

The pattern will apply during late PHI-OPT or during GENERIC folding
done by the frontends or gimplfiication - you scan the gimplification dump
below, so having it apply there means it will influence inlining heuristics
for example.  I wonder which variant is considered larger by its heuristic.

What do others think of the early canonicalization of these to
straight-line code?

Thanks,
Richard.

>  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
>     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1.  */
> diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> new file mode 100644
> index 00000000000..46ac00426b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> @@ -0,0 +1,10 @@
> +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> +
> +#define DEF_SAT_U_ADD_2(T)              \
> +T sat_u_add_##T##_2 (T x, T y)          \
> +{                                       \
> +  return (T)(x + y) < x ? -1 : (x + y); \
> +}
> +
> +#endif
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> new file mode 100644
> index 00000000000..b170b35096c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint16_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> new file mode 100644
> index 00000000000..8830ed7b878
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint32_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> new file mode 100644
> index 00000000000..76c4d4bddaa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint64_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> new file mode 100644
> index 00000000000..b034b8eedb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint8_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> --
> 2.43.0
>
Li, Pan2 Nov. 6, 2024, 1:31 p.m. UTC | #2
Never mind and thanks Richard for comments.

> Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> cheaper.  When I count the number of gimple statements (sub-expressions)
> the original appears as 3 while the result looks to have 5.

I may have a question about how we count the stmts, not sure the x <= _1 and then goto should be counted
as 1 or 2 stmts in below example. I try to count it by myself but not very sure the understanding is correct.

Before this patch:
   1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
   2   │ {
   3   │   uint8_t D.2809;
   4   │
   5   │   _1 = x + y;  // 1
   6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto ???
   7   │   <D.2810>:
   8   │   D.2809 = x + y; // 4 (token)
   9   │   goto <D.2812>; // 5 (token) for goto ???
  10   │   <D.2811>:
  11   │   D.2809 = 255; // 4 (not token)
  12   │   <D.2812>:
  13   │   return D.2809; // 6 for token, 5 for not token
  14   │ }

After this patch:
   1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
   2   │ {
   3   │   uint8_t D.2809;
   4   │
   5   │   _1 = x + y; // 1
   6   │   _2 = x + y; // 2
   7   │   _3 = x > _2; // 3
   8   │   _4 = (unsigned char) _3; // 4
   9   │   _5 = -_4; // 5
  10   │   D.2809 = _1 | _5; // 6
  11   │   return D.2809; // 7
  12   │ }

> The catch is of
> course that the original might involve control flow and a PHI.

> The pattern will apply during late PHI-OPT or during GENERIC folding
> done by the frontends or gimplfiication - you scan the gimplification dump
> below, so having it apply there means it will influence inlining heuristics
> for example.  I wonder which variant is considered larger by its heuristic.

> What do others think of the early canonicalization of these to
> straight-line code?

Make sense, we could find one form that most of us consider to be the "cheapest".

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Wednesday, November 6, 2024 8:43 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless

On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> complicated while others are cheap.  This patch would like to simplify
> the complicated form into the cheap ones.  For example as below:
>
> From the form 4 (branch):
>   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
>
> To (branchless):
>   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
>
>   #define T uint8_t
>
>   T sat_add_u_1 (T x, T y)
>   {
>     return (T)(x + y) < x ? -1 : (x + y);
>   }
>
> Before this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;
>    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
>    7   │   <D.2810>:
>    8   │   D.2809 = x + y;
>    9   │   goto <D.2812>;
>   10   │   <D.2811>:
>   11   │   D.2809 = 255;
>   12   │   <D.2812>:
>   13   │   return D.2809;
>   14   │ }
>
> After this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;
>    6   │   _2 = x + y;
>    7   │   _3 = x > _2;
>    8   │   _4 = (unsigned char) _3;
>    9   │   _5 = -_4;
>   10   │   D.2809 = _1 | _5;
>   11   │   return D.2809;
>   12   │ }
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.
>
> gcc/ChangeLog:
>
>         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
>         add simplify to branchless instead.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/sat_arith_simplify.h: New test.
>         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
>         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
>         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
>         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd                                  | 23 +++++++++----------
>  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
>  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
>  6 files changed, 65 insertions(+), 12 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
>  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index c851ac56e37..8425d7c4f20 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (match (unsigned_integer_sat_add @0 @1)
>   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
>
> -/* Simplify SAT_U_ADD to the cheap form
> -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> -      && types_match (type, @0, @1))
> -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> -
> -/* Unsigned saturation add, case 4 (branch with lt):
> -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> -(match (unsigned_integer_sat_add @0 @1)
> - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> +  (if (types_match (type, @0, @1))
> +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> +  (if (types_match (type, @0, @1))
> +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))

Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
cheaper.  When I count the number of gimple statements (sub-expressions)
the original appears as 3 while the result looks to have 5.  The catch is of
course that the original might involve control flow and a PHI.

The pattern will apply during late PHI-OPT or during GENERIC folding
done by the frontends or gimplfiication - you scan the gimplification dump
below, so having it apply there means it will influence inlining heuristics
for example.  I wonder which variant is considered larger by its heuristic.

What do others think of the early canonicalization of these to
straight-line code?

Thanks,
Richard.

>  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
>     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1.  */
> diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> new file mode 100644
> index 00000000000..46ac00426b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> @@ -0,0 +1,10 @@
> +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> +
> +#define DEF_SAT_U_ADD_2(T)              \
> +T sat_u_add_##T##_2 (T x, T y)          \
> +{                                       \
> +  return (T)(x + y) < x ? -1 : (x + y); \
> +}
> +
> +#endif
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> new file mode 100644
> index 00000000000..b170b35096c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint16_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> new file mode 100644
> index 00000000000..8830ed7b878
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint32_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> new file mode 100644
> index 00000000000..76c4d4bddaa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint64_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> new file mode 100644
> index 00000000000..b034b8eedb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> +
> +#include <stdint.h>
> +#include "sat_arith_simplify.h"
> +
> +DEF_SAT_U_ADD_2 (uint8_t)
> +
> +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> --
> 2.43.0
>
Tamar Christina Nov. 6, 2024, 4:24 p.m. UTC | #3
> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Wednesday, November 6, 2024 1:31 PM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>;
> juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com;
> rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> Never mind and thanks Richard for comments.
> 
> > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > cheaper.  When I count the number of gimple statements (sub-expressions)
> > the original appears as 3 while the result looks to have 5.
> 
> I may have a question about how we count the stmts, not sure the x <= _1 and
> then goto should be counted

They should, because they both become a instructions that need to be executed in
the execution chain.

> as 1 or 2 stmts in below example. I try to count it by myself but not very sure the
> understanding is correct.
> 
> Before this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;  // 1
>    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto
> ???
>    7   │   <D.2810>:
>    8   │   D.2809 = x + y; // 4 (token)
>    9   │   goto <D.2812>; // 5 (token) for goto ???
>   10   │   <D.2811>:
>   11   │   D.2809 = 255; // 4 (not token)
>   12   │   <D.2812>:
>   13   │   return D.2809; // 6 for token, 5 for not token
>   14   │ }
> 
> After this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y; // 1
>    6   │   _2 = x + y; // 2
>    7   │   _3 = x > _2; // 3
>    8   │   _4 = (unsigned char) _3; // 4
>    9   │   _5 = -_4; // 5
>   10   │   D.2809 = _1 | _5; // 6
>   11   │   return D.2809; // 7
>   12   │ }
> 
> > The catch is of
> > course that the original might involve control flow and a PHI.

_1 and _2 are counted as one, as they'll be CSE'd.
After the patch your dependency chain is 5 instructions, as in,
to create the result you must execute 5 sequential instructions.

   1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
   2   │ {
   3   │   uint8_t D.2809;
   4   │
   5   │   _1 = x + y;
   6   │   _2 = x + y; // 1
   7   │   _3 = x > _2; // 2
   8   │   _4 = (unsigned char) _3; // 3
   9   │   _5 = -_4; // 4
  10   │   D.2809 = _1 | _5; //5
  11   │   return D.2809;
  12   │ }

Before your change you had 3, because you always do
Compare + branch + set.

The problem with the rewrite is that it pessimists the code if the saturating
instructions are not recognized afterwards.

Note this is also the case for vector codegen, The original code in vector would be

1. duplicate 255
1. x + y
2. compare
3. select.

The first two instructions are independent and execute in parallel, so the original
vector codegen also had a dependency chain of 3.

Additionally the branch version will get the benefit of branch prediction when ran
inside a loop.

> 
> > The pattern will apply during late PHI-OPT or during GENERIC folding
> > done by the frontends or gimplfiication - you scan the gimplification dump
> > below, so having it apply there means it will influence inlining heuristics
> > for example.  I wonder which variant is considered larger by its heuristic.
> 
> > What do others think of the early canonicalization of these to
> > straight-line code?
> 
> Make sense, we could find one form that most of us consider to be the "cheapest".

I would disagree with Richard here in that this would be a pessimistic codegen for
targets that don't later implement the IFN_SAT.

Thanks,
Tamar

> 
> Pan
> 
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Wednesday, November 6, 2024 8:43 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai;
> kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> > complicated while others are cheap.  This patch would like to simplify
> > the complicated form into the cheap ones.  For example as below:
> >
> > From the form 4 (branch):
> >   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
> >
> > To (branchless):
> >   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
> >
> >   #define T uint8_t
> >
> >   T sat_add_u_1 (T x, T y)
> >   {
> >     return (T)(x + y) < x ? -1 : (x + y);
> >   }
> >
> > Before this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;
> >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
> >    7   │   <D.2810>:
> >    8   │   D.2809 = x + y;
> >    9   │   goto <D.2812>;
> >   10   │   <D.2811>:
> >   11   │   D.2809 = 255;
> >   12   │   <D.2812>:
> >   13   │   return D.2809;
> >   14   │ }
> >
> > After this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;
> >    6   │   _2 = x + y;
> >    7   │   _3 = x > _2;
> >    8   │   _4 = (unsigned char) _3;
> >    9   │   _5 = -_4;
> >   10   │   D.2809 = _1 | _5;
> >   11   │   return D.2809;
> >   12   │ }
> >
> > The below test suites are passed for this patch.
> > * The rv64gcv fully regression test.
> > * The x86 bootstrap test.
> > * The x86 fully regression test.
> >
> > gcc/ChangeLog:
> >
> >         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
> >         add simplify to branchless instead.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.dg/sat_arith_simplify.h: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd                                  | 23 +++++++++----------
> >  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
> >  6 files changed, 65 insertions(+), 12 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index c851ac56e37..8425d7c4f20 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  (match (unsigned_integer_sat_add @0 @1)
> >   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
> >
> > -/* Simplify SAT_U_ADD to the cheap form
> > -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> > -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > -      && types_match (type, @0, @1))
> > -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> > -
> > -/* Unsigned saturation add, case 4 (branch with lt):
> > -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > -(match (unsigned_integer_sat_add @0 @1)
> > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> > +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> > +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > +  (if (types_match (type, @0, @1))
> > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> > +  (if (types_match (type, @0, @1))
> > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))
> 
> Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> cheaper.  When I count the number of gimple statements (sub-expressions)
> the original appears as 3 while the result looks to have 5.  The catch is of
> course that the original might involve control flow and a PHI.
> 
> The pattern will apply during late PHI-OPT or during GENERIC folding
> done by the frontends or gimplfiication - you scan the gimplification dump
> below, so having it apply there means it will influence inlining heuristics
> for example.  I wonder which variant is considered larger by its heuristic.
> 
> What do others think of the early canonicalization of these to
> straight-line code?
> 
> Thanks,
> Richard.
> 
> >  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
> >     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -
> 1.  */
> > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > new file mode 100644
> > index 00000000000..46ac00426b2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > @@ -0,0 +1,10 @@
> > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > +
> > +#define DEF_SAT_U_ADD_2(T)              \
> > +T sat_u_add_##T##_2 (T x, T y)          \
> > +{                                       \
> > +  return (T)(x + y) < x ? -1 : (x + y); \
> > +}
> > +
> > +#endif
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > new file mode 100644
> > index 00000000000..b170b35096c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint16_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > new file mode 100644
> > index 00000000000..8830ed7b878
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint32_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > new file mode 100644
> > index 00000000000..76c4d4bddaa
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint64_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > new file mode 100644
> > index 00000000000..b034b8eedb1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint8_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > --
> > 2.43.0
> >
Li, Pan2 Nov. 7, 2024, 1:44 a.m. UTC | #4
I see, thanks Tamar for the explanation.

> The problem with the rewrite is that it pessimists the code if the saturating
> instructions are not recognized afterwards.

The original idea is somehow independent with the backend support IFN_SAT_* or not.
Given we have sorts of form of IFN_SAT_*, some of them are cheap while others are heavy
from the perspective of gimple. It is possible to do some canonicalization here.

> Additionally the branch version will get the benefit of branch prediction when ran
> inside a loop.

Not very sure if my understanding is correct, but the branchless version is preferred in most case IMO if
they have nearly count of stmt. Not sure if it is still true during the vectorization.

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Thursday, November 7, 2024 12:25 AM
To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless

> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Wednesday, November 6, 2024 1:31 PM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>;
> juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com;
> rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> Never mind and thanks Richard for comments.
> 
> > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > cheaper.  When I count the number of gimple statements (sub-expressions)
> > the original appears as 3 while the result looks to have 5.
> 
> I may have a question about how we count the stmts, not sure the x <= _1 and
> then goto should be counted

They should, because they both become a instructions that need to be executed in
the execution chain.

> as 1 or 2 stmts in below example. I try to count it by myself but not very sure the
> understanding is correct.
> 
> Before this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;  // 1
>    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto
> ???
>    7   │   <D.2810>:
>    8   │   D.2809 = x + y; // 4 (token)
>    9   │   goto <D.2812>; // 5 (token) for goto ???
>   10   │   <D.2811>:
>   11   │   D.2809 = 255; // 4 (not token)
>   12   │   <D.2812>:
>   13   │   return D.2809; // 6 for token, 5 for not token
>   14   │ }
> 
> After this patch:
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y; // 1
>    6   │   _2 = x + y; // 2
>    7   │   _3 = x > _2; // 3
>    8   │   _4 = (unsigned char) _3; // 4
>    9   │   _5 = -_4; // 5
>   10   │   D.2809 = _1 | _5; // 6
>   11   │   return D.2809; // 7
>   12   │ }
> 
> > The catch is of
> > course that the original might involve control flow and a PHI.

_1 and _2 are counted as one, as they'll be CSE'd.
After the patch your dependency chain is 5 instructions, as in,
to create the result you must execute 5 sequential instructions.

   1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
   2   │ {
   3   │   uint8_t D.2809;
   4   │
   5   │   _1 = x + y;
   6   │   _2 = x + y; // 1
   7   │   _3 = x > _2; // 2
   8   │   _4 = (unsigned char) _3; // 3
   9   │   _5 = -_4; // 4
  10   │   D.2809 = _1 | _5; //5
  11   │   return D.2809;
  12   │ }

Before your change you had 3, because you always do
Compare + branch + set.

The problem with the rewrite is that it pessimists the code if the saturating
instructions are not recognized afterwards.

Note this is also the case for vector codegen, The original code in vector would be

1. duplicate 255
1. x + y
2. compare
3. select.

The first two instructions are independent and execute in parallel, so the original
vector codegen also had a dependency chain of 3.

Additionally the branch version will get the benefit of branch prediction when ran
inside a loop.

> 
> > The pattern will apply during late PHI-OPT or during GENERIC folding
> > done by the frontends or gimplfiication - you scan the gimplification dump
> > below, so having it apply there means it will influence inlining heuristics
> > for example.  I wonder which variant is considered larger by its heuristic.
> 
> > What do others think of the early canonicalization of these to
> > straight-line code?
> 
> Make sense, we could find one form that most of us consider to be the "cheapest".

I would disagree with Richard here in that this would be a pessimistic codegen for
targets that don't later implement the IFN_SAT.

Thanks,
Tamar

> 
> Pan
> 
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Wednesday, November 6, 2024 8:43 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai;
> kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> > complicated while others are cheap.  This patch would like to simplify
> > the complicated form into the cheap ones.  For example as below:
> >
> > From the form 4 (branch):
> >   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
> >
> > To (branchless):
> >   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
> >
> >   #define T uint8_t
> >
> >   T sat_add_u_1 (T x, T y)
> >   {
> >     return (T)(x + y) < x ? -1 : (x + y);
> >   }
> >
> > Before this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;
> >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
> >    7   │   <D.2810>:
> >    8   │   D.2809 = x + y;
> >    9   │   goto <D.2812>;
> >   10   │   <D.2811>:
> >   11   │   D.2809 = 255;
> >   12   │   <D.2812>:
> >   13   │   return D.2809;
> >   14   │ }
> >
> > After this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;
> >    6   │   _2 = x + y;
> >    7   │   _3 = x > _2;
> >    8   │   _4 = (unsigned char) _3;
> >    9   │   _5 = -_4;
> >   10   │   D.2809 = _1 | _5;
> >   11   │   return D.2809;
> >   12   │ }
> >
> > The below test suites are passed for this patch.
> > * The rv64gcv fully regression test.
> > * The x86 bootstrap test.
> > * The x86 fully regression test.
> >
> > gcc/ChangeLog:
> >
> >         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
> >         add simplify to branchless instead.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.dg/sat_arith_simplify.h: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
> >         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd                                  | 23 +++++++++----------
> >  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
> >  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
> >  6 files changed, 65 insertions(+), 12 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index c851ac56e37..8425d7c4f20 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  (match (unsigned_integer_sat_add @0 @1)
> >   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
> >
> > -/* Simplify SAT_U_ADD to the cheap form
> > -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> > -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > -      && types_match (type, @0, @1))
> > -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> > -
> > -/* Unsigned saturation add, case 4 (branch with lt):
> > -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > -(match (unsigned_integer_sat_add @0 @1)
> > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> > +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> > +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > +  (if (types_match (type, @0, @1))
> > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> > +  (if (types_match (type, @0, @1))
> > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))
> 
> Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> cheaper.  When I count the number of gimple statements (sub-expressions)
> the original appears as 3 while the result looks to have 5.  The catch is of
> course that the original might involve control flow and a PHI.
> 
> The pattern will apply during late PHI-OPT or during GENERIC folding
> done by the frontends or gimplfiication - you scan the gimplification dump
> below, so having it apply there means it will influence inlining heuristics
> for example.  I wonder which variant is considered larger by its heuristic.
> 
> What do others think of the early canonicalization of these to
> straight-line code?
> 
> Thanks,
> Richard.
> 
> >  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
> >     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -
> 1.  */
> > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > new file mode 100644
> > index 00000000000..46ac00426b2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > @@ -0,0 +1,10 @@
> > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > +
> > +#define DEF_SAT_U_ADD_2(T)              \
> > +T sat_u_add_##T##_2 (T x, T y)          \
> > +{                                       \
> > +  return (T)(x + y) < x ? -1 : (x + y); \
> > +}
> > +
> > +#endif
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > new file mode 100644
> > index 00000000000..b170b35096c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint16_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > new file mode 100644
> > index 00000000000..8830ed7b878
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint32_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > new file mode 100644
> > index 00000000000..76c4d4bddaa
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint64_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > new file mode 100644
> > index 00000000000..b034b8eedb1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > +
> > +#include <stdint.h>
> > +#include "sat_arith_simplify.h"
> > +
> > +DEF_SAT_U_ADD_2 (uint8_t)
> > +
> > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > --
> > 2.43.0
> >
Tamar Christina Nov. 7, 2024, 11:43 a.m. UTC | #5
> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Thursday, November 7, 2024 1:45 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
> <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> I see, thanks Tamar for the explanation.
> 
> > The problem with the rewrite is that it pessimists the code if the saturating
> > instructions are not recognized afterwards.
> 
> The original idea is somehow independent with the backend support IFN_SAT_* or
> not.
> Given we have sorts of form of IFN_SAT_*, some of them are cheap while others
> are heavy
> from the perspective of gimple. It is possible to do some canonicalization here.
> 
> > Additionally the branch version will get the benefit of branch prediction when ran
> > inside a loop.
> 
> Not very sure if my understanding is correct, but the branchless version is
> preferred in most case IMO if
> they have nearly count of stmt. Not sure if it is still true during the vectorization.

I don't think this is true.  The branchless version must be either less instructions or
cheaper instruction.  The branches, especially in a loop become mostly free due to
branch prediction. Modern branch predictors are really good at idoms like this.

That said, the other issue with doing this in gimple is that it interferes with the RTL
conditional execution pass.

For instance see https://godbolt.org/z/fvrq3aq6K

On ISAs with conditional operations the branch version gets ifconverted.

On AArch64 we get:

sat_add_u_1(unsigned int, unsigned int):
        adds    w0, w0, w1
        csinv   w0, w0, wzr, cc
        ret

so just 2 instructions, and also branchless. On x86_64 we get:

sat_add_u_1(unsigned int, unsigned int):
        add     edi, esi
        mov     eax, -1
        cmovnc  eax, edi
        ret

so 3 instructions but a dependency chain of 2.

also branchless.  This patch would regress both of these.

Thanks,
Tamar

> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Thursday, November 7, 2024 12:25 AM
> To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Wednesday, November 6, 2024 1:31 PM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>;
> > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com;
> > rdapp.gcc@gmail.com
> > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > Never mind and thanks Richard for comments.
> >
> > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > > cheaper.  When I count the number of gimple statements (sub-expressions)
> > > the original appears as 3 while the result looks to have 5.
> >
> > I may have a question about how we count the stmts, not sure the x <= _1 and
> > then goto should be counted
> 
> They should, because they both become a instructions that need to be executed in
> the execution chain.
> 
> > as 1 or 2 stmts in below example. I try to count it by myself but not very sure the
> > understanding is correct.
> >
> > Before this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;  // 1
> >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto
> > ???
> >    7   │   <D.2810>:
> >    8   │   D.2809 = x + y; // 4 (token)
> >    9   │   goto <D.2812>; // 5 (token) for goto ???
> >   10   │   <D.2811>:
> >   11   │   D.2809 = 255; // 4 (not token)
> >   12   │   <D.2812>:
> >   13   │   return D.2809; // 6 for token, 5 for not token
> >   14   │ }
> >
> > After this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y; // 1
> >    6   │   _2 = x + y; // 2
> >    7   │   _3 = x > _2; // 3
> >    8   │   _4 = (unsigned char) _3; // 4
> >    9   │   _5 = -_4; // 5
> >   10   │   D.2809 = _1 | _5; // 6
> >   11   │   return D.2809; // 7
> >   12   │ }
> >
> > > The catch is of
> > > course that the original might involve control flow and a PHI.
> 
> _1 and _2 are counted as one, as they'll be CSE'd.
> After the patch your dependency chain is 5 instructions, as in,
> to create the result you must execute 5 sequential instructions.
> 
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;
>    6   │   _2 = x + y; // 1
>    7   │   _3 = x > _2; // 2
>    8   │   _4 = (unsigned char) _3; // 3
>    9   │   _5 = -_4; // 4
>   10   │   D.2809 = _1 | _5; //5
>   11   │   return D.2809;
>   12   │ }
> 
> Before your change you had 3, because you always do
> Compare + branch + set.
> 
> The problem with the rewrite is that it pessimists the code if the saturating
> instructions are not recognized afterwards.
> 
> Note this is also the case for vector codegen, The original code in vector would be
> 
> 1. duplicate 255
> 1. x + y
> 2. compare
> 3. select.
> 
> The first two instructions are independent and execute in parallel, so the original
> vector codegen also had a dependency chain of 3.
> 
> Additionally the branch version will get the benefit of branch prediction when ran
> inside a loop.
> 
> >
> > > The pattern will apply during late PHI-OPT or during GENERIC folding
> > > done by the frontends or gimplfiication - you scan the gimplification dump
> > > below, so having it apply there means it will influence inlining heuristics
> > > for example.  I wonder which variant is considered larger by its heuristic.
> >
> > > What do others think of the early canonicalization of these to
> > > straight-line code?
> >
> > Make sense, we could find one form that most of us consider to be the
> "cheapest".
> 
> I would disagree with Richard here in that this would be a pessimistic codegen for
> targets that don't later implement the IFN_SAT.
> 
> Thanks,
> Tamar
> 
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Wednesday, November 6, 2024 8:43 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai;
> > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote:
> > >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> > > complicated while others are cheap.  This patch would like to simplify
> > > the complicated form into the cheap ones.  For example as below:
> > >
> > > From the form 4 (branch):
> > >   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
> > >
> > > To (branchless):
> > >   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
> > >
> > >   #define T uint8_t
> > >
> > >   T sat_add_u_1 (T x, T y)
> > >   {
> > >     return (T)(x + y) < x ? -1 : (x + y);
> > >   }
> > >
> > > Before this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y;
> > >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
> > >    7   │   <D.2810>:
> > >    8   │   D.2809 = x + y;
> > >    9   │   goto <D.2812>;
> > >   10   │   <D.2811>:
> > >   11   │   D.2809 = 255;
> > >   12   │   <D.2812>:
> > >   13   │   return D.2809;
> > >   14   │ }
> > >
> > > After this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y;
> > >    6   │   _2 = x + y;
> > >    7   │   _3 = x > _2;
> > >    8   │   _4 = (unsigned char) _3;
> > >    9   │   _5 = -_4;
> > >   10   │   D.2809 = _1 | _5;
> > >   11   │   return D.2809;
> > >   12   │ }
> > >
> > > The below test suites are passed for this patch.
> > > * The rv64gcv fully regression test.
> > > * The x86 bootstrap test.
> > > * The x86 fully regression test.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
> > >         add simplify to branchless instead.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.dg/sat_arith_simplify.h: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd                                  | 23 +++++++++----------
> > >  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
> > >  6 files changed, 65 insertions(+), 12 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index c851ac56e37..8425d7c4f20 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  (match (unsigned_integer_sat_add @0 @1)
> > >   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
> > >
> > > -/* Simplify SAT_U_ADD to the cheap form
> > > -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> > > -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))
> > > -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> > > -
> > > -/* Unsigned saturation add, case 4 (branch with lt):
> > > -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > -(match (unsigned_integer_sat_add @0 @1)
> > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> > > +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > +  (if (types_match (type, @0, @1))
> > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> > > +  (if (types_match (type, @0, @1))
> > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))
> >
> > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > cheaper.  When I count the number of gimple statements (sub-expressions)
> > the original appears as 3 while the result looks to have 5.  The catch is of
> > course that the original might involve control flow and a PHI.
> >
> > The pattern will apply during late PHI-OPT or during GENERIC folding
> > done by the frontends or gimplfiication - you scan the gimplification dump
> > below, so having it apply there means it will influence inlining heuristics
> > for example.  I wonder which variant is considered larger by its heuristic.
> >
> > What do others think of the early canonicalization of these to
> > straight-line code?
> >
> > Thanks,
> > Richard.
> >
> > >  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
> > >     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW
> : -
> > 1.  */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > new file mode 100644
> > > index 00000000000..46ac00426b2
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > @@ -0,0 +1,10 @@
> > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > +
> > > +#define DEF_SAT_U_ADD_2(T)              \
> > > +T sat_u_add_##T##_2 (T x, T y)          \
> > > +{                                       \
> > > +  return (T)(x + y) < x ? -1 : (x + y); \
> > > +}
> > > +
> > > +#endif
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > new file mode 100644
> > > index 00000000000..b170b35096c
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint16_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > new file mode 100644
> > > index 00000000000..8830ed7b878
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint32_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > new file mode 100644
> > > index 00000000000..76c4d4bddaa
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint64_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > new file mode 100644
> > > index 00000000000..b034b8eedb1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint8_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > --
> > > 2.43.0
> > >
Li, Pan2 Nov. 7, 2024, 12:57 p.m. UTC | #6
I see your point that the backend can leverage condition move to emit the branch code.

> For instance see https://godbolt.org/z/fvrq3aq6K
> On ISAs with conditional operations the branch version gets ifconverted.
> On AArch64 we get:
> sat_add_u_1(unsigned int, unsigned int):
>         adds    w0, w0, w1
>         csinv   w0, w0, wzr, cc
>         ret
> so just 2 instructions, and also branchless. On x86_64 we get:
> sat_add_u_1(unsigned int, unsigned int):
>         add     edi, esi
>         mov     eax, -1
>         cmovnc  eax, edi
>         ret
> so 3 instructions but a dependency chain of 2.
> also branchless.  This patch would regress both of these.

But the above Godbolt may not be a good example for evidence, because both x86_64 and aarch64 implemented usadd
already.
Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and sat_add_u_2 are almost the
same when the backend implemented usadd.

#include <stdint.h>

#define T uint32_t

  T sat_add_u_1 (T x, T y)
  {
    return (T)(x + y) < x ? -1 : (x + y);
  }

  T sat_add_u_2 (T x, T y)
  {
    return (x + y) | -((x + y) < x);
  }

It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE IR), the x86_64
will have below asm dump for -O3. Looks like no obvious difference here.

sat_add_u_1(unsigned int, unsigned int):
        add     edi, esi
        mov     eax, -1
        cmovnc  eax, edi
        ret

sat_add_u_2(unsigned int, unsigned int):
        add     edi, esi
        sbb     eax, eax
        or      eax, edi
        ret

As well as for the vector mode for x86_64 gcc 14.2 as below.

  void vec_sat_add_u_1 (T *a, T *b, T *out, int n)
  {
    for (int i = 0; i < n; i++) {
        T x = a[i];
        T y = b[i];
        out[i] = (T)(x + y) < x ? -1 : (x + y);
    }
  }

  void vec_sat_add_u_2 (T * __restrict a, T * __restrict b,
          T * __restrict out, int n)
  {
    for (int i = 0; i < n; i++) {
        T x = a[i];
        T y = b[i];
        out[i] = (x + y) | -((x + y) < x);
    }
  }

vec_sat_add_u_1(unsigned int*, unsigned int*, unsigned int*, int):
....
.L15:
        movdqu  xmm0, XMMWORD PTR [rdi+rax]
        movdqu  xmm1, XMMWORD PTR [rsi+rax]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        por     xmm0, xmm1
        movups  XMMWORD PTR [rdx+rax], xmm0
        add     rax, 16
        cmp     r8, rax
        jne     .L15
        mov     eax, ecx
        and     eax, -4
        mov     r8d, eax
        cmp     ecx, eax
        je      .L11
        sub     ecx, eax
        mov     r9d, ecx
        cmp     ecx, 1
        je      .L17
.L14:
        movq    xmm2, QWORD PTR .LC2[rip]
        mov     ecx, r8d
        movq    xmm0, QWORD PTR [rdi+rcx*4]
        movq    xmm1, QWORD PTR [rsi+rcx*4]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        movdqa  xmm2, xmm0
        pandn   xmm2, xmm1
        por     xmm0, xmm2
        movq    QWORD PTR [rdx+rcx*4], xmm0
....

vec_sat_add_u_2(unsigned int*, unsigned int*, unsigned int*, int):
...
.L50:
        movdqu  xmm0, XMMWORD PTR [rdi+rax]
        movdqu  xmm1, XMMWORD PTR [rsi+rax]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        por     xmm0, xmm1
        movups  XMMWORD PTR [rdx+rax], xmm0
        add     rax, 16
        cmp     rax, r8
        jne     .L50
        mov     eax, ecx
        and     eax, -4
        mov     r8d, eax
        cmp     ecx, eax
        je      .L64
.L49:
        sub     ecx, r8d
        cmp     ecx, 1
        je      .L52
        movq    xmm0, QWORD PTR [rdi+r8*4]
        movq    xmm1, QWORD PTR [rsi+r8*4]
        movq    xmm2, QWORD PTR .LC2[rip]
        paddd   xmm1, xmm0
        psubd   xmm0, xmm2
        movdqa  xmm3, xmm1
        psubd   xmm3, xmm2
        pcmpgtd xmm0, xmm3
        movdqa  xmm2, xmm0
        pandn   xmm2, xmm1
        por     xmm0, xmm2
        movq    QWORD PTR [rdx+r8*4], xmm0
...

Pan

-----Original Message-----
From: Tamar Christina <Tamar.Christina@arm.com> 
Sent: Thursday, November 7, 2024 7:43 PM
To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless

> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Thursday, November 7, 2024 1:45 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
> <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> I see, thanks Tamar for the explanation.
> 
> > The problem with the rewrite is that it pessimists the code if the saturating
> > instructions are not recognized afterwards.
> 
> The original idea is somehow independent with the backend support IFN_SAT_* or
> not.
> Given we have sorts of form of IFN_SAT_*, some of them are cheap while others
> are heavy
> from the perspective of gimple. It is possible to do some canonicalization here.
> 
> > Additionally the branch version will get the benefit of branch prediction when ran
> > inside a loop.
> 
> Not very sure if my understanding is correct, but the branchless version is
> preferred in most case IMO if
> they have nearly count of stmt. Not sure if it is still true during the vectorization.

I don't think this is true.  The branchless version must be either less instructions or
cheaper instruction.  The branches, especially in a loop become mostly free due to
branch prediction. Modern branch predictors are really good at idoms like this.

That said, the other issue with doing this in gimple is that it interferes with the RTL
conditional execution pass.

For instance see https://godbolt.org/z/fvrq3aq6K

On ISAs with conditional operations the branch version gets ifconverted.

On AArch64 we get:

sat_add_u_1(unsigned int, unsigned int):
        adds    w0, w0, w1
        csinv   w0, w0, wzr, cc
        ret

so just 2 instructions, and also branchless. On x86_64 we get:

sat_add_u_1(unsigned int, unsigned int):
        add     edi, esi
        mov     eax, -1
        cmovnc  eax, edi
        ret

so 3 instructions but a dependency chain of 2.

also branchless.  This patch would regress both of these.

Thanks,
Tamar

> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Thursday, November 7, 2024 12:25 AM
> To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Wednesday, November 6, 2024 1:31 PM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>;
> > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com;
> > rdapp.gcc@gmail.com
> > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > Never mind and thanks Richard for comments.
> >
> > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > > cheaper.  When I count the number of gimple statements (sub-expressions)
> > > the original appears as 3 while the result looks to have 5.
> >
> > I may have a question about how we count the stmts, not sure the x <= _1 and
> > then goto should be counted
> 
> They should, because they both become a instructions that need to be executed in
> the execution chain.
> 
> > as 1 or 2 stmts in below example. I try to count it by myself but not very sure the
> > understanding is correct.
> >
> > Before this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;  // 1
> >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto
> > ???
> >    7   │   <D.2810>:
> >    8   │   D.2809 = x + y; // 4 (token)
> >    9   │   goto <D.2812>; // 5 (token) for goto ???
> >   10   │   <D.2811>:
> >   11   │   D.2809 = 255; // 4 (not token)
> >   12   │   <D.2812>:
> >   13   │   return D.2809; // 6 for token, 5 for not token
> >   14   │ }
> >
> > After this patch:
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y; // 1
> >    6   │   _2 = x + y; // 2
> >    7   │   _3 = x > _2; // 3
> >    8   │   _4 = (unsigned char) _3; // 4
> >    9   │   _5 = -_4; // 5
> >   10   │   D.2809 = _1 | _5; // 6
> >   11   │   return D.2809; // 7
> >   12   │ }
> >
> > > The catch is of
> > > course that the original might involve control flow and a PHI.
> 
> _1 and _2 are counted as one, as they'll be CSE'd.
> After the patch your dependency chain is 5 instructions, as in,
> to create the result you must execute 5 sequential instructions.
> 
>    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
>    2   │ {
>    3   │   uint8_t D.2809;
>    4   │
>    5   │   _1 = x + y;
>    6   │   _2 = x + y; // 1
>    7   │   _3 = x > _2; // 2
>    8   │   _4 = (unsigned char) _3; // 3
>    9   │   _5 = -_4; // 4
>   10   │   D.2809 = _1 | _5; //5
>   11   │   return D.2809;
>   12   │ }
> 
> Before your change you had 3, because you always do
> Compare + branch + set.
> 
> The problem with the rewrite is that it pessimists the code if the saturating
> instructions are not recognized afterwards.
> 
> Note this is also the case for vector codegen, The original code in vector would be
> 
> 1. duplicate 255
> 1. x + y
> 2. compare
> 3. select.
> 
> The first two instructions are independent and execute in parallel, so the original
> vector codegen also had a dependency chain of 3.
> 
> Additionally the branch version will get the benefit of branch prediction when ran
> inside a loop.
> 
> >
> > > The pattern will apply during late PHI-OPT or during GENERIC folding
> > > done by the frontends or gimplfiication - you scan the gimplification dump
> > > below, so having it apply there means it will influence inlining heuristics
> > > for example.  I wonder which variant is considered larger by its heuristic.
> >
> > > What do others think of the early canonicalization of these to
> > > straight-line code?
> >
> > Make sense, we could find one form that most of us consider to be the
> "cheapest".
> 
> I would disagree with Richard here in that this would be a pessimistic codegen for
> targets that don't later implement the IFN_SAT.
> 
> Thanks,
> Tamar
> 
> >
> > Pan
> >
> > -----Original Message-----
> > From: Richard Biener <richard.guenther@gmail.com>
> > Sent: Wednesday, November 6, 2024 8:43 PM
> > To: Li, Pan2 <pan2.li@intel.com>
> > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai;
> > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote:
> > >
> > > From: Pan Li <pan2.li@intel.com>
> > >
> > > There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> > > complicated while others are cheap.  This patch would like to simplify
> > > the complicated form into the cheap ones.  For example as below:
> > >
> > > From the form 4 (branch):
> > >   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
> > >
> > > To (branchless):
> > >   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
> > >
> > >   #define T uint8_t
> > >
> > >   T sat_add_u_1 (T x, T y)
> > >   {
> > >     return (T)(x + y) < x ? -1 : (x + y);
> > >   }
> > >
> > > Before this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y;
> > >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
> > >    7   │   <D.2810>:
> > >    8   │   D.2809 = x + y;
> > >    9   │   goto <D.2812>;
> > >   10   │   <D.2811>:
> > >   11   │   D.2809 = 255;
> > >   12   │   <D.2812>:
> > >   13   │   return D.2809;
> > >   14   │ }
> > >
> > > After this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y;
> > >    6   │   _2 = x + y;
> > >    7   │   _3 = x > _2;
> > >    8   │   _4 = (unsigned char) _3;
> > >    9   │   _5 = -_4;
> > >   10   │   D.2809 = _1 | _5;
> > >   11   │   return D.2809;
> > >   12   │ }
> > >
> > > The below test suites are passed for this patch.
> > > * The rv64gcv fully regression test.
> > > * The x86 bootstrap test.
> > > * The x86 fully regression test.
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
> > >         add simplify to branchless instead.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.dg/sat_arith_simplify.h: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
> > >         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
> > >
> > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > ---
> > >  gcc/match.pd                                  | 23 +++++++++----------
> > >  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
> > >  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
> > >  6 files changed, 65 insertions(+), 12 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index c851ac56e37..8425d7c4f20 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >  (match (unsigned_integer_sat_add @0 @1)
> > >   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
> > >
> > > -/* Simplify SAT_U_ADD to the cheap form
> > > -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> > > -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > -      && types_match (type, @0, @1))
> > > -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> > > -
> > > -/* Unsigned saturation add, case 4 (branch with lt):
> > > -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > -(match (unsigned_integer_sat_add @0 @1)
> > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> > > +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > +  (if (types_match (type, @0, @1))
> > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> > > +  (if (types_match (type, @0, @1))
> > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))
> >
> > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > cheaper.  When I count the number of gimple statements (sub-expressions)
> > the original appears as 3 while the result looks to have 5.  The catch is of
> > course that the original might involve control flow and a PHI.
> >
> > The pattern will apply during late PHI-OPT or during GENERIC folding
> > done by the frontends or gimplfiication - you scan the gimplification dump
> > below, so having it apply there means it will influence inlining heuristics
> > for example.  I wonder which variant is considered larger by its heuristic.
> >
> > What do others think of the early canonicalization of these to
> > straight-line code?
> >
> > Thanks,
> > Richard.
> >
> > >  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
> > >     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW
> : -
> > 1.  */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > new file mode 100644
> > > index 00000000000..46ac00426b2
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > @@ -0,0 +1,10 @@
> > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > +
> > > +#define DEF_SAT_U_ADD_2(T)              \
> > > +T sat_u_add_##T##_2 (T x, T y)          \
> > > +{                                       \
> > > +  return (T)(x + y) < x ? -1 : (x + y); \
> > > +}
> > > +
> > > +#endif
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > new file mode 100644
> > > index 00000000000..b170b35096c
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint16_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > new file mode 100644
> > > index 00000000000..8830ed7b878
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint32_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > new file mode 100644
> > > index 00000000000..76c4d4bddaa
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint64_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > new file mode 100644
> > > index 00000000000..b034b8eedb1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > +
> > > +#include <stdint.h>
> > > +#include "sat_arith_simplify.h"
> > > +
> > > +DEF_SAT_U_ADD_2 (uint8_t)
> > > +
> > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > --
> > > 2.43.0
> > >
Tamar Christina Nov. 7, 2024, 3:07 p.m. UTC | #7
> -----Original Message-----
> From: Li, Pan2 <pan2.li@intel.com>
> Sent: Thursday, November 7, 2024 12:57 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
> <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> I see your point that the backend can leverage condition move to emit the branch
> code.
> 
> > For instance see https://godbolt.org/z/fvrq3aq6K
> > On ISAs with conditional operations the branch version gets ifconverted.
> > On AArch64 we get:
> > sat_add_u_1(unsigned int, unsigned int):
> >         adds    w0, w0, w1
> >         csinv   w0, w0, wzr, cc
> >         ret
> > so just 2 instructions, and also branchless. On x86_64 we get:
> > sat_add_u_1(unsigned int, unsigned int):
> >         add     edi, esi
> >         mov     eax, -1
> >         cmovnc  eax, edi
> >         ret
> > so 3 instructions but a dependency chain of 2.
> > also branchless.  This patch would regress both of these.
> 
> But the above Godbolt may not be a good example for evidence, because both
> x86_64 and aarch64 implemented usadd
> already.
> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and
> sat_add_u_2 are almost the
> same when the backend implemented usadd.
> 
> #include <stdint.h>
> 
> #define T uint32_t
> 
>   T sat_add_u_1 (T x, T y)
>   {
>     return (T)(x + y) < x ? -1 : (x + y);
>   }
> 
>   T sat_add_u_2 (T x, T y)
>   {
>     return (x + y) | -((x + y) < x);
>   }
> 
> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE
> IR), the x86_64
> will have below asm dump for -O3. Looks like no obvious difference here.
> 
> sat_add_u_1(unsigned int, unsigned int):
>         add     edi, esi
>         mov     eax, -1
>         cmovnc  eax, edi
>         ret
> 
> sat_add_u_2(unsigned int, unsigned int):
>         add     edi, esi
>         sbb     eax, eax
>         or      eax, edi
>         ret
> 

Because CE is able to recognize the idiom back into a conditional move.
Pick a target that doesn't have conditional instructions, like PowerPC
https://godbolt.org/z/4bTv18WMv

You'll see that this canonicalization has made codegen worse.

After:

.L.sat_add_u_1(unsigned int, unsigned int):
        add 4,3,4
        rldicl 9,4,0,32
        subf 3,3,9
        sradi 3,3,63
        or 3,3,4
        rldicl 3,3,0,32
        blr

and before

.L.sat_add_u_1(unsigned int, unsigned int):
        add 4,3,4
        cmplw 0,4,3
        bge 0,.L2
        li 4,-1
.L2:
        rldicl 3,4,0,32
        blr

It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending
on the order of the branch. So for those architectures, it's always slower.

Tamar

> As well as for the vector mode for x86_64 gcc 14.2 as below.
> 
>   void vec_sat_add_u_1 (T *a, T *b, T *out, int n)
>   {
>     for (int i = 0; i < n; i++) {
>         T x = a[i];
>         T y = b[i];
>         out[i] = (T)(x + y) < x ? -1 : (x + y);
>     }
>   }
> 
>   void vec_sat_add_u_2 (T * __restrict a, T * __restrict b,
>           T * __restrict out, int n)
>   {
>     for (int i = 0; i < n; i++) {
>         T x = a[i];
>         T y = b[i];
>         out[i] = (x + y) | -((x + y) < x);
>     }
>   }
> 
> vec_sat_add_u_1(unsigned int*, unsigned int*, unsigned int*, int):
> ....
> .L15:
>         movdqu  xmm0, XMMWORD PTR [rdi+rax]
>         movdqu  xmm1, XMMWORD PTR [rsi+rax]
>         paddd   xmm1, xmm0
>         psubd   xmm0, xmm2
>         movdqa  xmm3, xmm1
>         psubd   xmm3, xmm2
>         pcmpgtd xmm0, xmm3
>         por     xmm0, xmm1
>         movups  XMMWORD PTR [rdx+rax], xmm0
>         add     rax, 16
>         cmp     r8, rax
>         jne     .L15
>         mov     eax, ecx
>         and     eax, -4
>         mov     r8d, eax
>         cmp     ecx, eax
>         je      .L11
>         sub     ecx, eax
>         mov     r9d, ecx
>         cmp     ecx, 1
>         je      .L17
> .L14:
>         movq    xmm2, QWORD PTR .LC2[rip]
>         mov     ecx, r8d
>         movq    xmm0, QWORD PTR [rdi+rcx*4]
>         movq    xmm1, QWORD PTR [rsi+rcx*4]
>         paddd   xmm1, xmm0
>         psubd   xmm0, xmm2
>         movdqa  xmm3, xmm1
>         psubd   xmm3, xmm2
>         pcmpgtd xmm0, xmm3
>         movdqa  xmm2, xmm0
>         pandn   xmm2, xmm1
>         por     xmm0, xmm2
>         movq    QWORD PTR [rdx+rcx*4], xmm0
> ....
> 
> vec_sat_add_u_2(unsigned int*, unsigned int*, unsigned int*, int):
> ...
> .L50:
>         movdqu  xmm0, XMMWORD PTR [rdi+rax]
>         movdqu  xmm1, XMMWORD PTR [rsi+rax]
>         paddd   xmm1, xmm0
>         psubd   xmm0, xmm2
>         movdqa  xmm3, xmm1
>         psubd   xmm3, xmm2
>         pcmpgtd xmm0, xmm3
>         por     xmm0, xmm1
>         movups  XMMWORD PTR [rdx+rax], xmm0
>         add     rax, 16
>         cmp     rax, r8
>         jne     .L50
>         mov     eax, ecx
>         and     eax, -4
>         mov     r8d, eax
>         cmp     ecx, eax
>         je      .L64
> .L49:
>         sub     ecx, r8d
>         cmp     ecx, 1
>         je      .L52
>         movq    xmm0, QWORD PTR [rdi+r8*4]
>         movq    xmm1, QWORD PTR [rsi+r8*4]
>         movq    xmm2, QWORD PTR .LC2[rip]
>         paddd   xmm1, xmm0
>         psubd   xmm0, xmm2
>         movdqa  xmm3, xmm1
>         psubd   xmm3, xmm2
>         pcmpgtd xmm0, xmm3
>         movdqa  xmm2, xmm0
>         pandn   xmm2, xmm1
>         por     xmm0, xmm2
>         movq    QWORD PTR [rdx+r8*4], xmm0
> ...

Because x86 is not a fully masked architecture.


> 
> Pan
> 
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Thursday, November 7, 2024 7:43 PM
> To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> > -----Original Message-----
> > From: Li, Pan2 <pan2.li@intel.com>
> > Sent: Thursday, November 7, 2024 1:45 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
> > <richard.guenther@gmail.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > I see, thanks Tamar for the explanation.
> >
> > > The problem with the rewrite is that it pessimists the code if the saturating
> > > instructions are not recognized afterwards.
> >
> > The original idea is somehow independent with the backend support IFN_SAT_*
> or
> > not.
> > Given we have sorts of form of IFN_SAT_*, some of them are cheap while others
> > are heavy
> > from the perspective of gimple. It is possible to do some canonicalization here.
> >
> > > Additionally the branch version will get the benefit of branch prediction when
> ran
> > > inside a loop.
> >
> > Not very sure if my understanding is correct, but the branchless version is
> > preferred in most case IMO if
> > they have nearly count of stmt. Not sure if it is still true during the vectorization.
> 
> I don't think this is true.  The branchless version must be either less instructions or
> cheaper instruction.  The branches, especially in a loop become mostly free due to
> branch prediction. Modern branch predictors are really good at idoms like this.
> 
> That said, the other issue with doing this in gimple is that it interferes with the RTL
> conditional execution pass.
> 
> For instance see https://godbolt.org/z/fvrq3aq6K
> 
> On ISAs with conditional operations the branch version gets ifconverted.
> 
> On AArch64 we get:
> 
> sat_add_u_1(unsigned int, unsigned int):
>         adds    w0, w0, w1
>         csinv   w0, w0, wzr, cc
>         ret
> 
> so just 2 instructions, and also branchless. On x86_64 we get:
> 
> sat_add_u_1(unsigned int, unsigned int):
>         add     edi, esi
>         mov     eax, -1
>         cmovnc  eax, edi
>         ret
> 
> so 3 instructions but a dependency chain of 2.
> 
> also branchless.  This patch would regress both of these.
> 
> Thanks,
> Tamar
> 
> >
> > Pan
> >
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Thursday, November 7, 2024 12:25 AM
> > To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > SAT_ADD into branchless
> >
> > > -----Original Message-----
> > > From: Li, Pan2 <pan2.li@intel.com>
> > > Sent: Wednesday, November 6, 2024 1:31 PM
> > > To: Richard Biener <richard.guenther@gmail.com>
> > > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>;
> > > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com;
> > > rdapp.gcc@gmail.com
> > > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > > SAT_ADD into branchless
> > >
> > > Never mind and thanks Richard for comments.
> > >
> > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > > > cheaper.  When I count the number of gimple statements (sub-expressions)
> > > > the original appears as 3 while the result looks to have 5.
> > >
> > > I may have a question about how we count the stmts, not sure the x <= _1 and
> > > then goto should be counted
> >
> > They should, because they both become a instructions that need to be executed
> in
> > the execution chain.
> >
> > > as 1 or 2 stmts in below example. I try to count it by myself but not very sure
> the
> > > understanding is correct.
> > >
> > > Before this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y;  // 1
> > >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for
> goto
> > > ???
> > >    7   │   <D.2810>:
> > >    8   │   D.2809 = x + y; // 4 (token)
> > >    9   │   goto <D.2812>; // 5 (token) for goto ???
> > >   10   │   <D.2811>:
> > >   11   │   D.2809 = 255; // 4 (not token)
> > >   12   │   <D.2812>:
> > >   13   │   return D.2809; // 6 for token, 5 for not token
> > >   14   │ }
> > >
> > > After this patch:
> > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > >    2   │ {
> > >    3   │   uint8_t D.2809;
> > >    4   │
> > >    5   │   _1 = x + y; // 1
> > >    6   │   _2 = x + y; // 2
> > >    7   │   _3 = x > _2; // 3
> > >    8   │   _4 = (unsigned char) _3; // 4
> > >    9   │   _5 = -_4; // 5
> > >   10   │   D.2809 = _1 | _5; // 6
> > >   11   │   return D.2809; // 7
> > >   12   │ }
> > >
> > > > The catch is of
> > > > course that the original might involve control flow and a PHI.
> >
> > _1 and _2 are counted as one, as they'll be CSE'd.
> > After the patch your dependency chain is 5 instructions, as in,
> > to create the result you must execute 5 sequential instructions.
> >
> >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> >    2   │ {
> >    3   │   uint8_t D.2809;
> >    4   │
> >    5   │   _1 = x + y;
> >    6   │   _2 = x + y; // 1
> >    7   │   _3 = x > _2; // 2
> >    8   │   _4 = (unsigned char) _3; // 3
> >    9   │   _5 = -_4; // 4
> >   10   │   D.2809 = _1 | _5; //5
> >   11   │   return D.2809;
> >   12   │ }
> >
> > Before your change you had 3, because you always do
> > Compare + branch + set.
> >
> > The problem with the rewrite is that it pessimists the code if the saturating
> > instructions are not recognized afterwards.
> >
> > Note this is also the case for vector codegen, The original code in vector would be
> >
> > 1. duplicate 255
> > 1. x + y
> > 2. compare
> > 3. select.
> >
> > The first two instructions are independent and execute in parallel, so the original
> > vector codegen also had a dependency chain of 3.
> >
> > Additionally the branch version will get the benefit of branch prediction when ran
> > inside a loop.
> >
> > >
> > > > The pattern will apply during late PHI-OPT or during GENERIC folding
> > > > done by the frontends or gimplfiication - you scan the gimplification dump
> > > > below, so having it apply there means it will influence inlining heuristics
> > > > for example.  I wonder which variant is considered larger by its heuristic.
> > >
> > > > What do others think of the early canonicalization of these to
> > > > straight-line code?
> > >
> > > Make sense, we could find one form that most of us consider to be the
> > "cheapest".
> >
> > I would disagree with Richard here in that this would be a pessimistic codegen for
> > targets that don't later implement the IFN_SAT.
> >
> > Thanks,
> > Tamar
> >
> > >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Richard Biener <richard.guenther@gmail.com>
> > > Sent: Wednesday, November 6, 2024 8:43 PM
> > > To: Li, Pan2 <pan2.li@intel.com>
> > > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com;
> juzhe.zhong@rivai.ai;
> > > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> > > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> > > SAT_ADD into branchless
> > >
> > > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote:
> > > >
> > > > From: Pan Li <pan2.li@intel.com>
> > > >
> > > > There are sorts of forms for the unsigned SAT_ADD.  Some of them are
> > > > complicated while others are cheap.  This patch would like to simplify
> > > > the complicated form into the cheap ones.  For example as below:
> > > >
> > > > From the form 4 (branch):
> > > >   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).
> > > >
> > > > To (branchless):
> > > >   SAT_U_ADD = (X + Y) | - ((X + Y) < X).
> > > >
> > > >   #define T uint8_t
> > > >
> > > >   T sat_add_u_1 (T x, T y)
> > > >   {
> > > >     return (T)(x + y) < x ? -1 : (x + y);
> > > >   }
> > > >
> > > > Before this patch:
> > > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > > >    2   │ {
> > > >    3   │   uint8_t D.2809;
> > > >    4   │
> > > >    5   │   _1 = x + y;
> > > >    6   │   if (x <= _1) goto <D.2810>; else goto <D.2811>;
> > > >    7   │   <D.2810>:
> > > >    8   │   D.2809 = x + y;
> > > >    9   │   goto <D.2812>;
> > > >   10   │   <D.2811>:
> > > >   11   │   D.2809 = 255;
> > > >   12   │   <D.2812>:
> > > >   13   │   return D.2809;
> > > >   14   │ }
> > > >
> > > > After this patch:
> > > >    1   │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y)
> > > >    2   │ {
> > > >    3   │   uint8_t D.2809;
> > > >    4   │
> > > >    5   │   _1 = x + y;
> > > >    6   │   _2 = x + y;
> > > >    7   │   _3 = x > _2;
> > > >    8   │   _4 = (unsigned char) _3;
> > > >    9   │   _5 = -_4;
> > > >   10   │   D.2809 = _1 | _5;
> > > >   11   │   return D.2809;
> > > >   12   │ }
> > > >
> > > > The below test suites are passed for this patch.
> > > > * The rv64gcv fully regression test.
> > > > * The x86 bootstrap test.
> > > > * The x86 fully regression test.
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         * match.pd: Remove unsigned branch form 4 for SAT_ADD, and
> > > >         add simplify to branchless instead.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         * gcc.dg/sat_arith_simplify.h: New test.
> > > >         * gcc.dg/sat_u_add-simplify-2-u16.c: New test.
> > > >         * gcc.dg/sat_u_add-simplify-2-u32.c: New test.
> > > >         * gcc.dg/sat_u_add-simplify-2-u64.c: New test.
> > > >         * gcc.dg/sat_u_add-simplify-2-u8.c: New test.
> > > >
> > > > Signed-off-by: Pan Li <pan2.li@intel.com>
> > > > ---
> > > >  gcc/match.pd                                  | 23 +++++++++----------
> > > >  gcc/testsuite/gcc.dg/sat_arith_simplify.h     | 10 ++++++++
> > > >  .../gcc.dg/sat_u_add-simplify-2-u16.c         | 11 +++++++++
> > > >  .../gcc.dg/sat_u_add-simplify-2-u32.c         | 11 +++++++++
> > > >  .../gcc.dg/sat_u_add-simplify-2-u64.c         | 11 +++++++++
> > > >  .../gcc.dg/sat_u_add-simplify-2-u8.c          | 11 +++++++++
> > > >  6 files changed, 65 insertions(+), 12 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > >  create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > >
> > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > index c851ac56e37..8425d7c4f20 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > > >  (match (unsigned_integer_sat_add @0 @1)
> > > >   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
> > > >
> > > > -/* Simplify SAT_U_ADD to the cheap form
> > > > -   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
> > > > -   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
> > > > -      && types_match (type, @0, @1))
> > > > -  (bit_ior @2 (negate (convert (lt @2 @0))))))
> > > > -
> > > > -/* Unsigned saturation add, case 4 (branch with lt):
> > > > -   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > > -(match (unsigned_integer_sat_add @0 @1)
> > > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
> > > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one.
> > > > +   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
> > > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
> > > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
> > > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> > > > +  (if (types_match (type, @0, @1))
> > > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
> > > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
> > > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
> > > > +  (if (types_match (type, @0, @1))
> > > > +   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))
> > >
> > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is
> > > cheaper.  When I count the number of gimple statements (sub-expressions)
> > > the original appears as 3 while the result looks to have 5.  The catch is of
> > > course that the original might involve control flow and a PHI.
> > >
> > > The pattern will apply during late PHI-OPT or during GENERIC folding
> > > done by the frontends or gimplfiication - you scan the gimplification dump
> > > below, so having it apply there means it will influence inlining heuristics
> > > for example.  I wonder which variant is considered larger by its heuristic.
> > >
> > > What do others think of the early canonicalization of these to
> > > straight-line code?
> > >
> > > Thanks,
> > > Richard.
> > >
> > > >  /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
> > > >     SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ?
> .ADD_OVERFLOW
> > : -
> > > 1.  */
> > > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > > new file mode 100644
> > > > index 00000000000..46ac00426b2
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
> > > > @@ -0,0 +1,10 @@
> > > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
> > > > +
> > > > +#define DEF_SAT_U_ADD_2(T)              \
> > > > +T sat_u_add_##T##_2 (T x, T y)          \
> > > > +{                                       \
> > > > +  return (T)(x + y) < x ? -1 : (x + y); \
> > > > +}
> > > > +
> > > > +#endif
> > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > > new file mode 100644
> > > > index 00000000000..b170b35096c
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
> > > > @@ -0,0 +1,11 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include "sat_arith_simplify.h"
> > > > +
> > > > +DEF_SAT_U_ADD_2 (uint16_t)
> > > > +
> > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > > new file mode 100644
> > > > index 00000000000..8830ed7b878
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
> > > > @@ -0,0 +1,11 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include "sat_arith_simplify.h"
> > > > +
> > > > +DEF_SAT_U_ADD_2 (uint32_t)
> > > > +
> > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > > new file mode 100644
> > > > index 00000000000..76c4d4bddaa
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
> > > > @@ -0,0 +1,11 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include "sat_arith_simplify.h"
> > > > +
> > > > +DEF_SAT_U_ADD_2 (uint64_t)
> > > > +
> > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > > new file mode 100644
> > > > index 00000000000..b034b8eedb1
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
> > > > @@ -0,0 +1,11 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include "sat_arith_simplify.h"
> > > > +
> > > > +DEF_SAT_U_ADD_2 (uint8_t)
> > > > +
> > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
> > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
> > > > --
> > > > 2.43.0
> > > >
Jeff Law Nov. 7, 2024, 8:07 p.m. UTC | #8
On 11/7/24 8:07 AM, Tamar Christina wrote:
> 
>> -----Original Message-----
>> From: Li, Pan2 <pan2.li@intel.com>
>> Sent: Thursday, November 7, 2024 12:57 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
>> <richard.guenther@gmail.com>
>> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
>> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
>> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
>> SAT_ADD into branchless
>>
>> I see your point that the backend can leverage condition move to emit the branch
>> code.
>>
>>> For instance see https://godbolt.org/z/fvrq3aq6K
>>> On ISAs with conditional operations the branch version gets ifconverted.
>>> On AArch64 we get:
>>> sat_add_u_1(unsigned int, unsigned int):
>>>          adds    w0, w0, w1
>>>          csinv   w0, w0, wzr, cc
>>>          ret
>>> so just 2 instructions, and also branchless. On x86_64 we get:
>>> sat_add_u_1(unsigned int, unsigned int):
>>>          add     edi, esi
>>>          mov     eax, -1
>>>          cmovnc  eax, edi
>>>          ret
>>> so 3 instructions but a dependency chain of 2.
>>> also branchless.  This patch would regress both of these.
>>
>> But the above Godbolt may not be a good example for evidence, because both
>> x86_64 and aarch64 implemented usadd
>> already.
>> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and
>> sat_add_u_2 are almost the
>> same when the backend implemented usadd.
>>
>> #include <stdint.h>
>>
>> #define T uint32_t
>>
>>    T sat_add_u_1 (T x, T y)
>>    {
>>      return (T)(x + y) < x ? -1 : (x + y);
>>    }
>>
>>    T sat_add_u_2 (T x, T y)
>>    {
>>      return (x + y) | -((x + y) < x);
>>    }
>>
>> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE
>> IR), the x86_64
>> will have below asm dump for -O3. Looks like no obvious difference here.
>>
>> sat_add_u_1(unsigned int, unsigned int):
>>          add     edi, esi
>>          mov     eax, -1
>>          cmovnc  eax, edi
>>          ret
>>
>> sat_add_u_2(unsigned int, unsigned int):
>>          add     edi, esi
>>          sbb     eax, eax
>>          or      eax, edi
>>          ret
>>
> 
> Because CE is able to recognize the idiom back into a conditional move.
> Pick a target that doesn't have conditional instructions, like PowerPC
> https://godbolt.org/z/4bTv18WMv
> 
> You'll see that this canonicalization has made codegen worse.
> 
> After:
> 
> .L.sat_add_u_1(unsigned int, unsigned int):
>          add 4,3,4
>          rldicl 9,4,0,32
>          subf 3,3,9
>          sradi 3,3,63
>          or 3,3,4
>          rldicl 3,3,0,32
>          blr
> 
> and before
> 
> .L.sat_add_u_1(unsigned int, unsigned int):
>          add 4,3,4
>          cmplw 0,4,3
>          bge 0,.L2
>          li 4,-1
> .L2:
>          rldicl 3,4,0,32
>          blr
> 
> It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending
> on the order of the branch. So for those architectures, it's always slower.
I'm not sure it's that simple.  It'll depend on the micro-architecture. 
So things like strength of the branch predictors, how fetch blocks are 
handled (can you have embedded not-taken branches, short-forward-branch 
optimizations, etc).

Jeff
Li, Pan2 Nov. 7, 2024, 11:34 p.m. UTC | #9
Thanks Tamar and Jeff for comments.

> I'm not sure it's that simple.  It'll depend on the micro-architecture. 
> So things like strength of the branch predictors, how fetch blocks are 
> handled (can you have embedded not-taken branches, short-forward-branch 
> optimizations, etc).

> After:
> 
> .L.sat_add_u_1(unsigned int, unsigned int):
>          add 4,3,4
>          rldicl 9,4,0,32
>          subf 3,3,9
>          sradi 3,3,63
>          or 3,3,4
>          rldicl 3,3,0,32
>          blr
> 
> and before
> 
> .L.sat_add_u_1(unsigned int, unsigned int):
>          add 4,3,4
>          cmplw 0,4,3
>          bge 0,.L2
>          li 4,-1
> .L2:
>          rldicl 3,4,0,32
>          blr

I am not familiar with branch prediction, but the branch should be 50% token and 50% not-token
according to the range of sat add input. It is the worst case for branch prediction? I mean if we call
100 times with token, not-token, token, not-token... sequence, the branch version will be still faster?
Feel free to correct me if I'm wrong.

Back to these 16 forms of sat add as below, is there any suggestion which one or two form(s) may be
cheaper than others from the perspective of gimple IR? Independent with the backend implemented SAT_ADD or not.

#define DEF_SAT_U_ADD_1(T)               \
T sat_u_add_##T##_1 (T x, T y)           \
{                                        \
  return (T)(x + y) >= x ? (x + y) : -1; \
}

#define DEF_SAT_U_ADD_2(T)              \
T sat_u_add_##T##_2 (T x, T y)          \
{                                       \
  return (T)(x + y) < x ? -1 : (x + y); \
}

#define DEF_SAT_U_ADD_3(T)               \
T sat_u_add_##T##_3 (T x, T y)           \
{                                        \
  return x <= (T)(x + y) ? (x + y) : -1; \
}

#define DEF_SAT_U_ADD_4(T)              \
T sat_u_add_##T##_4 (T x, T y)          \
{                                       \
  return x > (T)(x + y) ? -1 : (x + y); \
}

#define DEF_SAT_U_ADD_5(T)      \
T sat_u_add_##T##_1 (T x, T y)  \
{                               \
  if ((T)(x + y) >= x)          \
    return x + y;               \
  else                          \
    return -1;                  \
}

#define DEF_SAT_U_ADD_6(T)      \
T sat_u_add_##T##_6 (T x, T y)  \
{                               \
  if ((T)(x + y) < x)           \
    return -1;                  \
  else                          \
    return x + y;               \
}

#define DEF_SAT_U_ADD_7(T)      \
T sat_u_add_##T##_7 (T x, T y)  \
{                               \
  if (x <= (T)(x + y))          \
    return x + y;               \
  else                          \
    return -1;                  \
}

#define DEF_SAT_U_ADD_8(T)      \
T sat_u_add_##T##_8 (T x, T y)  \
{                               \
  if (x > (T)(x + y))           \
    return -1;                  \
  else                          \
    return x + y;               \
}

#define DEF_SAT_U_ADD_9(T)                                     \
T sat_u_add_##T##_9 (T x, T y)                                 \
{                                                              \
  T ret;                                                       \
  return __builtin_add_overflow (x, y, &ret) == 0 ? ret : - 1; \
}

#define DEF_SAT_U_ADD_10(T)                                \
T sat_u_add_##T##_10 (T x, T y)                            \
{                                                          \
  T ret;                                                   \
  return !__builtin_add_overflow (x, y, &ret) ? ret : - 1; \
}

#define DEF_SAT_U_ADD_11(T)                     \
T sat_u_add_##T##_11 (T x, T y)                 \
{                                               \
  T ret;                                        \
  if (__builtin_add_overflow (x, y, &ret) == 0) \
    return ret;                                 \
  else                                          \
    return -1;                                  \
}

#define DEF_SAT_U_ADD_12(T)                 \
T sat_u_add_##T##_12 (T x, T y)             \
{                                           \
  T ret;                                    \
  if (!__builtin_add_overflow (x, y, &ret)) \
    return ret;                             \
  else                                      \
    return -1;                              \
}

#define DEF_SAT_U_ADD_13(T)                                   \
T sat_u_add_##T##_13 (T x, T y)                               \
{                                                             \
  T ret;                                                      \
  return __builtin_add_overflow (x, y, &ret) != 0 ? -1 : ret; \
}

#define DEF_SAT_U_ADD_14(T)                              \
T sat_u_add_##T##_14 (T x, T y)                          \
{                                                        \
  T ret;                                                 \
  return __builtin_add_overflow (x, y, &ret) ? -1 : ret; \
}

#define DEF_SAT_U_ADD_15(T)                     \
T sat_u_add_##T##_15 (T x, T y)                 \
{                                               \
  T ret;                                        \
  if (__builtin_add_overflow (x, y, &ret) != 0) \
    return -1;                                  \
  else                                          \
    return ret;                                 \
}

#define DEF_SAT_U_ADD_16(T)                \
T sat_u_add_##T##_16 (T x, T y)            \
{                                          \
  T ret;                                   \
  if (__builtin_add_overflow (x, y, &ret)) \
    return -1;                             \
  else                                     \
    return ret;                            \
}

Pan

-----Original Message-----
From: Jeff Law <jeffreyalaw@gmail.com> 
Sent: Friday, November 8, 2024 4:08 AM
To: Tamar Christina <Tamar.Christina@arm.com>; Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless



On 11/7/24 8:07 AM, Tamar Christina wrote:
> 
>> -----Original Message-----
>> From: Li, Pan2 <pan2.li@intel.com>
>> Sent: Thursday, November 7, 2024 12:57 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
>> <richard.guenther@gmail.com>
>> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
>> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
>> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
>> SAT_ADD into branchless
>>
>> I see your point that the backend can leverage condition move to emit the branch
>> code.
>>
>>> For instance see https://godbolt.org/z/fvrq3aq6K
>>> On ISAs with conditional operations the branch version gets ifconverted.
>>> On AArch64 we get:
>>> sat_add_u_1(unsigned int, unsigned int):
>>>          adds    w0, w0, w1
>>>          csinv   w0, w0, wzr, cc
>>>          ret
>>> so just 2 instructions, and also branchless. On x86_64 we get:
>>> sat_add_u_1(unsigned int, unsigned int):
>>>          add     edi, esi
>>>          mov     eax, -1
>>>          cmovnc  eax, edi
>>>          ret
>>> so 3 instructions but a dependency chain of 2.
>>> also branchless.  This patch would regress both of these.
>>
>> But the above Godbolt may not be a good example for evidence, because both
>> x86_64 and aarch64 implemented usadd
>> already.
>> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and
>> sat_add_u_2 are almost the
>> same when the backend implemented usadd.
>>
>> #include <stdint.h>
>>
>> #define T uint32_t
>>
>>    T sat_add_u_1 (T x, T y)
>>    {
>>      return (T)(x + y) < x ? -1 : (x + y);
>>    }
>>
>>    T sat_add_u_2 (T x, T y)
>>    {
>>      return (x + y) | -((x + y) < x);
>>    }
>>
>> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE
>> IR), the x86_64
>> will have below asm dump for -O3. Looks like no obvious difference here.
>>
>> sat_add_u_1(unsigned int, unsigned int):
>>          add     edi, esi
>>          mov     eax, -1
>>          cmovnc  eax, edi
>>          ret
>>
>> sat_add_u_2(unsigned int, unsigned int):
>>          add     edi, esi
>>          sbb     eax, eax
>>          or      eax, edi
>>          ret
>>
> 
> Because CE is able to recognize the idiom back into a conditional move.
> Pick a target that doesn't have conditional instructions, like PowerPC
> https://godbolt.org/z/4bTv18WMv
> 
> You'll see that this canonicalization has made codegen worse.
> 
> After:
> 
> .L.sat_add_u_1(unsigned int, unsigned int):
>          add 4,3,4
>          rldicl 9,4,0,32
>          subf 3,3,9
>          sradi 3,3,63
>          or 3,3,4
>          rldicl 3,3,0,32
>          blr
> 
> and before
> 
> .L.sat_add_u_1(unsigned int, unsigned int):
>          add 4,3,4
>          cmplw 0,4,3
>          bge 0,.L2
>          li 4,-1
> .L2:
>          rldicl 3,4,0,32
>          blr
> 
> It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending
> on the order of the branch. So for those architectures, it's always slower.
I'm not sure it's that simple.  It'll depend on the micro-architecture. 
So things like strength of the branch predictors, how fetch blocks are 
handled (can you have embedded not-taken branches, short-forward-branch 
optimizations, etc).

Jeff
Tamar Christina Nov. 8, 2024, 7:34 a.m. UTC | #10
> -----Original Message-----
> From: Jeff Law <jeffreyalaw@gmail.com>
> Sent: Thursday, November 7, 2024 8:08 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; Li, Pan2 <pan2.li@intel.com>;
> Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> rdapp.gcc@gmail.com
> Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> SAT_ADD into branchless
> 
> 
> 
> On 11/7/24 8:07 AM, Tamar Christina wrote:
> >
> >> -----Original Message-----
> >> From: Li, Pan2 <pan2.li@intel.com>
> >> Sent: Thursday, November 7, 2024 12:57 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
> >> <richard.guenther@gmail.com>
> >> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> >> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> >> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> >> SAT_ADD into branchless
> >>
> >> I see your point that the backend can leverage condition move to emit the
> branch
> >> code.
> >>
> >>> For instance see https://godbolt.org/z/fvrq3aq6K
> >>> On ISAs with conditional operations the branch version gets ifconverted.
> >>> On AArch64 we get:
> >>> sat_add_u_1(unsigned int, unsigned int):
> >>>          adds    w0, w0, w1
> >>>          csinv   w0, w0, wzr, cc
> >>>          ret
> >>> so just 2 instructions, and also branchless. On x86_64 we get:
> >>> sat_add_u_1(unsigned int, unsigned int):
> >>>          add     edi, esi
> >>>          mov     eax, -1
> >>>          cmovnc  eax, edi
> >>>          ret
> >>> so 3 instructions but a dependency chain of 2.
> >>> also branchless.  This patch would regress both of these.
> >>
> >> But the above Godbolt may not be a good example for evidence, because both
> >> x86_64 and aarch64 implemented usadd
> >> already.
> >> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1
> and
> >> sat_add_u_2 are almost the
> >> same when the backend implemented usadd.
> >>
> >> #include <stdint.h>
> >>
> >> #define T uint32_t
> >>
> >>    T sat_add_u_1 (T x, T y)
> >>    {
> >>      return (T)(x + y) < x ? -1 : (x + y);
> >>    }
> >>
> >>    T sat_add_u_2 (T x, T y)
> >>    {
> >>      return (x + y) | -((x + y) < x);
> >>    }
> >>
> >> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD
> GIMPLE
> >> IR), the x86_64
> >> will have below asm dump for -O3. Looks like no obvious difference here.
> >>
> >> sat_add_u_1(unsigned int, unsigned int):
> >>          add     edi, esi
> >>          mov     eax, -1
> >>          cmovnc  eax, edi
> >>          ret
> >>
> >> sat_add_u_2(unsigned int, unsigned int):
> >>          add     edi, esi
> >>          sbb     eax, eax
> >>          or      eax, edi
> >>          ret
> >>
> >
> > Because CE is able to recognize the idiom back into a conditional move.
> > Pick a target that doesn't have conditional instructions, like PowerPC
> > https://godbolt.org/z/4bTv18WMv
> >
> > You'll see that this canonicalization has made codegen worse.
> >
> > After:
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          rldicl 9,4,0,32
> >          subf 3,3,9
> >          sradi 3,3,63
> >          or 3,3,4
> >          rldicl 3,3,0,32
> >          blr
> >
> > and before
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          cmplw 0,4,3
> >          bge 0,.L2
> >          li 4,-1
> > .L2:
> >          rldicl 3,4,0,32
> >          blr
> >
> > It means now it always has to execute 6 instructions, whereas before it was 4 or
> 5 depending
> > on the order of the branch. So for those architectures, it's always slower.
> I'm not sure it's that simple.  It'll depend on the micro-architecture.
> So things like strength of the branch predictors, how fetch blocks are
> handled (can you have embedded not-taken branches, short-forward-branch
> optimizations, etc).

It's a pretty safe assumption that most modern branch predictors don't have an issue with
a simple diamond node like this. But since you brought up fetch blocks, at 5 instructions you're
far more likely to fit the entire thing in a fetch block, especially if within a loop where most targets
align loops.

Where as the more instructions you add, the larger the chance of you needing multiple fetches for the
loop. You then weigh the cost of a misprediction (which will get better as the loop runs) vs the cost of a
underutilize fetch, which will not get better.   You are also making an assumption that logical operations are cheap
on the particular core in the longer sequence.

I would agree that removing the branch is universally a good idea if you produced the same or less
Instructions, or cheaper instructions, as I said before.  I disagree that replacing it with more instructions
is a universally better thing, and you yourself said "it'll depend".

Tamar

> 
> Jeff
Richard Biener Nov. 8, 2024, 8:03 a.m. UTC | #11
On Fri, Nov 8, 2024 at 12:34 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Tamar and Jeff for comments.
>
> > I'm not sure it's that simple.  It'll depend on the micro-architecture.
> > So things like strength of the branch predictors, how fetch blocks are
> > handled (can you have embedded not-taken branches, short-forward-branch
> > optimizations, etc).
>
> > After:
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          rldicl 9,4,0,32
> >          subf 3,3,9
> >          sradi 3,3,63
> >          or 3,3,4
> >          rldicl 3,3,0,32
> >          blr
> >
> > and before
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          cmplw 0,4,3
> >          bge 0,.L2
> >          li 4,-1
> > .L2:
> >          rldicl 3,4,0,32
> >          blr
>
> I am not familiar with branch prediction, but the branch should be 50% token and 50% not-token
> according to the range of sat add input. It is the worst case for branch prediction? I mean if we call
> 100 times with token, not-token, token, not-token... sequence, the branch version will be still faster?
> Feel free to correct me if I'm wrong.
>
> Back to these 16 forms of sat add as below, is there any suggestion which one or two form(s) may be
> cheaper than others from the perspective of gimple IR? Independent with the backend implemented SAT_ADD or not.
>
> #define DEF_SAT_U_ADD_1(T)               \
> T sat_u_add_##T##_1 (T x, T y)           \
> {                                        \
>   return (T)(x + y) >= x ? (x + y) : -1; \
> }
>
> #define DEF_SAT_U_ADD_2(T)              \
> T sat_u_add_##T##_2 (T x, T y)          \
> {                                       \
>   return (T)(x + y) < x ? -1 : (x + y); \
> }
>
> #define DEF_SAT_U_ADD_3(T)               \
> T sat_u_add_##T##_3 (T x, T y)           \
> {                                        \
>   return x <= (T)(x + y) ? (x + y) : -1; \
> }
>
> #define DEF_SAT_U_ADD_4(T)              \
> T sat_u_add_##T##_4 (T x, T y)          \
> {                                       \
>   return x > (T)(x + y) ? -1 : (x + y); \
> }
>
> #define DEF_SAT_U_ADD_5(T)      \
> T sat_u_add_##T##_1 (T x, T y)  \
> {                               \
>   if ((T)(x + y) >= x)          \
>     return x + y;               \
>   else                          \
>     return -1;                  \
> }
>
> #define DEF_SAT_U_ADD_6(T)      \
> T sat_u_add_##T##_6 (T x, T y)  \
> {                               \
>   if ((T)(x + y) < x)           \
>     return -1;                  \
>   else                          \
>     return x + y;               \
> }
>
> #define DEF_SAT_U_ADD_7(T)      \
> T sat_u_add_##T##_7 (T x, T y)  \
> {                               \
>   if (x <= (T)(x + y))          \
>     return x + y;               \
>   else                          \
>     return -1;                  \
> }
>
> #define DEF_SAT_U_ADD_8(T)      \
> T sat_u_add_##T##_8 (T x, T y)  \
> {                               \
>   if (x > (T)(x + y))           \
>     return -1;                  \
>   else                          \
>     return x + y;               \
> }
>
> #define DEF_SAT_U_ADD_9(T)                                     \
> T sat_u_add_##T##_9 (T x, T y)                                 \
> {                                                              \
>   T ret;                                                       \
>   return __builtin_add_overflow (x, y, &ret) == 0 ? ret : - 1; \
> }
>
> #define DEF_SAT_U_ADD_10(T)                                \
> T sat_u_add_##T##_10 (T x, T y)                            \
> {                                                          \
>   T ret;                                                   \
>   return !__builtin_add_overflow (x, y, &ret) ? ret : - 1; \
> }
>
> #define DEF_SAT_U_ADD_11(T)                     \
> T sat_u_add_##T##_11 (T x, T y)                 \
> {                                               \
>   T ret;                                        \
>   if (__builtin_add_overflow (x, y, &ret) == 0) \
>     return ret;                                 \
>   else                                          \
>     return -1;                                  \
> }
>
> #define DEF_SAT_U_ADD_12(T)                 \
> T sat_u_add_##T##_12 (T x, T y)             \
> {                                           \
>   T ret;                                    \
>   if (!__builtin_add_overflow (x, y, &ret)) \
>     return ret;                             \
>   else                                      \
>     return -1;                              \
> }
>
> #define DEF_SAT_U_ADD_13(T)                                   \
> T sat_u_add_##T##_13 (T x, T y)                               \
> {                                                             \
>   T ret;                                                      \
>   return __builtin_add_overflow (x, y, &ret) != 0 ? -1 : ret; \
> }
>
> #define DEF_SAT_U_ADD_14(T)                              \
> T sat_u_add_##T##_14 (T x, T y)                          \
> {                                                        \
>   T ret;                                                 \
>   return __builtin_add_overflow (x, y, &ret) ? -1 : ret; \
> }
>
> #define DEF_SAT_U_ADD_15(T)                     \
> T sat_u_add_##T##_15 (T x, T y)                 \
> {                                               \
>   T ret;                                        \
>   if (__builtin_add_overflow (x, y, &ret) != 0) \
>     return -1;                                  \
>   else                                          \
>     return ret;                                 \
> }
>
> #define DEF_SAT_U_ADD_16(T)                \
> T sat_u_add_##T##_16 (T x, T y)            \
> {                                          \
>   T ret;                                   \
>   if (__builtin_add_overflow (x, y, &ret)) \
>     return -1;                             \
>   else                                     \
>     return ret;                            \
> }

So for the late .SAT_* introduction in the widen_mul pass the
__builtin_add_overflow
variants make sense to be used iff the target has optabs for those.
The IL for the
builtin variant should appear cheap as well:

  _6 = .ADD_OVERFLOW (x_4(D), y_5(D));
  _2 = IMAGPART_EXPR <_6>;
  if (_2 != 0)
    goto <bb 4>; [21.72%]
  else
    goto <bb 3>; [78.28%]

  <bb 3> [local count: 840525096]:
  _1 = REALPART_EXPR <_6>;

  <bb 4> [local count: 1073741824]:
  # _3 = PHI <65535(2), _1(3)>

it's possibly also the best canonicalization for the overflow check
though I fear code
generation for targets that do not support __builtin_*_overflow might
end up with
two branches.

That said - I'd avoid canonicalizing this via match.pd given that
inevitably will
if-convert.  Instead I'd see it as a way to provide a generic .SAT_* expansion
though one could say we should then simply implement fallback expansion
for the internal function.  It's also not necessarily your
responsibility to implement
this since risc-v does have .SAT_* expanders, so does x86.

Richard.

> Pan
>
> -----Original Message-----
> From: Jeff Law <jeffreyalaw@gmail.com>
> Sent: Friday, November 8, 2024 4:08 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless
>
>
>
> On 11/7/24 8:07 AM, Tamar Christina wrote:
> >
> >> -----Original Message-----
> >> From: Li, Pan2 <pan2.li@intel.com>
> >> Sent: Thursday, November 7, 2024 12:57 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener
> >> <richard.guenther@gmail.com>
> >> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com;
> >> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> >> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned
> >> SAT_ADD into branchless
> >>
> >> I see your point that the backend can leverage condition move to emit the branch
> >> code.
> >>
> >>> For instance see https://godbolt.org/z/fvrq3aq6K
> >>> On ISAs with conditional operations the branch version gets ifconverted.
> >>> On AArch64 we get:
> >>> sat_add_u_1(unsigned int, unsigned int):
> >>>          adds    w0, w0, w1
> >>>          csinv   w0, w0, wzr, cc
> >>>          ret
> >>> so just 2 instructions, and also branchless. On x86_64 we get:
> >>> sat_add_u_1(unsigned int, unsigned int):
> >>>          add     edi, esi
> >>>          mov     eax, -1
> >>>          cmovnc  eax, edi
> >>>          ret
> >>> so 3 instructions but a dependency chain of 2.
> >>> also branchless.  This patch would regress both of these.
> >>
> >> But the above Godbolt may not be a good example for evidence, because both
> >> x86_64 and aarch64 implemented usadd
> >> already.
> >> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and
> >> sat_add_u_2 are almost the
> >> same when the backend implemented usadd.
> >>
> >> #include <stdint.h>
> >>
> >> #define T uint32_t
> >>
> >>    T sat_add_u_1 (T x, T y)
> >>    {
> >>      return (T)(x + y) < x ? -1 : (x + y);
> >>    }
> >>
> >>    T sat_add_u_2 (T x, T y)
> >>    {
> >>      return (x + y) | -((x + y) < x);
> >>    }
> >>
> >> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE
> >> IR), the x86_64
> >> will have below asm dump for -O3. Looks like no obvious difference here.
> >>
> >> sat_add_u_1(unsigned int, unsigned int):
> >>          add     edi, esi
> >>          mov     eax, -1
> >>          cmovnc  eax, edi
> >>          ret
> >>
> >> sat_add_u_2(unsigned int, unsigned int):
> >>          add     edi, esi
> >>          sbb     eax, eax
> >>          or      eax, edi
> >>          ret
> >>
> >
> > Because CE is able to recognize the idiom back into a conditional move.
> > Pick a target that doesn't have conditional instructions, like PowerPC
> > https://godbolt.org/z/4bTv18WMv
> >
> > You'll see that this canonicalization has made codegen worse.
> >
> > After:
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          rldicl 9,4,0,32
> >          subf 3,3,9
> >          sradi 3,3,63
> >          or 3,3,4
> >          rldicl 3,3,0,32
> >          blr
> >
> > and before
> >
> > .L.sat_add_u_1(unsigned int, unsigned int):
> >          add 4,3,4
> >          cmplw 0,4,3
> >          bge 0,.L2
> >          li 4,-1
> > .L2:
> >          rldicl 3,4,0,32
> >          blr
> >
> > It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending
> > on the order of the branch. So for those architectures, it's always slower.
> I'm not sure it's that simple.  It'll depend on the micro-architecture.
> So things like strength of the branch predictors, how fetch blocks are
> handled (can you have embedded not-taken branches, short-forward-branch
> optimizations, etc).
>
> Jeff
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index c851ac56e37..8425d7c4f20 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3146,18 +3146,17 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (match (unsigned_integer_sat_add @0 @1)
  (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
 
-/* Simplify SAT_U_ADD to the cheap form
-   From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.
-   To:   SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
-(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
- (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
-      && types_match (type, @0, @1))
-  (bit_ior @2 (negate (convert (lt @2 @0))))))
-
-/* Unsigned saturation add, case 4 (branch with lt):
-   SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
-(match (unsigned_integer_sat_add @0 @1)
- (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2))
+/* Simplify sorts of SAT_U_ADD forms to the cheap one.
+   The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X).  */
+(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type))
+ /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1.  */
+ (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
+  (if (types_match (type, @0, @1))
+   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))
+ /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y).  */
+ (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2)
+  (if (types_match (type, @0, @1))
+   (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))))
 
 /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW):
    SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1.  */
diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
new file mode 100644
index 00000000000..46ac00426b2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h
@@ -0,0 +1,10 @@ 
+#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
+#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H
+
+#define DEF_SAT_U_ADD_2(T)              \
+T sat_u_add_##T##_2 (T x, T y)          \
+{                                       \
+  return (T)(x + y) < x ? -1 : (x + y); \
+}
+
+#endif
diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
new file mode 100644
index 00000000000..b170b35096c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-gimple-details" } */
+
+#include <stdint.h>
+#include "sat_arith_simplify.h"
+
+DEF_SAT_U_ADD_2 (uint16_t)
+
+/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
new file mode 100644
index 00000000000..8830ed7b878
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-gimple-details" } */
+
+#include <stdint.h>
+#include "sat_arith_simplify.h"
+
+DEF_SAT_U_ADD_2 (uint32_t)
+
+/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
new file mode 100644
index 00000000000..76c4d4bddaa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-gimple-details" } */
+
+#include <stdint.h>
+#include "sat_arith_simplify.h"
+
+DEF_SAT_U_ADD_2 (uint64_t)
+
+/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */
diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
new file mode 100644
index 00000000000..b034b8eedb1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-gimple-details" } */
+
+#include <stdint.h>
+#include "sat_arith_simplify.h"
+
+DEF_SAT_U_ADD_2 (uint8_t)
+
+/* { dg-final { scan-tree-dump-not " if " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " else " "gimple" } } */
+/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */