Message ID | 20241031062742.2709845-1-pan2.li@intel.com |
---|---|
State | New |
Headers | show |
Series | [v2,01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless | expand |
On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > There are sorts of forms for the unsigned SAT_ADD. Some of them are > complicated while others are cheap. This patch would like to simplify > the complicated form into the cheap ones. For example as below: > > From the form 4 (branch): > SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). > > To (branchless): > SAT_U_ADD = (X + Y) | - ((X + Y) < X). > > #define T uint8_t > > T sat_add_u_1 (T x, T y) > { > return (T)(x + y) < x ? -1 : (x + y); > } > > Before this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; > 7 │ <D.2810>: > 8 │ D.2809 = x + y; > 9 │ goto <D.2812>; > 10 │ <D.2811>: > 11 │ D.2809 = 255; > 12 │ <D.2812>: > 13 │ return D.2809; > 14 │ } > > After this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; > 6 │ _2 = x + y; > 7 │ _3 = x > _2; > 8 │ _4 = (unsigned char) _3; > 9 │ _5 = -_4; > 10 │ D.2809 = _1 | _5; > 11 │ return D.2809; > 12 │ } > > The below test suites are passed for this patch. > * The rv64gcv fully regression test. > * The x86 bootstrap test. > * The x86 fully regression test. > > gcc/ChangeLog: > > * match.pd: Remove unsigned branch form 4 for SAT_ADD, and > add simplify to branchless instead. > > gcc/testsuite/ChangeLog: > > * gcc.dg/sat_arith_simplify.h: New test. > * gcc.dg/sat_u_add-simplify-2-u16.c: New test. > * gcc.dg/sat_u_add-simplify-2-u32.c: New test. > * gcc.dg/sat_u_add-simplify-2-u64.c: New test. > * gcc.dg/sat_u_add-simplify-2-u8.c: New test. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 23 +++++++++---------- > gcc/testsuite/gcc.dg/sat_arith_simplify.h | 10 ++++++++ > .../gcc.dg/sat_u_add-simplify-2-u16.c | 11 +++++++++ > .../gcc.dg/sat_u_add-simplify-2-u32.c | 11 +++++++++ > .../gcc.dg/sat_u_add-simplify-2-u64.c | 11 +++++++++ > .../gcc.dg/sat_u_add-simplify-2-u8.c | 11 +++++++++ > 6 files changed, 65 insertions(+), 12 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index c851ac56e37..8425d7c4f20 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (match (unsigned_integer_sat_add @0 @1) > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > -/* Simplify SAT_U_ADD to the cheap form > - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. > - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)) > - (bit_ior @2 (negate (convert (lt @2 @0)))))) > - > -/* Unsigned saturation add, case 4 (branch with lt): > - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > -(match (unsigned_integer_sat_add @0 @1) > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > +/* Simplify sorts of SAT_U_ADD forms to the cheap one. > + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > + (if (types_match (type, @0, @1)) > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) > + (if (types_match (type, @0, @1)) > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is cheaper. When I count the number of gimple statements (sub-expressions) the original appears as 3 while the result looks to have 5. The catch is of course that the original might involve control flow and a PHI. The pattern will apply during late PHI-OPT or during GENERIC folding done by the frontends or gimplfiication - you scan the gimplification dump below, so having it apply there means it will influence inlining heuristics for example. I wonder which variant is considered larger by its heuristic. What do others think of the early canonicalization of these to straight-line code? Thanks, Richard. > /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1. */ > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > new file mode 100644 > index 00000000000..46ac00426b2 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > @@ -0,0 +1,10 @@ > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > + > +#define DEF_SAT_U_ADD_2(T) \ > +T sat_u_add_##T##_2 (T x, T y) \ > +{ \ > + return (T)(x + y) < x ? -1 : (x + y); \ > +} > + > +#endif > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > new file mode 100644 > index 00000000000..b170b35096c > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint16_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > new file mode 100644 > index 00000000000..8830ed7b878 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint32_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > new file mode 100644 > index 00000000000..76c4d4bddaa > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint64_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > new file mode 100644 > index 00000000000..b034b8eedb1 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint8_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > -- > 2.43.0 >
Never mind and thanks Richard for comments. > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > cheaper. When I count the number of gimple statements (sub-expressions) > the original appears as 3 while the result looks to have 5. I may have a question about how we count the stmts, not sure the x <= _1 and then goto should be counted as 1 or 2 stmts in below example. I try to count it by myself but not very sure the understanding is correct. Before this patch: 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) 2 │ { 3 │ uint8_t D.2809; 4 │ 5 │ _1 = x + y; // 1 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto ??? 7 │ <D.2810>: 8 │ D.2809 = x + y; // 4 (token) 9 │ goto <D.2812>; // 5 (token) for goto ??? 10 │ <D.2811>: 11 │ D.2809 = 255; // 4 (not token) 12 │ <D.2812>: 13 │ return D.2809; // 6 for token, 5 for not token 14 │ } After this patch: 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) 2 │ { 3 │ uint8_t D.2809; 4 │ 5 │ _1 = x + y; // 1 6 │ _2 = x + y; // 2 7 │ _3 = x > _2; // 3 8 │ _4 = (unsigned char) _3; // 4 9 │ _5 = -_4; // 5 10 │ D.2809 = _1 | _5; // 6 11 │ return D.2809; // 7 12 │ } > The catch is of > course that the original might involve control flow and a PHI. > The pattern will apply during late PHI-OPT or during GENERIC folding > done by the frontends or gimplfiication - you scan the gimplification dump > below, so having it apply there means it will influence inlining heuristics > for example. I wonder which variant is considered larger by its heuristic. > What do others think of the early canonicalization of these to > straight-line code? Make sense, we could find one form that most of us consider to be the "cheapest". Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Wednesday, November 6, 2024 8:43 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > There are sorts of forms for the unsigned SAT_ADD. Some of them are > complicated while others are cheap. This patch would like to simplify > the complicated form into the cheap ones. For example as below: > > From the form 4 (branch): > SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). > > To (branchless): > SAT_U_ADD = (X + Y) | - ((X + Y) < X). > > #define T uint8_t > > T sat_add_u_1 (T x, T y) > { > return (T)(x + y) < x ? -1 : (x + y); > } > > Before this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; > 7 │ <D.2810>: > 8 │ D.2809 = x + y; > 9 │ goto <D.2812>; > 10 │ <D.2811>: > 11 │ D.2809 = 255; > 12 │ <D.2812>: > 13 │ return D.2809; > 14 │ } > > After this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; > 6 │ _2 = x + y; > 7 │ _3 = x > _2; > 8 │ _4 = (unsigned char) _3; > 9 │ _5 = -_4; > 10 │ D.2809 = _1 | _5; > 11 │ return D.2809; > 12 │ } > > The below test suites are passed for this patch. > * The rv64gcv fully regression test. > * The x86 bootstrap test. > * The x86 fully regression test. > > gcc/ChangeLog: > > * match.pd: Remove unsigned branch form 4 for SAT_ADD, and > add simplify to branchless instead. > > gcc/testsuite/ChangeLog: > > * gcc.dg/sat_arith_simplify.h: New test. > * gcc.dg/sat_u_add-simplify-2-u16.c: New test. > * gcc.dg/sat_u_add-simplify-2-u32.c: New test. > * gcc.dg/sat_u_add-simplify-2-u64.c: New test. > * gcc.dg/sat_u_add-simplify-2-u8.c: New test. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 23 +++++++++---------- > gcc/testsuite/gcc.dg/sat_arith_simplify.h | 10 ++++++++ > .../gcc.dg/sat_u_add-simplify-2-u16.c | 11 +++++++++ > .../gcc.dg/sat_u_add-simplify-2-u32.c | 11 +++++++++ > .../gcc.dg/sat_u_add-simplify-2-u64.c | 11 +++++++++ > .../gcc.dg/sat_u_add-simplify-2-u8.c | 11 +++++++++ > 6 files changed, 65 insertions(+), 12 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index c851ac56e37..8425d7c4f20 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (match (unsigned_integer_sat_add @0 @1) > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > -/* Simplify SAT_U_ADD to the cheap form > - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. > - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)) > - (bit_ior @2 (negate (convert (lt @2 @0)))))) > - > -/* Unsigned saturation add, case 4 (branch with lt): > - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > -(match (unsigned_integer_sat_add @0 @1) > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > +/* Simplify sorts of SAT_U_ADD forms to the cheap one. > + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > + (if (types_match (type, @0, @1)) > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) > + (if (types_match (type, @0, @1)) > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is cheaper. When I count the number of gimple statements (sub-expressions) the original appears as 3 while the result looks to have 5. The catch is of course that the original might involve control flow and a PHI. The pattern will apply during late PHI-OPT or during GENERIC folding done by the frontends or gimplfiication - you scan the gimplification dump below, so having it apply there means it will influence inlining heuristics for example. I wonder which variant is considered larger by its heuristic. What do others think of the early canonicalization of these to straight-line code? Thanks, Richard. > /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1. */ > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > new file mode 100644 > index 00000000000..46ac00426b2 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > @@ -0,0 +1,10 @@ > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > + > +#define DEF_SAT_U_ADD_2(T) \ > +T sat_u_add_##T##_2 (T x, T y) \ > +{ \ > + return (T)(x + y) < x ? -1 : (x + y); \ > +} > + > +#endif > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > new file mode 100644 > index 00000000000..b170b35096c > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint16_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > new file mode 100644 > index 00000000000..8830ed7b878 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint32_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > new file mode 100644 > index 00000000000..76c4d4bddaa > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint64_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > new file mode 100644 > index 00000000000..b034b8eedb1 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > + > +#include <stdint.h> > +#include "sat_arith_simplify.h" > + > +DEF_SAT_U_ADD_2 (uint8_t) > + > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > -- > 2.43.0 >
> -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Wednesday, November 6, 2024 1:31 PM > To: Richard Biener <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>; > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; > rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > Never mind and thanks Richard for comments. > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > cheaper. When I count the number of gimple statements (sub-expressions) > > the original appears as 3 while the result looks to have 5. > > I may have a question about how we count the stmts, not sure the x <= _1 and > then goto should be counted They should, because they both become a instructions that need to be executed in the execution chain. > as 1 or 2 stmts in below example. I try to count it by myself but not very sure the > understanding is correct. > > Before this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; // 1 > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto > ??? > 7 │ <D.2810>: > 8 │ D.2809 = x + y; // 4 (token) > 9 │ goto <D.2812>; // 5 (token) for goto ??? > 10 │ <D.2811>: > 11 │ D.2809 = 255; // 4 (not token) > 12 │ <D.2812>: > 13 │ return D.2809; // 6 for token, 5 for not token > 14 │ } > > After this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; // 1 > 6 │ _2 = x + y; // 2 > 7 │ _3 = x > _2; // 3 > 8 │ _4 = (unsigned char) _3; // 4 > 9 │ _5 = -_4; // 5 > 10 │ D.2809 = _1 | _5; // 6 > 11 │ return D.2809; // 7 > 12 │ } > > > The catch is of > > course that the original might involve control flow and a PHI. _1 and _2 are counted as one, as they'll be CSE'd. After the patch your dependency chain is 5 instructions, as in, to create the result you must execute 5 sequential instructions. 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) 2 │ { 3 │ uint8_t D.2809; 4 │ 5 │ _1 = x + y; 6 │ _2 = x + y; // 1 7 │ _3 = x > _2; // 2 8 │ _4 = (unsigned char) _3; // 3 9 │ _5 = -_4; // 4 10 │ D.2809 = _1 | _5; //5 11 │ return D.2809; 12 │ } Before your change you had 3, because you always do Compare + branch + set. The problem with the rewrite is that it pessimists the code if the saturating instructions are not recognized afterwards. Note this is also the case for vector codegen, The original code in vector would be 1. duplicate 255 1. x + y 2. compare 3. select. The first two instructions are independent and execute in parallel, so the original vector codegen also had a dependency chain of 3. Additionally the branch version will get the benefit of branch prediction when ran inside a loop. > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > done by the frontends or gimplfiication - you scan the gimplification dump > > below, so having it apply there means it will influence inlining heuristics > > for example. I wonder which variant is considered larger by its heuristic. > > > What do others think of the early canonicalization of these to > > straight-line code? > > Make sense, we could find one form that most of us consider to be the "cheapest". I would disagree with Richard here in that this would be a pessimistic codegen for targets that don't later implement the IFN_SAT. Thanks, Tamar > > Pan > > -----Original Message----- > From: Richard Biener <richard.guenther@gmail.com> > Sent: Wednesday, November 6, 2024 8:43 PM > To: Li, Pan2 <pan2.li@intel.com> > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote: > > > > From: Pan Li <pan2.li@intel.com> > > > > There are sorts of forms for the unsigned SAT_ADD. Some of them are > > complicated while others are cheap. This patch would like to simplify > > the complicated form into the cheap ones. For example as below: > > > > From the form 4 (branch): > > SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). > > > > To (branchless): > > SAT_U_ADD = (X + Y) | - ((X + Y) < X). > > > > #define T uint8_t > > > > T sat_add_u_1 (T x, T y) > > { > > return (T)(x + y) < x ? -1 : (x + y); > > } > > > > Before this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; > > 7 │ <D.2810>: > > 8 │ D.2809 = x + y; > > 9 │ goto <D.2812>; > > 10 │ <D.2811>: > > 11 │ D.2809 = 255; > > 12 │ <D.2812>: > > 13 │ return D.2809; > > 14 │ } > > > > After this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; > > 6 │ _2 = x + y; > > 7 │ _3 = x > _2; > > 8 │ _4 = (unsigned char) _3; > > 9 │ _5 = -_4; > > 10 │ D.2809 = _1 | _5; > > 11 │ return D.2809; > > 12 │ } > > > > The below test suites are passed for this patch. > > * The rv64gcv fully regression test. > > * The x86 bootstrap test. > > * The x86 fully regression test. > > > > gcc/ChangeLog: > > > > * match.pd: Remove unsigned branch form 4 for SAT_ADD, and > > add simplify to branchless instead. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.dg/sat_arith_simplify.h: New test. > > * gcc.dg/sat_u_add-simplify-2-u16.c: New test. > > * gcc.dg/sat_u_add-simplify-2-u32.c: New test. > > * gcc.dg/sat_u_add-simplify-2-u64.c: New test. > > * gcc.dg/sat_u_add-simplify-2-u8.c: New test. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 23 +++++++++---------- > > gcc/testsuite/gcc.dg/sat_arith_simplify.h | 10 ++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u16.c | 11 +++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u32.c | 11 +++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u64.c | 11 +++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u8.c | 11 +++++++++ > > 6 files changed, 65 insertions(+), 12 deletions(-) > > create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index c851ac56e37..8425d7c4f20 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (match (unsigned_integer_sat_add @0 @1) > > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > > > -/* Simplify SAT_U_ADD to the cheap form > > - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. > > - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > - && types_match (type, @0, @1)) > > - (bit_ior @2 (negate (convert (lt @2 @0)))))) > > - > > -/* Unsigned saturation add, case 4 (branch with lt): > > - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > -(match (unsigned_integer_sat_add @0 @1) > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one. > > + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > + (if (types_match (type, @0, @1)) > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) > > + (if (types_match (type, @0, @1)) > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > cheaper. When I count the number of gimple statements (sub-expressions) > the original appears as 3 while the result looks to have 5. The catch is of > course that the original might involve control flow and a PHI. > > The pattern will apply during late PHI-OPT or during GENERIC folding > done by the frontends or gimplfiication - you scan the gimplification dump > below, so having it apply there means it will influence inlining heuristics > for example. I wonder which variant is considered larger by its heuristic. > > What do others think of the early canonicalization of these to > straight-line code? > > Thanks, > Richard. > > > /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > > SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : - > 1. */ > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > new file mode 100644 > > index 00000000000..46ac00426b2 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > @@ -0,0 +1,10 @@ > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > + > > +#define DEF_SAT_U_ADD_2(T) \ > > +T sat_u_add_##T##_2 (T x, T y) \ > > +{ \ > > + return (T)(x + y) < x ? -1 : (x + y); \ > > +} > > + > > +#endif > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > new file mode 100644 > > index 00000000000..b170b35096c > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint16_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > new file mode 100644 > > index 00000000000..8830ed7b878 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint32_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > new file mode 100644 > > index 00000000000..76c4d4bddaa > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint64_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > new file mode 100644 > > index 00000000000..b034b8eedb1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint8_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > -- > > 2.43.0 > >
I see, thanks Tamar for the explanation. > The problem with the rewrite is that it pessimists the code if the saturating > instructions are not recognized afterwards. The original idea is somehow independent with the backend support IFN_SAT_* or not. Given we have sorts of form of IFN_SAT_*, some of them are cheap while others are heavy from the perspective of gimple. It is possible to do some canonicalization here. > Additionally the branch version will get the benefit of branch prediction when ran > inside a loop. Not very sure if my understanding is correct, but the branchless version is preferred in most case IMO if they have nearly count of stmt. Not sure if it is still true during the vectorization. Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Thursday, November 7, 2024 12:25 AM To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless > -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Wednesday, November 6, 2024 1:31 PM > To: Richard Biener <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>; > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; > rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > Never mind and thanks Richard for comments. > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > cheaper. When I count the number of gimple statements (sub-expressions) > > the original appears as 3 while the result looks to have 5. > > I may have a question about how we count the stmts, not sure the x <= _1 and > then goto should be counted They should, because they both become a instructions that need to be executed in the execution chain. > as 1 or 2 stmts in below example. I try to count it by myself but not very sure the > understanding is correct. > > Before this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; // 1 > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto > ??? > 7 │ <D.2810>: > 8 │ D.2809 = x + y; // 4 (token) > 9 │ goto <D.2812>; // 5 (token) for goto ??? > 10 │ <D.2811>: > 11 │ D.2809 = 255; // 4 (not token) > 12 │ <D.2812>: > 13 │ return D.2809; // 6 for token, 5 for not token > 14 │ } > > After this patch: > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; // 1 > 6 │ _2 = x + y; // 2 > 7 │ _3 = x > _2; // 3 > 8 │ _4 = (unsigned char) _3; // 4 > 9 │ _5 = -_4; // 5 > 10 │ D.2809 = _1 | _5; // 6 > 11 │ return D.2809; // 7 > 12 │ } > > > The catch is of > > course that the original might involve control flow and a PHI. _1 and _2 are counted as one, as they'll be CSE'd. After the patch your dependency chain is 5 instructions, as in, to create the result you must execute 5 sequential instructions. 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) 2 │ { 3 │ uint8_t D.2809; 4 │ 5 │ _1 = x + y; 6 │ _2 = x + y; // 1 7 │ _3 = x > _2; // 2 8 │ _4 = (unsigned char) _3; // 3 9 │ _5 = -_4; // 4 10 │ D.2809 = _1 | _5; //5 11 │ return D.2809; 12 │ } Before your change you had 3, because you always do Compare + branch + set. The problem with the rewrite is that it pessimists the code if the saturating instructions are not recognized afterwards. Note this is also the case for vector codegen, The original code in vector would be 1. duplicate 255 1. x + y 2. compare 3. select. The first two instructions are independent and execute in parallel, so the original vector codegen also had a dependency chain of 3. Additionally the branch version will get the benefit of branch prediction when ran inside a loop. > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > done by the frontends or gimplfiication - you scan the gimplification dump > > below, so having it apply there means it will influence inlining heuristics > > for example. I wonder which variant is considered larger by its heuristic. > > > What do others think of the early canonicalization of these to > > straight-line code? > > Make sense, we could find one form that most of us consider to be the "cheapest". I would disagree with Richard here in that this would be a pessimistic codegen for targets that don't later implement the IFN_SAT. Thanks, Tamar > > Pan > > -----Original Message----- > From: Richard Biener <richard.guenther@gmail.com> > Sent: Wednesday, November 6, 2024 8:43 PM > To: Li, Pan2 <pan2.li@intel.com> > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote: > > > > From: Pan Li <pan2.li@intel.com> > > > > There are sorts of forms for the unsigned SAT_ADD. Some of them are > > complicated while others are cheap. This patch would like to simplify > > the complicated form into the cheap ones. For example as below: > > > > From the form 4 (branch): > > SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). > > > > To (branchless): > > SAT_U_ADD = (X + Y) | - ((X + Y) < X). > > > > #define T uint8_t > > > > T sat_add_u_1 (T x, T y) > > { > > return (T)(x + y) < x ? -1 : (x + y); > > } > > > > Before this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; > > 7 │ <D.2810>: > > 8 │ D.2809 = x + y; > > 9 │ goto <D.2812>; > > 10 │ <D.2811>: > > 11 │ D.2809 = 255; > > 12 │ <D.2812>: > > 13 │ return D.2809; > > 14 │ } > > > > After this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; > > 6 │ _2 = x + y; > > 7 │ _3 = x > _2; > > 8 │ _4 = (unsigned char) _3; > > 9 │ _5 = -_4; > > 10 │ D.2809 = _1 | _5; > > 11 │ return D.2809; > > 12 │ } > > > > The below test suites are passed for this patch. > > * The rv64gcv fully regression test. > > * The x86 bootstrap test. > > * The x86 fully regression test. > > > > gcc/ChangeLog: > > > > * match.pd: Remove unsigned branch form 4 for SAT_ADD, and > > add simplify to branchless instead. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.dg/sat_arith_simplify.h: New test. > > * gcc.dg/sat_u_add-simplify-2-u16.c: New test. > > * gcc.dg/sat_u_add-simplify-2-u32.c: New test. > > * gcc.dg/sat_u_add-simplify-2-u64.c: New test. > > * gcc.dg/sat_u_add-simplify-2-u8.c: New test. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 23 +++++++++---------- > > gcc/testsuite/gcc.dg/sat_arith_simplify.h | 10 ++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u16.c | 11 +++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u32.c | 11 +++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u64.c | 11 +++++++++ > > .../gcc.dg/sat_u_add-simplify-2-u8.c | 11 +++++++++ > > 6 files changed, 65 insertions(+), 12 deletions(-) > > create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index c851ac56e37..8425d7c4f20 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > (match (unsigned_integer_sat_add @0 @1) > > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > > > -/* Simplify SAT_U_ADD to the cheap form > > - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. > > - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > - && types_match (type, @0, @1)) > > - (bit_ior @2 (negate (convert (lt @2 @0)))))) > > - > > -/* Unsigned saturation add, case 4 (branch with lt): > > - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > -(match (unsigned_integer_sat_add @0 @1) > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one. > > + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > + (if (types_match (type, @0, @1)) > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) > > + (if (types_match (type, @0, @1)) > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > cheaper. When I count the number of gimple statements (sub-expressions) > the original appears as 3 while the result looks to have 5. The catch is of > course that the original might involve control flow and a PHI. > > The pattern will apply during late PHI-OPT or during GENERIC folding > done by the frontends or gimplfiication - you scan the gimplification dump > below, so having it apply there means it will influence inlining heuristics > for example. I wonder which variant is considered larger by its heuristic. > > What do others think of the early canonicalization of these to > straight-line code? > > Thanks, > Richard. > > > /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > > SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : - > 1. */ > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > new file mode 100644 > > index 00000000000..46ac00426b2 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > @@ -0,0 +1,10 @@ > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > + > > +#define DEF_SAT_U_ADD_2(T) \ > > +T sat_u_add_##T##_2 (T x, T y) \ > > +{ \ > > + return (T)(x + y) < x ? -1 : (x + y); \ > > +} > > + > > +#endif > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > new file mode 100644 > > index 00000000000..b170b35096c > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint16_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > new file mode 100644 > > index 00000000000..8830ed7b878 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint32_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > new file mode 100644 > > index 00000000000..76c4d4bddaa > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint64_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > new file mode 100644 > > index 00000000000..b034b8eedb1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > @@ -0,0 +1,11 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > + > > +#include <stdint.h> > > +#include "sat_arith_simplify.h" > > + > > +DEF_SAT_U_ADD_2 (uint8_t) > > + > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > -- > > 2.43.0 > >
> -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Thursday, November 7, 2024 1:45 AM > To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener > <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > I see, thanks Tamar for the explanation. > > > The problem with the rewrite is that it pessimists the code if the saturating > > instructions are not recognized afterwards. > > The original idea is somehow independent with the backend support IFN_SAT_* or > not. > Given we have sorts of form of IFN_SAT_*, some of them are cheap while others > are heavy > from the perspective of gimple. It is possible to do some canonicalization here. > > > Additionally the branch version will get the benefit of branch prediction when ran > > inside a loop. > > Not very sure if my understanding is correct, but the branchless version is > preferred in most case IMO if > they have nearly count of stmt. Not sure if it is still true during the vectorization. I don't think this is true. The branchless version must be either less instructions or cheaper instruction. The branches, especially in a loop become mostly free due to branch prediction. Modern branch predictors are really good at idoms like this. That said, the other issue with doing this in gimple is that it interferes with the RTL conditional execution pass. For instance see https://godbolt.org/z/fvrq3aq6K On ISAs with conditional operations the branch version gets ifconverted. On AArch64 we get: sat_add_u_1(unsigned int, unsigned int): adds w0, w0, w1 csinv w0, w0, wzr, cc ret so just 2 instructions, and also branchless. On x86_64 we get: sat_add_u_1(unsigned int, unsigned int): add edi, esi mov eax, -1 cmovnc eax, edi ret so 3 instructions but a dependency chain of 2. also branchless. This patch would regress both of these. Thanks, Tamar > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Thursday, November 7, 2024 12:25 AM > To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > > -----Original Message----- > > From: Li, Pan2 <pan2.li@intel.com> > > Sent: Wednesday, November 6, 2024 1:31 PM > > To: Richard Biener <richard.guenther@gmail.com> > > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>; > > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; > > rdapp.gcc@gmail.com > > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > SAT_ADD into branchless > > > > Never mind and thanks Richard for comments. > > > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > > cheaper. When I count the number of gimple statements (sub-expressions) > > > the original appears as 3 while the result looks to have 5. > > > > I may have a question about how we count the stmts, not sure the x <= _1 and > > then goto should be counted > > They should, because they both become a instructions that need to be executed in > the execution chain. > > > as 1 or 2 stmts in below example. I try to count it by myself but not very sure the > > understanding is correct. > > > > Before this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; // 1 > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto > > ??? > > 7 │ <D.2810>: > > 8 │ D.2809 = x + y; // 4 (token) > > 9 │ goto <D.2812>; // 5 (token) for goto ??? > > 10 │ <D.2811>: > > 11 │ D.2809 = 255; // 4 (not token) > > 12 │ <D.2812>: > > 13 │ return D.2809; // 6 for token, 5 for not token > > 14 │ } > > > > After this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; // 1 > > 6 │ _2 = x + y; // 2 > > 7 │ _3 = x > _2; // 3 > > 8 │ _4 = (unsigned char) _3; // 4 > > 9 │ _5 = -_4; // 5 > > 10 │ D.2809 = _1 | _5; // 6 > > 11 │ return D.2809; // 7 > > 12 │ } > > > > > The catch is of > > > course that the original might involve control flow and a PHI. > > _1 and _2 are counted as one, as they'll be CSE'd. > After the patch your dependency chain is 5 instructions, as in, > to create the result you must execute 5 sequential instructions. > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; > 6 │ _2 = x + y; // 1 > 7 │ _3 = x > _2; // 2 > 8 │ _4 = (unsigned char) _3; // 3 > 9 │ _5 = -_4; // 4 > 10 │ D.2809 = _1 | _5; //5 > 11 │ return D.2809; > 12 │ } > > Before your change you had 3, because you always do > Compare + branch + set. > > The problem with the rewrite is that it pessimists the code if the saturating > instructions are not recognized afterwards. > > Note this is also the case for vector codegen, The original code in vector would be > > 1. duplicate 255 > 1. x + y > 2. compare > 3. select. > > The first two instructions are independent and execute in parallel, so the original > vector codegen also had a dependency chain of 3. > > Additionally the branch version will get the benefit of branch prediction when ran > inside a loop. > > > > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > > done by the frontends or gimplfiication - you scan the gimplification dump > > > below, so having it apply there means it will influence inlining heuristics > > > for example. I wonder which variant is considered larger by its heuristic. > > > > > What do others think of the early canonicalization of these to > > > straight-line code? > > > > Make sense, we could find one form that most of us consider to be the > "cheapest". > > I would disagree with Richard here in that this would be a pessimistic codegen for > targets that don't later implement the IFN_SAT. > > Thanks, > Tamar > > > > > Pan > > > > -----Original Message----- > > From: Richard Biener <richard.guenther@gmail.com> > > Sent: Wednesday, November 6, 2024 8:43 PM > > To: Li, Pan2 <pan2.li@intel.com> > > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; > > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > SAT_ADD into branchless > > > > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote: > > > > > > From: Pan Li <pan2.li@intel.com> > > > > > > There are sorts of forms for the unsigned SAT_ADD. Some of them are > > > complicated while others are cheap. This patch would like to simplify > > > the complicated form into the cheap ones. For example as below: > > > > > > From the form 4 (branch): > > > SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). > > > > > > To (branchless): > > > SAT_U_ADD = (X + Y) | - ((X + Y) < X). > > > > > > #define T uint8_t > > > > > > T sat_add_u_1 (T x, T y) > > > { > > > return (T)(x + y) < x ? -1 : (x + y); > > > } > > > > > > Before this patch: > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > 2 │ { > > > 3 │ uint8_t D.2809; > > > 4 │ > > > 5 │ _1 = x + y; > > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; > > > 7 │ <D.2810>: > > > 8 │ D.2809 = x + y; > > > 9 │ goto <D.2812>; > > > 10 │ <D.2811>: > > > 11 │ D.2809 = 255; > > > 12 │ <D.2812>: > > > 13 │ return D.2809; > > > 14 │ } > > > > > > After this patch: > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > 2 │ { > > > 3 │ uint8_t D.2809; > > > 4 │ > > > 5 │ _1 = x + y; > > > 6 │ _2 = x + y; > > > 7 │ _3 = x > _2; > > > 8 │ _4 = (unsigned char) _3; > > > 9 │ _5 = -_4; > > > 10 │ D.2809 = _1 | _5; > > > 11 │ return D.2809; > > > 12 │ } > > > > > > The below test suites are passed for this patch. > > > * The rv64gcv fully regression test. > > > * The x86 bootstrap test. > > > * The x86 fully regression test. > > > > > > gcc/ChangeLog: > > > > > > * match.pd: Remove unsigned branch form 4 for SAT_ADD, and > > > add simplify to branchless instead. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.dg/sat_arith_simplify.h: New test. > > > * gcc.dg/sat_u_add-simplify-2-u16.c: New test. > > > * gcc.dg/sat_u_add-simplify-2-u32.c: New test. > > > * gcc.dg/sat_u_add-simplify-2-u64.c: New test. > > > * gcc.dg/sat_u_add-simplify-2-u8.c: New test. > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > > --- > > > gcc/match.pd | 23 +++++++++---------- > > > gcc/testsuite/gcc.dg/sat_arith_simplify.h | 10 ++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u16.c | 11 +++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u32.c | 11 +++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u64.c | 11 +++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u8.c | 11 +++++++++ > > > 6 files changed, 65 insertions(+), 12 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index c851ac56e37..8425d7c4f20 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > (match (unsigned_integer_sat_add @0 @1) > > > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > > > > > -/* Simplify SAT_U_ADD to the cheap form > > > - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. > > > - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > > - && types_match (type, @0, @1)) > > > - (bit_ior @2 (negate (convert (lt @2 @0)))))) > > > - > > > -/* Unsigned saturation add, case 4 (branch with lt): > > > - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > > -(match (unsigned_integer_sat_add @0 @1) > > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one. > > > + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > > + (if (types_match (type, @0, @1)) > > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) > > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) > > > + (if (types_match (type, @0, @1)) > > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) > > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > cheaper. When I count the number of gimple statements (sub-expressions) > > the original appears as 3 while the result looks to have 5. The catch is of > > course that the original might involve control flow and a PHI. > > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > done by the frontends or gimplfiication - you scan the gimplification dump > > below, so having it apply there means it will influence inlining heuristics > > for example. I wonder which variant is considered larger by its heuristic. > > > > What do others think of the early canonicalization of these to > > straight-line code? > > > > Thanks, > > Richard. > > > > > /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > > > SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW > : - > > 1. */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > new file mode 100644 > > > index 00000000000..46ac00426b2 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > @@ -0,0 +1,10 @@ > > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > > + > > > +#define DEF_SAT_U_ADD_2(T) \ > > > +T sat_u_add_##T##_2 (T x, T y) \ > > > +{ \ > > > + return (T)(x + y) < x ? -1 : (x + y); \ > > > +} > > > + > > > +#endif > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > new file mode 100644 > > > index 00000000000..b170b35096c > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint16_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > new file mode 100644 > > > index 00000000000..8830ed7b878 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint32_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > new file mode 100644 > > > index 00000000000..76c4d4bddaa > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint64_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > new file mode 100644 > > > index 00000000000..b034b8eedb1 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint8_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > -- > > > 2.43.0 > > >
I see your point that the backend can leverage condition move to emit the branch code. > For instance see https://godbolt.org/z/fvrq3aq6K > On ISAs with conditional operations the branch version gets ifconverted. > On AArch64 we get: > sat_add_u_1(unsigned int, unsigned int): > adds w0, w0, w1 > csinv w0, w0, wzr, cc > ret > so just 2 instructions, and also branchless. On x86_64 we get: > sat_add_u_1(unsigned int, unsigned int): > add edi, esi > mov eax, -1 > cmovnc eax, edi > ret > so 3 instructions but a dependency chain of 2. > also branchless. This patch would regress both of these. But the above Godbolt may not be a good example for evidence, because both x86_64 and aarch64 implemented usadd already. Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and sat_add_u_2 are almost the same when the backend implemented usadd. #include <stdint.h> #define T uint32_t T sat_add_u_1 (T x, T y) { return (T)(x + y) < x ? -1 : (x + y); } T sat_add_u_2 (T x, T y) { return (x + y) | -((x + y) < x); } It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE IR), the x86_64 will have below asm dump for -O3. Looks like no obvious difference here. sat_add_u_1(unsigned int, unsigned int): add edi, esi mov eax, -1 cmovnc eax, edi ret sat_add_u_2(unsigned int, unsigned int): add edi, esi sbb eax, eax or eax, edi ret As well as for the vector mode for x86_64 gcc 14.2 as below. void vec_sat_add_u_1 (T *a, T *b, T *out, int n) { for (int i = 0; i < n; i++) { T x = a[i]; T y = b[i]; out[i] = (T)(x + y) < x ? -1 : (x + y); } } void vec_sat_add_u_2 (T * __restrict a, T * __restrict b, T * __restrict out, int n) { for (int i = 0; i < n; i++) { T x = a[i]; T y = b[i]; out[i] = (x + y) | -((x + y) < x); } } vec_sat_add_u_1(unsigned int*, unsigned int*, unsigned int*, int): .... .L15: movdqu xmm0, XMMWORD PTR [rdi+rax] movdqu xmm1, XMMWORD PTR [rsi+rax] paddd xmm1, xmm0 psubd xmm0, xmm2 movdqa xmm3, xmm1 psubd xmm3, xmm2 pcmpgtd xmm0, xmm3 por xmm0, xmm1 movups XMMWORD PTR [rdx+rax], xmm0 add rax, 16 cmp r8, rax jne .L15 mov eax, ecx and eax, -4 mov r8d, eax cmp ecx, eax je .L11 sub ecx, eax mov r9d, ecx cmp ecx, 1 je .L17 .L14: movq xmm2, QWORD PTR .LC2[rip] mov ecx, r8d movq xmm0, QWORD PTR [rdi+rcx*4] movq xmm1, QWORD PTR [rsi+rcx*4] paddd xmm1, xmm0 psubd xmm0, xmm2 movdqa xmm3, xmm1 psubd xmm3, xmm2 pcmpgtd xmm0, xmm3 movdqa xmm2, xmm0 pandn xmm2, xmm1 por xmm0, xmm2 movq QWORD PTR [rdx+rcx*4], xmm0 .... vec_sat_add_u_2(unsigned int*, unsigned int*, unsigned int*, int): ... .L50: movdqu xmm0, XMMWORD PTR [rdi+rax] movdqu xmm1, XMMWORD PTR [rsi+rax] paddd xmm1, xmm0 psubd xmm0, xmm2 movdqa xmm3, xmm1 psubd xmm3, xmm2 pcmpgtd xmm0, xmm3 por xmm0, xmm1 movups XMMWORD PTR [rdx+rax], xmm0 add rax, 16 cmp rax, r8 jne .L50 mov eax, ecx and eax, -4 mov r8d, eax cmp ecx, eax je .L64 .L49: sub ecx, r8d cmp ecx, 1 je .L52 movq xmm0, QWORD PTR [rdi+r8*4] movq xmm1, QWORD PTR [rsi+r8*4] movq xmm2, QWORD PTR .LC2[rip] paddd xmm1, xmm0 psubd xmm0, xmm2 movdqa xmm3, xmm1 psubd xmm3, xmm2 pcmpgtd xmm0, xmm3 movdqa xmm2, xmm0 pandn xmm2, xmm1 por xmm0, xmm2 movq QWORD PTR [rdx+r8*4], xmm0 ... Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Thursday, November 7, 2024 7:43 PM To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless > -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Thursday, November 7, 2024 1:45 AM > To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener > <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > I see, thanks Tamar for the explanation. > > > The problem with the rewrite is that it pessimists the code if the saturating > > instructions are not recognized afterwards. > > The original idea is somehow independent with the backend support IFN_SAT_* or > not. > Given we have sorts of form of IFN_SAT_*, some of them are cheap while others > are heavy > from the perspective of gimple. It is possible to do some canonicalization here. > > > Additionally the branch version will get the benefit of branch prediction when ran > > inside a loop. > > Not very sure if my understanding is correct, but the branchless version is > preferred in most case IMO if > they have nearly count of stmt. Not sure if it is still true during the vectorization. I don't think this is true. The branchless version must be either less instructions or cheaper instruction. The branches, especially in a loop become mostly free due to branch prediction. Modern branch predictors are really good at idoms like this. That said, the other issue with doing this in gimple is that it interferes with the RTL conditional execution pass. For instance see https://godbolt.org/z/fvrq3aq6K On ISAs with conditional operations the branch version gets ifconverted. On AArch64 we get: sat_add_u_1(unsigned int, unsigned int): adds w0, w0, w1 csinv w0, w0, wzr, cc ret so just 2 instructions, and also branchless. On x86_64 we get: sat_add_u_1(unsigned int, unsigned int): add edi, esi mov eax, -1 cmovnc eax, edi ret so 3 instructions but a dependency chain of 2. also branchless. This patch would regress both of these. Thanks, Tamar > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Thursday, November 7, 2024 12:25 AM > To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > > -----Original Message----- > > From: Li, Pan2 <pan2.li@intel.com> > > Sent: Wednesday, November 6, 2024 1:31 PM > > To: Richard Biener <richard.guenther@gmail.com> > > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>; > > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; > > rdapp.gcc@gmail.com > > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > SAT_ADD into branchless > > > > Never mind and thanks Richard for comments. > > > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > > cheaper. When I count the number of gimple statements (sub-expressions) > > > the original appears as 3 while the result looks to have 5. > > > > I may have a question about how we count the stmts, not sure the x <= _1 and > > then goto should be counted > > They should, because they both become a instructions that need to be executed in > the execution chain. > > > as 1 or 2 stmts in below example. I try to count it by myself but not very sure the > > understanding is correct. > > > > Before this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; // 1 > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for goto > > ??? > > 7 │ <D.2810>: > > 8 │ D.2809 = x + y; // 4 (token) > > 9 │ goto <D.2812>; // 5 (token) for goto ??? > > 10 │ <D.2811>: > > 11 │ D.2809 = 255; // 4 (not token) > > 12 │ <D.2812>: > > 13 │ return D.2809; // 6 for token, 5 for not token > > 14 │ } > > > > After this patch: > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; // 1 > > 6 │ _2 = x + y; // 2 > > 7 │ _3 = x > _2; // 3 > > 8 │ _4 = (unsigned char) _3; // 4 > > 9 │ _5 = -_4; // 5 > > 10 │ D.2809 = _1 | _5; // 6 > > 11 │ return D.2809; // 7 > > 12 │ } > > > > > The catch is of > > > course that the original might involve control flow and a PHI. > > _1 and _2 are counted as one, as they'll be CSE'd. > After the patch your dependency chain is 5 instructions, as in, > to create the result you must execute 5 sequential instructions. > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > 2 │ { > 3 │ uint8_t D.2809; > 4 │ > 5 │ _1 = x + y; > 6 │ _2 = x + y; // 1 > 7 │ _3 = x > _2; // 2 > 8 │ _4 = (unsigned char) _3; // 3 > 9 │ _5 = -_4; // 4 > 10 │ D.2809 = _1 | _5; //5 > 11 │ return D.2809; > 12 │ } > > Before your change you had 3, because you always do > Compare + branch + set. > > The problem with the rewrite is that it pessimists the code if the saturating > instructions are not recognized afterwards. > > Note this is also the case for vector codegen, The original code in vector would be > > 1. duplicate 255 > 1. x + y > 2. compare > 3. select. > > The first two instructions are independent and execute in parallel, so the original > vector codegen also had a dependency chain of 3. > > Additionally the branch version will get the benefit of branch prediction when ran > inside a loop. > > > > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > > done by the frontends or gimplfiication - you scan the gimplification dump > > > below, so having it apply there means it will influence inlining heuristics > > > for example. I wonder which variant is considered larger by its heuristic. > > > > > What do others think of the early canonicalization of these to > > > straight-line code? > > > > Make sense, we could find one form that most of us consider to be the > "cheapest". > > I would disagree with Richard here in that this would be a pessimistic codegen for > targets that don't later implement the IFN_SAT. > > Thanks, > Tamar > > > > > Pan > > > > -----Original Message----- > > From: Richard Biener <richard.guenther@gmail.com> > > Sent: Wednesday, November 6, 2024 8:43 PM > > To: Li, Pan2 <pan2.li@intel.com> > > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; > > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > SAT_ADD into branchless > > > > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote: > > > > > > From: Pan Li <pan2.li@intel.com> > > > > > > There are sorts of forms for the unsigned SAT_ADD. Some of them are > > > complicated while others are cheap. This patch would like to simplify > > > the complicated form into the cheap ones. For example as below: > > > > > > From the form 4 (branch): > > > SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). > > > > > > To (branchless): > > > SAT_U_ADD = (X + Y) | - ((X + Y) < X). > > > > > > #define T uint8_t > > > > > > T sat_add_u_1 (T x, T y) > > > { > > > return (T)(x + y) < x ? -1 : (x + y); > > > } > > > > > > Before this patch: > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > 2 │ { > > > 3 │ uint8_t D.2809; > > > 4 │ > > > 5 │ _1 = x + y; > > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; > > > 7 │ <D.2810>: > > > 8 │ D.2809 = x + y; > > > 9 │ goto <D.2812>; > > > 10 │ <D.2811>: > > > 11 │ D.2809 = 255; > > > 12 │ <D.2812>: > > > 13 │ return D.2809; > > > 14 │ } > > > > > > After this patch: > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > 2 │ { > > > 3 │ uint8_t D.2809; > > > 4 │ > > > 5 │ _1 = x + y; > > > 6 │ _2 = x + y; > > > 7 │ _3 = x > _2; > > > 8 │ _4 = (unsigned char) _3; > > > 9 │ _5 = -_4; > > > 10 │ D.2809 = _1 | _5; > > > 11 │ return D.2809; > > > 12 │ } > > > > > > The below test suites are passed for this patch. > > > * The rv64gcv fully regression test. > > > * The x86 bootstrap test. > > > * The x86 fully regression test. > > > > > > gcc/ChangeLog: > > > > > > * match.pd: Remove unsigned branch form 4 for SAT_ADD, and > > > add simplify to branchless instead. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.dg/sat_arith_simplify.h: New test. > > > * gcc.dg/sat_u_add-simplify-2-u16.c: New test. > > > * gcc.dg/sat_u_add-simplify-2-u32.c: New test. > > > * gcc.dg/sat_u_add-simplify-2-u64.c: New test. > > > * gcc.dg/sat_u_add-simplify-2-u8.c: New test. > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > > --- > > > gcc/match.pd | 23 +++++++++---------- > > > gcc/testsuite/gcc.dg/sat_arith_simplify.h | 10 ++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u16.c | 11 +++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u32.c | 11 +++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u64.c | 11 +++++++++ > > > .../gcc.dg/sat_u_add-simplify-2-u8.c | 11 +++++++++ > > > 6 files changed, 65 insertions(+), 12 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index c851ac56e37..8425d7c4f20 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > (match (unsigned_integer_sat_add @0 @1) > > > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > > > > > -/* Simplify SAT_U_ADD to the cheap form > > > - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. > > > - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > > - && types_match (type, @0, @1)) > > > - (bit_ior @2 (negate (convert (lt @2 @0)))))) > > > - > > > -/* Unsigned saturation add, case 4 (branch with lt): > > > - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > > -(match (unsigned_integer_sat_add @0 @1) > > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one. > > > + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > > + (if (types_match (type, @0, @1)) > > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) > > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) > > > + (if (types_match (type, @0, @1)) > > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) > > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > cheaper. When I count the number of gimple statements (sub-expressions) > > the original appears as 3 while the result looks to have 5. The catch is of > > course that the original might involve control flow and a PHI. > > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > done by the frontends or gimplfiication - you scan the gimplification dump > > below, so having it apply there means it will influence inlining heuristics > > for example. I wonder which variant is considered larger by its heuristic. > > > > What do others think of the early canonicalization of these to > > straight-line code? > > > > Thanks, > > Richard. > > > > > /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > > > SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW > : - > > 1. */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > new file mode 100644 > > > index 00000000000..46ac00426b2 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > @@ -0,0 +1,10 @@ > > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > > + > > > +#define DEF_SAT_U_ADD_2(T) \ > > > +T sat_u_add_##T##_2 (T x, T y) \ > > > +{ \ > > > + return (T)(x + y) < x ? -1 : (x + y); \ > > > +} > > > + > > > +#endif > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > new file mode 100644 > > > index 00000000000..b170b35096c > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint16_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > new file mode 100644 > > > index 00000000000..8830ed7b878 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint32_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > new file mode 100644 > > > index 00000000000..76c4d4bddaa > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint64_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > new file mode 100644 > > > index 00000000000..b034b8eedb1 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > @@ -0,0 +1,11 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > + > > > +#include <stdint.h> > > > +#include "sat_arith_simplify.h" > > > + > > > +DEF_SAT_U_ADD_2 (uint8_t) > > > + > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > -- > > > 2.43.0 > > >
> -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Thursday, November 7, 2024 12:57 PM > To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener > <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > I see your point that the backend can leverage condition move to emit the branch > code. > > > For instance see https://godbolt.org/z/fvrq3aq6K > > On ISAs with conditional operations the branch version gets ifconverted. > > On AArch64 we get: > > sat_add_u_1(unsigned int, unsigned int): > > adds w0, w0, w1 > > csinv w0, w0, wzr, cc > > ret > > so just 2 instructions, and also branchless. On x86_64 we get: > > sat_add_u_1(unsigned int, unsigned int): > > add edi, esi > > mov eax, -1 > > cmovnc eax, edi > > ret > > so 3 instructions but a dependency chain of 2. > > also branchless. This patch would regress both of these. > > But the above Godbolt may not be a good example for evidence, because both > x86_64 and aarch64 implemented usadd > already. > Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and > sat_add_u_2 are almost the > same when the backend implemented usadd. > > #include <stdint.h> > > #define T uint32_t > > T sat_add_u_1 (T x, T y) > { > return (T)(x + y) < x ? -1 : (x + y); > } > > T sat_add_u_2 (T x, T y) > { > return (x + y) | -((x + y) < x); > } > > It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE > IR), the x86_64 > will have below asm dump for -O3. Looks like no obvious difference here. > > sat_add_u_1(unsigned int, unsigned int): > add edi, esi > mov eax, -1 > cmovnc eax, edi > ret > > sat_add_u_2(unsigned int, unsigned int): > add edi, esi > sbb eax, eax > or eax, edi > ret > Because CE is able to recognize the idiom back into a conditional move. Pick a target that doesn't have conditional instructions, like PowerPC https://godbolt.org/z/4bTv18WMv You'll see that this canonicalization has made codegen worse. After: .L.sat_add_u_1(unsigned int, unsigned int): add 4,3,4 rldicl 9,4,0,32 subf 3,3,9 sradi 3,3,63 or 3,3,4 rldicl 3,3,0,32 blr and before .L.sat_add_u_1(unsigned int, unsigned int): add 4,3,4 cmplw 0,4,3 bge 0,.L2 li 4,-1 .L2: rldicl 3,4,0,32 blr It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending on the order of the branch. So for those architectures, it's always slower. Tamar > As well as for the vector mode for x86_64 gcc 14.2 as below. > > void vec_sat_add_u_1 (T *a, T *b, T *out, int n) > { > for (int i = 0; i < n; i++) { > T x = a[i]; > T y = b[i]; > out[i] = (T)(x + y) < x ? -1 : (x + y); > } > } > > void vec_sat_add_u_2 (T * __restrict a, T * __restrict b, > T * __restrict out, int n) > { > for (int i = 0; i < n; i++) { > T x = a[i]; > T y = b[i]; > out[i] = (x + y) | -((x + y) < x); > } > } > > vec_sat_add_u_1(unsigned int*, unsigned int*, unsigned int*, int): > .... > .L15: > movdqu xmm0, XMMWORD PTR [rdi+rax] > movdqu xmm1, XMMWORD PTR [rsi+rax] > paddd xmm1, xmm0 > psubd xmm0, xmm2 > movdqa xmm3, xmm1 > psubd xmm3, xmm2 > pcmpgtd xmm0, xmm3 > por xmm0, xmm1 > movups XMMWORD PTR [rdx+rax], xmm0 > add rax, 16 > cmp r8, rax > jne .L15 > mov eax, ecx > and eax, -4 > mov r8d, eax > cmp ecx, eax > je .L11 > sub ecx, eax > mov r9d, ecx > cmp ecx, 1 > je .L17 > .L14: > movq xmm2, QWORD PTR .LC2[rip] > mov ecx, r8d > movq xmm0, QWORD PTR [rdi+rcx*4] > movq xmm1, QWORD PTR [rsi+rcx*4] > paddd xmm1, xmm0 > psubd xmm0, xmm2 > movdqa xmm3, xmm1 > psubd xmm3, xmm2 > pcmpgtd xmm0, xmm3 > movdqa xmm2, xmm0 > pandn xmm2, xmm1 > por xmm0, xmm2 > movq QWORD PTR [rdx+rcx*4], xmm0 > .... > > vec_sat_add_u_2(unsigned int*, unsigned int*, unsigned int*, int): > ... > .L50: > movdqu xmm0, XMMWORD PTR [rdi+rax] > movdqu xmm1, XMMWORD PTR [rsi+rax] > paddd xmm1, xmm0 > psubd xmm0, xmm2 > movdqa xmm3, xmm1 > psubd xmm3, xmm2 > pcmpgtd xmm0, xmm3 > por xmm0, xmm1 > movups XMMWORD PTR [rdx+rax], xmm0 > add rax, 16 > cmp rax, r8 > jne .L50 > mov eax, ecx > and eax, -4 > mov r8d, eax > cmp ecx, eax > je .L64 > .L49: > sub ecx, r8d > cmp ecx, 1 > je .L52 > movq xmm0, QWORD PTR [rdi+r8*4] > movq xmm1, QWORD PTR [rsi+r8*4] > movq xmm2, QWORD PTR .LC2[rip] > paddd xmm1, xmm0 > psubd xmm0, xmm2 > movdqa xmm3, xmm1 > psubd xmm3, xmm2 > pcmpgtd xmm0, xmm3 > movdqa xmm2, xmm0 > pandn xmm2, xmm1 > por xmm0, xmm2 > movq QWORD PTR [rdx+r8*4], xmm0 > ... Because x86 is not a fully masked architecture. > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Thursday, November 7, 2024 7:43 PM > To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > > -----Original Message----- > > From: Li, Pan2 <pan2.li@intel.com> > > Sent: Thursday, November 7, 2024 1:45 AM > > To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener > > <richard.guenther@gmail.com> > > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > SAT_ADD into branchless > > > > I see, thanks Tamar for the explanation. > > > > > The problem with the rewrite is that it pessimists the code if the saturating > > > instructions are not recognized afterwards. > > > > The original idea is somehow independent with the backend support IFN_SAT_* > or > > not. > > Given we have sorts of form of IFN_SAT_*, some of them are cheap while others > > are heavy > > from the perspective of gimple. It is possible to do some canonicalization here. > > > > > Additionally the branch version will get the benefit of branch prediction when > ran > > > inside a loop. > > > > Not very sure if my understanding is correct, but the branchless version is > > preferred in most case IMO if > > they have nearly count of stmt. Not sure if it is still true during the vectorization. > > I don't think this is true. The branchless version must be either less instructions or > cheaper instruction. The branches, especially in a loop become mostly free due to > branch prediction. Modern branch predictors are really good at idoms like this. > > That said, the other issue with doing this in gimple is that it interferes with the RTL > conditional execution pass. > > For instance see https://godbolt.org/z/fvrq3aq6K > > On ISAs with conditional operations the branch version gets ifconverted. > > On AArch64 we get: > > sat_add_u_1(unsigned int, unsigned int): > adds w0, w0, w1 > csinv w0, w0, wzr, cc > ret > > so just 2 instructions, and also branchless. On x86_64 we get: > > sat_add_u_1(unsigned int, unsigned int): > add edi, esi > mov eax, -1 > cmovnc eax, edi > ret > > so 3 instructions but a dependency chain of 2. > > also branchless. This patch would regress both of these. > > Thanks, > Tamar > > > > > Pan > > > > -----Original Message----- > > From: Tamar Christina <Tamar.Christina@arm.com> > > Sent: Thursday, November 7, 2024 12:25 AM > > To: Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> > > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > > jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > SAT_ADD into branchless > > > > > -----Original Message----- > > > From: Li, Pan2 <pan2.li@intel.com> > > > Sent: Wednesday, November 6, 2024 1:31 PM > > > To: Richard Biener <richard.guenther@gmail.com> > > > Cc: gcc-patches@gcc.gnu.org; Tamar Christina <Tamar.Christina@arm.com>; > > > juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; > > > rdapp.gcc@gmail.com > > > Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > > SAT_ADD into branchless > > > > > > Never mind and thanks Richard for comments. > > > > > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > > > cheaper. When I count the number of gimple statements (sub-expressions) > > > > the original appears as 3 while the result looks to have 5. > > > > > > I may have a question about how we count the stmts, not sure the x <= _1 and > > > then goto should be counted > > > > They should, because they both become a instructions that need to be executed > in > > the execution chain. > > > > > as 1 or 2 stmts in below example. I try to count it by myself but not very sure > the > > > understanding is correct. > > > > > > Before this patch: > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > 2 │ { > > > 3 │ uint8_t D.2809; > > > 4 │ > > > 5 │ _1 = x + y; // 1 > > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; // 2 for x <= _1, 3 for > goto > > > ??? > > > 7 │ <D.2810>: > > > 8 │ D.2809 = x + y; // 4 (token) > > > 9 │ goto <D.2812>; // 5 (token) for goto ??? > > > 10 │ <D.2811>: > > > 11 │ D.2809 = 255; // 4 (not token) > > > 12 │ <D.2812>: > > > 13 │ return D.2809; // 6 for token, 5 for not token > > > 14 │ } > > > > > > After this patch: > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > 2 │ { > > > 3 │ uint8_t D.2809; > > > 4 │ > > > 5 │ _1 = x + y; // 1 > > > 6 │ _2 = x + y; // 2 > > > 7 │ _3 = x > _2; // 3 > > > 8 │ _4 = (unsigned char) _3; // 4 > > > 9 │ _5 = -_4; // 5 > > > 10 │ D.2809 = _1 | _5; // 6 > > > 11 │ return D.2809; // 7 > > > 12 │ } > > > > > > > The catch is of > > > > course that the original might involve control flow and a PHI. > > > > _1 and _2 are counted as one, as they'll be CSE'd. > > After the patch your dependency chain is 5 instructions, as in, > > to create the result you must execute 5 sequential instructions. > > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > 2 │ { > > 3 │ uint8_t D.2809; > > 4 │ > > 5 │ _1 = x + y; > > 6 │ _2 = x + y; // 1 > > 7 │ _3 = x > _2; // 2 > > 8 │ _4 = (unsigned char) _3; // 3 > > 9 │ _5 = -_4; // 4 > > 10 │ D.2809 = _1 | _5; //5 > > 11 │ return D.2809; > > 12 │ } > > > > Before your change you had 3, because you always do > > Compare + branch + set. > > > > The problem with the rewrite is that it pessimists the code if the saturating > > instructions are not recognized afterwards. > > > > Note this is also the case for vector codegen, The original code in vector would be > > > > 1. duplicate 255 > > 1. x + y > > 2. compare > > 3. select. > > > > The first two instructions are independent and execute in parallel, so the original > > vector codegen also had a dependency chain of 3. > > > > Additionally the branch version will get the benefit of branch prediction when ran > > inside a loop. > > > > > > > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > > > done by the frontends or gimplfiication - you scan the gimplification dump > > > > below, so having it apply there means it will influence inlining heuristics > > > > for example. I wonder which variant is considered larger by its heuristic. > > > > > > > What do others think of the early canonicalization of these to > > > > straight-line code? > > > > > > Make sense, we could find one form that most of us consider to be the > > "cheapest". > > > > I would disagree with Richard here in that this would be a pessimistic codegen for > > targets that don't later implement the IFN_SAT. > > > > Thanks, > > Tamar > > > > > > > > Pan > > > > > > -----Original Message----- > > > From: Richard Biener <richard.guenther@gmail.com> > > > Sent: Wednesday, November 6, 2024 8:43 PM > > > To: Li, Pan2 <pan2.li@intel.com> > > > Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; > juzhe.zhong@rivai.ai; > > > kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > > > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > > > SAT_ADD into branchless > > > > > > On Thu, Oct 31, 2024 at 7:29 AM <pan2.li@intel.com> wrote: > > > > > > > > From: Pan Li <pan2.li@intel.com> > > > > > > > > There are sorts of forms for the unsigned SAT_ADD. Some of them are > > > > complicated while others are cheap. This patch would like to simplify > > > > the complicated form into the cheap ones. For example as below: > > > > > > > > From the form 4 (branch): > > > > SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). > > > > > > > > To (branchless): > > > > SAT_U_ADD = (X + Y) | - ((X + Y) < X). > > > > > > > > #define T uint8_t > > > > > > > > T sat_add_u_1 (T x, T y) > > > > { > > > > return (T)(x + y) < x ? -1 : (x + y); > > > > } > > > > > > > > Before this patch: > > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > > 2 │ { > > > > 3 │ uint8_t D.2809; > > > > 4 │ > > > > 5 │ _1 = x + y; > > > > 6 │ if (x <= _1) goto <D.2810>; else goto <D.2811>; > > > > 7 │ <D.2810>: > > > > 8 │ D.2809 = x + y; > > > > 9 │ goto <D.2812>; > > > > 10 │ <D.2811>: > > > > 11 │ D.2809 = 255; > > > > 12 │ <D.2812>: > > > > 13 │ return D.2809; > > > > 14 │ } > > > > > > > > After this patch: > > > > 1 │ uint8_t sat_add_u_1 (uint8_t x, uint8_t y) > > > > 2 │ { > > > > 3 │ uint8_t D.2809; > > > > 4 │ > > > > 5 │ _1 = x + y; > > > > 6 │ _2 = x + y; > > > > 7 │ _3 = x > _2; > > > > 8 │ _4 = (unsigned char) _3; > > > > 9 │ _5 = -_4; > > > > 10 │ D.2809 = _1 | _5; > > > > 11 │ return D.2809; > > > > 12 │ } > > > > > > > > The below test suites are passed for this patch. > > > > * The rv64gcv fully regression test. > > > > * The x86 bootstrap test. > > > > * The x86 fully regression test. > > > > > > > > gcc/ChangeLog: > > > > > > > > * match.pd: Remove unsigned branch form 4 for SAT_ADD, and > > > > add simplify to branchless instead. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > * gcc.dg/sat_arith_simplify.h: New test. > > > > * gcc.dg/sat_u_add-simplify-2-u16.c: New test. > > > > * gcc.dg/sat_u_add-simplify-2-u32.c: New test. > > > > * gcc.dg/sat_u_add-simplify-2-u64.c: New test. > > > > * gcc.dg/sat_u_add-simplify-2-u8.c: New test. > > > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > > > --- > > > > gcc/match.pd | 23 +++++++++---------- > > > > gcc/testsuite/gcc.dg/sat_arith_simplify.h | 10 ++++++++ > > > > .../gcc.dg/sat_u_add-simplify-2-u16.c | 11 +++++++++ > > > > .../gcc.dg/sat_u_add-simplify-2-u32.c | 11 +++++++++ > > > > .../gcc.dg/sat_u_add-simplify-2-u64.c | 11 +++++++++ > > > > .../gcc.dg/sat_u_add-simplify-2-u8.c | 11 +++++++++ > > > > 6 files changed, 65 insertions(+), 12 deletions(-) > > > > create mode 100644 gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > > create mode 100644 gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > index c851ac56e37..8425d7c4f20 100644 > > > > --- a/gcc/match.pd > > > > +++ b/gcc/match.pd > > > > @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > > (match (unsigned_integer_sat_add @0 @1) > > > > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > > > > > > > -/* Simplify SAT_U_ADD to the cheap form > > > > - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. > > > > - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > > > -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > > > - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > > > - && types_match (type, @0, @1)) > > > > - (bit_ior @2 (negate (convert (lt @2 @0)))))) > > > > - > > > > -/* Unsigned saturation add, case 4 (branch with lt): > > > > - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > > > -(match (unsigned_integer_sat_add @0 @1) > > > > - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) > > > > +/* Simplify sorts of SAT_U_ADD forms to the cheap one. > > > > + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ > > > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > > > > + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ > > > > + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > > > > + (if (types_match (type, @0, @1)) > > > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) > > > > + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ > > > > + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) > > > > + (if (types_match (type, @0, @1)) > > > > + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) > > > > > > Sorry for falling back in reviewing - it's not exactly clear the "cheap" form is > > > cheaper. When I count the number of gimple statements (sub-expressions) > > > the original appears as 3 while the result looks to have 5. The catch is of > > > course that the original might involve control flow and a PHI. > > > > > > The pattern will apply during late PHI-OPT or during GENERIC folding > > > done by the frontends or gimplfiication - you scan the gimplification dump > > > below, so having it apply there means it will influence inlining heuristics > > > for example. I wonder which variant is considered larger by its heuristic. > > > > > > What do others think of the early canonicalization of these to > > > straight-line code? > > > > > > Thanks, > > > Richard. > > > > > > > /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): > > > > SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? > .ADD_OVERFLOW > > : - > > > 1. */ > > > > diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > > new file mode 100644 > > > > index 00000000000..46ac00426b2 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h > > > > @@ -0,0 +1,10 @@ > > > > +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > > > +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H > > > > + > > > > +#define DEF_SAT_U_ADD_2(T) \ > > > > +T sat_u_add_##T##_2 (T x, T y) \ > > > > +{ \ > > > > + return (T)(x + y) < x ? -1 : (x + y); \ > > > > +} > > > > + > > > > +#endif > > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > > new file mode 100644 > > > > index 00000000000..b170b35096c > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c > > > > @@ -0,0 +1,11 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > > + > > > > +#include <stdint.h> > > > > +#include "sat_arith_simplify.h" > > > > + > > > > +DEF_SAT_U_ADD_2 (uint16_t) > > > > + > > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > > new file mode 100644 > > > > index 00000000000..8830ed7b878 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c > > > > @@ -0,0 +1,11 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > > + > > > > +#include <stdint.h> > > > > +#include "sat_arith_simplify.h" > > > > + > > > > +DEF_SAT_U_ADD_2 (uint32_t) > > > > + > > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > > new file mode 100644 > > > > index 00000000000..76c4d4bddaa > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c > > > > @@ -0,0 +1,11 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > > + > > > > +#include <stdint.h> > > > > +#include "sat_arith_simplify.h" > > > > + > > > > +DEF_SAT_U_ADD_2 (uint64_t) > > > > + > > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > > diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > > new file mode 100644 > > > > index 00000000000..b034b8eedb1 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c > > > > @@ -0,0 +1,11 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ > > > > + > > > > +#include <stdint.h> > > > > +#include "sat_arith_simplify.h" > > > > + > > > > +DEF_SAT_U_ADD_2 (uint8_t) > > > > + > > > > +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ > > > > +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ > > > > -- > > > > 2.43.0 > > > >
On 11/7/24 8:07 AM, Tamar Christina wrote: > >> -----Original Message----- >> From: Li, Pan2 <pan2.li@intel.com> >> Sent: Thursday, November 7, 2024 12:57 PM >> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener >> <richard.guenther@gmail.com> >> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; >> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com >> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned >> SAT_ADD into branchless >> >> I see your point that the backend can leverage condition move to emit the branch >> code. >> >>> For instance see https://godbolt.org/z/fvrq3aq6K >>> On ISAs with conditional operations the branch version gets ifconverted. >>> On AArch64 we get: >>> sat_add_u_1(unsigned int, unsigned int): >>> adds w0, w0, w1 >>> csinv w0, w0, wzr, cc >>> ret >>> so just 2 instructions, and also branchless. On x86_64 we get: >>> sat_add_u_1(unsigned int, unsigned int): >>> add edi, esi >>> mov eax, -1 >>> cmovnc eax, edi >>> ret >>> so 3 instructions but a dependency chain of 2. >>> also branchless. This patch would regress both of these. >> >> But the above Godbolt may not be a good example for evidence, because both >> x86_64 and aarch64 implemented usadd >> already. >> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and >> sat_add_u_2 are almost the >> same when the backend implemented usadd. >> >> #include <stdint.h> >> >> #define T uint32_t >> >> T sat_add_u_1 (T x, T y) >> { >> return (T)(x + y) < x ? -1 : (x + y); >> } >> >> T sat_add_u_2 (T x, T y) >> { >> return (x + y) | -((x + y) < x); >> } >> >> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE >> IR), the x86_64 >> will have below asm dump for -O3. Looks like no obvious difference here. >> >> sat_add_u_1(unsigned int, unsigned int): >> add edi, esi >> mov eax, -1 >> cmovnc eax, edi >> ret >> >> sat_add_u_2(unsigned int, unsigned int): >> add edi, esi >> sbb eax, eax >> or eax, edi >> ret >> > > Because CE is able to recognize the idiom back into a conditional move. > Pick a target that doesn't have conditional instructions, like PowerPC > https://godbolt.org/z/4bTv18WMv > > You'll see that this canonicalization has made codegen worse. > > After: > > .L.sat_add_u_1(unsigned int, unsigned int): > add 4,3,4 > rldicl 9,4,0,32 > subf 3,3,9 > sradi 3,3,63 > or 3,3,4 > rldicl 3,3,0,32 > blr > > and before > > .L.sat_add_u_1(unsigned int, unsigned int): > add 4,3,4 > cmplw 0,4,3 > bge 0,.L2 > li 4,-1 > .L2: > rldicl 3,4,0,32 > blr > > It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending > on the order of the branch. So for those architectures, it's always slower. I'm not sure it's that simple. It'll depend on the micro-architecture. So things like strength of the branch predictors, how fetch blocks are handled (can you have embedded not-taken branches, short-forward-branch optimizations, etc). Jeff
Thanks Tamar and Jeff for comments. > I'm not sure it's that simple. It'll depend on the micro-architecture. > So things like strength of the branch predictors, how fetch blocks are > handled (can you have embedded not-taken branches, short-forward-branch > optimizations, etc). > After: > > .L.sat_add_u_1(unsigned int, unsigned int): > add 4,3,4 > rldicl 9,4,0,32 > subf 3,3,9 > sradi 3,3,63 > or 3,3,4 > rldicl 3,3,0,32 > blr > > and before > > .L.sat_add_u_1(unsigned int, unsigned int): > add 4,3,4 > cmplw 0,4,3 > bge 0,.L2 > li 4,-1 > .L2: > rldicl 3,4,0,32 > blr I am not familiar with branch prediction, but the branch should be 50% token and 50% not-token according to the range of sat add input. It is the worst case for branch prediction? I mean if we call 100 times with token, not-token, token, not-token... sequence, the branch version will be still faster? Feel free to correct me if I'm wrong. Back to these 16 forms of sat add as below, is there any suggestion which one or two form(s) may be cheaper than others from the perspective of gimple IR? Independent with the backend implemented SAT_ADD or not. #define DEF_SAT_U_ADD_1(T) \ T sat_u_add_##T##_1 (T x, T y) \ { \ return (T)(x + y) >= x ? (x + y) : -1; \ } #define DEF_SAT_U_ADD_2(T) \ T sat_u_add_##T##_2 (T x, T y) \ { \ return (T)(x + y) < x ? -1 : (x + y); \ } #define DEF_SAT_U_ADD_3(T) \ T sat_u_add_##T##_3 (T x, T y) \ { \ return x <= (T)(x + y) ? (x + y) : -1; \ } #define DEF_SAT_U_ADD_4(T) \ T sat_u_add_##T##_4 (T x, T y) \ { \ return x > (T)(x + y) ? -1 : (x + y); \ } #define DEF_SAT_U_ADD_5(T) \ T sat_u_add_##T##_1 (T x, T y) \ { \ if ((T)(x + y) >= x) \ return x + y; \ else \ return -1; \ } #define DEF_SAT_U_ADD_6(T) \ T sat_u_add_##T##_6 (T x, T y) \ { \ if ((T)(x + y) < x) \ return -1; \ else \ return x + y; \ } #define DEF_SAT_U_ADD_7(T) \ T sat_u_add_##T##_7 (T x, T y) \ { \ if (x <= (T)(x + y)) \ return x + y; \ else \ return -1; \ } #define DEF_SAT_U_ADD_8(T) \ T sat_u_add_##T##_8 (T x, T y) \ { \ if (x > (T)(x + y)) \ return -1; \ else \ return x + y; \ } #define DEF_SAT_U_ADD_9(T) \ T sat_u_add_##T##_9 (T x, T y) \ { \ T ret; \ return __builtin_add_overflow (x, y, &ret) == 0 ? ret : - 1; \ } #define DEF_SAT_U_ADD_10(T) \ T sat_u_add_##T##_10 (T x, T y) \ { \ T ret; \ return !__builtin_add_overflow (x, y, &ret) ? ret : - 1; \ } #define DEF_SAT_U_ADD_11(T) \ T sat_u_add_##T##_11 (T x, T y) \ { \ T ret; \ if (__builtin_add_overflow (x, y, &ret) == 0) \ return ret; \ else \ return -1; \ } #define DEF_SAT_U_ADD_12(T) \ T sat_u_add_##T##_12 (T x, T y) \ { \ T ret; \ if (!__builtin_add_overflow (x, y, &ret)) \ return ret; \ else \ return -1; \ } #define DEF_SAT_U_ADD_13(T) \ T sat_u_add_##T##_13 (T x, T y) \ { \ T ret; \ return __builtin_add_overflow (x, y, &ret) != 0 ? -1 : ret; \ } #define DEF_SAT_U_ADD_14(T) \ T sat_u_add_##T##_14 (T x, T y) \ { \ T ret; \ return __builtin_add_overflow (x, y, &ret) ? -1 : ret; \ } #define DEF_SAT_U_ADD_15(T) \ T sat_u_add_##T##_15 (T x, T y) \ { \ T ret; \ if (__builtin_add_overflow (x, y, &ret) != 0) \ return -1; \ else \ return ret; \ } #define DEF_SAT_U_ADD_16(T) \ T sat_u_add_##T##_16 (T x, T y) \ { \ T ret; \ if (__builtin_add_overflow (x, y, &ret)) \ return -1; \ else \ return ret; \ } Pan -----Original Message----- From: Jeff Law <jeffreyalaw@gmail.com> Sent: Friday, November 8, 2024 4:08 AM To: Tamar Christina <Tamar.Christina@arm.com>; Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; rdapp.gcc@gmail.com Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless On 11/7/24 8:07 AM, Tamar Christina wrote: > >> -----Original Message----- >> From: Li, Pan2 <pan2.li@intel.com> >> Sent: Thursday, November 7, 2024 12:57 PM >> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener >> <richard.guenther@gmail.com> >> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; >> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com >> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned >> SAT_ADD into branchless >> >> I see your point that the backend can leverage condition move to emit the branch >> code. >> >>> For instance see https://godbolt.org/z/fvrq3aq6K >>> On ISAs with conditional operations the branch version gets ifconverted. >>> On AArch64 we get: >>> sat_add_u_1(unsigned int, unsigned int): >>> adds w0, w0, w1 >>> csinv w0, w0, wzr, cc >>> ret >>> so just 2 instructions, and also branchless. On x86_64 we get: >>> sat_add_u_1(unsigned int, unsigned int): >>> add edi, esi >>> mov eax, -1 >>> cmovnc eax, edi >>> ret >>> so 3 instructions but a dependency chain of 2. >>> also branchless. This patch would regress both of these. >> >> But the above Godbolt may not be a good example for evidence, because both >> x86_64 and aarch64 implemented usadd >> already. >> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and >> sat_add_u_2 are almost the >> same when the backend implemented usadd. >> >> #include <stdint.h> >> >> #define T uint32_t >> >> T sat_add_u_1 (T x, T y) >> { >> return (T)(x + y) < x ? -1 : (x + y); >> } >> >> T sat_add_u_2 (T x, T y) >> { >> return (x + y) | -((x + y) < x); >> } >> >> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE >> IR), the x86_64 >> will have below asm dump for -O3. Looks like no obvious difference here. >> >> sat_add_u_1(unsigned int, unsigned int): >> add edi, esi >> mov eax, -1 >> cmovnc eax, edi >> ret >> >> sat_add_u_2(unsigned int, unsigned int): >> add edi, esi >> sbb eax, eax >> or eax, edi >> ret >> > > Because CE is able to recognize the idiom back into a conditional move. > Pick a target that doesn't have conditional instructions, like PowerPC > https://godbolt.org/z/4bTv18WMv > > You'll see that this canonicalization has made codegen worse. > > After: > > .L.sat_add_u_1(unsigned int, unsigned int): > add 4,3,4 > rldicl 9,4,0,32 > subf 3,3,9 > sradi 3,3,63 > or 3,3,4 > rldicl 3,3,0,32 > blr > > and before > > .L.sat_add_u_1(unsigned int, unsigned int): > add 4,3,4 > cmplw 0,4,3 > bge 0,.L2 > li 4,-1 > .L2: > rldicl 3,4,0,32 > blr > > It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending > on the order of the branch. So for those architectures, it's always slower. I'm not sure it's that simple. It'll depend on the micro-architecture. So things like strength of the branch predictors, how fetch blocks are handled (can you have embedded not-taken branches, short-forward-branch optimizations, etc). Jeff
> -----Original Message----- > From: Jeff Law <jeffreyalaw@gmail.com> > Sent: Thursday, November 7, 2024 8:08 PM > To: Tamar Christina <Tamar.Christina@arm.com>; Li, Pan2 <pan2.li@intel.com>; > Richard Biener <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > rdapp.gcc@gmail.com > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > SAT_ADD into branchless > > > > On 11/7/24 8:07 AM, Tamar Christina wrote: > > > >> -----Original Message----- > >> From: Li, Pan2 <pan2.li@intel.com> > >> Sent: Thursday, November 7, 2024 12:57 PM > >> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener > >> <richard.guenther@gmail.com> > >> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > >> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > >> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > >> SAT_ADD into branchless > >> > >> I see your point that the backend can leverage condition move to emit the > branch > >> code. > >> > >>> For instance see https://godbolt.org/z/fvrq3aq6K > >>> On ISAs with conditional operations the branch version gets ifconverted. > >>> On AArch64 we get: > >>> sat_add_u_1(unsigned int, unsigned int): > >>> adds w0, w0, w1 > >>> csinv w0, w0, wzr, cc > >>> ret > >>> so just 2 instructions, and also branchless. On x86_64 we get: > >>> sat_add_u_1(unsigned int, unsigned int): > >>> add edi, esi > >>> mov eax, -1 > >>> cmovnc eax, edi > >>> ret > >>> so 3 instructions but a dependency chain of 2. > >>> also branchless. This patch would regress both of these. > >> > >> But the above Godbolt may not be a good example for evidence, because both > >> x86_64 and aarch64 implemented usadd > >> already. > >> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 > and > >> sat_add_u_2 are almost the > >> same when the backend implemented usadd. > >> > >> #include <stdint.h> > >> > >> #define T uint32_t > >> > >> T sat_add_u_1 (T x, T y) > >> { > >> return (T)(x + y) < x ? -1 : (x + y); > >> } > >> > >> T sat_add_u_2 (T x, T y) > >> { > >> return (x + y) | -((x + y) < x); > >> } > >> > >> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD > GIMPLE > >> IR), the x86_64 > >> will have below asm dump for -O3. Looks like no obvious difference here. > >> > >> sat_add_u_1(unsigned int, unsigned int): > >> add edi, esi > >> mov eax, -1 > >> cmovnc eax, edi > >> ret > >> > >> sat_add_u_2(unsigned int, unsigned int): > >> add edi, esi > >> sbb eax, eax > >> or eax, edi > >> ret > >> > > > > Because CE is able to recognize the idiom back into a conditional move. > > Pick a target that doesn't have conditional instructions, like PowerPC > > https://godbolt.org/z/4bTv18WMv > > > > You'll see that this canonicalization has made codegen worse. > > > > After: > > > > .L.sat_add_u_1(unsigned int, unsigned int): > > add 4,3,4 > > rldicl 9,4,0,32 > > subf 3,3,9 > > sradi 3,3,63 > > or 3,3,4 > > rldicl 3,3,0,32 > > blr > > > > and before > > > > .L.sat_add_u_1(unsigned int, unsigned int): > > add 4,3,4 > > cmplw 0,4,3 > > bge 0,.L2 > > li 4,-1 > > .L2: > > rldicl 3,4,0,32 > > blr > > > > It means now it always has to execute 6 instructions, whereas before it was 4 or > 5 depending > > on the order of the branch. So for those architectures, it's always slower. > I'm not sure it's that simple. It'll depend on the micro-architecture. > So things like strength of the branch predictors, how fetch blocks are > handled (can you have embedded not-taken branches, short-forward-branch > optimizations, etc). It's a pretty safe assumption that most modern branch predictors don't have an issue with a simple diamond node like this. But since you brought up fetch blocks, at 5 instructions you're far more likely to fit the entire thing in a fetch block, especially if within a loop where most targets align loops. Where as the more instructions you add, the larger the chance of you needing multiple fetches for the loop. You then weigh the cost of a misprediction (which will get better as the loop runs) vs the cost of a underutilize fetch, which will not get better. You are also making an assumption that logical operations are cheap on the particular core in the longer sequence. I would agree that removing the branch is universally a good idea if you produced the same or less Instructions, or cheaper instructions, as I said before. I disagree that replacing it with more instructions is a universally better thing, and you yourself said "it'll depend". Tamar > > Jeff
On Fri, Nov 8, 2024 at 12:34 AM Li, Pan2 <pan2.li@intel.com> wrote: > > Thanks Tamar and Jeff for comments. > > > I'm not sure it's that simple. It'll depend on the micro-architecture. > > So things like strength of the branch predictors, how fetch blocks are > > handled (can you have embedded not-taken branches, short-forward-branch > > optimizations, etc). > > > After: > > > > .L.sat_add_u_1(unsigned int, unsigned int): > > add 4,3,4 > > rldicl 9,4,0,32 > > subf 3,3,9 > > sradi 3,3,63 > > or 3,3,4 > > rldicl 3,3,0,32 > > blr > > > > and before > > > > .L.sat_add_u_1(unsigned int, unsigned int): > > add 4,3,4 > > cmplw 0,4,3 > > bge 0,.L2 > > li 4,-1 > > .L2: > > rldicl 3,4,0,32 > > blr > > I am not familiar with branch prediction, but the branch should be 50% token and 50% not-token > according to the range of sat add input. It is the worst case for branch prediction? I mean if we call > 100 times with token, not-token, token, not-token... sequence, the branch version will be still faster? > Feel free to correct me if I'm wrong. > > Back to these 16 forms of sat add as below, is there any suggestion which one or two form(s) may be > cheaper than others from the perspective of gimple IR? Independent with the backend implemented SAT_ADD or not. > > #define DEF_SAT_U_ADD_1(T) \ > T sat_u_add_##T##_1 (T x, T y) \ > { \ > return (T)(x + y) >= x ? (x + y) : -1; \ > } > > #define DEF_SAT_U_ADD_2(T) \ > T sat_u_add_##T##_2 (T x, T y) \ > { \ > return (T)(x + y) < x ? -1 : (x + y); \ > } > > #define DEF_SAT_U_ADD_3(T) \ > T sat_u_add_##T##_3 (T x, T y) \ > { \ > return x <= (T)(x + y) ? (x + y) : -1; \ > } > > #define DEF_SAT_U_ADD_4(T) \ > T sat_u_add_##T##_4 (T x, T y) \ > { \ > return x > (T)(x + y) ? -1 : (x + y); \ > } > > #define DEF_SAT_U_ADD_5(T) \ > T sat_u_add_##T##_1 (T x, T y) \ > { \ > if ((T)(x + y) >= x) \ > return x + y; \ > else \ > return -1; \ > } > > #define DEF_SAT_U_ADD_6(T) \ > T sat_u_add_##T##_6 (T x, T y) \ > { \ > if ((T)(x + y) < x) \ > return -1; \ > else \ > return x + y; \ > } > > #define DEF_SAT_U_ADD_7(T) \ > T sat_u_add_##T##_7 (T x, T y) \ > { \ > if (x <= (T)(x + y)) \ > return x + y; \ > else \ > return -1; \ > } > > #define DEF_SAT_U_ADD_8(T) \ > T sat_u_add_##T##_8 (T x, T y) \ > { \ > if (x > (T)(x + y)) \ > return -1; \ > else \ > return x + y; \ > } > > #define DEF_SAT_U_ADD_9(T) \ > T sat_u_add_##T##_9 (T x, T y) \ > { \ > T ret; \ > return __builtin_add_overflow (x, y, &ret) == 0 ? ret : - 1; \ > } > > #define DEF_SAT_U_ADD_10(T) \ > T sat_u_add_##T##_10 (T x, T y) \ > { \ > T ret; \ > return !__builtin_add_overflow (x, y, &ret) ? ret : - 1; \ > } > > #define DEF_SAT_U_ADD_11(T) \ > T sat_u_add_##T##_11 (T x, T y) \ > { \ > T ret; \ > if (__builtin_add_overflow (x, y, &ret) == 0) \ > return ret; \ > else \ > return -1; \ > } > > #define DEF_SAT_U_ADD_12(T) \ > T sat_u_add_##T##_12 (T x, T y) \ > { \ > T ret; \ > if (!__builtin_add_overflow (x, y, &ret)) \ > return ret; \ > else \ > return -1; \ > } > > #define DEF_SAT_U_ADD_13(T) \ > T sat_u_add_##T##_13 (T x, T y) \ > { \ > T ret; \ > return __builtin_add_overflow (x, y, &ret) != 0 ? -1 : ret; \ > } > > #define DEF_SAT_U_ADD_14(T) \ > T sat_u_add_##T##_14 (T x, T y) \ > { \ > T ret; \ > return __builtin_add_overflow (x, y, &ret) ? -1 : ret; \ > } > > #define DEF_SAT_U_ADD_15(T) \ > T sat_u_add_##T##_15 (T x, T y) \ > { \ > T ret; \ > if (__builtin_add_overflow (x, y, &ret) != 0) \ > return -1; \ > else \ > return ret; \ > } > > #define DEF_SAT_U_ADD_16(T) \ > T sat_u_add_##T##_16 (T x, T y) \ > { \ > T ret; \ > if (__builtin_add_overflow (x, y, &ret)) \ > return -1; \ > else \ > return ret; \ > } So for the late .SAT_* introduction in the widen_mul pass the __builtin_add_overflow variants make sense to be used iff the target has optabs for those. The IL for the builtin variant should appear cheap as well: _6 = .ADD_OVERFLOW (x_4(D), y_5(D)); _2 = IMAGPART_EXPR <_6>; if (_2 != 0) goto <bb 4>; [21.72%] else goto <bb 3>; [78.28%] <bb 3> [local count: 840525096]: _1 = REALPART_EXPR <_6>; <bb 4> [local count: 1073741824]: # _3 = PHI <65535(2), _1(3)> it's possibly also the best canonicalization for the overflow check though I fear code generation for targets that do not support __builtin_*_overflow might end up with two branches. That said - I'd avoid canonicalizing this via match.pd given that inevitably will if-convert. Instead I'd see it as a way to provide a generic .SAT_* expansion though one could say we should then simply implement fallback expansion for the internal function. It's also not necessarily your responsibility to implement this since risc-v does have .SAT_* expanders, so does x86. Richard. > Pan > > -----Original Message----- > From: Jeff Law <jeffreyalaw@gmail.com> > Sent: Friday, November 8, 2024 4:08 AM > To: Tamar Christina <Tamar.Christina@arm.com>; Li, Pan2 <pan2.li@intel.com>; Richard Biener <richard.guenther@gmail.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; rdapp.gcc@gmail.com > Subject: Re: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned SAT_ADD into branchless > > > > On 11/7/24 8:07 AM, Tamar Christina wrote: > > > >> -----Original Message----- > >> From: Li, Pan2 <pan2.li@intel.com> > >> Sent: Thursday, November 7, 2024 12:57 PM > >> To: Tamar Christina <Tamar.Christina@arm.com>; Richard Biener > >> <richard.guenther@gmail.com> > >> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > >> jeffreyalaw@gmail.com; rdapp.gcc@gmail.com > >> Subject: RE: [PATCH v2 01/10] Match: Simplify branch form 4 of unsigned > >> SAT_ADD into branchless > >> > >> I see your point that the backend can leverage condition move to emit the branch > >> code. > >> > >>> For instance see https://godbolt.org/z/fvrq3aq6K > >>> On ISAs with conditional operations the branch version gets ifconverted. > >>> On AArch64 we get: > >>> sat_add_u_1(unsigned int, unsigned int): > >>> adds w0, w0, w1 > >>> csinv w0, w0, wzr, cc > >>> ret > >>> so just 2 instructions, and also branchless. On x86_64 we get: > >>> sat_add_u_1(unsigned int, unsigned int): > >>> add edi, esi > >>> mov eax, -1 > >>> cmovnc eax, edi > >>> ret > >>> so 3 instructions but a dependency chain of 2. > >>> also branchless. This patch would regress both of these. > >> > >> But the above Godbolt may not be a good example for evidence, because both > >> x86_64 and aarch64 implemented usadd > >> already. > >> Thus, they all go to usadd<QImode>. For example as below, the sat_add_u_1 and > >> sat_add_u_2 are almost the > >> same when the backend implemented usadd. > >> > >> #include <stdint.h> > >> > >> #define T uint32_t > >> > >> T sat_add_u_1 (T x, T y) > >> { > >> return (T)(x + y) < x ? -1 : (x + y); > >> } > >> > >> T sat_add_u_2 (T x, T y) > >> { > >> return (x + y) | -((x + y) < x); > >> } > >> > >> It will become different when take gcc 14.2 (which doesn’t have .SAT_ADD GIMPLE > >> IR), the x86_64 > >> will have below asm dump for -O3. Looks like no obvious difference here. > >> > >> sat_add_u_1(unsigned int, unsigned int): > >> add edi, esi > >> mov eax, -1 > >> cmovnc eax, edi > >> ret > >> > >> sat_add_u_2(unsigned int, unsigned int): > >> add edi, esi > >> sbb eax, eax > >> or eax, edi > >> ret > >> > > > > Because CE is able to recognize the idiom back into a conditional move. > > Pick a target that doesn't have conditional instructions, like PowerPC > > https://godbolt.org/z/4bTv18WMv > > > > You'll see that this canonicalization has made codegen worse. > > > > After: > > > > .L.sat_add_u_1(unsigned int, unsigned int): > > add 4,3,4 > > rldicl 9,4,0,32 > > subf 3,3,9 > > sradi 3,3,63 > > or 3,3,4 > > rldicl 3,3,0,32 > > blr > > > > and before > > > > .L.sat_add_u_1(unsigned int, unsigned int): > > add 4,3,4 > > cmplw 0,4,3 > > bge 0,.L2 > > li 4,-1 > > .L2: > > rldicl 3,4,0,32 > > blr > > > > It means now it always has to execute 6 instructions, whereas before it was 4 or 5 depending > > on the order of the branch. So for those architectures, it's always slower. > I'm not sure it's that simple. It'll depend on the micro-architecture. > So things like strength of the branch predictors, how fetch blocks are > handled (can you have embedded not-taken branches, short-forward-branch > optimizations, etc). > > Jeff
diff --git a/gcc/match.pd b/gcc/match.pd index c851ac56e37..8425d7c4f20 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3146,18 +3146,17 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (match (unsigned_integer_sat_add @0 @1) (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) -/* Simplify SAT_U_ADD to the cheap form - From: SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. - To: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ -(simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) - (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) - && types_match (type, @0, @1)) - (bit_ior @2 (negate (convert (lt @2 @0)))))) - -/* Unsigned saturation add, case 4 (branch with lt): - SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ -(match (unsigned_integer_sat_add @0 @1) - (cond^ (lt (usadd_left_part_1@2 @0 @1) @0) integer_minus_onep @2)) +/* Simplify sorts of SAT_U_ADD forms to the cheap one. + The cheap form: SAT_U_ADD = (X + Y) | - ((X + Y) < X). */ +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) + /* From SAT_U_ADD = (X + Y) >= x ? (X + Y) : -1. */ + (simplify (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) + (if (types_match (type, @0, @1)) + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0)))))) + /* From SAT_U_ADD = (X + Y) < x ? -1 : (X + Y). */ + (simplify (cond (lt (plus:c@2 @0 @1) @0) integer_minus_onep @2) + (if (types_match (type, @0, @1)) + (bit_ior (plus@2 @0 @1) (negate (convert (lt @2 @0))))))) /* Unsigned saturation add, case 5 (branch with eq .ADD_OVERFLOW): SAT_U_ADD = REALPART_EXPR <.ADD_OVERFLOW> == 0 ? .ADD_OVERFLOW : -1. */ diff --git a/gcc/testsuite/gcc.dg/sat_arith_simplify.h b/gcc/testsuite/gcc.dg/sat_arith_simplify.h new file mode 100644 index 00000000000..46ac00426b2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/sat_arith_simplify.h @@ -0,0 +1,10 @@ +#ifndef HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H +#define HAVE_DEFINED_SAT_ARITH_SIMPLIFY_H + +#define DEF_SAT_U_ADD_2(T) \ +T sat_u_add_##T##_2 (T x, T y) \ +{ \ + return (T)(x + y) < x ? -1 : (x + y); \ +} + +#endif diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c new file mode 100644 index 00000000000..b170b35096c --- /dev/null +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u16.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ + +#include <stdint.h> +#include "sat_arith_simplify.h" + +DEF_SAT_U_ADD_2 (uint16_t) + +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c new file mode 100644 index 00000000000..8830ed7b878 --- /dev/null +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u32.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ + +#include <stdint.h> +#include "sat_arith_simplify.h" + +DEF_SAT_U_ADD_2 (uint32_t) + +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c new file mode 100644 index 00000000000..76c4d4bddaa --- /dev/null +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u64.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ + +#include <stdint.h> +#include "sat_arith_simplify.h" + +DEF_SAT_U_ADD_2 (uint64_t) + +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */ diff --git a/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c new file mode 100644 index 00000000000..b034b8eedb1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/sat_u_add-simplify-2-u8.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-gimple-details" } */ + +#include <stdint.h> +#include "sat_arith_simplify.h" + +DEF_SAT_U_ADD_2 (uint8_t) + +/* { dg-final { scan-tree-dump-not " if " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " else " "gimple" } } */ +/* { dg-final { scan-tree-dump-not " goto " "gimple" } } */