diff mbox series

match.pd: Add std::pow folding optimizations.

Message ID BEA21137-FDCB-478A-82C0-C88C42FF74B0@nvidia.com
State New
Headers show
Series match.pd: Add std::pow folding optimizations. | expand

Commit Message

Jennifer Schmitz Oct. 18, 2024, 12:08 p.m. UTC
This patch adds the following two simplifications in match.pd:
- pow (1.0/x, y) to pow (x, -y), avoiding the division
- pow (0.0, x) to 0.0, avoiding the call to pow.
The patterns are guarded by flag_unsafe_math_optimizations,
!flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
and !HONOR_INFINITIES.

Tests were added to confirm the application of the transform for float,
double, and long double.

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/
	* match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
	pow (0.0, x) -> 0.0.

gcc/testsuite/
	* gcc.dg/tree-ssa/pow_fold_1.c: New test.
---
 gcc/match.pd                               | 14 +++++++++
 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c

Comments

Andrew Pinski Oct. 18, 2024, 7:46 p.m. UTC | #1
On Fri, Oct 18, 2024 at 5:09 AM Jennifer Schmitz <jschmitz@nvidia.com> wrote:
>
> This patch adds the following two simplifications in match.pd:
> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> - pow (0.0, x) to 0.0, avoiding the call to pow.
> The patterns are guarded by flag_unsafe_math_optimizations,
> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
> and !HONOR_INFINITIES.
>
> Tests were added to confirm the application of the transform for float,
> double, and long double.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> x86_64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>
> gcc/
>         * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
>         pow (0.0, x) -> 0.0.

Note this is not to block this patch; it looks good to me.
We have __builtin_powi too and these seem like these simplifications
should apply for that builtin also.

Also I do think you should add a testcase for powf16 too.

Thanks,
Andrew Pinski

>
> gcc/testsuite/
>         * gcc.dg/tree-ssa/pow_fold_1.c: New test.
> ---
>  gcc/match.pd                               | 14 +++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
>  2 files changed, 48 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 12d81fcac0d..ba100b117e7 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>     (rdiv @0 (exps:s @1))
>      (mult @0 (exps (negate @1)))))
>
> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> + (if (! HONOR_INFINITIES (type)
> +      && ! HONOR_SIGNED_ZEROS (type)
> +      && ! flag_trapping_math
> +      && ! flag_errno_math)
> +  (simplify
> +   (POW (rdiv:s real_onep@0 @1) @2)
> +    (POW @1 (negate @2)))
> +
> +  /* Simplify pow(0.0, x) into 0.0.  */
> +  (simplify
> +   (POW real_zerop@0 @1)
> +    @0))
> +
>   (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
>        && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
>        && ! flag_trapping_math
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> new file mode 100644
> index 00000000000..113df572661
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ffast-math" } */
> +/* { dg-require-effective-target c99_runtime } */
> +
> +extern void link_error (void);
> +
> +#define POW1OVER(TYPE, C_TY, TY)                       \
> +  void                                                 \
> +  pow1over_##TY (TYPE x, TYPE y)                       \
> +  {                                                    \
> +    TYPE t1 = 1.0##C_TY / x;                           \
> +    TYPE t2 = __builtin_pow##TY (t1, y);               \
> +    TYPE t3 = -y;                                      \
> +    TYPE t4 = __builtin_pow##TY (x, t3);               \
> +    if (t2 != t4)                                      \
> +      link_error ();                                   \
> +  }                                                    \
> +
> +#define POW0(TYPE, C_TY, TY)                           \
> +  void                                                 \
> +  pow0_##TY (TYPE x)                                   \
> +  {                                                    \
> +    TYPE t1 = __builtin_pow##TY (0.0##C_TY, x);                \
> +    if (t1 != 0.0##C_TY)                               \
> +      link_error ();                                   \
> +  }                                                    \
> +
> +#define TEST_ALL(TYPE, C_TY, TY)                       \
> +  POW1OVER (TYPE, C_TY, TY)                            \
> +  POW0 (TYPE, C_TY, TY)
> +
> +TEST_ALL (double, , )
> +TEST_ALL (float, f, f)
> +TEST_ALL (long double, L, l)
> --
> 2.34.1
Richard Biener Oct. 21, 2024, 8:51 a.m. UTC | #2
On Fri, 18 Oct 2024, Jennifer Schmitz wrote:

> This patch adds the following two simplifications in match.pd:
> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> - pow (0.0, x) to 0.0, avoiding the call to pow.
> The patterns are guarded by flag_unsafe_math_optimizations,
> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
> and !HONOR_INFINITIES.
> 
> Tests were added to confirm the application of the transform for float,
> double, and long double.
> 
> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> x86_64-linux-gnu, no regression.
> OK for mainline?
> 
> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> 
> gcc/
> 	* match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
> 	pow (0.0, x) -> 0.0.
> 
> gcc/testsuite/
> 	* gcc.dg/tree-ssa/pow_fold_1.c: New test.
> ---
>  gcc/match.pd                               | 14 +++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
>  2 files changed, 48 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 12d81fcac0d..ba100b117e7 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>     (rdiv @0 (exps:s @1))
>      (mult @0 (exps (negate @1)))))
>  
> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> + (if (! HONOR_INFINITIES (type)
> +      && ! HONOR_SIGNED_ZEROS (type)
> +      && ! flag_trapping_math
> +      && ! flag_errno_math)
> +  (simplify
> +   (POW (rdiv:s real_onep@0 @1) @2)
> +    (POW @1 (negate @2)))

This one shouldn't need HONOR_SIGNED_ZEROS?

> +
> +  /* Simplify pow(0.0, x) into 0.0.  */
> +  (simplify
> +   (POW real_zerop@0 @1)

I think this needs !HONOR_NANS (type)?

Otherwise OK.

Richard.

> +    @0))
> +
>   (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
>        && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
>        && ! flag_trapping_math
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> new file mode 100644
> index 00000000000..113df572661
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ffast-math" } */
> +/* { dg-require-effective-target c99_runtime } */
> +
> +extern void link_error (void);
> +
> +#define POW1OVER(TYPE, C_TY, TY)			\
> +  void							\
> +  pow1over_##TY (TYPE x, TYPE y)			\
> +  {							\
> +    TYPE t1 = 1.0##C_TY / x;				\
> +    TYPE t2 = __builtin_pow##TY (t1, y);		\
> +    TYPE t3 = -y;					\
> +    TYPE t4 = __builtin_pow##TY (x, t3);		\
> +    if (t2 != t4)					\
> +      link_error ();					\
> +  }							\
> +
> +#define POW0(TYPE, C_TY, TY)				\
> +  void							\
> +  pow0_##TY (TYPE x)					\
> +  {							\
> +    TYPE t1 = __builtin_pow##TY (0.0##C_TY, x);		\
> +    if (t1 != 0.0##C_TY)				\
> +      link_error ();					\
> +  }							\
> +
> +#define TEST_ALL(TYPE, C_TY, TY)			\
> +  POW1OVER (TYPE, C_TY, TY)				\
> +  POW0 (TYPE, C_TY, TY)
> +
> +TEST_ALL (double, , )
> +TEST_ALL (float, f, f)
> +TEST_ALL (long double, L, l)
>
Jennifer Schmitz Oct. 22, 2024, 8:46 a.m. UTC | #3
> On 21 Oct 2024, at 10:51, Richard Biener <rguenther@suse.de> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Fri, 18 Oct 2024, Jennifer Schmitz wrote:
> 
>> This patch adds the following two simplifications in match.pd:
>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
>> - pow (0.0, x) to 0.0, avoiding the call to pow.
>> The patterns are guarded by flag_unsafe_math_optimizations,
>> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
>> and !HONOR_INFINITIES.
>> 
>> Tests were added to confirm the application of the transform for float,
>> double, and long double.
>> 
>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
>> x86_64-linux-gnu, no regression.
>> OK for mainline?
>> 
>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>> 
>> gcc/
>>      * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
>>      pow (0.0, x) -> 0.0.
>> 
>> gcc/testsuite/
>>      * gcc.dg/tree-ssa/pow_fold_1.c: New test.
>> ---
>> gcc/match.pd                               | 14 +++++++++
>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
>> 2 files changed, 48 insertions(+)
>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>> 
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 12d81fcac0d..ba100b117e7 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>    (rdiv @0 (exps:s @1))
>>     (mult @0 (exps (negate @1)))))
>> 
>> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
>> + (if (! HONOR_INFINITIES (type)
>> +      && ! HONOR_SIGNED_ZEROS (type)
>> +      && ! flag_trapping_math
>> +      && ! flag_errno_math)
>> +  (simplify
>> +   (POW (rdiv:s real_onep@0 @1) @2)
>> +    (POW @1 (negate @2)))
> 
> This one shouldn't need HONOR_SIGNED_ZEROS?
> 
>> +
>> +  /* Simplify pow(0.0, x) into 0.0.  */
>> +  (simplify
>> +   (POW real_zerop@0 @1)
> 
> I think this needs !HONOR_NANS (type)?
> 
> Otherwise OK.
Thanks for the feedback, Richard and Andrew. I made the following changes to the patch (current version of the patch below):
- also applied the pattern to POWI and added tests for pow, powif, powil
- not gate first pattern under !HONOR_SIGNED_ZEROS, but second one additionally under !HONOR_NANS (type)
- added tests for powf16

Now, I am encountering two problems:

First, the transform is not applied for float16 (even if -fexcess-precision=16). Do you know what the problem could be?

Second, validation on aarch64 shows a regression in tests
  - gcc.dg/recip_sqrt_mult_1.c and 
  - gcc.dg/recip_sqrt_mult_5.c,
because the pattern (POWI(1/x, y) -> POWI(x, -y)) is applied before the recip pass and prevents application of the recip-patterns. The reason for this might be that the single-use restriction only work if the integer argument is non-constant, but in the failing test cases, the integer argument is 2 and the pattern is applied despite the :s flag.
For example, my pattern is **not** applied (single-use restriction works) for:
double res, res2;
void foo (double a, int b)
{
  double f (double);
  double t1 = 1.0 / a;
  res = __builtin_powi (t1, b);
res2 = f (t1);
}

But the pattern **is** applied and single-use restriction does **not** work for:
double res, res2;
void foo (double a)
{
  double f (double);
  double t1 = 1.0 / a;
  res = __builtin_powi (t1, 2);
  res2 = f (t1);
}

Possible options to resolve this are:
  - gate pattern to run after recip pass
  - do not apply pattern for POWI
What are your thoughts on this?
Thanks,
Jennifer

This patch adds the following two simplifications in match.pd for POW
and POWI:
- pow (1.0/x, y) to pow (x, -y), avoiding the division
- pow (0.0, x) to 0.0, avoiding the call to pow.
The patterns are guarded by flag_unsafe_math_optimizations,
!flag_trapping_math, !flag_errno_math, and !HONOR_INFINITIES.
The second pattern is also guarded by !HONOR_NANS and
!HONOR_SIGNED_ZEROS.

Tests were added to confirm the application of the transform for
builtins pow, powf, powl, powi, powif, powil, and powf16.

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/
	* match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
	pow (0.0, x) -> 0.0.

gcc/testsuite/
	* gcc.dg/tree-ssa/pow_fold_1.c: New test.
---
 gcc/match.pd                               | 15 ++++++++
 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 12d81fcac0d..b061ef9dc91 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8203,6 +8203,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (rdiv @0 (exps:s @1))
     (mult @0 (exps (negate @1)))))
 
+ (for pow (POW POWI)
+  (if (! HONOR_INFINITIES (type)
+       && ! flag_trapping_math
+       && ! flag_errno_math)
+   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
+   (simplify
+    (pow (rdiv:s real_onep@0 @1) @2)
+     (pow @1 (negate @2)))
+
+   /* Simplify pow(0.0, x) into 0.0.  */
+   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
+    (simplify
+     (pow real_zerop@0 @1)
+      @0))))
+
  (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
       && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
       && ! flag_trapping_math
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
new file mode 100644
index 00000000000..c38b7390478
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-optimized" } */
+/* { dg-add-options float16 } */
+/* { dg-require-effective-target float16_runtime } */
+/* { dg-require-effective-target c99_runtime } */
+
+extern void link_error (void);
+
+#define POW1OVER(TYPE1, TYPE2, CTY, TY)			\
+  void							\
+  pow1over_##TY (TYPE1 x, TYPE2 y)			\
+  {							\
+    TYPE1 t1 = 1.0##CTY / x;				\
+    TYPE1 t2 = __builtin_pow##TY (t1, y);		\
+    TYPE2 t3 = -y;					\
+    TYPE1 t4 = __builtin_pow##TY (x, t3);		\
+    if (t2 != t4)					\
+      link_error ();					\
+  }							\
+
+#define POW0(TYPE1, TYPE2, CTY, TY)			\
+  void							\
+  pow0_##TY (TYPE2 x)					\
+  {							\
+    TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);		\
+    if (t1 != 0.0##CTY)					\
+      link_error ();					\
+  }							\
+
+#define TEST_ALL(TYPE1, TYPE2, CTY, TY)			\
+  POW1OVER (TYPE1, TYPE2, CTY, TY)			\
+  POW0 (TYPE1, TYPE2, CTY, TY)
+
+TEST_ALL (double, double, , )
+TEST_ALL (float, float, f, f)
+TEST_ALL (_Float16, _Float16, f16, f16)
+TEST_ALL (long double, long double, L, l)
+TEST_ALL (double, int, , i)
+TEST_ALL (float, int, f, if)
+TEST_ALL (long double, int, L, il)
+
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
Richard Biener Oct. 22, 2024, 9:05 a.m. UTC | #4
On Tue, 22 Oct 2024, Jennifer Schmitz wrote:

> 
> 
> > On 21 Oct 2024, at 10:51, Richard Biener <rguenther@suse.de> wrote:
> > 
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Fri, 18 Oct 2024, Jennifer Schmitz wrote:
> > 
> >> This patch adds the following two simplifications in match.pd:
> >> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> >> - pow (0.0, x) to 0.0, avoiding the call to pow.
> >> The patterns are guarded by flag_unsafe_math_optimizations,
> >> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
> >> and !HONOR_INFINITIES.
> >> 
> >> Tests were added to confirm the application of the transform for float,
> >> double, and long double.
> >> 
> >> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> >> x86_64-linux-gnu, no regression.
> >> OK for mainline?
> >> 
> >> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> >> 
> >> gcc/
> >>      * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
> >>      pow (0.0, x) -> 0.0.
> >> 
> >> gcc/testsuite/
> >>      * gcc.dg/tree-ssa/pow_fold_1.c: New test.
> >> ---
> >> gcc/match.pd                               | 14 +++++++++
> >> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
> >> 2 files changed, 48 insertions(+)
> >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> >> 
> >> diff --git a/gcc/match.pd b/gcc/match.pd
> >> index 12d81fcac0d..ba100b117e7 100644
> >> --- a/gcc/match.pd
> >> +++ b/gcc/match.pd
> >> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>    (rdiv @0 (exps:s @1))
> >>     (mult @0 (exps (negate @1)))))
> >> 
> >> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> >> + (if (! HONOR_INFINITIES (type)
> >> +      && ! HONOR_SIGNED_ZEROS (type)
> >> +      && ! flag_trapping_math
> >> +      && ! flag_errno_math)
> >> +  (simplify
> >> +   (POW (rdiv:s real_onep@0 @1) @2)
> >> +    (POW @1 (negate @2)))
> > 
> > This one shouldn't need HONOR_SIGNED_ZEROS?
> > 
> >> +
> >> +  /* Simplify pow(0.0, x) into 0.0.  */
> >> +  (simplify
> >> +   (POW real_zerop@0 @1)
> > 
> > I think this needs !HONOR_NANS (type)?
> > 
> > Otherwise OK.
> Thanks for the feedback, Richard and Andrew. I made the following changes to the patch (current version of the patch below):
> - also applied the pattern to POWI and added tests for pow, powif, powil
> - not gate first pattern under !HONOR_SIGNED_ZEROS, but second one additionally under !HONOR_NANS (type)
> - added tests for powf16

Note powi is GCC internal, it doesn't set errno and it should be subject
to different rules - I'd rather have patterns working on powi separate.

> Now, I am encountering two problems:
> 
> First, the transform is not applied for float16 (even if -fexcess-precision=16). Do you know what the problem could be?

I think you want to use POW_ALL instead of POW.  The generated
cfn-operators.pd shows

(define_operator_list POW
    BUILT_IN_POWF
    BUILT_IN_POW
    BUILT_IN_POWL
    IFN_POW)
(define_operator_list POW_FN
    BUILT_IN_POWF16
    BUILT_IN_POWF32
    BUILT_IN_POWF64
    BUILT_IN_POWF128
    BUILT_IN_POWF32X
    BUILT_IN_POWF64X
    BUILT_IN_POWF128X
    null)
(define_operator_list POW_ALL
    BUILT_IN_POWF
    BUILT_IN_POW
    BUILT_IN_POWL
    BUILT_IN_POWF16
...

note this comes at expense of more generated code (in 
gimple/generic-match.pd).

> Second, validation on aarch64 shows a regression in tests
>   - gcc.dg/recip_sqrt_mult_1.c and 
>   - gcc.dg/recip_sqrt_mult_5.c,
> because the pattern (POWI(1/x, y) -> POWI(x, -y)) is applied before the recip pass and prevents application of the recip-patterns. The reason for this might be that the single-use restriction only work if the integer argument is non-constant, but in the failing test cases, the integer argument is 2 and the pattern is applied despite the :s flag.
> For example, my pattern is **not** applied (single-use restriction works) for:
> double res, res2;
> void foo (double a, int b)
> {
>   double f (double);
>   double t1 = 1.0 / a;
>   res = __builtin_powi (t1, b);
> res2 = f (t1);
> }
> 
> But the pattern **is** applied and single-use restriction does **not** work for:
> double res, res2;
> void foo (double a)
> {
>   double f (double);
>   double t1 = 1.0 / a;
>   res = __builtin_powi (t1, 2);
>   res2 = f (t1);
> }

This must be because the result is a single operation.  :s only applies
when the result has sub-expresions.  This is to make CSE work.
The "fix" is to add explicit && single_use (@n) to override that
behavior.  Note that I think the transform is good even when the
division is used because the result reduces the dependence chain length.
It's only when @2 is non-constant that we're introducing another
stmt for the negation that re-introduces this latency (even if in
practice it would be smaller).

> Possible options to resolve this are:
>   - gate pattern to run after recip pass
>   - do not apply pattern for POWI

 - adjust the testcase (is the final outcome still good?)

> What are your thoughts on this?
> Thanks,
> Jennifer
> 
> This patch adds the following two simplifications in match.pd for POW
> and POWI:
> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> - pow (0.0, x) to 0.0, avoiding the call to pow.
> The patterns are guarded by flag_unsafe_math_optimizations,
> !flag_trapping_math, !flag_errno_math, and !HONOR_INFINITIES.
> The second pattern is also guarded by !HONOR_NANS and
> !HONOR_SIGNED_ZEROS.
> 
> Tests were added to confirm the application of the transform for
> builtins pow, powf, powl, powi, powif, powil, and powf16.
> 
> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> x86_64-linux-gnu, no regression.
> OK for mainline?
> 
> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> 
> gcc/
> 	* match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
> 	pow (0.0, x) -> 0.0.
> 
> gcc/testsuite/
> 	* gcc.dg/tree-ssa/pow_fold_1.c: New test.
> ---
>  gcc/match.pd                               | 15 ++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++++++++++++++++++++++
>  2 files changed, 57 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 12d81fcac0d..b061ef9dc91 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8203,6 +8203,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>     (rdiv @0 (exps:s @1))
>      (mult @0 (exps (negate @1)))))
>  
> + (for pow (POW POWI)
> +  (if (! HONOR_INFINITIES (type)
> +       && ! flag_trapping_math
> +       && ! flag_errno_math)
> +   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> +   (simplify
> +    (pow (rdiv:s real_onep@0 @1) @2)
> +     (pow @1 (negate @2)))
> +
> +   /* Simplify pow(0.0, x) into 0.0.  */
> +   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
> +    (simplify
> +     (pow real_zerop@0 @1)
> +      @0))))
> +
>   (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
>        && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
>        && ! flag_trapping_math
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> new file mode 100644
> index 00000000000..c38b7390478
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> @@ -0,0 +1,42 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ffast-math -fdump-tree-optimized" } */
> +/* { dg-add-options float16 } */
> +/* { dg-require-effective-target float16_runtime } */
> +/* { dg-require-effective-target c99_runtime } */
> +
> +extern void link_error (void);
> +
> +#define POW1OVER(TYPE1, TYPE2, CTY, TY)			\
> +  void							\
> +  pow1over_##TY (TYPE1 x, TYPE2 y)			\
> +  {							\
> +    TYPE1 t1 = 1.0##CTY / x;				\
> +    TYPE1 t2 = __builtin_pow##TY (t1, y);		\
> +    TYPE2 t3 = -y;					\
> +    TYPE1 t4 = __builtin_pow##TY (x, t3);		\
> +    if (t2 != t4)					\
> +      link_error ();					\
> +  }							\
> +
> +#define POW0(TYPE1, TYPE2, CTY, TY)			\
> +  void							\
> +  pow0_##TY (TYPE2 x)					\
> +  {							\
> +    TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);		\
> +    if (t1 != 0.0##CTY)					\
> +      link_error ();					\
> +  }							\
> +
> +#define TEST_ALL(TYPE1, TYPE2, CTY, TY)			\
> +  POW1OVER (TYPE1, TYPE2, CTY, TY)			\
> +  POW0 (TYPE1, TYPE2, CTY, TY)
> +
> +TEST_ALL (double, double, , )
> +TEST_ALL (float, float, f, f)
> +TEST_ALL (_Float16, _Float16, f16, f16)
> +TEST_ALL (long double, long double, L, l)
> +TEST_ALL (double, int, , i)
> +TEST_ALL (float, int, f, if)
> +TEST_ALL (long double, int, L, il)
> +
> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
>
Jennifer Schmitz Oct. 22, 2024, 10:43 a.m. UTC | #5
> On 22 Oct 2024, at 11:05, Richard Biener <rguenther@suse.de> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
> 
>> 
>> 
>>> On 21 Oct 2024, at 10:51, Richard Biener <rguenther@suse.de> wrote:
>>> 
>>> External email: Use caution opening links or attachments
>>> 
>>> 
>>> On Fri, 18 Oct 2024, Jennifer Schmitz wrote:
>>> 
>>>> This patch adds the following two simplifications in match.pd:
>>>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
>>>> - pow (0.0, x) to 0.0, avoiding the call to pow.
>>>> The patterns are guarded by flag_unsafe_math_optimizations,
>>>> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
>>>> and !HONOR_INFINITIES.
>>>> 
>>>> Tests were added to confirm the application of the transform for float,
>>>> double, and long double.
>>>> 
>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
>>>> x86_64-linux-gnu, no regression.
>>>> OK for mainline?
>>>> 
>>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>>>> 
>>>> gcc/
>>>>     * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
>>>>     pow (0.0, x) -> 0.0.
>>>> 
>>>> gcc/testsuite/
>>>>     * gcc.dg/tree-ssa/pow_fold_1.c: New test.
>>>> ---
>>>> gcc/match.pd                               | 14 +++++++++
>>>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
>>>> 2 files changed, 48 insertions(+)
>>>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>>>> 
>>>> diff --git a/gcc/match.pd b/gcc/match.pd
>>>> index 12d81fcac0d..ba100b117e7 100644
>>>> --- a/gcc/match.pd
>>>> +++ b/gcc/match.pd
>>>> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>>   (rdiv @0 (exps:s @1))
>>>>    (mult @0 (exps (negate @1)))))
>>>> 
>>>> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
>>>> + (if (! HONOR_INFINITIES (type)
>>>> +      && ! HONOR_SIGNED_ZEROS (type)
>>>> +      && ! flag_trapping_math
>>>> +      && ! flag_errno_math)
>>>> +  (simplify
>>>> +   (POW (rdiv:s real_onep@0 @1) @2)
>>>> +    (POW @1 (negate @2)))
>>> 
>>> This one shouldn't need HONOR_SIGNED_ZEROS?
>>> 
>>>> +
>>>> +  /* Simplify pow(0.0, x) into 0.0.  */
>>>> +  (simplify
>>>> +   (POW real_zerop@0 @1)
>>> 
>>> I think this needs !HONOR_NANS (type)?
>>> 
>>> Otherwise OK.
>> Thanks for the feedback, Richard and Andrew. I made the following changes to the patch (current version of the patch below):
>> - also applied the pattern to POWI and added tests for pow, powif, powil
>> - not gate first pattern under !HONOR_SIGNED_ZEROS, but second one additionally under !HONOR_NANS (type)
>> - added tests for powf16
> 
> Note powi is GCC internal, it doesn't set errno and it should be subject
> to different rules - I'd rather have patterns working on powi separate.
How about moving the patterns for POWI into the section flag_unsafe_math_optimizations && canonicalize_math_p () and not use (!flag_errno_math)?
> 
>> Now, I am encountering two problems:
>> 
>> First, the transform is not applied for float16 (even if -fexcess-precision=16). Do you know what the problem could be?
> 
> I think you want to use POW_ALL instead of POW.  The generated
> cfn-operators.pd shows
> 
> (define_operator_list POW
>    BUILT_IN_POWF
>    BUILT_IN_POW
>    BUILT_IN_POWL
>    IFN_POW)
> (define_operator_list POW_FN
>    BUILT_IN_POWF16
>    BUILT_IN_POWF32
>    BUILT_IN_POWF64
>    BUILT_IN_POWF128
>    BUILT_IN_POWF32X
>    BUILT_IN_POWF64X
>    BUILT_IN_POWF128X
>    null)
> (define_operator_list POW_ALL
>    BUILT_IN_POWF
>    BUILT_IN_POW
>    BUILT_IN_POWL
>    BUILT_IN_POWF16
> ...
> 
> note this comes at expense of more generated code (in
> gimple/generic-match.pd).
Thanks, that solved the Float16 issue.
> 
>> Second, validation on aarch64 shows a regression in tests
>>  - gcc.dg/recip_sqrt_mult_1.c and
>>  - gcc.dg/recip_sqrt_mult_5.c,
>> because the pattern (POWI(1/x, y) -> POWI(x, -y)) is applied before the recip pass and prevents application of the recip-patterns. The reason for this might be that the single-use restriction only work if the integer argument is non-constant, but in the failing test cases, the integer argument is 2 and the pattern is applied despite the :s flag.
>> For example, my pattern is **not** applied (single-use restriction works) for:
>> double res, res2;
>> void foo (double a, int b)
>> {
>>  double f (double);
>>  double t1 = 1.0 / a;
>>  res = __builtin_powi (t1, b);
>> res2 = f (t1);
>> }
>> 
>> But the pattern **is** applied and single-use restriction does **not** work for:
>> double res, res2;
>> void foo (double a)
>> {
>>  double f (double);
>>  double t1 = 1.0 / a;
>>  res = __builtin_powi (t1, 2);
>>  res2 = f (t1);
>> }
> 
> This must be because the result is a single operation.  :s only applies
> when the result has sub-expresions.  This is to make CSE work.
> The "fix" is to add explicit && single_use (@n) to override that
> behavior.  Note that I think the transform is good even when the
> division is used because the result reduces the dependence chain length.
> It's only when @2 is non-constant that we're introducing another
> stmt for the negation that re-introduces this latency (even if in
> practice it would be smaller).
> 
>> Possible options to resolve this are:
>>  - gate pattern to run after recip pass
>>  - do not apply pattern for POWI
> 
> - adjust the testcase (is the final outcome still good?)
Without the patch, there is one fdiv instruction less (below is the assembly for recip_sqrt_mult_1.c, but for _5.c it’s analogous):
No patch or with single_use of the result of the division:
foo:
	fmov d30, 1.0e+0
	fsqrt d31, d0
	adrp x0, .LANCHOR0
	add x1, x0, :lo12:.LANCHOR0
	fdiv d30, d30, d0
	fmul d0, d31, d30
	str d0, [x0, #:lo12:.LANCHOR0]
	stp d30, d31, [x1, 8]
	ret

With patch:
foo:
        fsqrt   d31, d0
        fmov    d30, 1.0e+0
        adrp    x1, .LANCHOR0
        add     x0, x1, :lo12:.LANCHOR0
        fdiv    d31, d30, d31
        fdiv    d30, d30, d0
        str     d31, [x1, #:lo12:.LANCHOR0]
        fmul    d31, d31, d0
        stp     d30, d31, [x0, 8]
        ret
So, we might want to use the single_use guard.
Best,
Jennifer

> 
>> What are your thoughts on this?
>> Thanks,
>> Jennifer
>> 
>> This patch adds the following two simplifications in match.pd for POW
>> and POWI:
>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
>> - pow (0.0, x) to 0.0, avoiding the call to pow.
>> The patterns are guarded by flag_unsafe_math_optimizations,
>> !flag_trapping_math, !flag_errno_math, and !HONOR_INFINITIES.
>> The second pattern is also guarded by !HONOR_NANS and
>> !HONOR_SIGNED_ZEROS.
>> 
>> Tests were added to confirm the application of the transform for
>> builtins pow, powf, powl, powi, powif, powil, and powf16.
>> 
>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
>> x86_64-linux-gnu, no regression.
>> OK for mainline?
>> 
>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>> 
>> gcc/
>>      * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
>>      pow (0.0, x) -> 0.0.
>> 
>> gcc/testsuite/
>>      * gcc.dg/tree-ssa/pow_fold_1.c: New test.
>> ---
>> gcc/match.pd                               | 15 ++++++++
>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++++++++++++++++++++++
>> 2 files changed, 57 insertions(+)
>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>> 
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 12d81fcac0d..b061ef9dc91 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -8203,6 +8203,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>    (rdiv @0 (exps:s @1))
>>     (mult @0 (exps (negate @1)))))
>> 
>> + (for pow (POW POWI)
>> +  (if (! HONOR_INFINITIES (type)
>> +       && ! flag_trapping_math
>> +       && ! flag_errno_math)
>> +   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
>> +   (simplify
>> +    (pow (rdiv:s real_onep@0 @1) @2)
>> +     (pow @1 (negate @2)))
>> +
>> +   /* Simplify pow(0.0, x) into 0.0.  */
>> +   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
>> +    (simplify
>> +     (pow real_zerop@0 @1)
>> +      @0))))
>> +
>>  (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
>>       && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
>>       && ! flag_trapping_math
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>> new file mode 100644
>> index 00000000000..c38b7390478
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>> @@ -0,0 +1,42 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -ffast-math -fdump-tree-optimized" } */
>> +/* { dg-add-options float16 } */
>> +/* { dg-require-effective-target float16_runtime } */
>> +/* { dg-require-effective-target c99_runtime } */
>> +
>> +extern void link_error (void);
>> +
>> +#define POW1OVER(TYPE1, TYPE2, CTY, TY)                      \
>> +  void                                                       \
>> +  pow1over_##TY (TYPE1 x, TYPE2 y)                   \
>> +  {                                                  \
>> +    TYPE1 t1 = 1.0##CTY / x;                         \
>> +    TYPE1 t2 = __builtin_pow##TY (t1, y);            \
>> +    TYPE2 t3 = -y;                                   \
>> +    TYPE1 t4 = __builtin_pow##TY (x, t3);            \
>> +    if (t2 != t4)                                    \
>> +      link_error ();                                 \
>> +  }                                                  \
>> +
>> +#define POW0(TYPE1, TYPE2, CTY, TY)                  \
>> +  void                                                       \
>> +  pow0_##TY (TYPE2 x)                                        \
>> +  {                                                  \
>> +    TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);              \
>> +    if (t1 != 0.0##CTY)                                      \
>> +      link_error ();                                 \
>> +  }                                                  \
>> +
>> +#define TEST_ALL(TYPE1, TYPE2, CTY, TY)                      \
>> +  POW1OVER (TYPE1, TYPE2, CTY, TY)                   \
>> +  POW0 (TYPE1, TYPE2, CTY, TY)
>> +
>> +TEST_ALL (double, double, , )
>> +TEST_ALL (float, float, f, f)
>> +TEST_ALL (_Float16, _Float16, f16, f16)
>> +TEST_ALL (long double, long double, L, l)
>> +TEST_ALL (double, int, , i)
>> +TEST_ALL (float, int, f, if)
>> +TEST_ALL (long double, int, L, il)
>> +
>> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
>> 
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Richard Biener Oct. 22, 2024, 11:14 a.m. UTC | #6
On Tue, 22 Oct 2024, Jennifer Schmitz wrote:

> 
> 
> > On 22 Oct 2024, at 11:05, Richard Biener <rguenther@suse.de> wrote:
> > 
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
> > 
> >> 
> >> 
> >>> On 21 Oct 2024, at 10:51, Richard Biener <rguenther@suse.de> wrote:
> >>> 
> >>> External email: Use caution opening links or attachments
> >>> 
> >>> 
> >>> On Fri, 18 Oct 2024, Jennifer Schmitz wrote:
> >>> 
> >>>> This patch adds the following two simplifications in match.pd:
> >>>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> >>>> - pow (0.0, x) to 0.0, avoiding the call to pow.
> >>>> The patterns are guarded by flag_unsafe_math_optimizations,
> >>>> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
> >>>> and !HONOR_INFINITIES.
> >>>> 
> >>>> Tests were added to confirm the application of the transform for float,
> >>>> double, and long double.
> >>>> 
> >>>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> >>>> x86_64-linux-gnu, no regression.
> >>>> OK for mainline?
> >>>> 
> >>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> >>>> 
> >>>> gcc/
> >>>>     * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
> >>>>     pow (0.0, x) -> 0.0.
> >>>> 
> >>>> gcc/testsuite/
> >>>>     * gcc.dg/tree-ssa/pow_fold_1.c: New test.
> >>>> ---
> >>>> gcc/match.pd                               | 14 +++++++++
> >>>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
> >>>> 2 files changed, 48 insertions(+)
> >>>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> >>>> 
> >>>> diff --git a/gcc/match.pd b/gcc/match.pd
> >>>> index 12d81fcac0d..ba100b117e7 100644
> >>>> --- a/gcc/match.pd
> >>>> +++ b/gcc/match.pd
> >>>> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>>>   (rdiv @0 (exps:s @1))
> >>>>    (mult @0 (exps (negate @1)))))
> >>>> 
> >>>> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> >>>> + (if (! HONOR_INFINITIES (type)
> >>>> +      && ! HONOR_SIGNED_ZEROS (type)
> >>>> +      && ! flag_trapping_math
> >>>> +      && ! flag_errno_math)
> >>>> +  (simplify
> >>>> +   (POW (rdiv:s real_onep@0 @1) @2)
> >>>> +    (POW @1 (negate @2)))
> >>> 
> >>> This one shouldn't need HONOR_SIGNED_ZEROS?
> >>> 
> >>>> +
> >>>> +  /* Simplify pow(0.0, x) into 0.0.  */
> >>>> +  (simplify
> >>>> +   (POW real_zerop@0 @1)
> >>> 
> >>> I think this needs !HONOR_NANS (type)?
> >>> 
> >>> Otherwise OK.
> >> Thanks for the feedback, Richard and Andrew. I made the following changes to the patch (current version of the patch below):
> >> - also applied the pattern to POWI and added tests for pow, powif, powil
> >> - not gate first pattern under !HONOR_SIGNED_ZEROS, but second one additionally under !HONOR_NANS (type)
> >> - added tests for powf16
> > 
> > Note powi is GCC internal, it doesn't set errno and it should be subject
> > to different rules - I'd rather have patterns working on powi separate.
> How about moving the patterns for POWI into the section flag_unsafe_math_optimizations && canonicalize_math_p () and not use (!flag_errno_math)?

Sounds good.

> > 
> >> Now, I am encountering two problems:
> >> 
> >> First, the transform is not applied for float16 (even if -fexcess-precision=16). Do you know what the problem could be?
> > 
> > I think you want to use POW_ALL instead of POW.  The generated
> > cfn-operators.pd shows
> > 
> > (define_operator_list POW
> >    BUILT_IN_POWF
> >    BUILT_IN_POW
> >    BUILT_IN_POWL
> >    IFN_POW)
> > (define_operator_list POW_FN
> >    BUILT_IN_POWF16
> >    BUILT_IN_POWF32
> >    BUILT_IN_POWF64
> >    BUILT_IN_POWF128
> >    BUILT_IN_POWF32X
> >    BUILT_IN_POWF64X
> >    BUILT_IN_POWF128X
> >    null)
> > (define_operator_list POW_ALL
> >    BUILT_IN_POWF
> >    BUILT_IN_POW
> >    BUILT_IN_POWL
> >    BUILT_IN_POWF16
> > ...
> > 
> > note this comes at expense of more generated code (in
> > gimple/generic-match.pd).
> Thanks, that solved the Float16 issue.
> > 
> >> Second, validation on aarch64 shows a regression in tests
> >>  - gcc.dg/recip_sqrt_mult_1.c and
> >>  - gcc.dg/recip_sqrt_mult_5.c,
> >> because the pattern (POWI(1/x, y) -> POWI(x, -y)) is applied before the recip pass and prevents application of the recip-patterns. The reason for this might be that the single-use restriction only work if the integer argument is non-constant, but in the failing test cases, the integer argument is 2 and the pattern is applied despite the :s flag.
> >> For example, my pattern is **not** applied (single-use restriction works) for:
> >> double res, res2;
> >> void foo (double a, int b)
> >> {
> >>  double f (double);
> >>  double t1 = 1.0 / a;
> >>  res = __builtin_powi (t1, b);
> >> res2 = f (t1);
> >> }
> >> 
> >> But the pattern **is** applied and single-use restriction does **not** work for:
> >> double res, res2;
> >> void foo (double a)
> >> {
> >>  double f (double);
> >>  double t1 = 1.0 / a;
> >>  res = __builtin_powi (t1, 2);
> >>  res2 = f (t1);
> >> }
> > 
> > This must be because the result is a single operation.  :s only applies
> > when the result has sub-expresions.  This is to make CSE work.
> > The "fix" is to add explicit && single_use (@n) to override that
> > behavior.  Note that I think the transform is good even when the
> > division is used because the result reduces the dependence chain length.
> > It's only when @2 is non-constant that we're introducing another
> > stmt for the negation that re-introduces this latency (even if in
> > practice it would be smaller).
> > 
> >> Possible options to resolve this are:
> >>  - gate pattern to run after recip pass
> >>  - do not apply pattern for POWI
> > 
> > - adjust the testcase (is the final outcome still good?)
> Without the patch, there is one fdiv instruction less (below is the assembly for recip_sqrt_mult_1.c, but for _5.c it’s analogous):
> No patch or with single_use of the result of the division:
> foo:
> 	fmov d30, 1.0e+0
> 	fsqrt d31, d0
> 	adrp x0, .LANCHOR0
> 	add x1, x0, :lo12:.LANCHOR0
> 	fdiv d30, d30, d0
> 	fmul d0, d31, d30
> 	str d0, [x0, #:lo12:.LANCHOR0]
> 	stp d30, d31, [x1, 8]
> 	ret
> 
> With patch:
> foo:
>         fsqrt   d31, d0
>         fmov    d30, 1.0e+0
>         adrp    x1, .LANCHOR0
>         add     x0, x1, :lo12:.LANCHOR0
>         fdiv    d31, d30, d31
>         fdiv    d30, d30, d0
>         str     d31, [x1, #:lo12:.LANCHOR0]
>         fmul    d31, d31, d0
>         stp     d30, d31, [x0, 8]
>         ret
> So, we might want to use the single_use guard.

Yeah, this is because the powi inline expansion will add back
the division.

Richard.

> Best,
> Jennifer
> 
> > 
> >> What are your thoughts on this?
> >> Thanks,
> >> Jennifer
> >> 
> >> This patch adds the following two simplifications in match.pd for POW
> >> and POWI:
> >> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> >> - pow (0.0, x) to 0.0, avoiding the call to pow.
> >> The patterns are guarded by flag_unsafe_math_optimizations,
> >> !flag_trapping_math, !flag_errno_math, and !HONOR_INFINITIES.
> >> The second pattern is also guarded by !HONOR_NANS and
> >> !HONOR_SIGNED_ZEROS.
> >> 
> >> Tests were added to confirm the application of the transform for
> >> builtins pow, powf, powl, powi, powif, powil, and powf16.
> >> 
> >> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> >> x86_64-linux-gnu, no regression.
> >> OK for mainline?
> >> 
> >> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> >> 
> >> gcc/
> >>      * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
> >>      pow (0.0, x) -> 0.0.
> >> 
> >> gcc/testsuite/
> >>      * gcc.dg/tree-ssa/pow_fold_1.c: New test.
> >> ---
> >> gcc/match.pd                               | 15 ++++++++
> >> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++++++++++++++++++++++
> >> 2 files changed, 57 insertions(+)
> >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> >> 
> >> diff --git a/gcc/match.pd b/gcc/match.pd
> >> index 12d81fcac0d..b061ef9dc91 100644
> >> --- a/gcc/match.pd
> >> +++ b/gcc/match.pd
> >> @@ -8203,6 +8203,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>    (rdiv @0 (exps:s @1))
> >>     (mult @0 (exps (negate @1)))))
> >> 
> >> + (for pow (POW POWI)
> >> +  (if (! HONOR_INFINITIES (type)
> >> +       && ! flag_trapping_math
> >> +       && ! flag_errno_math)
> >> +   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> >> +   (simplify
> >> +    (pow (rdiv:s real_onep@0 @1) @2)
> >> +     (pow @1 (negate @2)))
> >> +
> >> +   /* Simplify pow(0.0, x) into 0.0.  */
> >> +   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
> >> +    (simplify
> >> +     (pow real_zerop@0 @1)
> >> +      @0))))
> >> +
> >>  (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
> >>       && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
> >>       && ! flag_trapping_math
> >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> >> new file mode 100644
> >> index 00000000000..c38b7390478
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> >> @@ -0,0 +1,42 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-O2 -ffast-math -fdump-tree-optimized" } */
> >> +/* { dg-add-options float16 } */
> >> +/* { dg-require-effective-target float16_runtime } */
> >> +/* { dg-require-effective-target c99_runtime } */
> >> +
> >> +extern void link_error (void);
> >> +
> >> +#define POW1OVER(TYPE1, TYPE2, CTY, TY)                      \
> >> +  void                                                       \
> >> +  pow1over_##TY (TYPE1 x, TYPE2 y)                   \
> >> +  {                                                  \
> >> +    TYPE1 t1 = 1.0##CTY / x;                         \
> >> +    TYPE1 t2 = __builtin_pow##TY (t1, y);            \
> >> +    TYPE2 t3 = -y;                                   \
> >> +    TYPE1 t4 = __builtin_pow##TY (x, t3);            \
> >> +    if (t2 != t4)                                    \
> >> +      link_error ();                                 \
> >> +  }                                                  \
> >> +
> >> +#define POW0(TYPE1, TYPE2, CTY, TY)                  \
> >> +  void                                                       \
> >> +  pow0_##TY (TYPE2 x)                                        \
> >> +  {                                                  \
> >> +    TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);              \
> >> +    if (t1 != 0.0##CTY)                                      \
> >> +      link_error ();                                 \
> >> +  }                                                  \
> >> +
> >> +#define TEST_ALL(TYPE1, TYPE2, CTY, TY)                      \
> >> +  POW1OVER (TYPE1, TYPE2, CTY, TY)                   \
> >> +  POW0 (TYPE1, TYPE2, CTY, TY)
> >> +
> >> +TEST_ALL (double, double, , )
> >> +TEST_ALL (float, float, f, f)
> >> +TEST_ALL (_Float16, _Float16, f16, f16)
> >> +TEST_ALL (long double, long double, L, l)
> >> +TEST_ALL (double, int, , i)
> >> +TEST_ALL (float, int, f, if)
> >> +TEST_ALL (long double, int, L, il)
> >> +
> >> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> >> 
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
> 
> 
>
Jennifer Schmitz Oct. 23, 2024, 8:55 a.m. UTC | #7
> On 22 Oct 2024, at 13:14, Richard Biener <rguenther@suse.de> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
> 
>> 
>> 
>>> On 22 Oct 2024, at 11:05, Richard Biener <rguenther@suse.de> wrote:
>>> 
>>> External email: Use caution opening links or attachments
>>> 
>>> 
>>> On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
>>> 
>>>> 
>>>> 
>>>>> On 21 Oct 2024, at 10:51, Richard Biener <rguenther@suse.de> wrote:
>>>>> 
>>>>> External email: Use caution opening links or attachments
>>>>> 
>>>>> 
>>>>> On Fri, 18 Oct 2024, Jennifer Schmitz wrote:
>>>>> 
>>>>>> This patch adds the following two simplifications in match.pd:
>>>>>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
>>>>>> - pow (0.0, x) to 0.0, avoiding the call to pow.
>>>>>> The patterns are guarded by flag_unsafe_math_optimizations,
>>>>>> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
>>>>>> and !HONOR_INFINITIES.
>>>>>> 
>>>>>> Tests were added to confirm the application of the transform for float,
>>>>>> double, and long double.
>>>>>> 
>>>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
>>>>>> x86_64-linux-gnu, no regression.
>>>>>> OK for mainline?
>>>>>> 
>>>>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>>>>>> 
>>>>>> gcc/
>>>>>>    * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
>>>>>>    pow (0.0, x) -> 0.0.
>>>>>> 
>>>>>> gcc/testsuite/
>>>>>>    * gcc.dg/tree-ssa/pow_fold_1.c: New test.
>>>>>> ---
>>>>>> gcc/match.pd                               | 14 +++++++++
>>>>>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
>>>>>> 2 files changed, 48 insertions(+)
>>>>>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>>>>>> 
>>>>>> diff --git a/gcc/match.pd b/gcc/match.pd
>>>>>> index 12d81fcac0d..ba100b117e7 100644
>>>>>> --- a/gcc/match.pd
>>>>>> +++ b/gcc/match.pd
>>>>>> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>>>>  (rdiv @0 (exps:s @1))
>>>>>>   (mult @0 (exps (negate @1)))))
>>>>>> 
>>>>>> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
>>>>>> + (if (! HONOR_INFINITIES (type)
>>>>>> +      && ! HONOR_SIGNED_ZEROS (type)
>>>>>> +      && ! flag_trapping_math
>>>>>> +      && ! flag_errno_math)
>>>>>> +  (simplify
>>>>>> +   (POW (rdiv:s real_onep@0 @1) @2)
>>>>>> +    (POW @1 (negate @2)))
>>>>> 
>>>>> This one shouldn't need HONOR_SIGNED_ZEROS?
>>>>> 
>>>>>> +
>>>>>> +  /* Simplify pow(0.0, x) into 0.0.  */
>>>>>> +  (simplify
>>>>>> +   (POW real_zerop@0 @1)
>>>>> 
>>>>> I think this needs !HONOR_NANS (type)?
>>>>> 
>>>>> Otherwise OK.
>>>> Thanks for the feedback, Richard and Andrew. I made the following changes to the patch (current version of the patch below):
>>>> - also applied the pattern to POWI and added tests for pow, powif, powil
>>>> - not gate first pattern under !HONOR_SIGNED_ZEROS, but second one additionally under !HONOR_NANS (type)
>>>> - added tests for powf16
>>> 
>>> Note powi is GCC internal, it doesn't set errno and it should be subject
>>> to different rules - I'd rather have patterns working on powi separate.
>> How about moving the patterns for POWI into the section flag_unsafe_math_optimizations && canonicalize_math_p () and not use (!flag_errno_math)?
> 
> Sounds good.
> 
>>> 
>>>> Now, I am encountering two problems:
>>>> 
>>>> First, the transform is not applied for float16 (even if -fexcess-precision=16). Do you know what the problem could be?
>>> 
>>> I think you want to use POW_ALL instead of POW.  The generated
>>> cfn-operators.pd shows
>>> 
>>> (define_operator_list POW
>>>   BUILT_IN_POWF
>>>   BUILT_IN_POW
>>>   BUILT_IN_POWL
>>>   IFN_POW)
>>> (define_operator_list POW_FN
>>>   BUILT_IN_POWF16
>>>   BUILT_IN_POWF32
>>>   BUILT_IN_POWF64
>>>   BUILT_IN_POWF128
>>>   BUILT_IN_POWF32X
>>>   BUILT_IN_POWF64X
>>>   BUILT_IN_POWF128X
>>>   null)
>>> (define_operator_list POW_ALL
>>>   BUILT_IN_POWF
>>>   BUILT_IN_POW
>>>   BUILT_IN_POWL
>>>   BUILT_IN_POWF16
>>> ...
>>> 
>>> note this comes at expense of more generated code (in
>>> gimple/generic-match.pd).
>> Thanks, that solved the Float16 issue.
>>> 
>>>> Second, validation on aarch64 shows a regression in tests
>>>> - gcc.dg/recip_sqrt_mult_1.c and
>>>> - gcc.dg/recip_sqrt_mult_5.c,
>>>> because the pattern (POWI(1/x, y) -> POWI(x, -y)) is applied before the recip pass and prevents application of the recip-patterns. The reason for this might be that the single-use restriction only work if the integer argument is non-constant, but in the failing test cases, the integer argument is 2 and the pattern is applied despite the :s flag.
>>>> For example, my pattern is **not** applied (single-use restriction works) for:
>>>> double res, res2;
>>>> void foo (double a, int b)
>>>> {
>>>> double f (double);
>>>> double t1 = 1.0 / a;
>>>> res = __builtin_powi (t1, b);
>>>> res2 = f (t1);
>>>> }
>>>> 
>>>> But the pattern **is** applied and single-use restriction does **not** work for:
>>>> double res, res2;
>>>> void foo (double a)
>>>> {
>>>> double f (double);
>>>> double t1 = 1.0 / a;
>>>> res = __builtin_powi (t1, 2);
>>>> res2 = f (t1);
>>>> }
>>> 
>>> This must be because the result is a single operation.  :s only applies
>>> when the result has sub-expresions.  This is to make CSE work.
>>> The "fix" is to add explicit && single_use (@n) to override that
>>> behavior.  Note that I think the transform is good even when the
>>> division is used because the result reduces the dependence chain length.
>>> It's only when @2 is non-constant that we're introducing another
>>> stmt for the negation that re-introduces this latency (even if in
>>> practice it would be smaller).
>>> 
>>>> Possible options to resolve this are:
>>>> - gate pattern to run after recip pass
>>>> - do not apply pattern for POWI
>>> 
>>> - adjust the testcase (is the final outcome still good?)
>> Without the patch, there is one fdiv instruction less (below is the assembly for recip_sqrt_mult_1.c, but for _5.c it’s analogous):
>> No patch or with single_use of the result of the division:
>> foo:
>>      fmov d30, 1.0e+0
>>      fsqrt d31, d0
>>      adrp x0, .LANCHOR0
>>      add x1, x0, :lo12:.LANCHOR0
>>      fdiv d30, d30, d0
>>      fmul d0, d31, d30
>>      str d0, [x0, #:lo12:.LANCHOR0]
>>      stp d30, d31, [x1, 8]
>>      ret
>> 
>> With patch:
>> foo:
>>        fsqrt   d31, d0
>>        fmov    d30, 1.0e+0
>>        adrp    x1, .LANCHOR0
>>        add     x0, x1, :lo12:.LANCHOR0
>>        fdiv    d31, d30, d31
>>        fdiv    d30, d30, d0
>>        str     d31, [x1, #:lo12:.LANCHOR0]
>>        fmul    d31, d31, d0
>>        stp     d30, d31, [x0, 8]
>>        ret
>> So, we might want to use the single_use guard.
> 
> Yeah, this is because the powi inline expansion will add back
> the division.
Below is the updated patch, I re-validated with no regression on aarch64 and x86_64.
Thanks,
Jenni

This patch adds the following two simplifications in match.pd for
POW_ALL and POWI:
- pow (1.0/x, y) to pow (x, -y), avoiding the division
- pow (0.0, x) to 0.0, avoiding the call to pow.
The patterns are guarded by flag_unsafe_math_optimizations,
!flag_trapping_math, and !HONOR_INFINITIES.
The POW_ALL patterns are also gated under !flag_errno_math.
The second pattern is also guarded by !HONOR_NANS and
!HONOR_SIGNED_ZEROS.

Tests were added to confirm the application of the transform for
builtins pow, powf, powl, powi, powif, powil, and powf16.

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/
	* match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
	pow (0.0, x) -> 0.0.

gcc/testsuite/
	* gcc.dg/tree-ssa/pow_fold_1.c: New test.
---
 gcc/match.pd                               | 28 +++++++++++++++
 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 12d81fcac0d..6d9868d2bb1 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8203,6 +8203,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (rdiv @0 (exps:s @1))
     (mult @0 (exps (negate @1)))))
 
+ (for pow (POW_ALL)
+  (if (! HONOR_INFINITIES (type)
+       && ! flag_trapping_math
+       && ! flag_errno_math)
+   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
+   (simplify
+    (pow (rdiv:s real_onep@0 @1) @2)
+     (pow @1 (negate @2)))
+
+   /* Simplify pow(0.0, x) into 0.0.  */
+   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
+    (simplify
+     (pow real_zerop@0 @1)
+      @0))))
+
  (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
       && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
       && ! flag_trapping_math
@@ -8561,6 +8576,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (mult (POW:s @0 @1) (POW:s @2 @1))
    (POW (mult @0 @2) @1))
 
+ (if (! HONOR_INFINITIES (type) && ! flag_trapping_math)
+  /* Simplify powi(1.0/x, y) into powi(x, -y).  */
+  (simplify
+   (POWI (rdiv@3 real_onep@0 @1) @2)
+   (if (single_use (@3))
+    (POWI @1 (negate @2))))
+
+  /* Simplify powi(0.0, x) into 0.0.  */
+  (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
+   (simplify
+    (POWI real_zerop@0 @1)
+     @0)))
+
  /* Simplify powi(x,y) * powi(z,y) -> powi(x*z,y). */
  (simplify
   (mult (POWI:s @0 @1) (POWI:s @2 @1))
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
new file mode 100644
index 00000000000..d98bcb0827e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized -fexcess-precision=16" } */
+/* { dg-add-options float16 } */
+/* { dg-require-effective-target float16_runtime } */
+/* { dg-require-effective-target c99_runtime } */
+
+extern void link_error (void);
+
+#define POW1OVER(TYPE1, TYPE2, CTY, TY)			\
+  void							\
+  pow1over_##TY (TYPE1 x, TYPE2 y)			\
+  {							\
+    TYPE1 t1 = 1.0##CTY / x;				\
+    TYPE1 t2 = __builtin_pow##TY (t1, y);		\
+    TYPE2 t3 = -y;					\
+    TYPE1 t4 = __builtin_pow##TY (x, t3);		\
+    if (t2 != t4)					\
+      link_error ();					\
+  }							\
+
+#define POW0(TYPE1, TYPE2, CTY, TY)			\
+  void							\
+  pow0_##TY (TYPE2 x)					\
+  {							\
+    TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);		\
+    if (t1 != 0.0##CTY)					\
+      link_error ();					\
+  }							\
+
+#define TEST_ALL(TYPE1, TYPE2, CTY, TY)			\
+  POW1OVER (TYPE1, TYPE2, CTY, TY)			\
+  POW0 (TYPE1, TYPE2, CTY, TY)
+
+TEST_ALL (double, double, , )
+TEST_ALL (float, float, f, f)
+TEST_ALL (_Float16, _Float16, f16, f16)
+TEST_ALL (long double, long double, L, l)
+TEST_ALL (double, int, , i)
+TEST_ALL (float, int, f, if)
+TEST_ALL (long double, int, L, il)
+
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
Richard Biener Oct. 25, 2024, 12:39 p.m. UTC | #8
On Wed, 23 Oct 2024, Jennifer Schmitz wrote:

> 
> 
> > On 22 Oct 2024, at 13:14, Richard Biener <rguenther@suse.de> wrote:
> > 
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
> > 
> >> 
> >> 
> >>> On 22 Oct 2024, at 11:05, Richard Biener <rguenther@suse.de> wrote:
> >>> 
> >>> External email: Use caution opening links or attachments
> >>> 
> >>> 
> >>> On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
> >>> 
> >>>> 
> >>>> 
> >>>>> On 21 Oct 2024, at 10:51, Richard Biener <rguenther@suse.de> wrote:
> >>>>> 
> >>>>> External email: Use caution opening links or attachments
> >>>>> 
> >>>>> 
> >>>>> On Fri, 18 Oct 2024, Jennifer Schmitz wrote:
> >>>>> 
> >>>>>> This patch adds the following two simplifications in match.pd:
> >>>>>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> >>>>>> - pow (0.0, x) to 0.0, avoiding the call to pow.
> >>>>>> The patterns are guarded by flag_unsafe_math_optimizations,
> >>>>>> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
> >>>>>> and !HONOR_INFINITIES.
> >>>>>> 
> >>>>>> Tests were added to confirm the application of the transform for float,
> >>>>>> double, and long double.
> >>>>>> 
> >>>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> >>>>>> x86_64-linux-gnu, no regression.
> >>>>>> OK for mainline?
> >>>>>> 
> >>>>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> >>>>>> 
> >>>>>> gcc/
> >>>>>>    * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
> >>>>>>    pow (0.0, x) -> 0.0.
> >>>>>> 
> >>>>>> gcc/testsuite/
> >>>>>>    * gcc.dg/tree-ssa/pow_fold_1.c: New test.
> >>>>>> ---
> >>>>>> gcc/match.pd                               | 14 +++++++++
> >>>>>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
> >>>>>> 2 files changed, 48 insertions(+)
> >>>>>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> >>>>>> 
> >>>>>> diff --git a/gcc/match.pd b/gcc/match.pd
> >>>>>> index 12d81fcac0d..ba100b117e7 100644
> >>>>>> --- a/gcc/match.pd
> >>>>>> +++ b/gcc/match.pd
> >>>>>> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>>>>>  (rdiv @0 (exps:s @1))
> >>>>>>   (mult @0 (exps (negate @1)))))
> >>>>>> 
> >>>>>> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> >>>>>> + (if (! HONOR_INFINITIES (type)
> >>>>>> +      && ! HONOR_SIGNED_ZEROS (type)
> >>>>>> +      && ! flag_trapping_math
> >>>>>> +      && ! flag_errno_math)
> >>>>>> +  (simplify
> >>>>>> +   (POW (rdiv:s real_onep@0 @1) @2)
> >>>>>> +    (POW @1 (negate @2)))
> >>>>> 
> >>>>> This one shouldn't need HONOR_SIGNED_ZEROS?
> >>>>> 
> >>>>>> +
> >>>>>> +  /* Simplify pow(0.0, x) into 0.0.  */
> >>>>>> +  (simplify
> >>>>>> +   (POW real_zerop@0 @1)
> >>>>> 
> >>>>> I think this needs !HONOR_NANS (type)?
> >>>>> 
> >>>>> Otherwise OK.
> >>>> Thanks for the feedback, Richard and Andrew. I made the following changes to the patch (current version of the patch below):
> >>>> - also applied the pattern to POWI and added tests for pow, powif, powil
> >>>> - not gate first pattern under !HONOR_SIGNED_ZEROS, but second one additionally under !HONOR_NANS (type)
> >>>> - added tests for powf16
> >>> 
> >>> Note powi is GCC internal, it doesn't set errno and it should be subject
> >>> to different rules - I'd rather have patterns working on powi separate.
> >> How about moving the patterns for POWI into the section flag_unsafe_math_optimizations && canonicalize_math_p () and not use (!flag_errno_math)?
> > 
> > Sounds good.
> > 
> >>> 
> >>>> Now, I am encountering two problems:
> >>>> 
> >>>> First, the transform is not applied for float16 (even if -fexcess-precision=16). Do you know what the problem could be?
> >>> 
> >>> I think you want to use POW_ALL instead of POW.  The generated
> >>> cfn-operators.pd shows
> >>> 
> >>> (define_operator_list POW
> >>>   BUILT_IN_POWF
> >>>   BUILT_IN_POW
> >>>   BUILT_IN_POWL
> >>>   IFN_POW)
> >>> (define_operator_list POW_FN
> >>>   BUILT_IN_POWF16
> >>>   BUILT_IN_POWF32
> >>>   BUILT_IN_POWF64
> >>>   BUILT_IN_POWF128
> >>>   BUILT_IN_POWF32X
> >>>   BUILT_IN_POWF64X
> >>>   BUILT_IN_POWF128X
> >>>   null)
> >>> (define_operator_list POW_ALL
> >>>   BUILT_IN_POWF
> >>>   BUILT_IN_POW
> >>>   BUILT_IN_POWL
> >>>   BUILT_IN_POWF16
> >>> ...
> >>> 
> >>> note this comes at expense of more generated code (in
> >>> gimple/generic-match.pd).
> >> Thanks, that solved the Float16 issue.
> >>> 
> >>>> Second, validation on aarch64 shows a regression in tests
> >>>> - gcc.dg/recip_sqrt_mult_1.c and
> >>>> - gcc.dg/recip_sqrt_mult_5.c,
> >>>> because the pattern (POWI(1/x, y) -> POWI(x, -y)) is applied before the recip pass and prevents application of the recip-patterns. The reason for this might be that the single-use restriction only work if the integer argument is non-constant, but in the failing test cases, the integer argument is 2 and the pattern is applied despite the :s flag.
> >>>> For example, my pattern is **not** applied (single-use restriction works) for:
> >>>> double res, res2;
> >>>> void foo (double a, int b)
> >>>> {
> >>>> double f (double);
> >>>> double t1 = 1.0 / a;
> >>>> res = __builtin_powi (t1, b);
> >>>> res2 = f (t1);
> >>>> }
> >>>> 
> >>>> But the pattern **is** applied and single-use restriction does **not** work for:
> >>>> double res, res2;
> >>>> void foo (double a)
> >>>> {
> >>>> double f (double);
> >>>> double t1 = 1.0 / a;
> >>>> res = __builtin_powi (t1, 2);
> >>>> res2 = f (t1);
> >>>> }
> >>> 
> >>> This must be because the result is a single operation.  :s only applies
> >>> when the result has sub-expresions.  This is to make CSE work.
> >>> The "fix" is to add explicit && single_use (@n) to override that
> >>> behavior.  Note that I think the transform is good even when the
> >>> division is used because the result reduces the dependence chain length.
> >>> It's only when @2 is non-constant that we're introducing another
> >>> stmt for the negation that re-introduces this latency (even if in
> >>> practice it would be smaller).
> >>> 
> >>>> Possible options to resolve this are:
> >>>> - gate pattern to run after recip pass
> >>>> - do not apply pattern for POWI
> >>> 
> >>> - adjust the testcase (is the final outcome still good?)
> >> Without the patch, there is one fdiv instruction less (below is the assembly for recip_sqrt_mult_1.c, but for _5.c it’s analogous):
> >> No patch or with single_use of the result of the division:
> >> foo:
> >>      fmov d30, 1.0e+0
> >>      fsqrt d31, d0
> >>      adrp x0, .LANCHOR0
> >>      add x1, x0, :lo12:.LANCHOR0
> >>      fdiv d30, d30, d0
> >>      fmul d0, d31, d30
> >>      str d0, [x0, #:lo12:.LANCHOR0]
> >>      stp d30, d31, [x1, 8]
> >>      ret
> >> 
> >> With patch:
> >> foo:
> >>        fsqrt   d31, d0
> >>        fmov    d30, 1.0e+0
> >>        adrp    x1, .LANCHOR0
> >>        add     x0, x1, :lo12:.LANCHOR0
> >>        fdiv    d31, d30, d31
> >>        fdiv    d30, d30, d0
> >>        str     d31, [x1, #:lo12:.LANCHOR0]
> >>        fmul    d31, d31, d0
> >>        stp     d30, d31, [x0, 8]
> >>        ret
> >> So, we might want to use the single_use guard.
> > 
> > Yeah, this is because the powi inline expansion will add back
> > the division.
> Below is the updated patch, I re-validated with no regression on aarch64 and x86_64.
> Thanks,
> Jenni
> 
> This patch adds the following two simplifications in match.pd for
> POW_ALL and POWI:
> - pow (1.0/x, y) to pow (x, -y), avoiding the division
> - pow (0.0, x) to 0.0, avoiding the call to pow.
> The patterns are guarded by flag_unsafe_math_optimizations,
> !flag_trapping_math, and !HONOR_INFINITIES.
> The POW_ALL patterns are also gated under !flag_errno_math.
> The second pattern is also guarded by !HONOR_NANS and
> !HONOR_SIGNED_ZEROS.
> 
> Tests were added to confirm the application of the transform for
> builtins pow, powf, powl, powi, powif, powil, and powf16.
> 
> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> x86_64-linux-gnu, no regression.
> OK for mainline?

OK.

Thanks,
Richard.

> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> 
> gcc/
> 	* match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
> 	pow (0.0, x) -> 0.0.
> 
> gcc/testsuite/
> 	* gcc.dg/tree-ssa/pow_fold_1.c: New test.
> ---
>  gcc/match.pd                               | 28 +++++++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++++++++++++++++++++++
>  2 files changed, 70 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 12d81fcac0d..6d9868d2bb1 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8203,6 +8203,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>     (rdiv @0 (exps:s @1))
>      (mult @0 (exps (negate @1)))))
>  
> + (for pow (POW_ALL)
> +  (if (! HONOR_INFINITIES (type)
> +       && ! flag_trapping_math
> +       && ! flag_errno_math)
> +   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
> +   (simplify
> +    (pow (rdiv:s real_onep@0 @1) @2)
> +     (pow @1 (negate @2)))
> +
> +   /* Simplify pow(0.0, x) into 0.0.  */
> +   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
> +    (simplify
> +     (pow real_zerop@0 @1)
> +      @0))))
> +
>   (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
>        && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
>        && ! flag_trapping_math
> @@ -8561,6 +8576,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>    (mult (POW:s @0 @1) (POW:s @2 @1))
>     (POW (mult @0 @2) @1))
>  
> + (if (! HONOR_INFINITIES (type) && ! flag_trapping_math)
> +  /* Simplify powi(1.0/x, y) into powi(x, -y).  */
> +  (simplify
> +   (POWI (rdiv@3 real_onep@0 @1) @2)
> +   (if (single_use (@3))
> +    (POWI @1 (negate @2))))
> +
> +  /* Simplify powi(0.0, x) into 0.0.  */
> +  (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
> +   (simplify
> +    (POWI real_zerop@0 @1)
> +     @0)))
> +
>   /* Simplify powi(x,y) * powi(z,y) -> powi(x*z,y). */
>   (simplify
>    (mult (POWI:s @0 @1) (POWI:s @2 @1))
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> new file mode 100644
> index 00000000000..d98bcb0827e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
> @@ -0,0 +1,42 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -fdump-tree-optimized -fexcess-precision=16" } */
> +/* { dg-add-options float16 } */
> +/* { dg-require-effective-target float16_runtime } */
> +/* { dg-require-effective-target c99_runtime } */
> +
> +extern void link_error (void);
> +
> +#define POW1OVER(TYPE1, TYPE2, CTY, TY)			\
> +  void							\
> +  pow1over_##TY (TYPE1 x, TYPE2 y)			\
> +  {							\
> +    TYPE1 t1 = 1.0##CTY / x;				\
> +    TYPE1 t2 = __builtin_pow##TY (t1, y);		\
> +    TYPE2 t3 = -y;					\
> +    TYPE1 t4 = __builtin_pow##TY (x, t3);		\
> +    if (t2 != t4)					\
> +      link_error ();					\
> +  }							\
> +
> +#define POW0(TYPE1, TYPE2, CTY, TY)			\
> +  void							\
> +  pow0_##TY (TYPE2 x)					\
> +  {							\
> +    TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);		\
> +    if (t1 != 0.0##CTY)					\
> +      link_error ();					\
> +  }							\
> +
> +#define TEST_ALL(TYPE1, TYPE2, CTY, TY)			\
> +  POW1OVER (TYPE1, TYPE2, CTY, TY)			\
> +  POW0 (TYPE1, TYPE2, CTY, TY)
> +
> +TEST_ALL (double, double, , )
> +TEST_ALL (float, float, f, f)
> +TEST_ALL (_Float16, _Float16, f16, f16)
> +TEST_ALL (long double, long double, L, l)
> +TEST_ALL (double, int, , i)
> +TEST_ALL (float, int, f, if)
> +TEST_ALL (long double, int, L, il)
> +
> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
>
Jennifer Schmitz Oct. 25, 2024, 1:58 p.m. UTC | #9
> On 25 Oct 2024, at 14:39, Richard Biener <rguenther@suse.de> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Wed, 23 Oct 2024, Jennifer Schmitz wrote:
> 
>> 
>> 
>>> On 22 Oct 2024, at 13:14, Richard Biener <rguenther@suse.de> wrote:
>>> 
>>> External email: Use caution opening links or attachments
>>> 
>>> 
>>> On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
>>> 
>>>> 
>>>> 
>>>>> On 22 Oct 2024, at 11:05, Richard Biener <rguenther@suse.de> wrote:
>>>>> 
>>>>> External email: Use caution opening links or attachments
>>>>> 
>>>>> 
>>>>> On Tue, 22 Oct 2024, Jennifer Schmitz wrote:
>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> On 21 Oct 2024, at 10:51, Richard Biener <rguenther@suse.de> wrote:
>>>>>>> 
>>>>>>> External email: Use caution opening links or attachments
>>>>>>> 
>>>>>>> 
>>>>>>> On Fri, 18 Oct 2024, Jennifer Schmitz wrote:
>>>>>>> 
>>>>>>>> This patch adds the following two simplifications in match.pd:
>>>>>>>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
>>>>>>>> - pow (0.0, x) to 0.0, avoiding the call to pow.
>>>>>>>> The patterns are guarded by flag_unsafe_math_optimizations,
>>>>>>>> !flag_trapping_math, !flag_errno_math, !HONOR_SIGNED_ZEROS,
>>>>>>>> and !HONOR_INFINITIES.
>>>>>>>> 
>>>>>>>> Tests were added to confirm the application of the transform for float,
>>>>>>>> double, and long double.
>>>>>>>> 
>>>>>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
>>>>>>>> x86_64-linux-gnu, no regression.
>>>>>>>> OK for mainline?
>>>>>>>> 
>>>>>>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>>>>>>>> 
>>>>>>>> gcc/
>>>>>>>>   * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
>>>>>>>>   pow (0.0, x) -> 0.0.
>>>>>>>> 
>>>>>>>> gcc/testsuite/
>>>>>>>>   * gcc.dg/tree-ssa/pow_fold_1.c: New test.
>>>>>>>> ---
>>>>>>>> gcc/match.pd                               | 14 +++++++++
>>>>>>>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 34 ++++++++++++++++++++++
>>>>>>>> 2 files changed, 48 insertions(+)
>>>>>>>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>>>>>>>> 
>>>>>>>> diff --git a/gcc/match.pd b/gcc/match.pd
>>>>>>>> index 12d81fcac0d..ba100b117e7 100644
>>>>>>>> --- a/gcc/match.pd
>>>>>>>> +++ b/gcc/match.pd
>>>>>>>> @@ -8203,6 +8203,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>>>>>> (rdiv @0 (exps:s @1))
>>>>>>>>  (mult @0 (exps (negate @1)))))
>>>>>>>> 
>>>>>>>> + /* Simplify pow(1.0/x, y) into pow(x, -y).  */
>>>>>>>> + (if (! HONOR_INFINITIES (type)
>>>>>>>> +      && ! HONOR_SIGNED_ZEROS (type)
>>>>>>>> +      && ! flag_trapping_math
>>>>>>>> +      && ! flag_errno_math)
>>>>>>>> +  (simplify
>>>>>>>> +   (POW (rdiv:s real_onep@0 @1) @2)
>>>>>>>> +    (POW @1 (negate @2)))
>>>>>>> 
>>>>>>> This one shouldn't need HONOR_SIGNED_ZEROS?
>>>>>>> 
>>>>>>>> +
>>>>>>>> +  /* Simplify pow(0.0, x) into 0.0.  */
>>>>>>>> +  (simplify
>>>>>>>> +   (POW real_zerop@0 @1)
>>>>>>> 
>>>>>>> I think this needs !HONOR_NANS (type)?
>>>>>>> 
>>>>>>> Otherwise OK.
>>>>>> Thanks for the feedback, Richard and Andrew. I made the following changes to the patch (current version of the patch below):
>>>>>> - also applied the pattern to POWI and added tests for pow, powif, powil
>>>>>> - not gate first pattern under !HONOR_SIGNED_ZEROS, but second one additionally under !HONOR_NANS (type)
>>>>>> - added tests for powf16
>>>>> 
>>>>> Note powi is GCC internal, it doesn't set errno and it should be subject
>>>>> to different rules - I'd rather have patterns working on powi separate.
>>>> How about moving the patterns for POWI into the section flag_unsafe_math_optimizations && canonicalize_math_p () and not use (!flag_errno_math)?
>>> 
>>> Sounds good.
>>> 
>>>>> 
>>>>>> Now, I am encountering two problems:
>>>>>> 
>>>>>> First, the transform is not applied for float16 (even if -fexcess-precision=16). Do you know what the problem could be?
>>>>> 
>>>>> I think you want to use POW_ALL instead of POW.  The generated
>>>>> cfn-operators.pd shows
>>>>> 
>>>>> (define_operator_list POW
>>>>>  BUILT_IN_POWF
>>>>>  BUILT_IN_POW
>>>>>  BUILT_IN_POWL
>>>>>  IFN_POW)
>>>>> (define_operator_list POW_FN
>>>>>  BUILT_IN_POWF16
>>>>>  BUILT_IN_POWF32
>>>>>  BUILT_IN_POWF64
>>>>>  BUILT_IN_POWF128
>>>>>  BUILT_IN_POWF32X
>>>>>  BUILT_IN_POWF64X
>>>>>  BUILT_IN_POWF128X
>>>>>  null)
>>>>> (define_operator_list POW_ALL
>>>>>  BUILT_IN_POWF
>>>>>  BUILT_IN_POW
>>>>>  BUILT_IN_POWL
>>>>>  BUILT_IN_POWF16
>>>>> ...
>>>>> 
>>>>> note this comes at expense of more generated code (in
>>>>> gimple/generic-match.pd).
>>>> Thanks, that solved the Float16 issue.
>>>>> 
>>>>>> Second, validation on aarch64 shows a regression in tests
>>>>>> - gcc.dg/recip_sqrt_mult_1.c and
>>>>>> - gcc.dg/recip_sqrt_mult_5.c,
>>>>>> because the pattern (POWI(1/x, y) -> POWI(x, -y)) is applied before the recip pass and prevents application of the recip-patterns. The reason for this might be that the single-use restriction only work if the integer argument is non-constant, but in the failing test cases, the integer argument is 2 and the pattern is applied despite the :s flag.
>>>>>> For example, my pattern is **not** applied (single-use restriction works) for:
>>>>>> double res, res2;
>>>>>> void foo (double a, int b)
>>>>>> {
>>>>>> double f (double);
>>>>>> double t1 = 1.0 / a;
>>>>>> res = __builtin_powi (t1, b);
>>>>>> res2 = f (t1);
>>>>>> }
>>>>>> 
>>>>>> But the pattern **is** applied and single-use restriction does **not** work for:
>>>>>> double res, res2;
>>>>>> void foo (double a)
>>>>>> {
>>>>>> double f (double);
>>>>>> double t1 = 1.0 / a;
>>>>>> res = __builtin_powi (t1, 2);
>>>>>> res2 = f (t1);
>>>>>> }
>>>>> 
>>>>> This must be because the result is a single operation.  :s only applies
>>>>> when the result has sub-expresions.  This is to make CSE work.
>>>>> The "fix" is to add explicit && single_use (@n) to override that
>>>>> behavior.  Note that I think the transform is good even when the
>>>>> division is used because the result reduces the dependence chain length.
>>>>> It's only when @2 is non-constant that we're introducing another
>>>>> stmt for the negation that re-introduces this latency (even if in
>>>>> practice it would be smaller).
>>>>> 
>>>>>> Possible options to resolve this are:
>>>>>> - gate pattern to run after recip pass
>>>>>> - do not apply pattern for POWI
>>>>> 
>>>>> - adjust the testcase (is the final outcome still good?)
>>>> Without the patch, there is one fdiv instruction less (below is the assembly for recip_sqrt_mult_1.c, but for _5.c it’s analogous):
>>>> No patch or with single_use of the result of the division:
>>>> foo:
>>>>     fmov d30, 1.0e+0
>>>>     fsqrt d31, d0
>>>>     adrp x0, .LANCHOR0
>>>>     add x1, x0, :lo12:.LANCHOR0
>>>>     fdiv d30, d30, d0
>>>>     fmul d0, d31, d30
>>>>     str d0, [x0, #:lo12:.LANCHOR0]
>>>>     stp d30, d31, [x1, 8]
>>>>     ret
>>>> 
>>>> With patch:
>>>> foo:
>>>>       fsqrt   d31, d0
>>>>       fmov    d30, 1.0e+0
>>>>       adrp    x1, .LANCHOR0
>>>>       add     x0, x1, :lo12:.LANCHOR0
>>>>       fdiv    d31, d30, d31
>>>>       fdiv    d30, d30, d0
>>>>       str     d31, [x1, #:lo12:.LANCHOR0]
>>>>       fmul    d31, d31, d0
>>>>       stp     d30, d31, [x0, 8]
>>>>       ret
>>>> So, we might want to use the single_use guard.
>>> 
>>> Yeah, this is because the powi inline expansion will add back
>>> the division.
>> Below is the updated patch, I re-validated with no regression on aarch64 and x86_64.
>> Thanks,
>> Jenni
>> 
>> This patch adds the following two simplifications in match.pd for
>> POW_ALL and POWI:
>> - pow (1.0/x, y) to pow (x, -y), avoiding the division
>> - pow (0.0, x) to 0.0, avoiding the call to pow.
>> The patterns are guarded by flag_unsafe_math_optimizations,
>> !flag_trapping_math, and !HONOR_INFINITIES.
>> The POW_ALL patterns are also gated under !flag_errno_math.
>> The second pattern is also guarded by !HONOR_NANS and
>> !HONOR_SIGNED_ZEROS.
>> 
>> Tests were added to confirm the application of the transform for
>> builtins pow, powf, powl, powi, powif, powil, and powf16.
>> 
>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
>> x86_64-linux-gnu, no regression.
>> OK for mainline?
> 
> OK.
Thanks, pushed to trunk: 07a8538d90763f0ae640dea822bdeb63ea17ec44
Jennifer
> 
> Thanks,
> Richard.
> 
>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>> 
>> gcc/
>>      * match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
>>      pow (0.0, x) -> 0.0.
>> 
>> gcc/testsuite/
>>      * gcc.dg/tree-ssa/pow_fold_1.c: New test.
>> ---
>> gcc/match.pd                               | 28 +++++++++++++++
>> gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++++++++++++++++++++++
>> 2 files changed, 70 insertions(+)
>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>> 
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 12d81fcac0d..6d9868d2bb1 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -8203,6 +8203,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>    (rdiv @0 (exps:s @1))
>>     (mult @0 (exps (negate @1)))))
>> 
>> + (for pow (POW_ALL)
>> +  (if (! HONOR_INFINITIES (type)
>> +       && ! flag_trapping_math
>> +       && ! flag_errno_math)
>> +   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
>> +   (simplify
>> +    (pow (rdiv:s real_onep@0 @1) @2)
>> +     (pow @1 (negate @2)))
>> +
>> +   /* Simplify pow(0.0, x) into 0.0.  */
>> +   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
>> +    (simplify
>> +     (pow real_zerop@0 @1)
>> +      @0))))
>> +
>>  (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
>>       && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
>>       && ! flag_trapping_math
>> @@ -8561,6 +8576,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>   (mult (POW:s @0 @1) (POW:s @2 @1))
>>    (POW (mult @0 @2) @1))
>> 
>> + (if (! HONOR_INFINITIES (type) && ! flag_trapping_math)
>> +  /* Simplify powi(1.0/x, y) into powi(x, -y).  */
>> +  (simplify
>> +   (POWI (rdiv@3 real_onep@0 @1) @2)
>> +   (if (single_use (@3))
>> +    (POWI @1 (negate @2))))
>> +
>> +  /* Simplify powi(0.0, x) into 0.0.  */
>> +  (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
>> +   (simplify
>> +    (POWI real_zerop@0 @1)
>> +     @0)))
>> +
>>  /* Simplify powi(x,y) * powi(z,y) -> powi(x*z,y). */
>>  (simplify
>>   (mult (POWI:s @0 @1) (POWI:s @2 @1))
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>> new file mode 100644
>> index 00000000000..d98bcb0827e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
>> @@ -0,0 +1,42 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-Ofast -fdump-tree-optimized -fexcess-precision=16" } */
>> +/* { dg-add-options float16 } */
>> +/* { dg-require-effective-target float16_runtime } */
>> +/* { dg-require-effective-target c99_runtime } */
>> +
>> +extern void link_error (void);
>> +
>> +#define POW1OVER(TYPE1, TYPE2, CTY, TY)                      \
>> +  void                                                       \
>> +  pow1over_##TY (TYPE1 x, TYPE2 y)                   \
>> +  {                                                  \
>> +    TYPE1 t1 = 1.0##CTY / x;                         \
>> +    TYPE1 t2 = __builtin_pow##TY (t1, y);            \
>> +    TYPE2 t3 = -y;                                   \
>> +    TYPE1 t4 = __builtin_pow##TY (x, t3);            \
>> +    if (t2 != t4)                                    \
>> +      link_error ();                                 \
>> +  }                                                  \
>> +
>> +#define POW0(TYPE1, TYPE2, CTY, TY)                  \
>> +  void                                                       \
>> +  pow0_##TY (TYPE2 x)                                        \
>> +  {                                                  \
>> +    TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);              \
>> +    if (t1 != 0.0##CTY)                                      \
>> +      link_error ();                                 \
>> +  }                                                  \
>> +
>> +#define TEST_ALL(TYPE1, TYPE2, CTY, TY)                      \
>> +  POW1OVER (TYPE1, TYPE2, CTY, TY)                   \
>> +  POW0 (TYPE1, TYPE2, CTY, TY)
>> +
>> +TEST_ALL (double, double, , )
>> +TEST_ALL (float, float, f, f)
>> +TEST_ALL (_Float16, _Float16, f16, f16)
>> +TEST_ALL (long double, long double, L, l)
>> +TEST_ALL (double, int, , i)
>> +TEST_ALL (float, int, f, if)
>> +TEST_ALL (long double, int, L, il)
>> +
>> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
>> 
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 12d81fcac0d..ba100b117e7 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8203,6 +8203,20 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (rdiv @0 (exps:s @1))
     (mult @0 (exps (negate @1)))))
 
+ /* Simplify pow(1.0/x, y) into pow(x, -y).  */
+ (if (! HONOR_INFINITIES (type)
+      && ! HONOR_SIGNED_ZEROS (type)
+      && ! flag_trapping_math
+      && ! flag_errno_math)
+  (simplify
+   (POW (rdiv:s real_onep@0 @1) @2)
+    (POW @1 (negate @2)))
+
+  /* Simplify pow(0.0, x) into 0.0.  */
+  (simplify
+   (POW real_zerop@0 @1)
+    @0))
+
  (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
       && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
       && ! flag_trapping_math
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
new file mode 100644
index 00000000000..113df572661
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
@@ -0,0 +1,34 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math" } */
+/* { dg-require-effective-target c99_runtime } */
+
+extern void link_error (void);
+
+#define POW1OVER(TYPE, C_TY, TY)			\
+  void							\
+  pow1over_##TY (TYPE x, TYPE y)			\
+  {							\
+    TYPE t1 = 1.0##C_TY / x;				\
+    TYPE t2 = __builtin_pow##TY (t1, y);		\
+    TYPE t3 = -y;					\
+    TYPE t4 = __builtin_pow##TY (x, t3);		\
+    if (t2 != t4)					\
+      link_error ();					\
+  }							\
+
+#define POW0(TYPE, C_TY, TY)				\
+  void							\
+  pow0_##TY (TYPE x)					\
+  {							\
+    TYPE t1 = __builtin_pow##TY (0.0##C_TY, x);		\
+    if (t1 != 0.0##C_TY)				\
+      link_error ();					\
+  }							\
+
+#define TEST_ALL(TYPE, C_TY, TY)			\
+  POW1OVER (TYPE, C_TY, TY)				\
+  POW0 (TYPE, C_TY, TY)
+
+TEST_ALL (double, , )
+TEST_ALL (float, f, f)
+TEST_ALL (long double, L, l)