diff mbox series

[match.pd] PR tree-optimization/114661: Generalize MULT_EXPR recognition.

Message ID 007301dad24f$40fe7c00$c2fb7400$@nextmovesoftware.com
State New
Headers show
Series [match.pd] PR tree-optimization/114661: Generalize MULT_EXPR recognition. | expand

Commit Message

Roger Sayle July 9, 2024, 10:27 p.m. UTC
This patch resolves PR tree-optimization/114661, by generalizing the set
of expressions that we canonicalize to multiplication.  This extends the
optimization(s) contributed (by me) back in July 2021.
https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575999.html

The existing transformation folds (X*C1)^(X<<C2) into X*C3 when
allowed.  A subtlety is that for non-wrapping integer types, we
actually fold this into (int)((unsigned)X*C3) so that we don't
introduce an undefined overflow that wasn't in the original.
Unfortunately, this transformation confuses itself, as the type-safe
multiplication isn't recognized when further combining bit operations.
Fixed here by adding transforms to turn (int)((unsigned)X*C1)^(X<<C2)
into (int)((unsigned)X*C3) so that match.pd and EVRP can continue
to construct multiplications.

For the example given in the PR:

unsigned mul(unsigned char c) {
    if (c > 3) __builtin_unreachable();
    return c << 18 | c << 15 |
           c << 12 | c << 9 |
           c << 6 | c << 3 | c;
}

GCC on x86_64 with -O2 previously generated:

mul:    movzbl  %dil, %edi
        leal    (%rdi,%rdi,8), %edx
        leal    0(,%rdx,8), %eax
        movl    %edx, %ecx
        sall    $15, %edx
        orl     %edi, %eax
        sall    $9, %ecx
        orl     %ecx, %eax
        orl     %edx, %eax
        ret

with this patch we now generate:

mul:    movzbl  %dil, %eax
        imull   $299593, %eax, %eax
        ret

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2024-07-09  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        PR tree-optimization/114661
        * match.pd ((X*C1)|(X*C2) to X*(C1+C2)): Additionally recognize
        multiplications surrounded by casts to an unsigned type and back
        such as those generated by these transformations.

gcc/testsuite/ChangeLog
        PR tree-optimization/114661
        * gcc.dg/pr114661.c: New test case.


Thanks in advance,
Roger
--

Comments

Richard Biener July 10, 2024, 11:33 a.m. UTC | #1
On Wed, Jul 10, 2024 at 12:28 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch resolves PR tree-optimization/114661, by generalizing the set
> of expressions that we canonicalize to multiplication.  This extends the
> optimization(s) contributed (by me) back in July 2021.
> https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575999.html
>
> The existing transformation folds (X*C1)^(X<<C2) into X*C3 when
> allowed.  A subtlety is that for non-wrapping integer types, we
> actually fold this into (int)((unsigned)X*C3) so that we don't
> introduce an undefined overflow that wasn't in the original.
> Unfortunately, this transformation confuses itself, as the type-safe
> multiplication isn't recognized when further combining bit operations.
> Fixed here by adding transforms to turn (int)((unsigned)X*C1)^(X<<C2)
> into (int)((unsigned)X*C3) so that match.pd and EVRP can continue
> to construct multiplications.
>
> For the example given in the PR:
>
> unsigned mul(unsigned char c) {
>     if (c > 3) __builtin_unreachable();
>     return c << 18 | c << 15 |
>            c << 12 | c << 9 |
>            c << 6 | c << 3 | c;
> }
>
> GCC on x86_64 with -O2 previously generated:
>
> mul:    movzbl  %dil, %edi
>         leal    (%rdi,%rdi,8), %edx
>         leal    0(,%rdx,8), %eax
>         movl    %edx, %ecx
>         sall    $15, %edx
>         orl     %edi, %eax
>         sall    $9, %ecx
>         orl     %ecx, %eax
>         orl     %edx, %eax
>         ret
>
> with this patch we now generate:
>
> mul:    movzbl  %dil, %eax
>         imull   $299593, %eax, %eax
>         ret
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?

I'm looking at the difference between the existing

 (simplify
  (op:c (mult:s@0 @1 INTEGER_CST@2)
        (lshift:s@3 @1 INTEGER_CST@4))
  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
       && tree_int_cst_sgn (@4) > 0
       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
   (with { wide_int wone = wi::one (TYPE_PRECISION (type));
           wide_int c = wi::add (wi::to_wide (@2),
                                 wi::lshift (wone, wi::to_wide (@4))); }
    (mult @1 { wide_int_to_tree (type, c); }))))

and

+ (simplify
+  (op:c (convert:s@0 (mult:s@1 (convert @2) INTEGER_CST@3))
+       (lshift:s@4 @2 INTEGER_CST@5))
+  (if (INTEGRAL_TYPE_P (type)
+       && INTEGRAL_TYPE_P (TREE_TYPE (@1))
+       && TREE_TYPE (@2) == type
+       && TYPE_UNSIGNED (TREE_TYPE (@1))
+       && TYPE_PRECISION (type) == TYPE_PRECISION (TREE_TYPE (@1))
+       && tree_int_cst_sgn (@5) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+   (with { tree t = TREE_TYPE (@1);
+          wide_int wone = wi::one (TYPE_PRECISION (t));
+          wide_int c = wi::add (wi::to_wide (@3),
+                                wi::lshift (wone, wi::to_wide (@5))); }
+    (convert (mult:t (convert:t @2) { wide_int_to_tree (t, c); })))))

and wonder whether wrapping of the multiplication is required for correctness,
specifically the former seems to allow signed types with -fwrapv while the
latter won't.  It also looks the patterns could be merged doing

 (simplify
  (op:c (nop_convert:s? (mult:s@0 (nop_convert? @1) INTEGER_CST@2)
        (lshift:s@3 @1 INTEGER_CST@4))

and by using nop_convert instead of convert simplify the condition?

Richard.

>
> 2024-07-09  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         PR tree-optimization/114661
>         * match.pd ((X*C1)|(X*C2) to X*(C1+C2)): Additionally recognize
>         multiplications surrounded by casts to an unsigned type and back
>         such as those generated by these transformations.
>
> gcc/testsuite/ChangeLog
>         PR tree-optimization/114661
>         * gcc.dg/pr114661.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 4edfa2a..228387f 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4205,7 +4205,35 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 	     t = unsigned_type_for (t);
 	   wide_int wone = wi::one (TYPE_PRECISION (t));
 	   wide_int c = wi::add (wi::lshift (wone, wi::to_wide (@2)), wone); }
-    (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }))))))
+    (convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); })))))
+ (simplify
+  (op:c (convert:s@0 (mult:s@1 (convert @2) INTEGER_CST@3))
+	(lshift:s@4 @2 INTEGER_CST@5))
+  (if (INTEGRAL_TYPE_P (type)
+       && INTEGRAL_TYPE_P (TREE_TYPE (@1))
+       && TREE_TYPE (@2) == type
+       && TYPE_UNSIGNED (TREE_TYPE (@1))
+       && TYPE_PRECISION (type) == TYPE_PRECISION (TREE_TYPE (@1))
+       && tree_int_cst_sgn (@5) > 0
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
+   (with { tree t = TREE_TYPE (@1);
+	   wide_int wone = wi::one (TYPE_PRECISION (t));
+	   wide_int c = wi::add (wi::to_wide (@3),
+				 wi::lshift (wone, wi::to_wide (@5))); }
+    (convert (mult:t (convert:t @2) { wide_int_to_tree (t, c); })))))
+ (simplify
+  (op:c (convert:s@0 (mult:s@1 (convert @2) INTEGER_CST@3))
+	@2)
+  (if (INTEGRAL_TYPE_P (type)
+       && INTEGRAL_TYPE_P (TREE_TYPE (@1))
+       && TREE_TYPE (@2) == type
+       && TYPE_UNSIGNED (TREE_TYPE (@1))
+       && TYPE_PRECISION (type) == TYPE_PRECISION (TREE_TYPE (@1))
+       && (tree_nonzero_bits (@0) & tree_nonzero_bits (@2)) == 0)
+   (with { tree t = TREE_TYPE (@1);
+	   wide_int wone = wi::one (TYPE_PRECISION (t));
+	   wide_int c = wi::add (wi::to_wide (@3), wone); }
+    (convert (mult:t (convert:t @2) { wide_int_to_tree (t, c); }))))))
 
 /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax().  */
 
diff --git a/gcc/testsuite/gcc.dg/pr114661.c b/gcc/testsuite/gcc.dg/pr114661.c
new file mode 100644
index 0000000..e6b5c69
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr114661.c
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-evrp" } */
+
+unsigned mul(unsigned char c) {
+    if (c > 3) __builtin_unreachable();
+    return c << 18 | c << 15 |
+        c << 12 | c << 9 |
+        c << 6 | c << 3 | c;
+}
+/* { dg-final { scan-tree-dump-times " \\* 299593" 1 "evrp" } } */