Message ID | 20220509051904.80399-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction. | expand |
On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote: > > This patch will enable below optimization: > > { > - int bit; > - long long unsigned int _1; > - long long unsigned int _2; > - > <bb 2> [local count: 46707768]: > - > - <bb 3> [local count: 1027034057]: > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)> > - # bit_13 = PHI <bit_9(3), 63(2)> > - _1 = 1 << bit_13; > - _2 = ~_1; > - tmp_8 = _2 & tmp_11; > - bit_9 = bit_13 + -3; > - if (bit_9 != -3(OVF)) > - goto <bb 3>; [95.65%] > - else > - goto <bb 4>; [4.35%] > - > - <bb 4> [local count: 46707768]: > - return tmp_8; > + tmp_12 = tmp_6(D) & 7905747460161236406; > + return tmp_12; > > } > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,} > Ok for trunk? > > gcc/ChangeLog: > > PR middle-end/103462 > * match.pd (bitwise_induction_p): New match. > * tree-scalar-evolution.c (gimple_bitwise_induction_p): > Declare. > (analyze_and_compute_bitwise_induction_effect): New function. > (enum bit_op_kind): New enum. > (final_value_replacement_loop): Enhanced to handle bitwise > induction. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr103462-1.c: New test. > * gcc.target/i386/pr103462-2.c: New test. > * gcc.target/i386/pr103462-3.c: New test. > * gcc.target/i386/pr103462-4.c: New test. > * gcc.target/i386/pr103462-5.c: New test. > * gcc.target/i386/pr103462-6.c: New test. > --- > gcc/match.pd | 7 + > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++ > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++ > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++ > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++ > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++- > 8 files changed, 654 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c > > diff --git a/gcc/match.pd b/gcc/match.pd > index 6d691d302b3..24ff5f9e6a8 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -7746,3 +7746,10 @@ and, > == TYPE_UNSIGNED (TREE_TYPE (@3)))) > && single_use (@4) > && single_use (@5)))) > + > +(for bit_op (bit_and bit_ior bit_xor) > + (match (bitwise_induction_p @0 @2 @3) > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3))) > + > +(match (bitwise_induction_p @0 @2 @3) > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3)))) > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c > new file mode 100644 > index 00000000000..1dc4c2acad6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c > @@ -0,0 +1,111 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > + > +unsigned long long > +__attribute__((noipa)) > +foo (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp &= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo1 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp &= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo2 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp &= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo3 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp &= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo4 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp |= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo5 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp |= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo6 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp |= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo7 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp |= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo8 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp ^= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo9 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp ^= ~(1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo10 (unsigned long long tmp) > +{ > + for (int bit = 0; bit < 64; bit += 3) > + tmp ^= (1ULL << bit); > + return tmp; > +} > + > +unsigned long long > +__attribute__((noipa)) > +foo11 (unsigned long long tmp) > +{ > + for (int bit = 63; bit >= 0; bit -= 3) > + tmp ^= (1ULL << bit); > + return tmp; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c > new file mode 100644 > index 00000000000..bc375cb78d4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c > @@ -0,0 +1,45 @@ > +/* { dg-do run } */ > +/* { dg-options "-O1" } */ > + > +#include "pr103462-1.c" > + > +int main() > +{ > + unsigned long long tmp = 0x1111111111111111ULL; > + if (foo (tmp) != 0x110110110110110ULL) > + __builtin_abort (); > + > + if (foo1 (tmp) != 0x110110110110110ULL) > + __builtin_abort (); > + > + if (foo2 (tmp) != 0x0ULL) > + __builtin_abort (); > + > + if (foo3 (tmp) != 0x0ULL) > + __builtin_abort (); > + > + if (foo4 (tmp) != 0xffffffffffffffffULL) > + __builtin_abort (); > + > + if (foo5 (tmp) != 0xffffffffffffffffULL) > + __builtin_abort (); > + > + if (foo6 (tmp) != 0x9359359359359359ULL) > + __builtin_abort (); > + > + if (foo7 (tmp) != 0x9359359359359359ULL) > + __builtin_abort (); > + > + if (foo8 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > + > + if (foo9 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > + > + if (foo10 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > + > + if (foo11 (tmp) != 0x8358358358358358ULL) > + __builtin_abort (); > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c > new file mode 100644 > index 00000000000..4ba248a4872 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c > @@ -0,0 +1,111 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > + > +unsigned int > +__attribute__((noipa)) > +foo (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo1 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo2 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo3 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo4 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo5 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo6 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo7 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo8 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo9 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo10 (unsigned int tmp) > +{ > + for (int bit = 0; bit < 32; bit += 3) > + tmp ^= (1U << bit); > + return tmp; > +} > + > +unsigned int > +__attribute__((noipa)) > +foo11 (unsigned int tmp) > +{ > + for (int bit = 31; bit >= 0; bit -= 3) > + tmp ^= (1U << bit); > + return tmp; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c > new file mode 100644 > index 00000000000..e2f4056f525 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c > @@ -0,0 +1,46 @@ > +/* { dg-do run } */ > +/* { dg-options "-O1" } */ > + > +#include "pr103462-3.c" > + > +int main() > +{ > + unsigned int tmp = 0x11111111U; > + > + if (foo (tmp) != 0x10110110U) > + __builtin_abort (); > + > + if (foo1 (tmp) != 0x1101101U) > + __builtin_abort (); > + > + if (foo2 (tmp) != 0x0U) > + __builtin_abort (); > + > + if (foo3 (tmp) != 0x0U) > + __builtin_abort (); > + > + if (foo4 (tmp) != 0xffffffffU) > + __builtin_abort (); > + > + if (foo5 (tmp) != 0xffffffffU) > + __builtin_abort (); > + > + if (foo6 (tmp) != 0x59359359U) > + __builtin_abort (); > + > + if (foo7 (tmp) != 0x93593593U) > + __builtin_abort (); > + > + if (foo8 (tmp) != 0xa7ca7ca7U) > + __builtin_abort (); > + > + if (foo9 (tmp) != 0x7ca7ca7cU) > + __builtin_abort (); > + > + if (foo10 (tmp) != 0x58358358U) > + __builtin_abort (); > + > + if (foo11 (tmp) != 0x83583583U) > + __builtin_abort (); > +} > + > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c > new file mode 100644 > index 00000000000..1f4ffa34b48 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c > @@ -0,0 +1,111 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > + > +unsigned short > +__attribute__((noipa)) > +foo (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo1 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp &= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo2 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo3 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp &= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo4 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo5 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp |= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo6 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo7 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp |= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo8 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo9 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp ^= ~(1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo10 (unsigned short tmp) > +{ > + for (int bit = 0; bit < 16; bit += 3) > + tmp ^= (1U << bit); > + return tmp; > +} > + > +unsigned short > +__attribute__((noipa)) > +foo11 (unsigned short tmp) > +{ > + for (int bit = 15; bit >= 0; bit -= 3) > + tmp ^= (1U << bit); > + return tmp; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c > new file mode 100644 > index 00000000000..65426d71efe > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c > @@ -0,0 +1,46 @@ > +/* { dg-do run } */ > +/* { dg-options "-O1" } */ > + > +#include "pr103462-5.c" > + > +int main() > +{ > + unsigned short tmp = 0x1111U; > + > + if (foo (tmp) != 0x110) > + __builtin_abort (); > + > + if (foo1 (tmp) != 0x110) > + __builtin_abort (); > + > + if (foo2 (tmp) != 0x0) > + __builtin_abort (); > + > + if (foo3 (tmp) != 0x0) > + __builtin_abort (); > + > + if (foo4 (tmp) != 0xffff) > + __builtin_abort (); > + > + if (foo5 (tmp) != 0xffff) > + __builtin_abort (); > + > + if (foo6 (tmp) != 0x9359) > + __builtin_abort (); > + > + if (foo7 (tmp) != 0x9359) > + __builtin_abort (); > + > + if (foo8 (tmp) != 0x8358) > + __builtin_abort (); > + > + if (foo9 (tmp) != 0x8358) > + __builtin_abort (); > + > + if (foo10 (tmp) != 0x8358) > + __builtin_abort (); > + > + if (foo11 (tmp) != 0x8358) > + __builtin_abort (); > +} > + > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > index 72ceb4001e3..9b0aec4fd09 100644 > --- a/gcc/tree-scalar-evolution.cc > +++ b/gcc/tree-scalar-evolution.cc > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr) > || expanded_size > cache.elements ()); > } > > +/* Match.pd function to match bitwise inductive expression. > + .i.e. > + _2 = 1 << _1; > + _3 = ~_2; > + tmp_9 = _3 & tmp_12; */ > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree)); > + > +/* Return the inductive expression of bitwise operation if possible, > + otherwise returns DEF. */ > +static tree > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop, > + class loop* loop, > + tree phidef, > + tree def, > + tree niter) > +{ > + tree match_op[3],inv, bitwise_scev, bitwise_res; > + bool folded_casts = false; > + tree type = TREE_TYPE (phidef); > + gimple* header_phi = NULL; use a gphi * > + > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF) > + > + op2 = PHI <phidef, inv> > + _1 = (int) bit_17; > + _3 = 1 << _1; > + op1 = ~_3; > + phidef = op1 & op2; */ > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL) > + || TREE_CODE (match_op[2]) != SSA_NAME > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2) and then use header_phi here, if you pass in the PHI to use you'll always have 2 args > + return def; > + > + header_phi = SSA_NAME_DEF_STMT (match_op[2]); > + if (gimple_bb (header_phi) != loop->header) > + return def; I think passing in the PHI node and the exit edge instead of phidef would make the above more clear > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef) > + return def; > + > + inv = gimple_phi_arg_def (header_phi, 0); > + if (inv == phidef) > + inv = gimple_phi_arg_def (header_phi, 1); inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop)); where is this used? > + > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop, > + match_op[1], > + &folded_casts); you want bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); bitwise_scev = instantiate_parameters (loop, bitwise_scev); instead since you are interested in in-loop behavior? > + > + /* Make sure bits is in range of type precision. */ > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you do not rule out pointers or vector types. > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev)) > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type) > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev))) > + return def; > + > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > + if (!tree_fits_uhwi_p (bitwise_res) > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type)) > + return def; Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct? So name it bit_final? > + > +enum bit_op_kind > + { > + INDUCTION_BIT_CLEAR, > + INDUCTION_BIT_IOR, > + INDUCTION_BIT_XOR, > + INDUCTION_BIT_RESET, > + INDUCTION_ZERO, > + INDUCTION_ALL > + }; > + > + enum bit_op_kind induction_kind; > + enum tree_code code1 > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef)); > + enum tree_code code2 > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0])); > + > + /* BIT_CLEAR: A &= ~(1 << bit) > + BIT_RESET: A ^= (1 << bit). > + BIT_IOR: A |= (1 << bit) > + BIT_ZERO: A &= (1 << bit) > + BIT_ALL: A |= ~(1 << bit) > + BIT_XOR: A ^= ~(1 << bit). > + bit is induction variable. */ > + switch (code1) > + { > + case BIT_AND_EXPR: > + induction_kind = code2 == BIT_NOT_EXPR > + ? INDUCTION_BIT_CLEAR > + : INDUCTION_ZERO; > + break; > + case BIT_IOR_EXPR: > + induction_kind = code2 == BIT_NOT_EXPR > + ? INDUCTION_ALL > + : INDUCTION_BIT_IOR; > + break; > + case BIT_XOR_EXPR: > + induction_kind = code2 == BIT_NOT_EXPR > + ? INDUCTION_BIT_XOR > + : INDUCTION_BIT_RESET; > + break; > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */ > + case BIT_NOT_EXPR: > + gcc_assert (code2 == BIT_XOR_EXPR); > + induction_kind = INDUCTION_BIT_XOR; > + break; > + default: > + gcc_unreachable(); > + } > + > + if (induction_kind == INDUCTION_ZERO) > + return build_zero_cst (type); is that true even when the loop doesn't iterate? > + if (induction_kind == INDUCTION_ALL) > + return build_all_ones_cst (type); Likewise? > + > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); you checked fits_uhwi above, name it bit_start? > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter); > + > + /* Loop tripcount should be num_iter + 1. */ > + for (unsigned i = 0; i != num_iter + 1; i++) > + { > + bits = wi::bit_or (bits, > + wi::lshift (wi::one (TYPE_PRECISION (type)), > + start)); you can use bits = wi::set_bit (bits, start); > + start += step; > + } Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that the IV wraps and is unsigned char and you have a 256 bit integer the above could run quite a long time. Please limit num_iter to TYPE_PRECISION as well (do you have a testcase with a wrapping IV? try some != CST exit condition and a large increment) > + > + bool inverted = false; > + switch (induction_kind) > + { > + case INDUCTION_BIT_CLEAR: > + code1 = BIT_AND_EXPR; > + inverted = true; > + break; > + case INDUCTION_BIT_IOR: > + code1 = BIT_IOR_EXPR; > + break; > + case INDUCTION_BIT_RESET: > + code1 = BIT_XOR_EXPR; > + break; > + /* A ^= ~(1 << bit) is special, when loop tripcount is even, > + it's equal to A ^= bits, else A ^= ~bits. */ > + case INDUCTION_BIT_XOR: > + code1 = BIT_XOR_EXPR; > + if (num_iter % 2 == 0) > + inverted = true; > + break; > + default: > + gcc_unreachable (); > + } > + > + if (inverted) > + bits = wi::bit_not (bits); > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits)); > +} > + > /* Do final value replacement for LOOP, return true if we did anything. */ > > bool > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop) > { > gphi *phi = psi.phi (); > tree rslt = PHI_RESULT (phi); > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit); > + tree def = phidef; > if (virtual_operand_p (def)) > { > gsi_next (&psi); > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop) > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def, > &folded_casts); > def = compute_overall_effect_of_inner_loop (ex_loop, def); > + > + /* Handle bitwise induction expression. > + > + .i.e. > + for (int i = 0; i != 64; i+=3) > + res &= ~(1UL << i); > + > + RES can't be analyzed out by SCEV because it is not polynomially > + expressible, but in fact final value of RES can be replaced by > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63} > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */ > + if (tree_fits_uhwi_p (niter) > + && tree_to_uhwi (niter) that's redundant - ah, it's the not iterating case that's excluded. please write it as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned HOST_WIDE_INT and pass it that way to analyze_and_compute_bitwise_induction_effect > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U) > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop, > + phidef, def, niter); > + it seems you overwrite the SCEV analysis result unconditionally here, please move this inside of the following if () so it is done as fallback only (or add it as && !(def = ...) to the condition) > if (!tree_does_not_contain_chrecs (def) > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num) > /* Moving the computation from the loop may prolong life range > -- > 2.18.1 >
On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > This patch will enable below optimization: > > > > { > > - int bit; > > - long long unsigned int _1; > > - long long unsigned int _2; > > - > > <bb 2> [local count: 46707768]: > > - > > - <bb 3> [local count: 1027034057]: > > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)> > > - # bit_13 = PHI <bit_9(3), 63(2)> > > - _1 = 1 << bit_13; > > - _2 = ~_1; > > - tmp_8 = _2 & tmp_11; > > - bit_9 = bit_13 + -3; > > - if (bit_9 != -3(OVF)) > > - goto <bb 3>; [95.65%] > > - else > > - goto <bb 4>; [4.35%] > > - > > - <bb 4> [local count: 46707768]: > > - return tmp_8; > > + tmp_12 = tmp_6(D) & 7905747460161236406; > > + return tmp_12; > > > > } > > > > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,} > > Ok for trunk? > > > > gcc/ChangeLog: > > > > PR middle-end/103462 > > * match.pd (bitwise_induction_p): New match. > > * tree-scalar-evolution.c (gimple_bitwise_induction_p): > > Declare. > > (analyze_and_compute_bitwise_induction_effect): New function. > > (enum bit_op_kind): New enum. > > (final_value_replacement_loop): Enhanced to handle bitwise > > induction. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr103462-1.c: New test. > > * gcc.target/i386/pr103462-2.c: New test. > > * gcc.target/i386/pr103462-3.c: New test. > > * gcc.target/i386/pr103462-4.c: New test. > > * gcc.target/i386/pr103462-5.c: New test. > > * gcc.target/i386/pr103462-6.c: New test. > > --- > > gcc/match.pd | 7 + > > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++ > > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++ > > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++ > > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++ > > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++ > > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++ > > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++- > > 8 files changed, 654 insertions(+), 1 deletion(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 6d691d302b3..24ff5f9e6a8 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -7746,3 +7746,10 @@ and, > > == TYPE_UNSIGNED (TREE_TYPE (@3)))) > > && single_use (@4) > > && single_use (@5)))) > > + > > +(for bit_op (bit_and bit_ior bit_xor) > > + (match (bitwise_induction_p @0 @2 @3) > > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3))) > > + > > +(match (bitwise_induction_p @0 @2 @3) > > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3)))) > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > new file mode 100644 > > index 00000000000..1dc4c2acad6 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > @@ -0,0 +1,111 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo (unsigned long long tmp) > > +{ > > + for (int bit = 0; bit < 64; bit += 3) > > + tmp &= ~(1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo1 (unsigned long long tmp) > > +{ > > + for (int bit = 63; bit >= 0; bit -= 3) > > + tmp &= ~(1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo2 (unsigned long long tmp) > > +{ > > + for (int bit = 0; bit < 64; bit += 3) > > + tmp &= (1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo3 (unsigned long long tmp) > > +{ > > + for (int bit = 63; bit >= 0; bit -= 3) > > + tmp &= (1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo4 (unsigned long long tmp) > > +{ > > + for (int bit = 0; bit < 64; bit += 3) > > + tmp |= ~(1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo5 (unsigned long long tmp) > > +{ > > + for (int bit = 63; bit >= 0; bit -= 3) > > + tmp |= ~(1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo6 (unsigned long long tmp) > > +{ > > + for (int bit = 0; bit < 64; bit += 3) > > + tmp |= (1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo7 (unsigned long long tmp) > > +{ > > + for (int bit = 63; bit >= 0; bit -= 3) > > + tmp |= (1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo8 (unsigned long long tmp) > > +{ > > + for (int bit = 0; bit < 64; bit += 3) > > + tmp ^= ~(1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo9 (unsigned long long tmp) > > +{ > > + for (int bit = 63; bit >= 0; bit -= 3) > > + tmp ^= ~(1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo10 (unsigned long long tmp) > > +{ > > + for (int bit = 0; bit < 64; bit += 3) > > + tmp ^= (1ULL << bit); > > + return tmp; > > +} > > + > > +unsigned long long > > +__attribute__((noipa)) > > +foo11 (unsigned long long tmp) > > +{ > > + for (int bit = 63; bit >= 0; bit -= 3) > > + tmp ^= (1ULL << bit); > > + return tmp; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > new file mode 100644 > > index 00000000000..bc375cb78d4 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > @@ -0,0 +1,45 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O1" } */ > > + > > +#include "pr103462-1.c" > > + > > +int main() > > +{ > > + unsigned long long tmp = 0x1111111111111111ULL; > > + if (foo (tmp) != 0x110110110110110ULL) > > + __builtin_abort (); > > + > > + if (foo1 (tmp) != 0x110110110110110ULL) > > + __builtin_abort (); > > + > > + if (foo2 (tmp) != 0x0ULL) > > + __builtin_abort (); > > + > > + if (foo3 (tmp) != 0x0ULL) > > + __builtin_abort (); > > + > > + if (foo4 (tmp) != 0xffffffffffffffffULL) > > + __builtin_abort (); > > + > > + if (foo5 (tmp) != 0xffffffffffffffffULL) > > + __builtin_abort (); > > + > > + if (foo6 (tmp) != 0x9359359359359359ULL) > > + __builtin_abort (); > > + > > + if (foo7 (tmp) != 0x9359359359359359ULL) > > + __builtin_abort (); > > + > > + if (foo8 (tmp) != 0x8358358358358358ULL) > > + __builtin_abort (); > > + > > + if (foo9 (tmp) != 0x8358358358358358ULL) > > + __builtin_abort (); > > + > > + if (foo10 (tmp) != 0x8358358358358358ULL) > > + __builtin_abort (); > > + > > + if (foo11 (tmp) != 0x8358358358358358ULL) > > + __builtin_abort (); > > +} > > + > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > new file mode 100644 > > index 00000000000..4ba248a4872 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > @@ -0,0 +1,111 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo (unsigned int tmp) > > +{ > > + for (int bit = 0; bit < 32; bit += 3) > > + tmp &= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo1 (unsigned int tmp) > > +{ > > + for (int bit = 31; bit >= 0; bit -= 3) > > + tmp &= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo2 (unsigned int tmp) > > +{ > > + for (int bit = 0; bit < 32; bit += 3) > > + tmp &= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo3 (unsigned int tmp) > > +{ > > + for (int bit = 31; bit >= 0; bit -= 3) > > + tmp &= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo4 (unsigned int tmp) > > +{ > > + for (int bit = 0; bit < 32; bit += 3) > > + tmp |= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo5 (unsigned int tmp) > > +{ > > + for (int bit = 31; bit >= 0; bit -= 3) > > + tmp |= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo6 (unsigned int tmp) > > +{ > > + for (int bit = 0; bit < 32; bit += 3) > > + tmp |= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo7 (unsigned int tmp) > > +{ > > + for (int bit = 31; bit >= 0; bit -= 3) > > + tmp |= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo8 (unsigned int tmp) > > +{ > > + for (int bit = 0; bit < 32; bit += 3) > > + tmp ^= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo9 (unsigned int tmp) > > +{ > > + for (int bit = 31; bit >= 0; bit -= 3) > > + tmp ^= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo10 (unsigned int tmp) > > +{ > > + for (int bit = 0; bit < 32; bit += 3) > > + tmp ^= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned int > > +__attribute__((noipa)) > > +foo11 (unsigned int tmp) > > +{ > > + for (int bit = 31; bit >= 0; bit -= 3) > > + tmp ^= (1U << bit); > > + return tmp; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > new file mode 100644 > > index 00000000000..e2f4056f525 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > @@ -0,0 +1,46 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O1" } */ > > + > > +#include "pr103462-3.c" > > + > > +int main() > > +{ > > + unsigned int tmp = 0x11111111U; > > + > > + if (foo (tmp) != 0x10110110U) > > + __builtin_abort (); > > + > > + if (foo1 (tmp) != 0x1101101U) > > + __builtin_abort (); > > + > > + if (foo2 (tmp) != 0x0U) > > + __builtin_abort (); > > + > > + if (foo3 (tmp) != 0x0U) > > + __builtin_abort (); > > + > > + if (foo4 (tmp) != 0xffffffffU) > > + __builtin_abort (); > > + > > + if (foo5 (tmp) != 0xffffffffU) > > + __builtin_abort (); > > + > > + if (foo6 (tmp) != 0x59359359U) > > + __builtin_abort (); > > + > > + if (foo7 (tmp) != 0x93593593U) > > + __builtin_abort (); > > + > > + if (foo8 (tmp) != 0xa7ca7ca7U) > > + __builtin_abort (); > > + > > + if (foo9 (tmp) != 0x7ca7ca7cU) > > + __builtin_abort (); > > + > > + if (foo10 (tmp) != 0x58358358U) > > + __builtin_abort (); > > + > > + if (foo11 (tmp) != 0x83583583U) > > + __builtin_abort (); > > +} > > + > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > new file mode 100644 > > index 00000000000..1f4ffa34b48 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > @@ -0,0 +1,111 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo (unsigned short tmp) > > +{ > > + for (int bit = 0; bit < 16; bit += 3) > > + tmp &= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo1 (unsigned short tmp) > > +{ > > + for (int bit = 15; bit >= 0; bit -= 3) > > + tmp &= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo2 (unsigned short tmp) > > +{ > > + for (int bit = 0; bit < 16; bit += 3) > > + tmp &= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo3 (unsigned short tmp) > > +{ > > + for (int bit = 15; bit >= 0; bit -= 3) > > + tmp &= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo4 (unsigned short tmp) > > +{ > > + for (int bit = 0; bit < 16; bit += 3) > > + tmp |= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo5 (unsigned short tmp) > > +{ > > + for (int bit = 15; bit >= 0; bit -= 3) > > + tmp |= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo6 (unsigned short tmp) > > +{ > > + for (int bit = 0; bit < 16; bit += 3) > > + tmp |= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo7 (unsigned short tmp) > > +{ > > + for (int bit = 15; bit >= 0; bit -= 3) > > + tmp |= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo8 (unsigned short tmp) > > +{ > > + for (int bit = 0; bit < 16; bit += 3) > > + tmp ^= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo9 (unsigned short tmp) > > +{ > > + for (int bit = 15; bit >= 0; bit -= 3) > > + tmp ^= ~(1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo10 (unsigned short tmp) > > +{ > > + for (int bit = 0; bit < 16; bit += 3) > > + tmp ^= (1U << bit); > > + return tmp; > > +} > > + > > +unsigned short > > +__attribute__((noipa)) > > +foo11 (unsigned short tmp) > > +{ > > + for (int bit = 15; bit >= 0; bit -= 3) > > + tmp ^= (1U << bit); > > + return tmp; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > new file mode 100644 > > index 00000000000..65426d71efe > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > @@ -0,0 +1,46 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O1" } */ > > + > > +#include "pr103462-5.c" > > + > > +int main() > > +{ > > + unsigned short tmp = 0x1111U; > > + > > + if (foo (tmp) != 0x110) > > + __builtin_abort (); > > + > > + if (foo1 (tmp) != 0x110) > > + __builtin_abort (); > > + > > + if (foo2 (tmp) != 0x0) > > + __builtin_abort (); > > + > > + if (foo3 (tmp) != 0x0) > > + __builtin_abort (); > > + > > + if (foo4 (tmp) != 0xffff) > > + __builtin_abort (); > > + > > + if (foo5 (tmp) != 0xffff) > > + __builtin_abort (); > > + > > + if (foo6 (tmp) != 0x9359) > > + __builtin_abort (); > > + > > + if (foo7 (tmp) != 0x9359) > > + __builtin_abort (); > > + > > + if (foo8 (tmp) != 0x8358) > > + __builtin_abort (); > > + > > + if (foo9 (tmp) != 0x8358) > > + __builtin_abort (); > > + > > + if (foo10 (tmp) != 0x8358) > > + __builtin_abort (); > > + > > + if (foo11 (tmp) != 0x8358) > > + __builtin_abort (); > > +} > > + > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > > index 72ceb4001e3..9b0aec4fd09 100644 > > --- a/gcc/tree-scalar-evolution.cc > > +++ b/gcc/tree-scalar-evolution.cc > > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr) > > || expanded_size > cache.elements ()); > > } > > > > +/* Match.pd function to match bitwise inductive expression. > > + .i.e. > > + _2 = 1 << _1; > > + _3 = ~_2; > > + tmp_9 = _3 & tmp_12; */ > > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree)); > > + > > +/* Return the inductive expression of bitwise operation if possible, > > + otherwise returns DEF. */ > > +static tree > > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop, > > + class loop* loop, > > + tree phidef, > > + tree def, > > + tree niter) > > +{ > > + tree match_op[3],inv, bitwise_scev, bitwise_res; > > + bool folded_casts = false; > > + tree type = TREE_TYPE (phidef); > > + gimple* header_phi = NULL; > > use a gphi * > Changed. > > + > > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF) > > + > > + op2 = PHI <phidef, inv> > > + _1 = (int) bit_17; > > + _3 = 1 << _1; > > + op1 = ~_3; > > + phidef = op1 & op2; */ > > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL) > > + || TREE_CODE (match_op[2]) != SSA_NAME > > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI > > || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > > > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2) > > and then use header_phi here, if you pass in the PHI to use you'll > always have 2 args changed. > > > + return def; > > + > > + header_phi = SSA_NAME_DEF_STMT (match_op[2]); > > + if (gimple_bb (header_phi) != loop->header) > > + return def; > > I think passing in the PHI node and the exit edge instead of phidef > would make the above more clear Remove the upper condition. > > > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef) > > + return def; > > + > > + inv = gimple_phi_arg_def (header_phi, 0); > > + if (inv == phidef) > > + inv = gimple_phi_arg_def (header_phi, 1); > > inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop)); > > where is this used? Move to the place where it's used. > > > + > > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop, > > + match_op[1], > > + &folded_casts); > > you want > > bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); > bitwise_scev = instantiate_parameters (loop, bitwise_scev); > > instead since you are interested in in-loop behavior? Yes, changed. > > > + > > + /* Make sure bits is in range of type precision. */ > > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC > > I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you > do not rule out pointers or vector types. Changed. > > > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev)) > > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type) > > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev))) > > + return def; > > + > > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > > + if (!tree_fits_uhwi_p (bitwise_res) > > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type)) > > + return def; > > Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct? > So name it bit_final? Changed. > > > + > > +enum bit_op_kind > > + { > > + INDUCTION_BIT_CLEAR, > > + INDUCTION_BIT_IOR, > > + INDUCTION_BIT_XOR, > > + INDUCTION_BIT_RESET, > > + INDUCTION_ZERO, > > + INDUCTION_ALL > > + }; > > + > > + enum bit_op_kind induction_kind; > > + enum tree_code code1 > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef)); > > + enum tree_code code2 > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0])); > > + > > + /* BIT_CLEAR: A &= ~(1 << bit) > > + BIT_RESET: A ^= (1 << bit). > > + BIT_IOR: A |= (1 << bit) > > + BIT_ZERO: A &= (1 << bit) > > + BIT_ALL: A |= ~(1 << bit) > > + BIT_XOR: A ^= ~(1 << bit). > > + bit is induction variable. */ > > + switch (code1) > > + { > > + case BIT_AND_EXPR: > > + induction_kind = code2 == BIT_NOT_EXPR > > + ? INDUCTION_BIT_CLEAR > > + : INDUCTION_ZERO; > > + break; > > + case BIT_IOR_EXPR: > > + induction_kind = code2 == BIT_NOT_EXPR > > + ? INDUCTION_ALL > > + : INDUCTION_BIT_IOR; > > + break; > > + case BIT_XOR_EXPR: > > + induction_kind = code2 == BIT_NOT_EXPR > > + ? INDUCTION_BIT_XOR > > + : INDUCTION_BIT_RESET; > > + break; > > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */ > > + case BIT_NOT_EXPR: > > + gcc_assert (code2 == BIT_XOR_EXPR); > > + induction_kind = INDUCTION_BIT_XOR; > > + break; > > + default: > > + gcc_unreachable(); > > + } > > + > > + if (induction_kind == INDUCTION_ZERO) > > + return build_zero_cst (type); > > is that true even when the loop doesn't iterate? > > > + if (induction_kind == INDUCTION_ALL) > > + return build_all_ones_cst (type); > > Likewise? the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it must be iterated. > > > + > > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); > > you checked fits_uhwi above, name it bit_start? > > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter); > > + > > + /* Loop tripcount should be num_iter + 1. */ > > + for (unsigned i = 0; i != num_iter + 1; i++) > > + { > > + bits = wi::bit_or (bits, > > + wi::lshift (wi::one (TYPE_PRECISION (type)), > > + start)); > > you can use bits = wi::set_bit (bits, start); Changed. > > > + start += step; > > + } > > Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that > the IV wraps and is unsigned char and you have a 256 bit integer the > above could run quite a long time. Please limit num_iter to TYPE_PRECISION > as well (do you have a testcase with a wrapping IV? try some != CST exit > condition and a large increment) Limit to > 0 and < TYPE_SIZE. > > > + > > + bool inverted = false; > > + switch (induction_kind) > > + { > > + case INDUCTION_BIT_CLEAR: > > + code1 = BIT_AND_EXPR; > > + inverted = true; > > + break; > > + case INDUCTION_BIT_IOR: > > + code1 = BIT_IOR_EXPR; > > + break; > > + case INDUCTION_BIT_RESET: > > + code1 = BIT_XOR_EXPR; > > + break; > > + /* A ^= ~(1 << bit) is special, when loop tripcount is even, > > + it's equal to A ^= bits, else A ^= ~bits. */ > > + case INDUCTION_BIT_XOR: > > + code1 = BIT_XOR_EXPR; > > + if (num_iter % 2 == 0) > > + inverted = true; > > + break; > > + default: > > + gcc_unreachable (); > > + } > > + > > + if (inverted) > > + bits = wi::bit_not (bits); > > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits)); > > +} > > + > > /* Do final value replacement for LOOP, return true if we did anything. */ > > > > bool > > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop) > > { > > gphi *phi = psi.phi (); > > tree rslt = PHI_RESULT (phi); > > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > + tree def = phidef; > > if (virtual_operand_p (def)) > > { > > gsi_next (&psi); > > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop) > > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def, > > &folded_casts); > > def = compute_overall_effect_of_inner_loop (ex_loop, def); > > + > > + /* Handle bitwise induction expression. > > + > > + .i.e. > > + for (int i = 0; i != 64; i+=3) > > + res &= ~(1UL << i); > > + > > + RES can't be analyzed out by SCEV because it is not polynomially > > + expressible, but in fact final value of RES can be replaced by > > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63} > > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */ > > + if (tree_fits_uhwi_p (niter) > > + && tree_to_uhwi (niter) > > that's redundant - ah, it's the not iterating case that's excluded. > please write it > as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned > HOST_WIDE_INT > and pass it that way to analyze_and_compute_bitwise_induction_effect Changed. > > > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U) > > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop, > > + phidef, def, niter); > > + > > it seems you overwrite the SCEV analysis result unconditionally here, please > move this inside of the following if () so it is done as fallback only > (or add it as && !(def = ...) to the condition) original def will be returned when bitwise induction fails which means rewriting is ok, but i think it's better to not pass original def and use your suggestion. Changed. > > > if (!tree_does_not_contain_chrecs (def) > > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num) > > /* Moving the computation from the loop may prolong life range > > -- > > 2.18.1 > > Here's an updated V2 patch.
On Fri, May 13, 2022 at 5:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > This patch will enable below optimization: > > > > > > { > > > - int bit; > > > - long long unsigned int _1; > > > - long long unsigned int _2; > > > - > > > <bb 2> [local count: 46707768]: > > > - > > > - <bb 3> [local count: 1027034057]: > > > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)> > > > - # bit_13 = PHI <bit_9(3), 63(2)> > > > - _1 = 1 << bit_13; > > > - _2 = ~_1; > > > - tmp_8 = _2 & tmp_11; > > > - bit_9 = bit_13 + -3; > > > - if (bit_9 != -3(OVF)) > > > - goto <bb 3>; [95.65%] > > > - else > > > - goto <bb 4>; [4.35%] > > > - > > > - <bb 4> [local count: 46707768]: > > > - return tmp_8; > > > + tmp_12 = tmp_6(D) & 7905747460161236406; > > > + return tmp_12; > > > > > > } > > > > > > > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,} > > > Ok for trunk? > > > > > > gcc/ChangeLog: > > > > > > PR middle-end/103462 > > > * match.pd (bitwise_induction_p): New match. > > > * tree-scalar-evolution.c (gimple_bitwise_induction_p): > > > Declare. > > > (analyze_and_compute_bitwise_induction_effect): New function. > > > (enum bit_op_kind): New enum. > > > (final_value_replacement_loop): Enhanced to handle bitwise > > > induction. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/i386/pr103462-1.c: New test. > > > * gcc.target/i386/pr103462-2.c: New test. > > > * gcc.target/i386/pr103462-3.c: New test. > > > * gcc.target/i386/pr103462-4.c: New test. > > > * gcc.target/i386/pr103462-5.c: New test. > > > * gcc.target/i386/pr103462-6.c: New test. > > > --- > > > gcc/match.pd | 7 + > > > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++ > > > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++ > > > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++ > > > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++- > > > 8 files changed, 654 insertions(+), 1 deletion(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index 6d691d302b3..24ff5f9e6a8 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -7746,3 +7746,10 @@ and, > > > == TYPE_UNSIGNED (TREE_TYPE (@3)))) > > > && single_use (@4) > > > && single_use (@5)))) > > > + > > > +(for bit_op (bit_and bit_ior bit_xor) > > > + (match (bitwise_induction_p @0 @2 @3) > > > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3))) > > > + > > > +(match (bitwise_induction_p @0 @2 @3) > > > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3)))) > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > > new file mode 100644 > > > index 00000000000..1dc4c2acad6 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > > @@ -0,0 +1,111 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo (unsigned long long tmp) > > > +{ > > > + for (int bit = 0; bit < 64; bit += 3) > > > + tmp &= ~(1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo1 (unsigned long long tmp) > > > +{ > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > + tmp &= ~(1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo2 (unsigned long long tmp) > > > +{ > > > + for (int bit = 0; bit < 64; bit += 3) > > > + tmp &= (1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo3 (unsigned long long tmp) > > > +{ > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > + tmp &= (1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo4 (unsigned long long tmp) > > > +{ > > > + for (int bit = 0; bit < 64; bit += 3) > > > + tmp |= ~(1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo5 (unsigned long long tmp) > > > +{ > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > + tmp |= ~(1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo6 (unsigned long long tmp) > > > +{ > > > + for (int bit = 0; bit < 64; bit += 3) > > > + tmp |= (1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo7 (unsigned long long tmp) > > > +{ > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > + tmp |= (1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo8 (unsigned long long tmp) > > > +{ > > > + for (int bit = 0; bit < 64; bit += 3) > > > + tmp ^= ~(1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo9 (unsigned long long tmp) > > > +{ > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > + tmp ^= ~(1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo10 (unsigned long long tmp) > > > +{ > > > + for (int bit = 0; bit < 64; bit += 3) > > > + tmp ^= (1ULL << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned long long > > > +__attribute__((noipa)) > > > +foo11 (unsigned long long tmp) > > > +{ > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > + tmp ^= (1ULL << bit); > > > + return tmp; > > > +} > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > > new file mode 100644 > > > index 00000000000..bc375cb78d4 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > > @@ -0,0 +1,45 @@ > > > +/* { dg-do run } */ > > > +/* { dg-options "-O1" } */ > > > + > > > +#include "pr103462-1.c" > > > + > > > +int main() > > > +{ > > > + unsigned long long tmp = 0x1111111111111111ULL; > > > + if (foo (tmp) != 0x110110110110110ULL) > > > + __builtin_abort (); > > > + > > > + if (foo1 (tmp) != 0x110110110110110ULL) > > > + __builtin_abort (); > > > + > > > + if (foo2 (tmp) != 0x0ULL) > > > + __builtin_abort (); > > > + > > > + if (foo3 (tmp) != 0x0ULL) > > > + __builtin_abort (); > > > + > > > + if (foo4 (tmp) != 0xffffffffffffffffULL) > > > + __builtin_abort (); > > > + > > > + if (foo5 (tmp) != 0xffffffffffffffffULL) > > > + __builtin_abort (); > > > + > > > + if (foo6 (tmp) != 0x9359359359359359ULL) > > > + __builtin_abort (); > > > + > > > + if (foo7 (tmp) != 0x9359359359359359ULL) > > > + __builtin_abort (); > > > + > > > + if (foo8 (tmp) != 0x8358358358358358ULL) > > > + __builtin_abort (); > > > + > > > + if (foo9 (tmp) != 0x8358358358358358ULL) > > > + __builtin_abort (); > > > + > > > + if (foo10 (tmp) != 0x8358358358358358ULL) > > > + __builtin_abort (); > > > + > > > + if (foo11 (tmp) != 0x8358358358358358ULL) > > > + __builtin_abort (); > > > +} > > > + > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > > new file mode 100644 > > > index 00000000000..4ba248a4872 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > > @@ -0,0 +1,111 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo (unsigned int tmp) > > > +{ > > > + for (int bit = 0; bit < 32; bit += 3) > > > + tmp &= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo1 (unsigned int tmp) > > > +{ > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > + tmp &= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo2 (unsigned int tmp) > > > +{ > > > + for (int bit = 0; bit < 32; bit += 3) > > > + tmp &= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo3 (unsigned int tmp) > > > +{ > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > + tmp &= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo4 (unsigned int tmp) > > > +{ > > > + for (int bit = 0; bit < 32; bit += 3) > > > + tmp |= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo5 (unsigned int tmp) > > > +{ > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > + tmp |= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo6 (unsigned int tmp) > > > +{ > > > + for (int bit = 0; bit < 32; bit += 3) > > > + tmp |= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo7 (unsigned int tmp) > > > +{ > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > + tmp |= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo8 (unsigned int tmp) > > > +{ > > > + for (int bit = 0; bit < 32; bit += 3) > > > + tmp ^= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo9 (unsigned int tmp) > > > +{ > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > + tmp ^= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo10 (unsigned int tmp) > > > +{ > > > + for (int bit = 0; bit < 32; bit += 3) > > > + tmp ^= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned int > > > +__attribute__((noipa)) > > > +foo11 (unsigned int tmp) > > > +{ > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > + tmp ^= (1U << bit); > > > + return tmp; > > > +} > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > > new file mode 100644 > > > index 00000000000..e2f4056f525 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > > @@ -0,0 +1,46 @@ > > > +/* { dg-do run } */ > > > +/* { dg-options "-O1" } */ > > > + > > > +#include "pr103462-3.c" > > > + > > > +int main() > > > +{ > > > + unsigned int tmp = 0x11111111U; > > > + > > > + if (foo (tmp) != 0x10110110U) > > > + __builtin_abort (); > > > + > > > + if (foo1 (tmp) != 0x1101101U) > > > + __builtin_abort (); > > > + > > > + if (foo2 (tmp) != 0x0U) > > > + __builtin_abort (); > > > + > > > + if (foo3 (tmp) != 0x0U) > > > + __builtin_abort (); > > > + > > > + if (foo4 (tmp) != 0xffffffffU) > > > + __builtin_abort (); > > > + > > > + if (foo5 (tmp) != 0xffffffffU) > > > + __builtin_abort (); > > > + > > > + if (foo6 (tmp) != 0x59359359U) > > > + __builtin_abort (); > > > + > > > + if (foo7 (tmp) != 0x93593593U) > > > + __builtin_abort (); > > > + > > > + if (foo8 (tmp) != 0xa7ca7ca7U) > > > + __builtin_abort (); > > > + > > > + if (foo9 (tmp) != 0x7ca7ca7cU) > > > + __builtin_abort (); > > > + > > > + if (foo10 (tmp) != 0x58358358U) > > > + __builtin_abort (); > > > + > > > + if (foo11 (tmp) != 0x83583583U) > > > + __builtin_abort (); > > > +} > > > + > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > > new file mode 100644 > > > index 00000000000..1f4ffa34b48 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > > @@ -0,0 +1,111 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo (unsigned short tmp) > > > +{ > > > + for (int bit = 0; bit < 16; bit += 3) > > > + tmp &= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo1 (unsigned short tmp) > > > +{ > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > + tmp &= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo2 (unsigned short tmp) > > > +{ > > > + for (int bit = 0; bit < 16; bit += 3) > > > + tmp &= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo3 (unsigned short tmp) > > > +{ > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > + tmp &= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo4 (unsigned short tmp) > > > +{ > > > + for (int bit = 0; bit < 16; bit += 3) > > > + tmp |= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo5 (unsigned short tmp) > > > +{ > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > + tmp |= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo6 (unsigned short tmp) > > > +{ > > > + for (int bit = 0; bit < 16; bit += 3) > > > + tmp |= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo7 (unsigned short tmp) > > > +{ > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > + tmp |= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo8 (unsigned short tmp) > > > +{ > > > + for (int bit = 0; bit < 16; bit += 3) > > > + tmp ^= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo9 (unsigned short tmp) > > > +{ > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > + tmp ^= ~(1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo10 (unsigned short tmp) > > > +{ > > > + for (int bit = 0; bit < 16; bit += 3) > > > + tmp ^= (1U << bit); > > > + return tmp; > > > +} > > > + > > > +unsigned short > > > +__attribute__((noipa)) > > > +foo11 (unsigned short tmp) > > > +{ > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > + tmp ^= (1U << bit); > > > + return tmp; > > > +} > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > > new file mode 100644 > > > index 00000000000..65426d71efe > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > > @@ -0,0 +1,46 @@ > > > +/* { dg-do run } */ > > > +/* { dg-options "-O1" } */ > > > + > > > +#include "pr103462-5.c" > > > + > > > +int main() > > > +{ > > > + unsigned short tmp = 0x1111U; > > > + > > > + if (foo (tmp) != 0x110) > > > + __builtin_abort (); > > > + > > > + if (foo1 (tmp) != 0x110) > > > + __builtin_abort (); > > > + > > > + if (foo2 (tmp) != 0x0) > > > + __builtin_abort (); > > > + > > > + if (foo3 (tmp) != 0x0) > > > + __builtin_abort (); > > > + > > > + if (foo4 (tmp) != 0xffff) > > > + __builtin_abort (); > > > + > > > + if (foo5 (tmp) != 0xffff) > > > + __builtin_abort (); > > > + > > > + if (foo6 (tmp) != 0x9359) > > > + __builtin_abort (); > > > + > > > + if (foo7 (tmp) != 0x9359) > > > + __builtin_abort (); > > > + > > > + if (foo8 (tmp) != 0x8358) > > > + __builtin_abort (); > > > + > > > + if (foo9 (tmp) != 0x8358) > > > + __builtin_abort (); > > > + > > > + if (foo10 (tmp) != 0x8358) > > > + __builtin_abort (); > > > + > > > + if (foo11 (tmp) != 0x8358) > > > + __builtin_abort (); > > > +} > > > + > > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > > > index 72ceb4001e3..9b0aec4fd09 100644 > > > --- a/gcc/tree-scalar-evolution.cc > > > +++ b/gcc/tree-scalar-evolution.cc > > > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr) > > > || expanded_size > cache.elements ()); > > > } > > > > > > +/* Match.pd function to match bitwise inductive expression. > > > + .i.e. > > > + _2 = 1 << _1; > > > + _3 = ~_2; > > > + tmp_9 = _3 & tmp_12; */ > > > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree)); > > > + > > > +/* Return the inductive expression of bitwise operation if possible, > > > + otherwise returns DEF. */ > > > +static tree > > > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop, > > > + class loop* loop, > > > + tree phidef, > > > + tree def, > > > + tree niter) > > > +{ > > > + tree match_op[3],inv, bitwise_scev, bitwise_res; > > > + bool folded_casts = false; > > > + tree type = TREE_TYPE (phidef); > > > + gimple* header_phi = NULL; > > > > use a gphi * > > > Changed. > > > + > > > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF) > > > + > > > + op2 = PHI <phidef, inv> > > > + _1 = (int) bit_17; > > > + _3 = 1 << _1; > > > + op1 = ~_3; > > > + phidef = op1 & op2; */ > > > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL) > > > + || TREE_CODE (match_op[2]) != SSA_NAME > > > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI > > > > || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > > > > > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2) > > > > and then use header_phi here, if you pass in the PHI to use you'll > > always have 2 args > changed. > > > > > + return def; > > > + > > > + header_phi = SSA_NAME_DEF_STMT (match_op[2]); > > > + if (gimple_bb (header_phi) != loop->header) > > > + return def; > > > > I think passing in the PHI node and the exit edge instead of phidef > > would make the above more clear > Remove the upper condition. > > > > > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef) > > > + return def; > > > + > > > + inv = gimple_phi_arg_def (header_phi, 0); > > > + if (inv == phidef) > > > + inv = gimple_phi_arg_def (header_phi, 1); > > > > inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop)); > > > > where is this used? > Move to the place where it's used. > > > > > + > > > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop, > > > + match_op[1], > > > + &folded_casts); > > > > you want > > > > bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); > > bitwise_scev = instantiate_parameters (loop, bitwise_scev); > > > > instead since you are interested in in-loop behavior? > Yes, changed. > > > > > + > > > + /* Make sure bits is in range of type precision. */ > > > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC > > > > I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you > > do not rule out pointers or vector types. > Changed. > > > > > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev)) > > > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type) > > > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev))) > > > + return def; > > > + > > > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > > > + if (!tree_fits_uhwi_p (bitwise_res) > > > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type)) > > > + return def; > > > > Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct? > > So name it bit_final? > Changed. > > > > > + > > > +enum bit_op_kind > > > + { > > > + INDUCTION_BIT_CLEAR, > > > + INDUCTION_BIT_IOR, > > > + INDUCTION_BIT_XOR, > > > + INDUCTION_BIT_RESET, > > > + INDUCTION_ZERO, > > > + INDUCTION_ALL > > > + }; > > > + > > > + enum bit_op_kind induction_kind; > > > + enum tree_code code1 > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef)); > > > + enum tree_code code2 > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0])); > > > + > > > + /* BIT_CLEAR: A &= ~(1 << bit) > > > + BIT_RESET: A ^= (1 << bit). > > > + BIT_IOR: A |= (1 << bit) > > > + BIT_ZERO: A &= (1 << bit) > > > + BIT_ALL: A |= ~(1 << bit) > > > + BIT_XOR: A ^= ~(1 << bit). > > > + bit is induction variable. */ > > > + switch (code1) > > > + { > > > + case BIT_AND_EXPR: > > > + induction_kind = code2 == BIT_NOT_EXPR > > > + ? INDUCTION_BIT_CLEAR > > > + : INDUCTION_ZERO; > > > + break; > > > + case BIT_IOR_EXPR: > > > + induction_kind = code2 == BIT_NOT_EXPR > > > + ? INDUCTION_ALL > > > + : INDUCTION_BIT_IOR; > > > + break; > > > + case BIT_XOR_EXPR: > > > + induction_kind = code2 == BIT_NOT_EXPR > > > + ? INDUCTION_BIT_XOR > > > + : INDUCTION_BIT_RESET; > > > + break; > > > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */ > > > + case BIT_NOT_EXPR: > > > + gcc_assert (code2 == BIT_XOR_EXPR); > > > + induction_kind = INDUCTION_BIT_XOR; > > > + break; > > > + default: > > > + gcc_unreachable(); > > > + } > > > + > > > + if (induction_kind == INDUCTION_ZERO) > > > + return build_zero_cst (type); > > > > is that true even when the loop doesn't iterate? > > > > > + if (induction_kind == INDUCTION_ALL) > > > + return build_all_ones_cst (type); > > > > Likewise? > the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it > must be iterated. > > > > > + > > > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > > > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); > > > > you checked fits_uhwi above, name it bit_start? > > > > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > > > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter); > > > + > > > + /* Loop tripcount should be num_iter + 1. */ > > > + for (unsigned i = 0; i != num_iter + 1; i++) > > > + { > > > + bits = wi::bit_or (bits, > > > + wi::lshift (wi::one (TYPE_PRECISION (type)), > > > + start)); > > > > you can use bits = wi::set_bit (bits, start); > Changed. > > > > > + start += step; > > > + } > > > > Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that > > the IV wraps and is unsigned char and you have a 256 bit integer the > > above could run quite a long time. Please limit num_iter to TYPE_PRECISION > > as well (do you have a testcase with a wrapping IV? try some != CST exit > > condition and a large increment) > Limit to > 0 and < TYPE_SIZE. > > > > > + > > > + bool inverted = false; > > > + switch (induction_kind) > > > + { > > > + case INDUCTION_BIT_CLEAR: > > > + code1 = BIT_AND_EXPR; > > > + inverted = true; > > > + break; > > > + case INDUCTION_BIT_IOR: > > > + code1 = BIT_IOR_EXPR; > > > + break; > > > + case INDUCTION_BIT_RESET: > > > + code1 = BIT_XOR_EXPR; > > > + break; > > > + /* A ^= ~(1 << bit) is special, when loop tripcount is even, > > > + it's equal to A ^= bits, else A ^= ~bits. */ > > > + case INDUCTION_BIT_XOR: > > > + code1 = BIT_XOR_EXPR; > > > + if (num_iter % 2 == 0) > > > + inverted = true; > > > + break; > > > + default: > > > + gcc_unreachable (); > > > + } > > > + > > > + if (inverted) > > > + bits = wi::bit_not (bits); > > > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits)); > > > +} > > > + > > > /* Do final value replacement for LOOP, return true if we did anything. */ > > > > > > bool > > > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop) > > > { > > > gphi *phi = psi.phi (); > > > tree rslt = PHI_RESULT (phi); > > > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > > + tree def = phidef; > > > if (virtual_operand_p (def)) > > > { > > > gsi_next (&psi); > > > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop) > > > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def, > > > &folded_casts); > > > def = compute_overall_effect_of_inner_loop (ex_loop, def); > > > + > > > + /* Handle bitwise induction expression. > > > + > > > + .i.e. > > > + for (int i = 0; i != 64; i+=3) > > > + res &= ~(1UL << i); > > > + > > > + RES can't be analyzed out by SCEV because it is not polynomially > > > + expressible, but in fact final value of RES can be replaced by > > > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63} > > > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */ > > > + if (tree_fits_uhwi_p (niter) > > > + && tree_to_uhwi (niter) > > > > that's redundant - ah, it's the not iterating case that's excluded. > > please write it > > as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned > > HOST_WIDE_INT > > and pass it that way to analyze_and_compute_bitwise_induction_effect > Changed. > > > > > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U) > > > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop, > > > + phidef, def, niter); > > > + > > > > it seems you overwrite the SCEV analysis result unconditionally here, please > > move this inside of the following if () so it is done as fallback only > > (or add it as && !(def = ...) to the condition) > original def will be returned when bitwise induction fails which means > rewriting is ok, but i think it's better to not pass original def and > use your suggestion. > Changed. > > > > > if (!tree_does_not_contain_chrecs (def) > > > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num) > > > /* Moving the computation from the loop may prolong life range > > > -- > > > 2.18.1 > > > > > Here's an updated V2 patch. + || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) + || header_phi->nargs != 2) please use gimple_phi_num_args (header_phi) != 2 + bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); + bitwise_scev = instantiate_parameters (loop, bitwise_scev); + so that's now OK to get the evolution of the bit IV + bit_final = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); but this might not, for the final value you probably want to re-analyze the SCEV wrt the outer loop using your original analyze_scalar_evolution_in_loop query. Of course the final value could be also computed with CHREC_LEFT + niter * CHREC_RIGHT, so it might be more convenient to remove the above and re-formulate the check it is used in after + wide_int bits = wi::zero (TYPE_PRECISION (type)); + HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); ? OK with that changes. Thanks, Richard. > > > -- > BR, > Hongtao
On Fri, May 13, 2022 at 7:16 PM Richard Biener <richard.guenther@gmail.com> wrote: > > On Fri, May 13, 2022 at 5:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > This patch will enable below optimization: > > > > > > > > { > > > > - int bit; > > > > - long long unsigned int _1; > > > > - long long unsigned int _2; > > > > - > > > > <bb 2> [local count: 46707768]: > > > > - > > > > - <bb 3> [local count: 1027034057]: > > > > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)> > > > > - # bit_13 = PHI <bit_9(3), 63(2)> > > > > - _1 = 1 << bit_13; > > > > - _2 = ~_1; > > > > - tmp_8 = _2 & tmp_11; > > > > - bit_9 = bit_13 + -3; > > > > - if (bit_9 != -3(OVF)) > > > > - goto <bb 3>; [95.65%] > > > > - else > > > > - goto <bb 4>; [4.35%] > > > > - > > > > - <bb 4> [local count: 46707768]: > > > > - return tmp_8; > > > > + tmp_12 = tmp_6(D) & 7905747460161236406; > > > > + return tmp_12; > > > > > > > > } > > > > > > > > > > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,} > > > > Ok for trunk? > > > > > > > > gcc/ChangeLog: > > > > > > > > PR middle-end/103462 > > > > * match.pd (bitwise_induction_p): New match. > > > > * tree-scalar-evolution.c (gimple_bitwise_induction_p): > > > > Declare. > > > > (analyze_and_compute_bitwise_induction_effect): New function. > > > > (enum bit_op_kind): New enum. > > > > (final_value_replacement_loop): Enhanced to handle bitwise > > > > induction. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > * gcc.target/i386/pr103462-1.c: New test. > > > > * gcc.target/i386/pr103462-2.c: New test. > > > > * gcc.target/i386/pr103462-3.c: New test. > > > > * gcc.target/i386/pr103462-4.c: New test. > > > > * gcc.target/i386/pr103462-5.c: New test. > > > > * gcc.target/i386/pr103462-6.c: New test. > > > > --- > > > > gcc/match.pd | 7 + > > > > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++ > > > > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++ > > > > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++ > > > > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++ > > > > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++ > > > > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++ > > > > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++- > > > > 8 files changed, 654 insertions(+), 1 deletion(-) > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > index 6d691d302b3..24ff5f9e6a8 100644 > > > > --- a/gcc/match.pd > > > > +++ b/gcc/match.pd > > > > @@ -7746,3 +7746,10 @@ and, > > > > == TYPE_UNSIGNED (TREE_TYPE (@3)))) > > > > && single_use (@4) > > > > && single_use (@5)))) > > > > + > > > > +(for bit_op (bit_and bit_ior bit_xor) > > > > + (match (bitwise_induction_p @0 @2 @3) > > > > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3))) > > > > + > > > > +(match (bitwise_induction_p @0 @2 @3) > > > > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3)))) > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > > > new file mode 100644 > > > > index 00000000000..1dc4c2acad6 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > > > @@ -0,0 +1,111 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > + tmp &= ~(1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo1 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > + tmp &= ~(1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo2 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > + tmp &= (1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo3 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > + tmp &= (1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo4 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > + tmp |= ~(1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo5 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > + tmp |= ~(1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo6 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > + tmp |= (1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo7 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > + tmp |= (1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo8 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > + tmp ^= ~(1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo9 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > + tmp ^= ~(1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo10 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > + tmp ^= (1ULL << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned long long > > > > +__attribute__((noipa)) > > > > +foo11 (unsigned long long tmp) > > > > +{ > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > + tmp ^= (1ULL << bit); > > > > + return tmp; > > > > +} > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > > > new file mode 100644 > > > > index 00000000000..bc375cb78d4 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > > > @@ -0,0 +1,45 @@ > > > > +/* { dg-do run } */ > > > > +/* { dg-options "-O1" } */ > > > > + > > > > +#include "pr103462-1.c" > > > > + > > > > +int main() > > > > +{ > > > > + unsigned long long tmp = 0x1111111111111111ULL; > > > > + if (foo (tmp) != 0x110110110110110ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo1 (tmp) != 0x110110110110110ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo2 (tmp) != 0x0ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo3 (tmp) != 0x0ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo4 (tmp) != 0xffffffffffffffffULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo5 (tmp) != 0xffffffffffffffffULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo6 (tmp) != 0x9359359359359359ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo7 (tmp) != 0x9359359359359359ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo8 (tmp) != 0x8358358358358358ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo9 (tmp) != 0x8358358358358358ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo10 (tmp) != 0x8358358358358358ULL) > > > > + __builtin_abort (); > > > > + > > > > + if (foo11 (tmp) != 0x8358358358358358ULL) > > > > + __builtin_abort (); > > > > +} > > > > + > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > > > new file mode 100644 > > > > index 00000000000..4ba248a4872 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > > > @@ -0,0 +1,111 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo (unsigned int tmp) > > > > +{ > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > + tmp &= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo1 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > + tmp &= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo2 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > + tmp &= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo3 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > + tmp &= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo4 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > + tmp |= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo5 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > + tmp |= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo6 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > + tmp |= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo7 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > + tmp |= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo8 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > + tmp ^= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo9 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > + tmp ^= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo10 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > + tmp ^= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned int > > > > +__attribute__((noipa)) > > > > +foo11 (unsigned int tmp) > > > > +{ > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > + tmp ^= (1U << bit); > > > > + return tmp; > > > > +} > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > > > new file mode 100644 > > > > index 00000000000..e2f4056f525 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > > > @@ -0,0 +1,46 @@ > > > > +/* { dg-do run } */ > > > > +/* { dg-options "-O1" } */ > > > > + > > > > +#include "pr103462-3.c" > > > > + > > > > +int main() > > > > +{ > > > > + unsigned int tmp = 0x11111111U; > > > > + > > > > + if (foo (tmp) != 0x10110110U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo1 (tmp) != 0x1101101U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo2 (tmp) != 0x0U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo3 (tmp) != 0x0U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo4 (tmp) != 0xffffffffU) > > > > + __builtin_abort (); > > > > + > > > > + if (foo5 (tmp) != 0xffffffffU) > > > > + __builtin_abort (); > > > > + > > > > + if (foo6 (tmp) != 0x59359359U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo7 (tmp) != 0x93593593U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo8 (tmp) != 0xa7ca7ca7U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo9 (tmp) != 0x7ca7ca7cU) > > > > + __builtin_abort (); > > > > + > > > > + if (foo10 (tmp) != 0x58358358U) > > > > + __builtin_abort (); > > > > + > > > > + if (foo11 (tmp) != 0x83583583U) > > > > + __builtin_abort (); > > > > +} > > > > + > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > > > new file mode 100644 > > > > index 00000000000..1f4ffa34b48 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > > > @@ -0,0 +1,111 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo (unsigned short tmp) > > > > +{ > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > + tmp &= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo1 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > + tmp &= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo2 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > + tmp &= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo3 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > + tmp &= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo4 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > + tmp |= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo5 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > + tmp |= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo6 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > + tmp |= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo7 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > + tmp |= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo8 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > + tmp ^= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo9 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > + tmp ^= ~(1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo10 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > + tmp ^= (1U << bit); > > > > + return tmp; > > > > +} > > > > + > > > > +unsigned short > > > > +__attribute__((noipa)) > > > > +foo11 (unsigned short tmp) > > > > +{ > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > + tmp ^= (1U << bit); > > > > + return tmp; > > > > +} > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > new file mode 100644 > > > > index 00000000000..65426d71efe > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > @@ -0,0 +1,46 @@ > > > > +/* { dg-do run } */ > > > > +/* { dg-options "-O1" } */ > > > > + > > > > +#include "pr103462-5.c" > > > > + > > > > +int main() > > > > +{ > > > > + unsigned short tmp = 0x1111U; > > > > + > > > > + if (foo (tmp) != 0x110) > > > > + __builtin_abort (); > > > > + > > > > + if (foo1 (tmp) != 0x110) > > > > + __builtin_abort (); > > > > + > > > > + if (foo2 (tmp) != 0x0) > > > > + __builtin_abort (); > > > > + > > > > + if (foo3 (tmp) != 0x0) > > > > + __builtin_abort (); > > > > + > > > > + if (foo4 (tmp) != 0xffff) > > > > + __builtin_abort (); > > > > + > > > > + if (foo5 (tmp) != 0xffff) > > > > + __builtin_abort (); > > > > + > > > > + if (foo6 (tmp) != 0x9359) > > > > + __builtin_abort (); > > > > + > > > > + if (foo7 (tmp) != 0x9359) > > > > + __builtin_abort (); > > > > + > > > > + if (foo8 (tmp) != 0x8358) > > > > + __builtin_abort (); > > > > + > > > > + if (foo9 (tmp) != 0x8358) > > > > + __builtin_abort (); > > > > + > > > > + if (foo10 (tmp) != 0x8358) > > > > + __builtin_abort (); > > > > + > > > > + if (foo11 (tmp) != 0x8358) > > > > + __builtin_abort (); > > > > +} > > > > + > > > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > > > > index 72ceb4001e3..9b0aec4fd09 100644 > > > > --- a/gcc/tree-scalar-evolution.cc > > > > +++ b/gcc/tree-scalar-evolution.cc > > > > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr) > > > > || expanded_size > cache.elements ()); > > > > } > > > > > > > > +/* Match.pd function to match bitwise inductive expression. > > > > + .i.e. > > > > + _2 = 1 << _1; > > > > + _3 = ~_2; > > > > + tmp_9 = _3 & tmp_12; */ > > > > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree)); > > > > + > > > > +/* Return the inductive expression of bitwise operation if possible, > > > > + otherwise returns DEF. */ > > > > +static tree > > > > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop, > > > > + class loop* loop, > > > > + tree phidef, > > > > + tree def, > > > > + tree niter) > > > > +{ > > > > + tree match_op[3],inv, bitwise_scev, bitwise_res; > > > > + bool folded_casts = false; > > > > + tree type = TREE_TYPE (phidef); > > > > + gimple* header_phi = NULL; > > > > > > use a gphi * > > > > > Changed. > > > > + > > > > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF) > > > > + > > > > + op2 = PHI <phidef, inv> > > > > + _1 = (int) bit_17; > > > > + _3 = 1 << _1; > > > > + op1 = ~_3; > > > > + phidef = op1 & op2; */ > > > > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL) > > > > + || TREE_CODE (match_op[2]) != SSA_NAME > > > > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI > > > > > > || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > > > > > > > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2) > > > > > > and then use header_phi here, if you pass in the PHI to use you'll > > > always have 2 args > > changed. > > > > > > > + return def; > > > > + > > > > + header_phi = SSA_NAME_DEF_STMT (match_op[2]); > > > > + if (gimple_bb (header_phi) != loop->header) > > > > + return def; > > > > > > I think passing in the PHI node and the exit edge instead of phidef > > > would make the above more clear > > Remove the upper condition. > > > > > > > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef) > > > > + return def; > > > > + > > > > + inv = gimple_phi_arg_def (header_phi, 0); > > > > + if (inv == phidef) > > > > + inv = gimple_phi_arg_def (header_phi, 1); > > > > > > inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop)); > > > > > > where is this used? > > Move to the place where it's used. > > > > > > > + > > > > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop, > > > > + match_op[1], > > > > + &folded_casts); > > > > > > you want > > > > > > bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); > > > bitwise_scev = instantiate_parameters (loop, bitwise_scev); > > > > > > instead since you are interested in in-loop behavior? > > Yes, changed. > > > > > > > + > > > > + /* Make sure bits is in range of type precision. */ > > > > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC > > > > > > I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you > > > do not rule out pointers or vector types. > > Changed. > > > > > > > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev)) > > > > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type) > > > > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev))) > > > > + return def; > > > > + > > > > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > > > > + if (!tree_fits_uhwi_p (bitwise_res) > > > > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type)) > > > > + return def; > > > > > > Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct? > > > So name it bit_final? > > Changed. > > > > > > > + > > > > +enum bit_op_kind > > > > + { > > > > + INDUCTION_BIT_CLEAR, > > > > + INDUCTION_BIT_IOR, > > > > + INDUCTION_BIT_XOR, > > > > + INDUCTION_BIT_RESET, > > > > + INDUCTION_ZERO, > > > > + INDUCTION_ALL > > > > + }; > > > > + > > > > + enum bit_op_kind induction_kind; > > > > + enum tree_code code1 > > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef)); > > > > + enum tree_code code2 > > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0])); > > > > + > > > > + /* BIT_CLEAR: A &= ~(1 << bit) > > > > + BIT_RESET: A ^= (1 << bit). > > > > + BIT_IOR: A |= (1 << bit) > > > > + BIT_ZERO: A &= (1 << bit) > > > > + BIT_ALL: A |= ~(1 << bit) > > > > + BIT_XOR: A ^= ~(1 << bit). > > > > + bit is induction variable. */ > > > > + switch (code1) > > > > + { > > > > + case BIT_AND_EXPR: > > > > + induction_kind = code2 == BIT_NOT_EXPR > > > > + ? INDUCTION_BIT_CLEAR > > > > + : INDUCTION_ZERO; > > > > + break; > > > > + case BIT_IOR_EXPR: > > > > + induction_kind = code2 == BIT_NOT_EXPR > > > > + ? INDUCTION_ALL > > > > + : INDUCTION_BIT_IOR; > > > > + break; > > > > + case BIT_XOR_EXPR: > > > > + induction_kind = code2 == BIT_NOT_EXPR > > > > + ? INDUCTION_BIT_XOR > > > > + : INDUCTION_BIT_RESET; > > > > + break; > > > > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */ > > > > + case BIT_NOT_EXPR: > > > > + gcc_assert (code2 == BIT_XOR_EXPR); > > > > + induction_kind = INDUCTION_BIT_XOR; > > > > + break; > > > > + default: > > > > + gcc_unreachable(); > > > > + } > > > > + > > > > + if (induction_kind == INDUCTION_ZERO) > > > > + return build_zero_cst (type); > > > > > > is that true even when the loop doesn't iterate? > > > > > > > + if (induction_kind == INDUCTION_ALL) > > > > + return build_all_ones_cst (type); > > > > > > Likewise? > > the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it > > must be iterated. > > > > > > > + > > > > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > > > > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); > > > > > > you checked fits_uhwi above, name it bit_start? > > > > > > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > > > > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter); > > > > + > > > > + /* Loop tripcount should be num_iter + 1. */ > > > > + for (unsigned i = 0; i != num_iter + 1; i++) > > > > + { > > > > + bits = wi::bit_or (bits, > > > > + wi::lshift (wi::one (TYPE_PRECISION (type)), > > > > + start)); > > > > > > you can use bits = wi::set_bit (bits, start); > > Changed. > > > > > > > + start += step; > > > > + } > > > > > > Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that > > > the IV wraps and is unsigned char and you have a 256 bit integer the > > > above could run quite a long time. Please limit num_iter to TYPE_PRECISION > > > as well (do you have a testcase with a wrapping IV? try some != CST exit > > > condition and a large increment) > > Limit to > 0 and < TYPE_SIZE. > > > > > > > + > > > > + bool inverted = false; > > > > + switch (induction_kind) > > > > + { > > > > + case INDUCTION_BIT_CLEAR: > > > > + code1 = BIT_AND_EXPR; > > > > + inverted = true; > > > > + break; > > > > + case INDUCTION_BIT_IOR: > > > > + code1 = BIT_IOR_EXPR; > > > > + break; > > > > + case INDUCTION_BIT_RESET: > > > > + code1 = BIT_XOR_EXPR; > > > > + break; > > > > + /* A ^= ~(1 << bit) is special, when loop tripcount is even, > > > > + it's equal to A ^= bits, else A ^= ~bits. */ > > > > + case INDUCTION_BIT_XOR: > > > > + code1 = BIT_XOR_EXPR; > > > > + if (num_iter % 2 == 0) > > > > + inverted = true; > > > > + break; > > > > + default: > > > > + gcc_unreachable (); > > > > + } > > > > + > > > > + if (inverted) > > > > + bits = wi::bit_not (bits); > > > > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits)); > > > > +} > > > > + > > > > /* Do final value replacement for LOOP, return true if we did anything. */ > > > > > > > > bool > > > > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop) > > > > { > > > > gphi *phi = psi.phi (); > > > > tree rslt = PHI_RESULT (phi); > > > > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > > > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > > > + tree def = phidef; > > > > if (virtual_operand_p (def)) > > > > { > > > > gsi_next (&psi); > > > > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop) > > > > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def, > > > > &folded_casts); > > > > def = compute_overall_effect_of_inner_loop (ex_loop, def); > > > > + > > > > + /* Handle bitwise induction expression. > > > > + > > > > + .i.e. > > > > + for (int i = 0; i != 64; i+=3) > > > > + res &= ~(1UL << i); > > > > + > > > > + RES can't be analyzed out by SCEV because it is not polynomially > > > > + expressible, but in fact final value of RES can be replaced by > > > > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63} > > > > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */ > > > > + if (tree_fits_uhwi_p (niter) > > > > + && tree_to_uhwi (niter) > > > > > > that's redundant - ah, it's the not iterating case that's excluded. > > > please write it > > > as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned > > > HOST_WIDE_INT > > > and pass it that way to analyze_and_compute_bitwise_induction_effect > > Changed. > > > > > > > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U) > > > > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop, > > > > + phidef, def, niter); > > > > + > > > > > > it seems you overwrite the SCEV analysis result unconditionally here, please > > > move this inside of the following if () so it is done as fallback only > > > (or add it as && !(def = ...) to the condition) > > original def will be returned when bitwise induction fails which means > > rewriting is ok, but i think it's better to not pass original def and > > use your suggestion. > > Changed. > > > > > > > if (!tree_does_not_contain_chrecs (def) > > > > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num) > > > > /* Moving the computation from the loop may prolong life range > > > > -- > > > > 2.18.1 > > > > > > > > Here's an updated V2 patch. > > + || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > + || header_phi->nargs != 2) > > please use gimple_phi_num_args (header_phi) != 2 Changed. > > + bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); > + bitwise_scev = instantiate_parameters (loop, bitwise_scev); > + > > so that's now OK to get the evolution of the bit IV > > + bit_final = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > > but this might not, for the final value you probably want to re-analyze the > SCEV wrt the outer loop using your original analyze_scalar_evolution_in_loop > query. Of course the final value could be also computed with > CHREC_LEFT + niter * CHREC_RIGHT, so it might be more convenient > to remove the above and re-formulate the check it is used in after > > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > + HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > > ? > How about + HOST_WIDE_INT bit_final = bit_start + step * niter; + + /* bit_start, bit_final in range of [0,TYPE_PRECISION) + implies all bits are set in range. */ + if (bit_final >= TYPE_PRECISION (type) + || bit_final < 0) + return NULL_TREE; Also remove ex_loop from the parameter. Updated patch. > OK with that changes. > > Thanks, > Richard. > > > > > > > -- > > BR, > > Hongtao
On Wed, May 18, 2022 at 4:45 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Fri, May 13, 2022 at 7:16 PM Richard Biener > <richard.guenther@gmail.com> wrote: > > > > On Fri, May 13, 2022 at 5:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > This patch will enable below optimization: > > > > > > > > > > { > > > > > - int bit; > > > > > - long long unsigned int _1; > > > > > - long long unsigned int _2; > > > > > - > > > > > <bb 2> [local count: 46707768]: > > > > > - > > > > > - <bb 3> [local count: 1027034057]: > > > > > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)> > > > > > - # bit_13 = PHI <bit_9(3), 63(2)> > > > > > - _1 = 1 << bit_13; > > > > > - _2 = ~_1; > > > > > - tmp_8 = _2 & tmp_11; > > > > > - bit_9 = bit_13 + -3; > > > > > - if (bit_9 != -3(OVF)) > > > > > - goto <bb 3>; [95.65%] > > > > > - else > > > > > - goto <bb 4>; [4.35%] > > > > > - > > > > > - <bb 4> [local count: 46707768]: > > > > > - return tmp_8; > > > > > + tmp_12 = tmp_6(D) & 7905747460161236406; > > > > > + return tmp_12; > > > > > > > > > > } > > > > > > > > > > > > > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,} > > > > > Ok for trunk? > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > PR middle-end/103462 > > > > > * match.pd (bitwise_induction_p): New match. > > > > > * tree-scalar-evolution.c (gimple_bitwise_induction_p): > > > > > Declare. > > > > > (analyze_and_compute_bitwise_induction_effect): New function. > > > > > (enum bit_op_kind): New enum. > > > > > (final_value_replacement_loop): Enhanced to handle bitwise > > > > > induction. > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > * gcc.target/i386/pr103462-1.c: New test. > > > > > * gcc.target/i386/pr103462-2.c: New test. > > > > > * gcc.target/i386/pr103462-3.c: New test. > > > > > * gcc.target/i386/pr103462-4.c: New test. > > > > > * gcc.target/i386/pr103462-5.c: New test. > > > > > * gcc.target/i386/pr103462-6.c: New test. > > > > > --- > > > > > gcc/match.pd | 7 + > > > > > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++ > > > > > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++ > > > > > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++ > > > > > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++ > > > > > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++ > > > > > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++ > > > > > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++- > > > > > 8 files changed, 654 insertions(+), 1 deletion(-) > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > > > index 6d691d302b3..24ff5f9e6a8 100644 > > > > > --- a/gcc/match.pd > > > > > +++ b/gcc/match.pd > > > > > @@ -7746,3 +7746,10 @@ and, > > > > > == TYPE_UNSIGNED (TREE_TYPE (@3)))) > > > > > && single_use (@4) > > > > > && single_use (@5)))) > > > > > + > > > > > +(for bit_op (bit_and bit_ior bit_xor) > > > > > + (match (bitwise_induction_p @0 @2 @3) > > > > > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3))) > > > > > + > > > > > +(match (bitwise_induction_p @0 @2 @3) > > > > > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3)))) > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > > > > new file mode 100644 > > > > > index 00000000000..1dc4c2acad6 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c > > > > > @@ -0,0 +1,111 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > > + tmp &= ~(1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo1 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > > + tmp &= ~(1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo2 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > > + tmp &= (1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo3 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > > + tmp &= (1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo4 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > > + tmp |= ~(1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo5 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > > + tmp |= ~(1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo6 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > > + tmp |= (1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo7 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > > + tmp |= (1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo8 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > > + tmp ^= ~(1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo9 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > > + tmp ^= ~(1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo10 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 64; bit += 3) > > > > > + tmp ^= (1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned long long > > > > > +__attribute__((noipa)) > > > > > +foo11 (unsigned long long tmp) > > > > > +{ > > > > > + for (int bit = 63; bit >= 0; bit -= 3) > > > > > + tmp ^= (1ULL << bit); > > > > > + return tmp; > > > > > +} > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > > > > new file mode 100644 > > > > > index 00000000000..bc375cb78d4 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c > > > > > @@ -0,0 +1,45 @@ > > > > > +/* { dg-do run } */ > > > > > +/* { dg-options "-O1" } */ > > > > > + > > > > > +#include "pr103462-1.c" > > > > > + > > > > > +int main() > > > > > +{ > > > > > + unsigned long long tmp = 0x1111111111111111ULL; > > > > > + if (foo (tmp) != 0x110110110110110ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo1 (tmp) != 0x110110110110110ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo2 (tmp) != 0x0ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo3 (tmp) != 0x0ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo4 (tmp) != 0xffffffffffffffffULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo5 (tmp) != 0xffffffffffffffffULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo6 (tmp) != 0x9359359359359359ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo7 (tmp) != 0x9359359359359359ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo8 (tmp) != 0x8358358358358358ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo9 (tmp) != 0x8358358358358358ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo10 (tmp) != 0x8358358358358358ULL) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo11 (tmp) != 0x8358358358358358ULL) > > > > > + __builtin_abort (); > > > > > +} > > > > > + > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > > > > new file mode 100644 > > > > > index 00000000000..4ba248a4872 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c > > > > > @@ -0,0 +1,111 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > > + tmp &= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo1 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > > + tmp &= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo2 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > > + tmp &= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo3 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > > + tmp &= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo4 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > > + tmp |= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo5 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > > + tmp |= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo6 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > > + tmp |= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo7 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > > + tmp |= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo8 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > > + tmp ^= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo9 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > > + tmp ^= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo10 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 32; bit += 3) > > > > > + tmp ^= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned int > > > > > +__attribute__((noipa)) > > > > > +foo11 (unsigned int tmp) > > > > > +{ > > > > > + for (int bit = 31; bit >= 0; bit -= 3) > > > > > + tmp ^= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > > > > new file mode 100644 > > > > > index 00000000000..e2f4056f525 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c > > > > > @@ -0,0 +1,46 @@ > > > > > +/* { dg-do run } */ > > > > > +/* { dg-options "-O1" } */ > > > > > + > > > > > +#include "pr103462-3.c" > > > > > + > > > > > +int main() > > > > > +{ > > > > > + unsigned int tmp = 0x11111111U; > > > > > + > > > > > + if (foo (tmp) != 0x10110110U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo1 (tmp) != 0x1101101U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo2 (tmp) != 0x0U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo3 (tmp) != 0x0U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo4 (tmp) != 0xffffffffU) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo5 (tmp) != 0xffffffffU) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo6 (tmp) != 0x59359359U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo7 (tmp) != 0x93593593U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo8 (tmp) != 0xa7ca7ca7U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo9 (tmp) != 0x7ca7ca7cU) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo10 (tmp) != 0x58358358U) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo11 (tmp) != 0x83583583U) > > > > > + __builtin_abort (); > > > > > +} > > > > > + > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > > > > new file mode 100644 > > > > > index 00000000000..1f4ffa34b48 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c > > > > > @@ -0,0 +1,111 @@ > > > > > +/* { dg-do compile } */ > > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ > > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > > + tmp &= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo1 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > > + tmp &= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo2 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > > + tmp &= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo3 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > > + tmp &= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo4 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > > + tmp |= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo5 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > > + tmp |= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo6 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > > + tmp |= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo7 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > > + tmp |= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo8 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > > + tmp ^= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo9 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > > + tmp ^= ~(1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo10 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 0; bit < 16; bit += 3) > > > > > + tmp ^= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > + > > > > > +unsigned short > > > > > +__attribute__((noipa)) > > > > > +foo11 (unsigned short tmp) > > > > > +{ > > > > > + for (int bit = 15; bit >= 0; bit -= 3) > > > > > + tmp ^= (1U << bit); > > > > > + return tmp; > > > > > +} > > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > > new file mode 100644 > > > > > index 00000000000..65426d71efe > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c > > > > > @@ -0,0 +1,46 @@ > > > > > +/* { dg-do run } */ > > > > > +/* { dg-options "-O1" } */ > > > > > + > > > > > +#include "pr103462-5.c" > > > > > + > > > > > +int main() > > > > > +{ > > > > > + unsigned short tmp = 0x1111U; > > > > > + > > > > > + if (foo (tmp) != 0x110) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo1 (tmp) != 0x110) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo2 (tmp) != 0x0) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo3 (tmp) != 0x0) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo4 (tmp) != 0xffff) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo5 (tmp) != 0xffff) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo6 (tmp) != 0x9359) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo7 (tmp) != 0x9359) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo8 (tmp) != 0x8358) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo9 (tmp) != 0x8358) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo10 (tmp) != 0x8358) > > > > > + __builtin_abort (); > > > > > + > > > > > + if (foo11 (tmp) != 0x8358) > > > > > + __builtin_abort (); > > > > > +} > > > > > + > > > > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > > > > > index 72ceb4001e3..9b0aec4fd09 100644 > > > > > --- a/gcc/tree-scalar-evolution.cc > > > > > +++ b/gcc/tree-scalar-evolution.cc > > > > > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr) > > > > > || expanded_size > cache.elements ()); > > > > > } > > > > > > > > > > +/* Match.pd function to match bitwise inductive expression. > > > > > + .i.e. > > > > > + _2 = 1 << _1; > > > > > + _3 = ~_2; > > > > > + tmp_9 = _3 & tmp_12; */ > > > > > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree)); > > > > > + > > > > > +/* Return the inductive expression of bitwise operation if possible, > > > > > + otherwise returns DEF. */ > > > > > +static tree > > > > > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop, > > > > > + class loop* loop, > > > > > + tree phidef, > > > > > + tree def, > > > > > + tree niter) > > > > > +{ > > > > > + tree match_op[3],inv, bitwise_scev, bitwise_res; > > > > > + bool folded_casts = false; > > > > > + tree type = TREE_TYPE (phidef); > > > > > + gimple* header_phi = NULL; > > > > > > > > use a gphi * > > > > > > > Changed. > > > > > + > > > > > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF) > > > > > + > > > > > + op2 = PHI <phidef, inv> > > > > > + _1 = (int) bit_17; > > > > > + _3 = 1 << _1; > > > > > + op1 = ~_3; > > > > > + phidef = op1 & op2; */ > > > > > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL) > > > > > + || TREE_CODE (match_op[2]) != SSA_NAME > > > > > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI > > > > > > > > || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > > > > > > > > > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2) > > > > > > > > and then use header_phi here, if you pass in the PHI to use you'll > > > > always have 2 args > > > changed. > > > > > > > > > + return def; > > > > > + > > > > > + header_phi = SSA_NAME_DEF_STMT (match_op[2]); > > > > > + if (gimple_bb (header_phi) != loop->header) > > > > > + return def; > > > > > > > > I think passing in the PHI node and the exit edge instead of phidef > > > > would make the above more clear > > > Remove the upper condition. > > > > > > > > > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef) > > > > > + return def; > > > > > + > > > > > + inv = gimple_phi_arg_def (header_phi, 0); > > > > > + if (inv == phidef) > > > > > + inv = gimple_phi_arg_def (header_phi, 1); > > > > > > > > inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop)); > > > > > > > > where is this used? > > > Move to the place where it's used. > > > > > > > > > + > > > > > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop, > > > > > + match_op[1], > > > > > + &folded_casts); > > > > > > > > you want > > > > > > > > bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); > > > > bitwise_scev = instantiate_parameters (loop, bitwise_scev); > > > > > > > > instead since you are interested in in-loop behavior? > > > Yes, changed. > > > > > > > > > + > > > > > + /* Make sure bits is in range of type precision. */ > > > > > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC > > > > > > > > I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you > > > > do not rule out pointers or vector types. > > > Changed. > > > > > > > > > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev)) > > > > > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type) > > > > > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev))) > > > > > + return def; > > > > > + > > > > > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > > > > > + if (!tree_fits_uhwi_p (bitwise_res) > > > > > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type)) > > > > > + return def; > > > > > > > > Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct? > > > > So name it bit_final? > > > Changed. > > > > > > > > > + > > > > > +enum bit_op_kind > > > > > + { > > > > > + INDUCTION_BIT_CLEAR, > > > > > + INDUCTION_BIT_IOR, > > > > > + INDUCTION_BIT_XOR, > > > > > + INDUCTION_BIT_RESET, > > > > > + INDUCTION_ZERO, > > > > > + INDUCTION_ALL > > > > > + }; > > > > > + > > > > > + enum bit_op_kind induction_kind; > > > > > + enum tree_code code1 > > > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef)); > > > > > + enum tree_code code2 > > > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0])); > > > > > + > > > > > + /* BIT_CLEAR: A &= ~(1 << bit) > > > > > + BIT_RESET: A ^= (1 << bit). > > > > > + BIT_IOR: A |= (1 << bit) > > > > > + BIT_ZERO: A &= (1 << bit) > > > > > + BIT_ALL: A |= ~(1 << bit) > > > > > + BIT_XOR: A ^= ~(1 << bit). > > > > > + bit is induction variable. */ > > > > > + switch (code1) > > > > > + { > > > > > + case BIT_AND_EXPR: > > > > > + induction_kind = code2 == BIT_NOT_EXPR > > > > > + ? INDUCTION_BIT_CLEAR > > > > > + : INDUCTION_ZERO; > > > > > + break; > > > > > + case BIT_IOR_EXPR: > > > > > + induction_kind = code2 == BIT_NOT_EXPR > > > > > + ? INDUCTION_ALL > > > > > + : INDUCTION_BIT_IOR; > > > > > + break; > > > > > + case BIT_XOR_EXPR: > > > > > + induction_kind = code2 == BIT_NOT_EXPR > > > > > + ? INDUCTION_BIT_XOR > > > > > + : INDUCTION_BIT_RESET; > > > > > + break; > > > > > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */ > > > > > + case BIT_NOT_EXPR: > > > > > + gcc_assert (code2 == BIT_XOR_EXPR); > > > > > + induction_kind = INDUCTION_BIT_XOR; > > > > > + break; > > > > > + default: > > > > > + gcc_unreachable(); > > > > > + } > > > > > + > > > > > + if (induction_kind == INDUCTION_ZERO) > > > > > + return build_zero_cst (type); > > > > > > > > is that true even when the loop doesn't iterate? > > > > > > > > > + if (induction_kind == INDUCTION_ALL) > > > > > + return build_all_ones_cst (type); > > > > > > > > Likewise? > > > the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it > > > must be iterated. > > > > > > > > > + > > > > > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > > > > > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); > > > > > > > > you checked fits_uhwi above, name it bit_start? > > > > > > > > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > > > > > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter); > > > > > + > > > > > + /* Loop tripcount should be num_iter + 1. */ > > > > > + for (unsigned i = 0; i != num_iter + 1; i++) > > > > > + { > > > > > + bits = wi::bit_or (bits, > > > > > + wi::lshift (wi::one (TYPE_PRECISION (type)), > > > > > + start)); > > > > > > > > you can use bits = wi::set_bit (bits, start); > > > Changed. > > > > > > > > > + start += step; > > > > > + } > > > > > > > > Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that > > > > the IV wraps and is unsigned char and you have a 256 bit integer the > > > > above could run quite a long time. Please limit num_iter to TYPE_PRECISION > > > > as well (do you have a testcase with a wrapping IV? try some != CST exit > > > > condition and a large increment) > > > Limit to > 0 and < TYPE_SIZE. > > > > > > > > > + > > > > > + bool inverted = false; > > > > > + switch (induction_kind) > > > > > + { > > > > > + case INDUCTION_BIT_CLEAR: > > > > > + code1 = BIT_AND_EXPR; > > > > > + inverted = true; > > > > > + break; > > > > > + case INDUCTION_BIT_IOR: > > > > > + code1 = BIT_IOR_EXPR; > > > > > + break; > > > > > + case INDUCTION_BIT_RESET: > > > > > + code1 = BIT_XOR_EXPR; > > > > > + break; > > > > > + /* A ^= ~(1 << bit) is special, when loop tripcount is even, > > > > > + it's equal to A ^= bits, else A ^= ~bits. */ > > > > > + case INDUCTION_BIT_XOR: > > > > > + code1 = BIT_XOR_EXPR; > > > > > + if (num_iter % 2 == 0) > > > > > + inverted = true; > > > > > + break; > > > > > + default: > > > > > + gcc_unreachable (); > > > > > + } > > > > > + > > > > > + if (inverted) > > > > > + bits = wi::bit_not (bits); > > > > > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits)); > > > > > +} > > > > > + > > > > > /* Do final value replacement for LOOP, return true if we did anything. */ > > > > > > > > > > bool > > > > > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop) > > > > > { > > > > > gphi *phi = psi.phi (); > > > > > tree rslt = PHI_RESULT (phi); > > > > > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > > > > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit); > > > > > + tree def = phidef; > > > > > if (virtual_operand_p (def)) > > > > > { > > > > > gsi_next (&psi); > > > > > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop) > > > > > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def, > > > > > &folded_casts); > > > > > def = compute_overall_effect_of_inner_loop (ex_loop, def); > > > > > + > > > > > + /* Handle bitwise induction expression. > > > > > + > > > > > + .i.e. > > > > > + for (int i = 0; i != 64; i+=3) > > > > > + res &= ~(1UL << i); > > > > > + > > > > > + RES can't be analyzed out by SCEV because it is not polynomially > > > > > + expressible, but in fact final value of RES can be replaced by > > > > > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63} > > > > > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */ > > > > > + if (tree_fits_uhwi_p (niter) > > > > > + && tree_to_uhwi (niter) > > > > > > > > that's redundant - ah, it's the not iterating case that's excluded. > > > > please write it > > > > as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned > > > > HOST_WIDE_INT > > > > and pass it that way to analyze_and_compute_bitwise_induction_effect > > > Changed. > > > > > > > > > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U) > > > > > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop, > > > > > + phidef, def, niter); > > > > > + > > > > > > > > it seems you overwrite the SCEV analysis result unconditionally here, please > > > > move this inside of the following if () so it is done as fallback only > > > > (or add it as && !(def = ...) to the condition) > > > original def will be returned when bitwise induction fails which means > > > rewriting is ok, but i think it's better to not pass original def and > > > use your suggestion. > > > Changed. > > > > > > > > > if (!tree_does_not_contain_chrecs (def) > > > > > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num) > > > > > /* Moving the computation from the loop may prolong life range > > > > > -- > > > > > 2.18.1 > > > > > > > > > > > Here's an updated V2 patch. > > > > + || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2]))) > > + || header_phi->nargs != 2) > > > > please use gimple_phi_num_args (header_phi) != 2 > Changed. > > > > + bitwise_scev = analyze_scalar_evolution (loop, match_op[1]); > > + bitwise_scev = instantiate_parameters (loop, bitwise_scev); > > + > > > > so that's now OK to get the evolution of the bit IV > > > > + bit_final = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); > > > > but this might not, for the final value you probably want to re-analyze the > > SCEV wrt the outer loop using your original analyze_scalar_evolution_in_loop > > query. Of course the final value could be also computed with > > CHREC_LEFT + niter * CHREC_RIGHT, so it might be more convenient > > to remove the above and re-formulate the check it is used in after > > > > + wide_int bits = wi::zero (TYPE_PRECISION (type)); > > + HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); > > > > ? > > > How about > + HOST_WIDE_INT bit_final = bit_start + step * niter; > + > + /* bit_start, bit_final in range of [0,TYPE_PRECISION) > + implies all bits are set in range. */ > + if (bit_final >= TYPE_PRECISION (type) > + || bit_final < 0) > + return NULL_TREE; > > Also remove ex_loop from the parameter. OK. Thanks, Richard. > Updated patch. > > OK with that changes. > > > > Thanks, > > Richard. > > > > > > > > > > > -- > > > BR, > > > Hongtao > > > > -- > BR, > Hongtao
diff --git a/gcc/match.pd b/gcc/match.pd index 6d691d302b3..24ff5f9e6a8 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -7746,3 +7746,10 @@ and, == TYPE_UNSIGNED (TREE_TYPE (@3)))) && single_use (@4) && single_use (@5)))) + +(for bit_op (bit_and bit_ior bit_xor) + (match (bitwise_induction_p @0 @2 @3) + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3))) + +(match (bitwise_induction_p @0 @2 @3) + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3)))) diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c new file mode 100644 index 00000000000..1dc4c2acad6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c @@ -0,0 +1,111 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ + +unsigned long long +__attribute__((noipa)) +foo (unsigned long long tmp) +{ + for (int bit = 0; bit < 64; bit += 3) + tmp &= ~(1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo1 (unsigned long long tmp) +{ + for (int bit = 63; bit >= 0; bit -= 3) + tmp &= ~(1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo2 (unsigned long long tmp) +{ + for (int bit = 0; bit < 64; bit += 3) + tmp &= (1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo3 (unsigned long long tmp) +{ + for (int bit = 63; bit >= 0; bit -= 3) + tmp &= (1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo4 (unsigned long long tmp) +{ + for (int bit = 0; bit < 64; bit += 3) + tmp |= ~(1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo5 (unsigned long long tmp) +{ + for (int bit = 63; bit >= 0; bit -= 3) + tmp |= ~(1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo6 (unsigned long long tmp) +{ + for (int bit = 0; bit < 64; bit += 3) + tmp |= (1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo7 (unsigned long long tmp) +{ + for (int bit = 63; bit >= 0; bit -= 3) + tmp |= (1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo8 (unsigned long long tmp) +{ + for (int bit = 0; bit < 64; bit += 3) + tmp ^= ~(1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo9 (unsigned long long tmp) +{ + for (int bit = 63; bit >= 0; bit -= 3) + tmp ^= ~(1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo10 (unsigned long long tmp) +{ + for (int bit = 0; bit < 64; bit += 3) + tmp ^= (1ULL << bit); + return tmp; +} + +unsigned long long +__attribute__((noipa)) +foo11 (unsigned long long tmp) +{ + for (int bit = 63; bit >= 0; bit -= 3) + tmp ^= (1ULL << bit); + return tmp; +} diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c new file mode 100644 index 00000000000..bc375cb78d4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c @@ -0,0 +1,45 @@ +/* { dg-do run } */ +/* { dg-options "-O1" } */ + +#include "pr103462-1.c" + +int main() +{ + unsigned long long tmp = 0x1111111111111111ULL; + if (foo (tmp) != 0x110110110110110ULL) + __builtin_abort (); + + if (foo1 (tmp) != 0x110110110110110ULL) + __builtin_abort (); + + if (foo2 (tmp) != 0x0ULL) + __builtin_abort (); + + if (foo3 (tmp) != 0x0ULL) + __builtin_abort (); + + if (foo4 (tmp) != 0xffffffffffffffffULL) + __builtin_abort (); + + if (foo5 (tmp) != 0xffffffffffffffffULL) + __builtin_abort (); + + if (foo6 (tmp) != 0x9359359359359359ULL) + __builtin_abort (); + + if (foo7 (tmp) != 0x9359359359359359ULL) + __builtin_abort (); + + if (foo8 (tmp) != 0x8358358358358358ULL) + __builtin_abort (); + + if (foo9 (tmp) != 0x8358358358358358ULL) + __builtin_abort (); + + if (foo10 (tmp) != 0x8358358358358358ULL) + __builtin_abort (); + + if (foo11 (tmp) != 0x8358358358358358ULL) + __builtin_abort (); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c new file mode 100644 index 00000000000..4ba248a4872 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c @@ -0,0 +1,111 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ + +unsigned int +__attribute__((noipa)) +foo (unsigned int tmp) +{ + for (int bit = 0; bit < 32; bit += 3) + tmp &= ~(1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo1 (unsigned int tmp) +{ + for (int bit = 31; bit >= 0; bit -= 3) + tmp &= ~(1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo2 (unsigned int tmp) +{ + for (int bit = 0; bit < 32; bit += 3) + tmp &= (1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo3 (unsigned int tmp) +{ + for (int bit = 31; bit >= 0; bit -= 3) + tmp &= (1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo4 (unsigned int tmp) +{ + for (int bit = 0; bit < 32; bit += 3) + tmp |= ~(1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo5 (unsigned int tmp) +{ + for (int bit = 31; bit >= 0; bit -= 3) + tmp |= ~(1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo6 (unsigned int tmp) +{ + for (int bit = 0; bit < 32; bit += 3) + tmp |= (1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo7 (unsigned int tmp) +{ + for (int bit = 31; bit >= 0; bit -= 3) + tmp |= (1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo8 (unsigned int tmp) +{ + for (int bit = 0; bit < 32; bit += 3) + tmp ^= ~(1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo9 (unsigned int tmp) +{ + for (int bit = 31; bit >= 0; bit -= 3) + tmp ^= ~(1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo10 (unsigned int tmp) +{ + for (int bit = 0; bit < 32; bit += 3) + tmp ^= (1U << bit); + return tmp; +} + +unsigned int +__attribute__((noipa)) +foo11 (unsigned int tmp) +{ + for (int bit = 31; bit >= 0; bit -= 3) + tmp ^= (1U << bit); + return tmp; +} diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c new file mode 100644 index 00000000000..e2f4056f525 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c @@ -0,0 +1,46 @@ +/* { dg-do run } */ +/* { dg-options "-O1" } */ + +#include "pr103462-3.c" + +int main() +{ + unsigned int tmp = 0x11111111U; + + if (foo (tmp) != 0x10110110U) + __builtin_abort (); + + if (foo1 (tmp) != 0x1101101U) + __builtin_abort (); + + if (foo2 (tmp) != 0x0U) + __builtin_abort (); + + if (foo3 (tmp) != 0x0U) + __builtin_abort (); + + if (foo4 (tmp) != 0xffffffffU) + __builtin_abort (); + + if (foo5 (tmp) != 0xffffffffU) + __builtin_abort (); + + if (foo6 (tmp) != 0x59359359U) + __builtin_abort (); + + if (foo7 (tmp) != 0x93593593U) + __builtin_abort (); + + if (foo8 (tmp) != 0xa7ca7ca7U) + __builtin_abort (); + + if (foo9 (tmp) != 0x7ca7ca7cU) + __builtin_abort (); + + if (foo10 (tmp) != 0x58358358U) + __builtin_abort (); + + if (foo11 (tmp) != 0x83583583U) + __builtin_abort (); +} + diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c new file mode 100644 index 00000000000..1f4ffa34b48 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c @@ -0,0 +1,111 @@ +/* { dg-do compile } */ +/* { dg-options "-O1 -fdump-tree-sccp-details" } */ +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */ + +unsigned short +__attribute__((noipa)) +foo (unsigned short tmp) +{ + for (int bit = 0; bit < 16; bit += 3) + tmp &= ~(1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo1 (unsigned short tmp) +{ + for (int bit = 15; bit >= 0; bit -= 3) + tmp &= ~(1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo2 (unsigned short tmp) +{ + for (int bit = 0; bit < 16; bit += 3) + tmp &= (1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo3 (unsigned short tmp) +{ + for (int bit = 15; bit >= 0; bit -= 3) + tmp &= (1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo4 (unsigned short tmp) +{ + for (int bit = 0; bit < 16; bit += 3) + tmp |= ~(1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo5 (unsigned short tmp) +{ + for (int bit = 15; bit >= 0; bit -= 3) + tmp |= ~(1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo6 (unsigned short tmp) +{ + for (int bit = 0; bit < 16; bit += 3) + tmp |= (1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo7 (unsigned short tmp) +{ + for (int bit = 15; bit >= 0; bit -= 3) + tmp |= (1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo8 (unsigned short tmp) +{ + for (int bit = 0; bit < 16; bit += 3) + tmp ^= ~(1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo9 (unsigned short tmp) +{ + for (int bit = 15; bit >= 0; bit -= 3) + tmp ^= ~(1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo10 (unsigned short tmp) +{ + for (int bit = 0; bit < 16; bit += 3) + tmp ^= (1U << bit); + return tmp; +} + +unsigned short +__attribute__((noipa)) +foo11 (unsigned short tmp) +{ + for (int bit = 15; bit >= 0; bit -= 3) + tmp ^= (1U << bit); + return tmp; +} diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c new file mode 100644 index 00000000000..65426d71efe --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c @@ -0,0 +1,46 @@ +/* { dg-do run } */ +/* { dg-options "-O1" } */ + +#include "pr103462-5.c" + +int main() +{ + unsigned short tmp = 0x1111U; + + if (foo (tmp) != 0x110) + __builtin_abort (); + + if (foo1 (tmp) != 0x110) + __builtin_abort (); + + if (foo2 (tmp) != 0x0) + __builtin_abort (); + + if (foo3 (tmp) != 0x0) + __builtin_abort (); + + if (foo4 (tmp) != 0xffff) + __builtin_abort (); + + if (foo5 (tmp) != 0xffff) + __builtin_abort (); + + if (foo6 (tmp) != 0x9359) + __builtin_abort (); + + if (foo7 (tmp) != 0x9359) + __builtin_abort (); + + if (foo8 (tmp) != 0x8358) + __builtin_abort (); + + if (foo9 (tmp) != 0x8358) + __builtin_abort (); + + if (foo10 (tmp) != 0x8358) + __builtin_abort (); + + if (foo11 (tmp) != 0x8358) + __builtin_abort (); +} + diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc index 72ceb4001e3..9b0aec4fd09 100644 --- a/gcc/tree-scalar-evolution.cc +++ b/gcc/tree-scalar-evolution.cc @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr) || expanded_size > cache.elements ()); } +/* Match.pd function to match bitwise inductive expression. + .i.e. + _2 = 1 << _1; + _3 = ~_2; + tmp_9 = _3 & tmp_12; */ +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree)); + +/* Return the inductive expression of bitwise operation if possible, + otherwise returns DEF. */ +static tree +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop, + class loop* loop, + tree phidef, + tree def, + tree niter) +{ + tree match_op[3],inv, bitwise_scev, bitwise_res; + bool folded_casts = false; + tree type = TREE_TYPE (phidef); + gimple* header_phi = NULL; + + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF) + + op2 = PHI <phidef, inv> + _1 = (int) bit_17; + _3 = 1 << _1; + op1 = ~_3; + phidef = op1 & op2; */ + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL) + || TREE_CODE (match_op[2]) != SSA_NAME + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2) + return def; + + header_phi = SSA_NAME_DEF_STMT (match_op[2]); + if (gimple_bb (header_phi) != loop->header) + return def; + + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef) + return def; + + inv = gimple_phi_arg_def (header_phi, 0); + if (inv == phidef) + inv = gimple_phi_arg_def (header_phi, 1); + + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop, + match_op[1], + &folded_casts); + + /* Make sure bits is in range of type precision. */ + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev)) + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type) + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev))) + return def; + + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev); + if (!tree_fits_uhwi_p (bitwise_res) + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type)) + return def; + +enum bit_op_kind + { + INDUCTION_BIT_CLEAR, + INDUCTION_BIT_IOR, + INDUCTION_BIT_XOR, + INDUCTION_BIT_RESET, + INDUCTION_ZERO, + INDUCTION_ALL + }; + + enum bit_op_kind induction_kind; + enum tree_code code1 + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef)); + enum tree_code code2 + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0])); + + /* BIT_CLEAR: A &= ~(1 << bit) + BIT_RESET: A ^= (1 << bit). + BIT_IOR: A |= (1 << bit) + BIT_ZERO: A &= (1 << bit) + BIT_ALL: A |= ~(1 << bit) + BIT_XOR: A ^= ~(1 << bit). + bit is induction variable. */ + switch (code1) + { + case BIT_AND_EXPR: + induction_kind = code2 == BIT_NOT_EXPR + ? INDUCTION_BIT_CLEAR + : INDUCTION_ZERO; + break; + case BIT_IOR_EXPR: + induction_kind = code2 == BIT_NOT_EXPR + ? INDUCTION_ALL + : INDUCTION_BIT_IOR; + break; + case BIT_XOR_EXPR: + induction_kind = code2 == BIT_NOT_EXPR + ? INDUCTION_BIT_XOR + : INDUCTION_BIT_RESET; + break; + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */ + case BIT_NOT_EXPR: + gcc_assert (code2 == BIT_XOR_EXPR); + induction_kind = INDUCTION_BIT_XOR; + break; + default: + gcc_unreachable(); + } + + if (induction_kind == INDUCTION_ZERO) + return build_zero_cst (type); + if (induction_kind == INDUCTION_ALL) + return build_all_ones_cst (type); + + wide_int bits = wi::zero (TYPE_PRECISION (type)); + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev)); + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev)); + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter); + + /* Loop tripcount should be num_iter + 1. */ + for (unsigned i = 0; i != num_iter + 1; i++) + { + bits = wi::bit_or (bits, + wi::lshift (wi::one (TYPE_PRECISION (type)), + start)); + start += step; + } + + bool inverted = false; + switch (induction_kind) + { + case INDUCTION_BIT_CLEAR: + code1 = BIT_AND_EXPR; + inverted = true; + break; + case INDUCTION_BIT_IOR: + code1 = BIT_IOR_EXPR; + break; + case INDUCTION_BIT_RESET: + code1 = BIT_XOR_EXPR; + break; + /* A ^= ~(1 << bit) is special, when loop tripcount is even, + it's equal to A ^= bits, else A ^= ~bits. */ + case INDUCTION_BIT_XOR: + code1 = BIT_XOR_EXPR; + if (num_iter % 2 == 0) + inverted = true; + break; + default: + gcc_unreachable (); + } + + if (inverted) + bits = wi::bit_not (bits); + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits)); +} + /* Do final value replacement for LOOP, return true if we did anything. */ bool @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop) { gphi *phi = psi.phi (); tree rslt = PHI_RESULT (phi); - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit); + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit); + tree def = phidef; if (virtual_operand_p (def)) { gsi_next (&psi); @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop) def = analyze_scalar_evolution_in_loop (ex_loop, loop, def, &folded_casts); def = compute_overall_effect_of_inner_loop (ex_loop, def); + + /* Handle bitwise induction expression. + + .i.e. + for (int i = 0; i != 64; i+=3) + res &= ~(1UL << i); + + RES can't be analyzed out by SCEV because it is not polynomially + expressible, but in fact final value of RES can be replaced by + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63} + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */ + if (tree_fits_uhwi_p (niter) + && tree_to_uhwi (niter) + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U) + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop, + phidef, def, niter); + if (!tree_does_not_contain_chrecs (def) || chrec_contains_symbols_defined_in_loop (def, ex_loop->num) /* Moving the computation from the loop may prolong life range