Message ID | 20220106065148.64387-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | [RTL/fwprop] Allow propagations from inner loop to outer loop. | expand |
On January 6, 2022 7:51:48 AM GMT+01:00, liuhongt <hongtao.liu@intel.com> wrote: >> that's flow_loop_nested_p (loop *outer, loop *inner) which >> is implemented in O(1). Note behavior for outer == inner >> might be different (didn't check your implementation too hard) >> >Thanks, it seems flow_loop_nested_p assume outer and inner not to be >NULL. So I add some conditions to check NULL which is considered as an outer > loop of any other loop. Huh, loop_father should never be NULL. Maybe when fwprop is run after RTL loop opts you instead want to add a check for current_loops or alternelatively initialize loops in fwprop. > > >gcc/ChangeLog: > > PR rtl/103750 > * fwprop.c (forward_propagate_into): Allow propagations from > inner loop to outer loop. > >gcc/testsuite/ChangeLog: > > * g++.target/i386/pr103750-fwprop-1.C: New test. >--- > gcc/fwprop.c | 7 +++-- > .../g++.target/i386/pr103750-fwprop-1.C | 26 +++++++++++++++++++ > 2 files changed, 31 insertions(+), 2 deletions(-) > create mode 100644 gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C > >diff --git a/gcc/fwprop.c b/gcc/fwprop.c >index 2eab4fd4614..af2e9d1c189 100644 >--- a/gcc/fwprop.c >+++ b/gcc/fwprop.c >@@ -866,10 +866,13 @@ forward_propagate_into (use_info *use, bool reg_prop_only = false) > rtx src = SET_SRC (def_set); > > /* Allow propagations into a loop only for reg-to-reg copies, since >- replacing one register by another shouldn't increase the cost. */ >+ replacing one register by another shouldn't increase the cost. >+ Propagations from inner loop to outer loop should be also ok. */ > struct loop *def_loop = def_insn->bb ()->cfg_bb ()->loop_father; > struct loop *use_loop = use->bb ()->cfg_bb ()->loop_father; >- if ((reg_prop_only || def_loop != use_loop) >+ if ((reg_prop_only >+ || (use_loop && def_loop != use_loop >+ &&(!def_loop || !flow_loop_nested_p (use_loop, def_loop)))) > && (!reg_single_def_p (dest) || !reg_single_def_p (src))) > return false; > >diff --git a/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C b/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C >new file mode 100644 >index 00000000000..26987d307aa >--- /dev/null >+++ b/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C >@@ -0,0 +1,26 @@ >+/* PR target/103750. */ >+/* { dg-do compile } */ >+/* { dg-options "-O2 -std=c++1y -march=cannonlake -fdump-rtl-fwprop1" } */ >+/* { dg-final { scan-rtl-dump-not "subreg:HI\[ \\\(\]*reg:SI\[^\n]*\n\[^\n]*UNSPEC_TZCNT" "fwprop1" } } */ >+ >+#include<immintrin.h> >+const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept >+{ >+ __m256i mch256 = _mm256_set1_epi16(c); >+ for ( ; n < e; n += 32) { >+ __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); >+ __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); >+ __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); >+ __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); >+ if (_kortestz_mask16_u8(mask1, mask2)) >+ continue; >+ >+ unsigned idx = _tzcnt_u32(mask1); >+ if (mask1 == 0) { >+ idx = __tzcnt_u16(mask2); >+ n += 16; >+ } >+ return n + idx; >+ } >+ return e; >+}
diff --git a/gcc/fwprop.c b/gcc/fwprop.c index 2eab4fd4614..af2e9d1c189 100644 --- a/gcc/fwprop.c +++ b/gcc/fwprop.c @@ -866,10 +866,13 @@ forward_propagate_into (use_info *use, bool reg_prop_only = false) rtx src = SET_SRC (def_set); /* Allow propagations into a loop only for reg-to-reg copies, since - replacing one register by another shouldn't increase the cost. */ + replacing one register by another shouldn't increase the cost. + Propagations from inner loop to outer loop should be also ok. */ struct loop *def_loop = def_insn->bb ()->cfg_bb ()->loop_father; struct loop *use_loop = use->bb ()->cfg_bb ()->loop_father; - if ((reg_prop_only || def_loop != use_loop) + if ((reg_prop_only + || (use_loop && def_loop != use_loop + &&(!def_loop || !flow_loop_nested_p (use_loop, def_loop)))) && (!reg_single_def_p (dest) || !reg_single_def_p (src))) return false; diff --git a/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C b/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C new file mode 100644 index 00000000000..26987d307aa --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr103750-fwprop-1.C @@ -0,0 +1,26 @@ +/* PR target/103750. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -std=c++1y -march=cannonlake -fdump-rtl-fwprop1" } */ +/* { dg-final { scan-rtl-dump-not "subreg:HI\[ \\\(\]*reg:SI\[^\n]*\n\[^\n]*UNSPEC_TZCNT" "fwprop1" } } */ + +#include<immintrin.h> +const char16_t *qustrchr(char16_t *n, char16_t *e, char16_t c) noexcept +{ + __m256i mch256 = _mm256_set1_epi16(c); + for ( ; n < e; n += 32) { + __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); + __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n) + 1); + __mmask16 mask1 = _mm256_cmpeq_epu16_mask(data1, mch256); + __mmask16 mask2 = _mm256_cmpeq_epu16_mask(data2, mch256); + if (_kortestz_mask16_u8(mask1, mask2)) + continue; + + unsigned idx = _tzcnt_u32(mask1); + if (mask1 == 0) { + idx = __tzcnt_u16(mask2); + n += 16; + } + return n + idx; + } + return e; +}