Message ID | 20230509070604.3466556-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Detect bswap + rotate for byte permutation in pass_bswap. | expand |
On Tue, May 9, 2023 at 9:06 AM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > The patch doesn't handle: > 1. cast64_to_32, > 2. memory source with rsize < range. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? OK and sorry for the delay. Richard. > gcc/ChangeLog: > > PR middle-end/108938 > * gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New > function, cut from original find_bswap_or_nop function. > (find_bswap_or_nop): Add a new parameter, detect bswap + > rotate and save rotate result in the new parameter. > (bswap_replace): Add a new parameter to indicate rotate and > generate rotate stmt if needed. > (maybe_optimize_vector_constructor): Adjust for new rotate > parameter in the upper 2 functions. > (pass_optimize_bswap::execute): Ditto. > (imm_store_chain_info::output_merged_store): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr108938-1.c: New test. > * gcc.target/i386/pr108938-2.c: New test. > * gcc.target/i386/pr108938-3.c: New test. > * gcc.target/i386/pr108938-load-1.c: New test. > * gcc.target/i386/pr108938-load-2.c: New test. > --- > gcc/gimple-ssa-store-merging.cc | 130 ++++++++++++++---- > gcc/testsuite/gcc.target/i386/pr108938-1.c | 79 +++++++++++ > gcc/testsuite/gcc.target/i386/pr108938-2.c | 35 +++++ > gcc/testsuite/gcc.target/i386/pr108938-3.c | 26 ++++ > .../gcc.target/i386/pr108938-load-1.c | 69 ++++++++++ > .../gcc.target/i386/pr108938-load-2.c | 30 ++++ > 6 files changed, 342 insertions(+), 27 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-2.c > > diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc > index df7afd2fd78..9cb574fa315 100644 > --- a/gcc/gimple-ssa-store-merging.cc > +++ b/gcc/gimple-ssa-store-merging.cc > @@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, > n->range *= BITS_PER_UNIT; > } > > +/* Helper function for find_bswap_or_nop, > + Return true if N is a swap or nop with MASK. */ > +static bool > +is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg, > + uint64_t cmpnop, uint64_t* mask, > + bool* bswap) > +{ > + *mask = ~(uint64_t) 0; > + if (n == cmpnop) > + *bswap = false; > + else if (n == cmpxchg) > + *bswap = true; > + else > + { > + int set = 0; > + for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) > + if ((n & msk) == 0) > + *mask &= ~msk; > + else if ((n & msk) == (cmpxchg & msk)) > + set++; > + else > + return false; > + > + if (set < 2) > + return false; > + *bswap = true; > + } > + return true; > +} > + > + > /* Check if STMT completes a bswap implementation or a read in a given > endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP > accordingly. It also sets N to represent the kind of operations > @@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, > > gimple * > find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, > - bool *cast64_to_32, uint64_t *mask) > + bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate) > { > tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt))); > if (!tree_fits_uhwi_p (type_size)) > @@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, > } > > uint64_t cmpxchg, cmpnop; > + uint64_t orig_range = n->range * BITS_PER_UNIT; > find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32); > > /* A complete byte swap should make the symbolic number to start with > the largest digit in the highest order byte. Unchanged symbolic > number indicates a read with same endianness as target architecture. */ > - *mask = ~(uint64_t) 0; > - if (n->n == cmpnop) > - *bswap = false; > - else if (n->n == cmpxchg) > - *bswap = true; > - else > + *l_rotate = 0; > + uint64_t tmp_n = n->n; > + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) > { > - int set = 0; > - for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) > - if ((n->n & msk) == 0) > - *mask &= ~msk; > - else if ((n->n & msk) == (cmpxchg & msk)) > - set++; > - else > - return NULL; > - if (set < 2) > + /* Try bswap + lrotate. */ > + /* TODO, handle cast64_to_32 and big/litte_endian memory > + source when rsize < range. */ > + if (n->range == orig_range > + && ((orig_range == 32 > + && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing) > + || (orig_range == 64 > + && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing)) > + && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT) > + { > + uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER; > + uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER; > + /* .i.e. hanlde 0x203040506070800 when lower byte is zero. */ > + if (!count) > + { > + for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++) > + { > + count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK; > + if (count) > + { > + /* Count should be meaningful not 0xff. */ > + if (count <= range / BITS_PER_MARKER) > + { > + count = (count + i) * BITS_PER_MARKER % range; > + break; > + } > + else > + return NULL; > + } > + } > + } > + tmp_n = tmp_n >> count | tmp_n << (range - count); > + if (orig_range == 32) > + tmp_n &= (1ULL << 32) - 1; > + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) > + return NULL; > + *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT; > + gcc_assert (*bswap); > + } > + else > return NULL; > - *bswap = true; > } > > /* Useless bit manipulation performed by code. */ > @@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val, > tree > bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, > tree bswap_type, tree load_type, struct symbolic_number *n, > - bool bswap, uint64_t mask) > + bool bswap, uint64_t mask, uint64_t l_rotate) > { > tree src, tmp, tgt = NULL_TREE; > - gimple *bswap_stmt, *mask_stmt = NULL; > + gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL; > tree_code conv_code = NOP_EXPR; > > gimple *cur_stmt = gsi_stmt (gsi); > @@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, > tmp = tgt; > } > > + if (l_rotate) > + { > + tree m = build_int_cst (bswap_type, l_rotate); > + tmp = make_temp_ssa_name (bswap_type, NULL, > + mask_stmt ? "bswapmaskdst" : "bswapdst"); > + gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); > + rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m); > + tmp = tgt; > + } > + > /* Convert the result if necessary. */ > if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type)) > { > @@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, > gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT); > } > > - gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); > + gimple_set_lhs (rotl_stmt ? rotl_stmt > + : mask_stmt ? mask_stmt : bswap_stmt, tmp); > > if (dump_file) > { > @@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, > > if (cur_stmt) > { > + if (rotl_stmt) > + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); > if (mask_stmt) > gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT); > gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); > @@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, > gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT); > if (mask_stmt) > gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT); > + if (rotl_stmt) > + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); > } > return tgt; > } > @@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) > } > > bool cast64_to_32; > - uint64_t mask; > + uint64_t mask, l_rotate; > gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, > - &cast64_to_32, &mask); > + &cast64_to_32, &mask, &l_rotate); > if (!ins_stmt > || n.range != (unsigned HOST_WIDE_INT) sz > || cast64_to_32 > @@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) > memset (&nop_stats, 0, sizeof (nop_stats)); > memset (&bswap_stats, 0, sizeof (bswap_stats)); > return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, > - bswap_type, load_type, &n, bswap, mask) != NULL_TREE; > + bswap_type, load_type, &n, bswap, mask, > + l_rotate) != NULL_TREE; > } > > /* Find manual byte swap implementations as well as load in a given > @@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun) > enum tree_code code; > struct symbolic_number n; > bool bswap, cast64_to_32; > - uint64_t mask; > + uint64_t mask, l_rotate; > > /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt > might be moved to a different basic block by bswap_replace and gsi > @@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun) > } > > ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, > - &cast64_to_32, &mask); > + &cast64_to_32, &mask, &l_rotate); > > if (!ins_stmt) > continue; > @@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun) > continue; > > if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, > - bswap_type, load_type, &n, bswap, mask)) > + bswap_type, load_type, &n, bswap, mask, > + l_rotate)) > changed = true; > } > } > @@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) > } > bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl, > bswap_type, load_type, n, bswap, > - ~(uint64_t) 0); > + ~(uint64_t) 0, 0); > gcc_assert (bswap_res); > } > > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c b/gcc/testsuite/gcc.target/i386/pr108938-1.c > new file mode 100644 > index 00000000000..67f245d1441 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c > @@ -0,0 +1,79 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mno-movbe" } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */ > + > +#include<stdint.h> > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64 (uint64_t x) > +{ > + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | > + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | > + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | > + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | > + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | > + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | > + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | > + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_1 (uint64_t x) > +{ > + return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) | > + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | > + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | > + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | > + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | > + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | > + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | > + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_2 (uint64_t x) > +{ > + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | > + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | > + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | > + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | > + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | > + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | > + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | > + (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48)); > +} > + > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32 (uint32_t x) > +{ > + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | > + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); > +} > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32_mask_1 (uint32_t x) > +{ > + return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | > + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); > +} > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32_mask_2 (uint32_t x) > +{ > + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | > + (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) | > + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c b/gcc/testsuite/gcc.target/i386/pr108938-2.c > new file mode 100644 > index 00000000000..47a2c89f1c4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c > @@ -0,0 +1,35 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > + > +#include "pr108938-1.c" > + > +int main () > +{ > + uint64_t a = 0x0807060504030201ULL; > + uint64_t res = swap_rotate_64 (a); > + if (res != 0x0203040506070801ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_1 (a); > + if (res != 0x0203040506070800ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_2 (a); > + if (res != 0x0203040506070001ULL) > + __builtin_abort (); > + > + uint32_t b = 0x04030201; > + uint32_t res2 = swap_rotate_32 (b); > + if (res2 != 0x03040102) > + __builtin_abort (); > + > + res2 = swap_rotate_32_mask_1 (b); > + if (res2 != 0x03040002) > + __builtin_abort (); > + > + res2 = swap_rotate_32_mask_2 (b); > + if (res2 != 0x03040100) > + __builtin_abort (); > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c b/gcc/testsuite/gcc.target/i386/pr108938-3.c > new file mode 100644 > index 00000000000..32ac544c7ed > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c > @@ -0,0 +1,26 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */ > +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */ > + > +void > +foo1 (char* a, unsigned int* __restrict b) > +{ > + a[0] = b[0] >> 24; > + a[1] = b[0] >> 16; > + a[2] = b[0] >> 8; > + a[3] = b[0]; > + a[4] = b[1] >> 24; > + a[5] = b[1] >> 16; > + a[6] = b[1] >> 8; > + a[7] = b[1]; > +} > + > +void > +foo2 (char* a, short* __restrict b) > +{ > + a[0] = b[0] >> 8; > + a[1] = b[0]; > + a[2] = b[1] >> 8; > + a[3] = b[1]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c > new file mode 100644 > index 00000000000..50d3a505c81 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c > @@ -0,0 +1,69 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mno-movbe" } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */ > + > +#include<stdint.h> > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 0 | > + (uint64_t)(x[1]) << 56 | > + (uint64_t)(x[2]) << 48 | > + (uint64_t)(x[3]) << 40 | > + (uint64_t)(x[4]) << 32 | > + (uint64_t)(x[5]) << 24 | > + (uint64_t)(x[6]) << 16 | > + (uint64_t)(x[7]) << 8); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_1 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 24 | > + (uint64_t)(x[1]) << 16 | > + (uint64_t)(x[2]) << 8 | > + (uint64_t)(0) << 0 | > + (uint64_t)(x[4]) << 56 | > + (uint64_t)(x[5]) << 48 | > + (uint64_t)(x[6]) << 40 | > + (uint64_t)(x[7]) << 32); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_2 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 0 | > + (uint64_t)(x[1]) << 56 | > + (uint64_t)(x[2]) << 48 | > + (uint64_t)(0) << 40 | > + (uint64_t)(x[4]) << 32 | > + (uint64_t)(x[5]) << 24 | > + (uint64_t)(x[6]) << 16 | > + (uint64_t)(x[7]) << 8); > +} > + > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 8 | > + (uint64_t)(x[1]) << 0 | > + (uint64_t)(x[2]) << 24 | > + (uint64_t)(x[3]) << 16); > +} > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32_mask_1 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 8 | > + (uint64_t)(0) << 0 | > + (uint64_t)(x[2]) << 24 | > + (uint64_t)(x[3]) << 16); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c > new file mode 100644 > index 00000000000..51a8102f995 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c > @@ -0,0 +1,30 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > + > +#include "pr108938-load-1.c" > + > +int main () > +{ > + unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8}; > + uint64_t res = swap_rotate_64 (a); > + if (res != 0x0203040506070801ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_1 (a); > + if (res != 0x0506070801020300ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_2 (a); > + if (res != 0x0203000506070801ULL) > + __builtin_abort (); > + > + uint32_t res2 = swap_rotate_32 (a); > + if (res2 != 0x03040102) > + __builtin_abort (); > + > + res2 = swap_rotate_32_mask_1 (a); > + if (res2 != 0x03040100) > + __builtin_abort (); > + > + return 0; > +} > -- > 2.39.1.388.g2fc9e9ca3c >
diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc index df7afd2fd78..9cb574fa315 100644 --- a/gcc/gimple-ssa-store-merging.cc +++ b/gcc/gimple-ssa-store-merging.cc @@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, n->range *= BITS_PER_UNIT; } +/* Helper function for find_bswap_or_nop, + Return true if N is a swap or nop with MASK. */ +static bool +is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg, + uint64_t cmpnop, uint64_t* mask, + bool* bswap) +{ + *mask = ~(uint64_t) 0; + if (n == cmpnop) + *bswap = false; + else if (n == cmpxchg) + *bswap = true; + else + { + int set = 0; + for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) + if ((n & msk) == 0) + *mask &= ~msk; + else if ((n & msk) == (cmpxchg & msk)) + set++; + else + return false; + + if (set < 2) + return false; + *bswap = true; + } + return true; +} + + /* Check if STMT completes a bswap implementation or a read in a given endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP accordingly. It also sets N to represent the kind of operations @@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, gimple * find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, - bool *cast64_to_32, uint64_t *mask) + bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate) { tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt))); if (!tree_fits_uhwi_p (type_size)) @@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, } uint64_t cmpxchg, cmpnop; + uint64_t orig_range = n->range * BITS_PER_UNIT; find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32); /* A complete byte swap should make the symbolic number to start with the largest digit in the highest order byte. Unchanged symbolic number indicates a read with same endianness as target architecture. */ - *mask = ~(uint64_t) 0; - if (n->n == cmpnop) - *bswap = false; - else if (n->n == cmpxchg) - *bswap = true; - else + *l_rotate = 0; + uint64_t tmp_n = n->n; + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) { - int set = 0; - for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) - if ((n->n & msk) == 0) - *mask &= ~msk; - else if ((n->n & msk) == (cmpxchg & msk)) - set++; - else - return NULL; - if (set < 2) + /* Try bswap + lrotate. */ + /* TODO, handle cast64_to_32 and big/litte_endian memory + source when rsize < range. */ + if (n->range == orig_range + && ((orig_range == 32 + && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing) + || (orig_range == 64 + && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing)) + && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT) + { + uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER; + uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER; + /* .i.e. hanlde 0x203040506070800 when lower byte is zero. */ + if (!count) + { + for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++) + { + count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK; + if (count) + { + /* Count should be meaningful not 0xff. */ + if (count <= range / BITS_PER_MARKER) + { + count = (count + i) * BITS_PER_MARKER % range; + break; + } + else + return NULL; + } + } + } + tmp_n = tmp_n >> count | tmp_n << (range - count); + if (orig_range == 32) + tmp_n &= (1ULL << 32) - 1; + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) + return NULL; + *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT; + gcc_assert (*bswap); + } + else return NULL; - *bswap = true; } /* Useless bit manipulation performed by code. */ @@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val, tree bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, tree bswap_type, tree load_type, struct symbolic_number *n, - bool bswap, uint64_t mask) + bool bswap, uint64_t mask, uint64_t l_rotate) { tree src, tmp, tgt = NULL_TREE; - gimple *bswap_stmt, *mask_stmt = NULL; + gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL; tree_code conv_code = NOP_EXPR; gimple *cur_stmt = gsi_stmt (gsi); @@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, tmp = tgt; } + if (l_rotate) + { + tree m = build_int_cst (bswap_type, l_rotate); + tmp = make_temp_ssa_name (bswap_type, NULL, + mask_stmt ? "bswapmaskdst" : "bswapdst"); + gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); + rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m); + tmp = tgt; + } + /* Convert the result if necessary. */ if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type)) { @@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT); } - gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); + gimple_set_lhs (rotl_stmt ? rotl_stmt + : mask_stmt ? mask_stmt : bswap_stmt, tmp); if (dump_file) { @@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, if (cur_stmt) { + if (rotl_stmt) + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); if (mask_stmt) gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT); gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); @@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT); if (mask_stmt) gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT); + if (rotl_stmt) + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); } return tgt; } @@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) } bool cast64_to_32; - uint64_t mask; + uint64_t mask, l_rotate; gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, - &cast64_to_32, &mask); + &cast64_to_32, &mask, &l_rotate); if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz || cast64_to_32 @@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) memset (&nop_stats, 0, sizeof (nop_stats)); memset (&bswap_stats, 0, sizeof (bswap_stats)); return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, - bswap_type, load_type, &n, bswap, mask) != NULL_TREE; + bswap_type, load_type, &n, bswap, mask, + l_rotate) != NULL_TREE; } /* Find manual byte swap implementations as well as load in a given @@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun) enum tree_code code; struct symbolic_number n; bool bswap, cast64_to_32; - uint64_t mask; + uint64_t mask, l_rotate; /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt might be moved to a different basic block by bswap_replace and gsi @@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun) } ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, - &cast64_to_32, &mask); + &cast64_to_32, &mask, &l_rotate); if (!ins_stmt) continue; @@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun) continue; if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, - bswap_type, load_type, &n, bswap, mask)) + bswap_type, load_type, &n, bswap, mask, + l_rotate)) changed = true; } } @@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) } bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl, bswap_type, load_type, n, bswap, - ~(uint64_t) 0); + ~(uint64_t) 0, 0); gcc_assert (bswap_res); } diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c b/gcc/testsuite/gcc.target/i386/pr108938-1.c new file mode 100644 index 00000000000..67f245d1441 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c @@ -0,0 +1,79 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-movbe" } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */ + +#include<stdint.h> + +uint64_t +__attribute__((noipa)) +swap_rotate_64 (uint64_t x) +{ + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_1 (uint64_t x) +{ + return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) | + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_2 (uint64_t x) +{ + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | + (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48)); +} + + +uint32_t +__attribute__((noipa)) +swap_rotate_32 (uint32_t x) +{ + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); +} + +uint32_t +__attribute__((noipa)) +swap_rotate_32_mask_1 (uint32_t x) +{ + return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); +} + +uint32_t +__attribute__((noipa)) +swap_rotate_32_mask_2 (uint32_t x) +{ + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | + (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) | + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c b/gcc/testsuite/gcc.target/i386/pr108938-2.c new file mode 100644 index 00000000000..47a2c89f1c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c @@ -0,0 +1,35 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +#include "pr108938-1.c" + +int main () +{ + uint64_t a = 0x0807060504030201ULL; + uint64_t res = swap_rotate_64 (a); + if (res != 0x0203040506070801ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_1 (a); + if (res != 0x0203040506070800ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_2 (a); + if (res != 0x0203040506070001ULL) + __builtin_abort (); + + uint32_t b = 0x04030201; + uint32_t res2 = swap_rotate_32 (b); + if (res2 != 0x03040102) + __builtin_abort (); + + res2 = swap_rotate_32_mask_1 (b); + if (res2 != 0x03040002) + __builtin_abort (); + + res2 = swap_rotate_32_mask_2 (b); + if (res2 != 0x03040100) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c b/gcc/testsuite/gcc.target/i386/pr108938-3.c new file mode 100644 index 00000000000..32ac544c7ed --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */ +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */ + +void +foo1 (char* a, unsigned int* __restrict b) +{ + a[0] = b[0] >> 24; + a[1] = b[0] >> 16; + a[2] = b[0] >> 8; + a[3] = b[0]; + a[4] = b[1] >> 24; + a[5] = b[1] >> 16; + a[6] = b[1] >> 8; + a[7] = b[1]; +} + +void +foo2 (char* a, short* __restrict b) +{ + a[0] = b[0] >> 8; + a[1] = b[0]; + a[2] = b[1] >> 8; + a[3] = b[1]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c new file mode 100644 index 00000000000..50d3a505c81 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-movbe" } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */ + +#include<stdint.h> + +uint64_t +__attribute__((noipa)) +swap_rotate_64 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 0 | + (uint64_t)(x[1]) << 56 | + (uint64_t)(x[2]) << 48 | + (uint64_t)(x[3]) << 40 | + (uint64_t)(x[4]) << 32 | + (uint64_t)(x[5]) << 24 | + (uint64_t)(x[6]) << 16 | + (uint64_t)(x[7]) << 8); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_1 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 24 | + (uint64_t)(x[1]) << 16 | + (uint64_t)(x[2]) << 8 | + (uint64_t)(0) << 0 | + (uint64_t)(x[4]) << 56 | + (uint64_t)(x[5]) << 48 | + (uint64_t)(x[6]) << 40 | + (uint64_t)(x[7]) << 32); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_2 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 0 | + (uint64_t)(x[1]) << 56 | + (uint64_t)(x[2]) << 48 | + (uint64_t)(0) << 40 | + (uint64_t)(x[4]) << 32 | + (uint64_t)(x[5]) << 24 | + (uint64_t)(x[6]) << 16 | + (uint64_t)(x[7]) << 8); +} + + +uint32_t +__attribute__((noipa)) +swap_rotate_32 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 8 | + (uint64_t)(x[1]) << 0 | + (uint64_t)(x[2]) << 24 | + (uint64_t)(x[3]) << 16); +} + +uint32_t +__attribute__((noipa)) +swap_rotate_32_mask_1 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 8 | + (uint64_t)(0) << 0 | + (uint64_t)(x[2]) << 24 | + (uint64_t)(x[3]) << 16); +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c new file mode 100644 index 00000000000..51a8102f995 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c @@ -0,0 +1,30 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +#include "pr108938-load-1.c" + +int main () +{ + unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8}; + uint64_t res = swap_rotate_64 (a); + if (res != 0x0203040506070801ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_1 (a); + if (res != 0x0506070801020300ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_2 (a); + if (res != 0x0203000506070801ULL) + __builtin_abort (); + + uint32_t res2 = swap_rotate_32 (a); + if (res2 != 0x03040102) + __builtin_abort (); + + res2 = swap_rotate_32_mask_1 (a); + if (res2 != 0x03040100) + __builtin_abort (); + + return 0; +}