Message ID | 20211118061632.1762685-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | Reduce cost of aligned sse register store. | expand |
On Thu, Nov 18, 2021 at 7:17 AM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Make them be equal to cost of unaligned ones to avoid odd alignment > peeling. > > Impact for SPEC2017 on CLX: > fprate: > 503.bwaves_r BuildSame > 507.cactuBSSN_r -0.22 > 508.namd_r -0.02 > 510.parest_r -0.28 > 511.povray_r -0.20 > 519.lbm_r BuildSame > 521.wrf_r -0.58 > 526.blender_r -0.30 > 527.cam4_r 1.07 > 538.imagick_r 0.01 > 544.nab_r -0.09 > 549.fotonik3d_r BuildSame > 554.roms_r BuildSame > intrate: > 500.perlbench_r -0.25 > 502.gcc_r -0.15 > 505.mcf_r BuildSame > 520.omnetpp_r 1.03 > 523.xalancbmk_r -0.13 > 525.x264_r -0.05 > 531.deepsjeng_r -0.27 > 541.leela_r -0.24 > 548.exchange2_r -0.06 > 557.xz_r -0.10 > 999.specrand_ir 2.69 > > Bootstrapped and regtested on x86_64-linux-gnu{-m32,}. > Ready to push to trunk. OK. > gcc/ChangeLog: > > PR target/102543 > * config/i386/x86-tune-costs.h (skylake_cost): Reduce cost of > storing 256/512-bit SSE register to be equal to cost of > unaligned store to avoid odd alignment peeling. > (icelake_cost): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr102543.c: New test. > --- > gcc/config/i386/x86-tune-costs.h | 4 +-- > gcc/testsuite/gcc.target/i386/pr102543.c | 35 ++++++++++++++++++++++++ > 2 files changed, 37 insertions(+), 2 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr102543.c > > diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h > index dd5563d2e64..60d50c97fca 100644 > --- a/gcc/config/i386/x86-tune-costs.h > +++ b/gcc/config/i386/x86-tune-costs.h > @@ -1903,7 +1903,7 @@ struct processor_costs skylake_cost = { > {6, 6, 6}, /* cost of storing integer registers */ > {6, 6, 6, 10, 20}, /* cost of loading SSE register > in 32bit, 64bit, 128bit, 256bit and 512bit */ > - {8, 8, 8, 12, 24}, /* cost of storing SSE register > + {8, 8, 8, 8, 16}, /* cost of storing SSE register > in 32bit, 64bit, 128bit, 256bit and 512bit */ > {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > @@ -2029,7 +2029,7 @@ struct processor_costs icelake_cost = { > {6, 6, 6}, /* cost of storing integer registers */ > {6, 6, 6, 10, 20}, /* cost of loading SSE register > in 32bit, 64bit, 128bit, 256bit and 512bit */ > - {8, 8, 8, 12, 24}, /* cost of storing SSE register > + {8, 8, 8, 8, 16}, /* cost of storing SSE register > in 32bit, 64bit, 128bit, 256bit and 512bit */ > {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ > {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > diff --git a/gcc/testsuite/gcc.target/i386/pr102543.c b/gcc/testsuite/gcc.target/i386/pr102543.c > new file mode 100644 > index 00000000000..893eb9a5902 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr102543.c > @@ -0,0 +1,35 @@ > +/* PR target/102543 */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -march=skylake-avx512 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-not "MEM\\\[" "optimized" } } */ > + > +struct a > +{ > + int a[100]; > +}; > +typedef struct a misaligned_t __attribute__ ((aligned (8))); > +typedef struct a aligned_t __attribute__ ((aligned (32))); > + > +__attribute__ ((used)) > +__attribute__ ((noinline)) > +void > +t(void *a, int misaligned, aligned_t *d) > +{ > + int i,v; > + for (i=0;i<100;i++) > + { > + if (misaligned) > + v=((misaligned_t *)a)->a[i]; > + else > + v=((aligned_t *)a)->a[i]; > + d->a[i]+=v; > + } > +} > +struct b {int v; misaligned_t m;aligned_t aa;} b; > +aligned_t d; > +int > +main() > +{ > + t(&b.m, 1, &d); > + return 0; > +} > -- > 2.18.2 >
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index dd5563d2e64..60d50c97fca 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1903,7 +1903,7 @@ struct processor_costs skylake_cost = { {6, 6, 6}, /* cost of storing integer registers */ {6, 6, 6, 10, 20}, /* cost of loading SSE register in 32bit, 64bit, 128bit, 256bit and 512bit */ - {8, 8, 8, 12, 24}, /* cost of storing SSE register + {8, 8, 8, 8, 16}, /* cost of storing SSE register in 32bit, 64bit, 128bit, 256bit and 512bit */ {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ @@ -2029,7 +2029,7 @@ struct processor_costs icelake_cost = { {6, 6, 6}, /* cost of storing integer registers */ {6, 6, 6, 10, 20}, /* cost of loading SSE register in 32bit, 64bit, 128bit, 256bit and 512bit */ - {8, 8, 8, 12, 24}, /* cost of storing SSE register + {8, 8, 8, 8, 16}, /* cost of storing SSE register in 32bit, 64bit, 128bit, 256bit and 512bit */ {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ diff --git a/gcc/testsuite/gcc.target/i386/pr102543.c b/gcc/testsuite/gcc.target/i386/pr102543.c new file mode 100644 index 00000000000..893eb9a5902 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr102543.c @@ -0,0 +1,35 @@ +/* PR target/102543 */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=skylake-avx512 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump-not "MEM\\\[" "optimized" } } */ + +struct a +{ + int a[100]; +}; +typedef struct a misaligned_t __attribute__ ((aligned (8))); +typedef struct a aligned_t __attribute__ ((aligned (32))); + +__attribute__ ((used)) +__attribute__ ((noinline)) +void +t(void *a, int misaligned, aligned_t *d) +{ + int i,v; + for (i=0;i<100;i++) + { + if (misaligned) + v=((misaligned_t *)a)->a[i]; + else + v=((aligned_t *)a)->a[i]; + d->a[i]+=v; + } +} +struct b {int v; misaligned_t m;aligned_t aa;} b; +aligned_t d; +int +main() +{ + t(&b.m, 1, &d); + return 0; +}