diff mbox series

[3/3] x86: Update memcpy/memset inline strategies for -mtune=generic

Message ID 20210322131636.58461-4-hjl.tools@gmail.com
State New
Headers show
Series x86: Update memcpy/memset inline strategies | expand

Commit Message

H.J. Lu March 22, 2021, 1:16 p.m. UTC
Simply memcpy and memset inline strategies to avoid branches for
-mtune=generic:

1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
   load and store for up to 16 * 16 (256) bytes when the data size is
   fixed and known.
2. Inline only if data size is known to be <= 256.
   a. Use "rep movsb/stosb" with simple code sequence if the data size
      is a constant.
   b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.

With -mtune=generic -O2,

1. On Ice Lake processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r  0.51%
502.gcc_r        0.55%
505.mcf_r        0.38%
520.omnetpp_r   -0.74%
523.xalancbmk_r -0.35%
525.x264_r       2.99%
531.deepsjeng_r -0.17%
541.leela_r     -0.98%
548.exchange2_r  0.89%
557.xz_r         0.70%
Geomean          0.37%

503.bwaves_r     0.04%
507.cactuBSSN_r -0.01%
508.namd_r      -0.45%
510.parest_r    -0.09%
511.povray_r    -1.37%
519.lbm_r        0.00%
521.wrf_r       -2.56%
526.blender_r   -0.01%
527.cam4_r      -0.05%
538.imagick_r    0.36%
544.nab_r        0.08%
549.fotonik3d_r -0.06%
554.roms_r       0.05%
Geomean         -0.34%

Significant impacts on eembc benchmarks:

eembc/nnet_test      14.85%
eembc/mp2decoddata2  13.57%

2. On Cascadelake processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r -0.02%
502.gcc_r        0.10%
505.mcf_r       -1.14%
520.omnetpp_r   -0.22%
523.xalancbmk_r  0.21%
525.x264_r       0.94%
531.deepsjeng_r -0.37%
541.leela_r     -0.46%
548.exchange2_r -0.40%
557.xz_r         0.60%
Geomean         -0.08%

503.bwaves_r    -0.50%
507.cactuBSSN_r  0.05%
508.namd_r      -0.02%
510.parest_r     0.09%
511.povray_r    -1.35%
519.lbm_r        0.00%
521.wrf_r       -0.03%
526.blender_r   -0.83%
527.cam4_r       1.23%
538.imagick_r    0.97%
544.nab_r       -0.02%
549.fotonik3d_r -0.12%
554.roms_r       0.55%
Geomean          0.00%

Significant impacts on eembc benchmarks:

eembc/nnet_test      9.90%
eembc/mp2decoddata2  16.42%
eembc/textv2data3   -4.86%
eembc/qos            12.90%

3. On Znver3 processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r -0.96%
502.gcc_r       -1.06%
505.mcf_r       -0.01%
520.omnetpp_r   -1.45%
523.xalancbmk_r  2.89%
525.x264_r       4.98%
531.deepsjeng_r  0.18%
541.leela_r     -1.54%
548.exchange2_r -1.25%
557.xz_r        -0.01%
Geomean          0.16%

503.bwaves_r     0.04%
507.cactuBSSN_r  0.85%
508.namd_r      -0.13%
510.parest_r     0.39%
511.povray_r     0.00%
519.lbm_r        0.00%
521.wrf_r        0.28%
526.blender_r   -0.10%
527.cam4_r      -0.58%
538.imagick_r    0.69%
544.nab_r       -0.04%
549.fotonik3d_r -0.04%
554.roms_r       0.40%
Geomean          0.15%

Significant impacts on eembc benchmarks:

eembc/aifftr01       13.95%
eembc/idctrn01       8.41%
eembc/nnet_test      30.25%
eembc/mp2decoddata2  5.05%
eembc/textv2data3    6.43%
eembc/qos           -5.79%

gcc/

	* config/i386/x86-tune-costs.h (generic_memcpy): Updated.
	(generic_memset): Likewise.
	(generic_cost): Change CLEAR_RATIO to 17.
	* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
	Add m_GENERIC.

gcc/testsuite/

	* gcc.target/i386/memcpy-strategy-12.c: New test.
	* gcc.target/i386/memcpy-strategy-13.c: Likewise.
	* gcc.target/i386/memset-strategy-10.c: Likewise.
	* gcc.target/i386/memset-strategy-11.c: Likewise.
	* gcc.target/i386/shrink_wrap_1.c: Also pass
	-mmemset-strategy=rep_8byte:-1:align.
	* gcc.target/i386/sw-1.c: Also pass -mstringop-strategy=rep_byte.
---
 gcc/config/i386/x86-tune-costs.h              | 31 ++++++++++++-------
 gcc/config/i386/x86-tune.def                  |  2 +-
 .../gcc.target/i386/memcpy-strategy-12.c      |  9 ++++++
 .../gcc.target/i386/memcpy-strategy-13.c      | 11 +++++++
 .../gcc.target/i386/memset-strategy-10.c      | 11 +++++++
 .../gcc.target/i386/memset-strategy-11.c      |  9 ++++++
 gcc/testsuite/gcc.target/i386/shrink_wrap_1.c |  2 +-
 gcc/testsuite/gcc.target/i386/sw-1.c          |  2 +-
 8 files changed, 63 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c

Comments

Richard Biener March 22, 2021, 1:29 p.m. UTC | #1
On Mon, Mar 22, 2021 at 2:19 PM H.J. Lu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Simply memcpy and memset inline strategies to avoid branches for
> -mtune=generic:
>
> 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
>    load and store for up to 16 * 16 (256) bytes when the data size is
>    fixed and known.
> 2. Inline only if data size is known to be <= 256.
>    a. Use "rep movsb/stosb" with simple code sequence if the data size
>       is a constant.
>    b. Use loop if data size is not a constant.
> 3. Use memcpy/memset libray function if data size is unknown or > 256.
>
> With -mtune=generic -O2,

Is there any visible code-size effect of increasing CLEAR_RATIO on
SPEC/eembc?  Did you play with other values of MOVE/CLEAR_RATIO?
17 memory-to-memory/memory-clear insns looks quite a lot.

> 1. On Ice Lake processor,
>
> Performance impacts on SPEC CPU 2017:
>
> 500.perlbench_r  0.51%
> 502.gcc_r        0.55%
> 505.mcf_r        0.38%
> 520.omnetpp_r   -0.74%
> 523.xalancbmk_r -0.35%
> 525.x264_r       2.99%
> 531.deepsjeng_r -0.17%
> 541.leela_r     -0.98%
> 548.exchange2_r  0.89%
> 557.xz_r         0.70%
> Geomean          0.37%
>
> 503.bwaves_r     0.04%
> 507.cactuBSSN_r -0.01%
> 508.namd_r      -0.45%
> 510.parest_r    -0.09%
> 511.povray_r    -1.37%
> 519.lbm_r        0.00%
> 521.wrf_r       -2.56%
> 526.blender_r   -0.01%
> 527.cam4_r      -0.05%
> 538.imagick_r    0.36%
> 544.nab_r        0.08%
> 549.fotonik3d_r -0.06%
> 554.roms_r       0.05%
> Geomean         -0.34%
>
> Significant impacts on eembc benchmarks:
>
> eembc/nnet_test      14.85%
> eembc/mp2decoddata2  13.57%
>
> 2. On Cascadelake processor,
>
> Performance impacts on SPEC CPU 2017:
>
> 500.perlbench_r -0.02%
> 502.gcc_r        0.10%
> 505.mcf_r       -1.14%
> 520.omnetpp_r   -0.22%
> 523.xalancbmk_r  0.21%
> 525.x264_r       0.94%
> 531.deepsjeng_r -0.37%
> 541.leela_r     -0.46%
> 548.exchange2_r -0.40%
> 557.xz_r         0.60%
> Geomean         -0.08%
>
> 503.bwaves_r    -0.50%
> 507.cactuBSSN_r  0.05%
> 508.namd_r      -0.02%
> 510.parest_r     0.09%
> 511.povray_r    -1.35%
> 519.lbm_r        0.00%
> 521.wrf_r       -0.03%
> 526.blender_r   -0.83%
> 527.cam4_r       1.23%
> 538.imagick_r    0.97%
> 544.nab_r       -0.02%
> 549.fotonik3d_r -0.12%
> 554.roms_r       0.55%
> Geomean          0.00%
>
> Significant impacts on eembc benchmarks:
>
> eembc/nnet_test      9.90%
> eembc/mp2decoddata2  16.42%
> eembc/textv2data3   -4.86%
> eembc/qos            12.90%
>
> 3. On Znver3 processor,
>
> Performance impacts on SPEC CPU 2017:
>
> 500.perlbench_r -0.96%
> 502.gcc_r       -1.06%
> 505.mcf_r       -0.01%
> 520.omnetpp_r   -1.45%
> 523.xalancbmk_r  2.89%
> 525.x264_r       4.98%
> 531.deepsjeng_r  0.18%
> 541.leela_r     -1.54%
> 548.exchange2_r -1.25%
> 557.xz_r        -0.01%
> Geomean          0.16%
>
> 503.bwaves_r     0.04%
> 507.cactuBSSN_r  0.85%
> 508.namd_r      -0.13%
> 510.parest_r     0.39%
> 511.povray_r     0.00%
> 519.lbm_r        0.00%
> 521.wrf_r        0.28%
> 526.blender_r   -0.10%
> 527.cam4_r      -0.58%
> 538.imagick_r    0.69%
> 544.nab_r       -0.04%
> 549.fotonik3d_r -0.04%
> 554.roms_r       0.40%
> Geomean          0.15%
>
> Significant impacts on eembc benchmarks:
>
> eembc/aifftr01       13.95%
> eembc/idctrn01       8.41%
> eembc/nnet_test      30.25%
> eembc/mp2decoddata2  5.05%
> eembc/textv2data3    6.43%
> eembc/qos           -5.79%
>
> gcc/
>
>         * config/i386/x86-tune-costs.h (generic_memcpy): Updated.
>         (generic_memset): Likewise.
>         (generic_cost): Change CLEAR_RATIO to 17.
>         * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
>         Add m_GENERIC.
>
> gcc/testsuite/
>
>         * gcc.target/i386/memcpy-strategy-12.c: New test.
>         * gcc.target/i386/memcpy-strategy-13.c: Likewise.
>         * gcc.target/i386/memset-strategy-10.c: Likewise.
>         * gcc.target/i386/memset-strategy-11.c: Likewise.
>         * gcc.target/i386/shrink_wrap_1.c: Also pass
>         -mmemset-strategy=rep_8byte:-1:align.
>         * gcc.target/i386/sw-1.c: Also pass -mstringop-strategy=rep_byte.
> ---
>  gcc/config/i386/x86-tune-costs.h              | 31 ++++++++++++-------
>  gcc/config/i386/x86-tune.def                  |  2 +-
>  .../gcc.target/i386/memcpy-strategy-12.c      |  9 ++++++
>  .../gcc.target/i386/memcpy-strategy-13.c      | 11 +++++++
>  .../gcc.target/i386/memset-strategy-10.c      | 11 +++++++
>  .../gcc.target/i386/memset-strategy-11.c      |  9 ++++++
>  gcc/testsuite/gcc.target/i386/shrink_wrap_1.c |  2 +-
>  gcc/testsuite/gcc.target/i386/sw-1.c          |  2 +-
>  8 files changed, 63 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c
>
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index ffe810f2bcb..30e7c3e4261 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -2844,19 +2844,28 @@ struct processor_costs intel_cost = {
>    "16",                                        /* Func alignment.  */
>  };
>
> -/* Generic should produce code tuned for Core-i7 (and newer chips)
> -   and btver1 (and newer chips).  */
> +/* Generic should produce code tuned for Haswell (and newer chips)
> +   and znver1 (and newer chips).  NB: rep_prefix_1_byte is used only
> +   for known size.  */
>
>  static stringop_algs generic_memcpy[2] = {
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
> -             {-1, libcall, false}}},
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
> -             {-1, libcall, false}}}};
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
>  static stringop_algs generic_memset[2] = {
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
> -             {-1, libcall, false}}},
> -  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
> -             {-1, libcall, false}}}};
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
>  static const
>  struct processor_costs generic_cost = {
>    {
> @@ -2913,7 +2922,7 @@ struct processor_costs generic_cost = {
>    COSTS_N_INSNS (1),                   /* cost of movzx */
>    8,                                   /* "large" insn */
>    17,                                  /* MOVE_RATIO */
> -  6,                                   /* CLEAR_RATIO */
> +  17,                                  /* CLEAR_RATIO */
>    {6, 6, 6},                           /* cost of loading integer registers
>                                            in QImode, HImode and SImode.
>                                            Relative to reg-reg move (2).  */
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index eb057a67750..fd9c011a3f5 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
>     move/set sequences of bytes with known size.  */
>  DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
>           "prefer_known_rep_movsb_stosb",
> -         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
> +         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512 | m_GENERIC)
>
>  /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
>     compact prologues and epilogues by issuing a misaligned moves.  This
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
> new file mode 100644
> index 00000000000..87f03352736
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "rep movsb" } } */
> +
> +void
> +foo (char *dest, char *src)
> +{
> +  __builtin_memcpy (dest, src, 249);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
> new file mode 100644
> index 00000000000..cfc3cfba623
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep movsb" } } */
> +
> +void
> +foo (char *dest, char *src)
> +{
> +  __builtin_memcpy (dest, src, 257);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
> new file mode 100644
> index 00000000000..ade5e8da42c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
> +/* { dg-final { scan-assembler-not "rep stosb" } } */
> +
> +void
> +foo (char *dest)
> +{
> +  __builtin_memset (dest, 0, 257);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
> new file mode 100644
> index 00000000000..d1b86152474
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler "rep stosb" } } */
> +
> +void
> +foo (char *dest)
> +{
> +  __builtin_memset (dest, 0, 253);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
> index 94dadd6cdbd..44fe7d2836e 100644
> --- a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
> +++ b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile { target { ! ia32 } } } */
> -/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
> +/* { dg-options "-O2 -mmemset-strategy=rep_8byte:-1:align -fdump-rtl-pro_and_epilogue" } */
>
>  enum machine_mode
>  {
> diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c b/gcc/testsuite/gcc.target/i386/sw-1.c
> index aec095eda62..f61621e42bf 100644
> --- a/gcc/testsuite/gcc.target/i386/sw-1.c
> +++ b/gcc/testsuite/gcc.target/i386/sw-1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -mtune=generic -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
> +/* { dg-options "-O2 -mtune=generic -mstringop-strategy=rep_byte -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
>  /* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */
>
>  #include <string.h>
> --
> 2.30.2
>
H.J. Lu March 22, 2021, 1:38 p.m. UTC | #2
On Mon, Mar 22, 2021 at 6:29 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Mon, Mar 22, 2021 at 2:19 PM H.J. Lu via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Simply memcpy and memset inline strategies to avoid branches for
> > -mtune=generic:
> >
> > 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
> >    load and store for up to 16 * 16 (256) bytes when the data size is
> >    fixed and known.
> > 2. Inline only if data size is known to be <= 256.
> >    a. Use "rep movsb/stosb" with simple code sequence if the data size
> >       is a constant.
> >    b. Use loop if data size is not a constant.
> > 3. Use memcpy/memset libray function if data size is unknown or > 256.
> >
> > With -mtune=generic -O2,
>
> Is there any visible code-size effect of increasing CLEAR_RATIO on

Hongyue, please collect code size differences on SPEC CPU 2017 and
eembc.

> SPEC/eembc?  Did you play with other values of MOVE/CLEAR_RATIO?
> 17 memory-to-memory/memory-clear insns looks quite a lot.
>

Yes, we did.  256 bytes is the threshold above which memcpy/memset in libc
win. Below 256 bytes, 16 by_pieces move/store is faster.
Hongyu Wang March 23, 2021, 2:41 a.m. UTC | #3
> Hongyue, please collect code size differences on SPEC CPU 2017 and
> eembc.

Here is code size difference for this patch

SPEC CPU 2017
                                   difference             w patch      w/o patch
500.perlbench_r              0.051%             1622637          1621805
502.gcc_r                         0.039%             6930877          6928141
505.mcf_r                         0.098%             16413              16397
520.omnetpp_r               0.083%             1327757          1326653
523.xalancbmk_r            0.001%             3575709          3575677
525.x264_r                       -0.067%           769095            769607
531.deepsjeng_r             0.071%             67629              67581
541.leela_r                       -3.062%           127629            131661
548.exchange2_r            -0.338%            66141              66365
557.xz_r                            0.946%            128061            126861

503.bwaves_r                  0.534%             33117              32941
507.cactuBSSN_r            0.004%             2993645          2993517
508.namd_r                     0.006%             851677            851629
510.parest_r                    0.488%             6741277          6708557
511.povray_r                   -0.021%           849290            849466
521.wrf_r                         0.022%             29682154       29675530
526.blender_r                  0.054%             7544057          7540009
527.cam4_r                      0.043%             6102234          6099594
538.imagick_r                  -0.015%           1625770          1626010
544.nab_r                         0.155%             155453            155213
549.fotonik3d_r              0.000%             351757            351757
554.roms_r                      0.041%             735837            735533

eembc
                                    difference        w patch      w/o patch
aifftr01                              0.762%             14813            14701
aiifft01                              0.556%             14477            14397
idctrn01                            0.101%             15853            15837
cjpeg-rose7-preset         0.114%             56125              56061
nnet_test                         -0.848%           35549              35853
aes                                   0.125%             38493            38445
cjpegv2data                     0.108%             59213              59149
djpegv2data                     0.025%             63821              63805
huffde                               -0.104%           30621              30653
mp2decoddata                -0.047%           68285              68317
mp2enf32data1              0.018%             86925              86909
mp2enf32data2              0.018%             89357              89341
mp2enf32data3              0.018%             88253              88237
mp3playerfixeddata       0.103%             46877              46829
ip_pktcheckb1m              0.191%             25213              25165
nat                                   0.527%             45757             45517
ospfv2                               0.196%             24573             24525
routelookup                     0.189%             25389              25341
tcpbulk                            0.155%             30925              30877
textv2data                        0.055%             29101              29085

H.J. Lu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年3月22日周一 下午9:39写道:
>
> On Mon, Mar 22, 2021 at 6:29 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Mon, Mar 22, 2021 at 2:19 PM H.J. Lu via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Simply memcpy and memset inline strategies to avoid branches for
> > > -mtune=generic:
> > >
> > > 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
> > >    load and store for up to 16 * 16 (256) bytes when the data size is
> > >    fixed and known.
> > > 2. Inline only if data size is known to be <= 256.
> > >    a. Use "rep movsb/stosb" with simple code sequence if the data size
> > >       is a constant.
> > >    b. Use loop if data size is not a constant.
> > > 3. Use memcpy/memset libray function if data size is unknown or > 256.
> > >
> > > With -mtune=generic -O2,
> >
> > Is there any visible code-size effect of increasing CLEAR_RATIO on
>
> Hongyue, please collect code size differences on SPEC CPU 2017 and
> eembc.
>
> > SPEC/eembc?  Did you play with other values of MOVE/CLEAR_RATIO?
> > 17 memory-to-memory/memory-clear insns looks quite a lot.
> >
>
> Yes, we did.  256 bytes is the threshold above which memcpy/memset in libc
> win. Below 256 bytes, 16 by_pieces move/store is faster.
>
> --
> H.J.
Richard Biener March 23, 2021, 8:19 a.m. UTC | #4
On Tue, Mar 23, 2021 at 3:41 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > eembc.
>
> Here is code size difference for this patch

Thanks, nothing too bad although slightly larger impacts than envisioned.

> SPEC CPU 2017
>                                    difference             w patch      w/o patch
> 500.perlbench_r              0.051%             1622637          1621805
> 502.gcc_r                         0.039%             6930877          6928141
> 505.mcf_r                         0.098%             16413              16397
> 520.omnetpp_r               0.083%             1327757          1326653
> 523.xalancbmk_r            0.001%             3575709          3575677
> 525.x264_r                       -0.067%           769095            769607
> 531.deepsjeng_r             0.071%             67629              67581
> 541.leela_r                       -3.062%           127629            131661
> 548.exchange2_r            -0.338%            66141              66365
> 557.xz_r                            0.946%            128061            126861
>
> 503.bwaves_r                  0.534%             33117              32941
> 507.cactuBSSN_r            0.004%             2993645          2993517
> 508.namd_r                     0.006%             851677            851629
> 510.parest_r                    0.488%             6741277          6708557
> 511.povray_r                   -0.021%           849290            849466
> 521.wrf_r                         0.022%             29682154       29675530
> 526.blender_r                  0.054%             7544057          7540009
> 527.cam4_r                      0.043%             6102234          6099594
> 538.imagick_r                  -0.015%           1625770          1626010
> 544.nab_r                         0.155%             155453            155213
> 549.fotonik3d_r              0.000%             351757            351757
> 554.roms_r                      0.041%             735837            735533
>
> eembc
>                                     difference        w patch      w/o patch
> aifftr01                              0.762%             14813            14701
> aiifft01                              0.556%             14477            14397
> idctrn01                            0.101%             15853            15837
> cjpeg-rose7-preset         0.114%             56125              56061
> nnet_test                         -0.848%           35549              35853
> aes                                   0.125%             38493            38445
> cjpegv2data                     0.108%             59213              59149
> djpegv2data                     0.025%             63821              63805
> huffde                               -0.104%           30621              30653
> mp2decoddata                -0.047%           68285              68317
> mp2enf32data1              0.018%             86925              86909
> mp2enf32data2              0.018%             89357              89341
> mp2enf32data3              0.018%             88253              88237
> mp3playerfixeddata       0.103%             46877              46829
> ip_pktcheckb1m              0.191%             25213              25165
> nat                                   0.527%             45757             45517
> ospfv2                               0.196%             24573             24525
> routelookup                     0.189%             25389              25341
> tcpbulk                            0.155%             30925              30877
> textv2data                        0.055%             29101              29085
>
> H.J. Lu via Gcc-patches <gcc-patches@gcc.gnu.org> 于2021年3月22日周一 下午9:39写道:
> >
> > On Mon, Mar 22, 2021 at 6:29 AM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Mon, Mar 22, 2021 at 2:19 PM H.J. Lu via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > Simply memcpy and memset inline strategies to avoid branches for
> > > > -mtune=generic:
> > > >
> > > > 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
> > > >    load and store for up to 16 * 16 (256) bytes when the data size is
> > > >    fixed and known.
> > > > 2. Inline only if data size is known to be <= 256.
> > > >    a. Use "rep movsb/stosb" with simple code sequence if the data size
> > > >       is a constant.
> > > >    b. Use loop if data size is not a constant.
> > > > 3. Use memcpy/memset libray function if data size is unknown or > 256.
> > > >
> > > > With -mtune=generic -O2,
> > >
> > > Is there any visible code-size effect of increasing CLEAR_RATIO on
> >
> > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > eembc.
> >
> > > SPEC/eembc?  Did you play with other values of MOVE/CLEAR_RATIO?
> > > 17 memory-to-memory/memory-clear insns looks quite a lot.
> > >
> >
> > Yes, we did.  256 bytes is the threshold above which memcpy/memset in libc
> > win. Below 256 bytes, 16 by_pieces move/store is faster.
> >
> > --
> > H.J.
>
> --
> Regards,
>
> Hongyu, Wang
diff mbox series

Patch

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f2bcb..30e7c3e4261 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2844,19 +2844,28 @@  struct processor_costs intel_cost = {
   "16",					/* Func alignment.  */
 };
 
-/* Generic should produce code tuned for Core-i7 (and newer chips)
-   and btver1 (and newer chips).  */
+/* Generic should produce code tuned for Haswell (and newer chips)
+   and znver1 (and newer chips).  NB: rep_prefix_1_byte is used only
+   for known size.  */
 
 static stringop_algs generic_memcpy[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 static stringop_algs generic_memset[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 static const
 struct processor_costs generic_cost = {
   {
@@ -2913,7 +2922,7 @@  struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-  6,					/* CLEAR_RATIO */
+  17,					/* CLEAR_RATIO */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index eb057a67750..fd9c011a3f5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,7 +273,7 @@  DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 	  "prefer_known_rep_movsb_stosb",
-	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512 | m_GENERIC)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
new file mode 100644
index 00000000000..87f03352736
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 249);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
new file mode 100644
index 00000000000..cfc3cfba623
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
new file mode 100644
index 00000000000..ade5e8da42c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
new file mode 100644
index 00000000000..d1b86152474
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 253);
+}
diff --git a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
index 94dadd6cdbd..44fe7d2836e 100644
--- a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
+++ b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
+/* { dg-options "-O2 -mmemset-strategy=rep_8byte:-1:align -fdump-rtl-pro_and_epilogue" } */
 
 enum machine_mode
 {
diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c b/gcc/testsuite/gcc.target/i386/sw-1.c
index aec095eda62..f61621e42bf 100644
--- a/gcc/testsuite/gcc.target/i386/sw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sw-1.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
+/* { dg-options "-O2 -mtune=generic -mstringop-strategy=rep_byte -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
 /* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */
 
 #include <string.h>