diff mbox series

[v2,3/3] x86: Enable non-temporal memset for Hygon processors

Message ID 1724050675-17136-4-git-send-email-wangfeifei@hygon.cn
State New
Headers show
Series x86: Add support for Hygon processors | expand

Commit Message

Feifei Wang Aug. 19, 2024, 6:57 a.m. UTC
This patch uses 'Avoid_Non_Temporal_Memset' flag to access
the non-temporal memset implementation for hygon processors.

Test Results:

hygon1 arch
x86_memset_non_temporal_threshold = 8MB
size                          new performance time / old performance time
1MB                           0.994
4MB                           0.996
8MB                           0.670
16MB                          0.343
32MB                          0.355

hygon2 arch
x86_memset_non_temporal_threshold = 8MB
size                          new performance time / old performance time
1MB                           1
4MB                           1
8MB                           1.312
16MB                          0.822
32MB                          0.830

hygon3 arch
x86_memset_non_temporal_threshold = 8MB
size                          new performance time / old performance time
1MB                           1
4MB                           0.990
8MB                           0.737
16MB                          0.390
32MB                          0.401

For hygon arch with this patch, non-temporal stores can improve
performance by 20% - 65%.

Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
Reviewed-by: Jing Li <lijing@hygon.cn>
---
 sysdeps/x86/cpu-features.c | 9 +++++++--
 sysdeps/x86/dl-cacheinfo.h | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

Comments

H.J. Lu Aug. 19, 2024, 1:13 p.m. UTC | #1
On Sun, Aug 18, 2024 at 11:59 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
>
> This patch uses 'Avoid_Non_Temporal_Memset' flag to access
> the non-temporal memset implementation for hygon processors.
>
> Test Results:
>
> hygon1 arch
> x86_memset_non_temporal_threshold = 8MB
> size                          new performance time / old performance time
> 1MB                           0.994
> 4MB                           0.996
> 8MB                           0.670
> 16MB                          0.343
> 32MB                          0.355
>
> hygon2 arch
> x86_memset_non_temporal_threshold = 8MB
> size                          new performance time / old performance time
> 1MB                           1
> 4MB                           1
> 8MB                           1.312
> 16MB                          0.822
> 32MB                          0.830
>
> hygon3 arch
> x86_memset_non_temporal_threshold = 8MB
> size                          new performance time / old performance time
> 1MB                           1
> 4MB                           0.990
> 8MB                           0.737
> 16MB                          0.390
> 32MB                          0.401
>
> For hygon arch with this patch, non-temporal stores can improve
> performance by 20% - 65%.
>
> Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
> Reviewed-by: Jing Li <lijing@hygon.cn>
> ---
>  sysdeps/x86/cpu-features.c | 9 +++++++--
>  sysdeps/x86/dl-cacheinfo.h | 2 +-
>  2 files changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index e6139e2837..1f30e237f5 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -756,9 +756,9 @@ init_cpu_features (struct cpu_features *cpu_features)
>    unsigned int stepping = 0;
>    enum cpu_features_kind kind;
>
> -  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
> +  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
>       as of writing this, we only have benchmarks indicatings it profitability
> -     on Intel/AMD.  */
> +     on Intel/AMD/Hygon.  */
>    cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
>        |= bit_arch_Avoid_Non_Temporal_Memset;
>
> @@ -1116,6 +1116,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
>        get_extended_indices (cpu_features);
>
>        update_active (cpu_features);
> +
> +      /* Benchmarks indicate non-temporal memset can be profitable on Hygon
> +       hardware.  */
> +      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> +           &= ~bit_arch_Avoid_Non_Temporal_Memset;
>      }
>    else
>      {
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 8f4fe98d88..e9579505a3 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>
>    /* Non-temporal stores are more performant on some hardware above
>       non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
> -     Intel and AMD hardware. */
> +     Intel, AMD and Hygon hardware. */
>    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
>    if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
>      memset_non_temporal_threshold = non_temporal_threshold;
> --
> 2.43.0
>

Don't you need to enable Avoid_STOSB to avoid "stosb"?
Feifei Wang Aug. 20, 2024, 2:25 a.m. UTC | #2
> -----邮件原件-----
> 发件人: H.J. Lu <hjl.tools@gmail.com>
> 发送时间: 2024年8月19日 21:14
> 收件人: Feifei Wang <wangfeifei@hygon.cn>
> 抄送: libc-alpha@sourceware.org; carlos@redhat.com; fw@deneb.enyo.de;
> goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> <lijing@hygon.cn>
> 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon processors
> 
> On Sun, Aug 18, 2024 at 11:59 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
> >
> > This patch uses 'Avoid_Non_Temporal_Memset' flag to access the
> > non-temporal memset implementation for hygon processors.
> >
> > Test Results:
> >
> > hygon1 arch
> > x86_memset_non_temporal_threshold = 8MB
> > size                          new performance time / old performance
> time
> > 1MB                           0.994
> > 4MB                           0.996
> > 8MB                           0.670
> > 16MB                          0.343
> > 32MB                          0.355
> >
> > hygon2 arch
> > x86_memset_non_temporal_threshold = 8MB
> > size                          new performance time / old performance
> time
> > 1MB                           1
> > 4MB                           1
> > 8MB                           1.312
> > 16MB                          0.822
> > 32MB                          0.830
> >
> > hygon3 arch
> > x86_memset_non_temporal_threshold = 8MB
> > size                          new performance time / old performance
> time
> > 1MB                           1
> > 4MB                           0.990
> > 8MB                           0.737
> > 16MB                          0.390
> > 32MB                          0.401
> >
> > For hygon arch with this patch, non-temporal stores can improve
> > performance by 20% - 65%.
> >
> > Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
> > Reviewed-by: Jing Li <lijing@hygon.cn>
> > ---
> >  sysdeps/x86/cpu-features.c | 9 +++++++--  sysdeps/x86/dl-cacheinfo.h
> > | 2 +-
> >  2 files changed, 8 insertions(+), 3 deletions(-)
> >
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index e6139e2837..1f30e237f5 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -756,9 +756,9 @@ init_cpu_features (struct cpu_features
> *cpu_features)
> >    unsigned int stepping = 0;
> >    enum cpu_features_kind kind;
> >
> > -  /* Default is avoid non-temporal memset for non Intel/AMD hardware.
> > This is,
> > +  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon
> > + hardware. This is,
> >       as of writing this, we only have benchmarks indicatings it profitability
> > -     on Intel/AMD.  */
> > +     on Intel/AMD/Hygon.  */
> >    cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> >        |= bit_arch_Avoid_Non_Temporal_Memset;
> >
> > @@ -1116,6 +1116,11 @@
> https://www.intel.com/content/www/us/en/support/articles/000059422/proce
> ssors.ht
> >        get_extended_indices (cpu_features);
> >
> >        update_active (cpu_features);
> > +
> > +      /* Benchmarks indicate non-temporal memset can be profitable on
> Hygon
> > +       hardware.  */
> > +
> cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > +           &= ~bit_arch_Avoid_Non_Temporal_Memset;
> >      }
> >    else
> >      {
> > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > index 8f4fe98d88..e9579505a3 100644
> > --- a/sysdeps/x86/dl-cacheinfo.h
> > +++ b/sysdeps/x86/dl-cacheinfo.h
> > @@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features
> > *cpu_features)
> >
> >    /* Non-temporal stores are more performant on some hardware above
> >       non_temporal_threshold. Currently Prefer_Non_Temporal is set for
> for both
> > -     Intel and AMD hardware. */
> > +     Intel, AMD and Hygon hardware. */
> >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> >    if (!CPU_FEATURES_ARCH_P (cpu_features,
> Avoid_Non_Temporal_Memset))
> >      memset_non_temporal_threshold = non_temporal_threshold;
> > --
> > 2.43.0
> >
> 
> Don't you need to enable Avoid_STOSB to avoid "stosb"?

Thanks for the comments.

Noah patch: 'Add `Avoid_STOSB` tunable to allow NT memset without ERMS'
set 'Avoid_STOSB' bit when CPU can't support ERMs feature for all CPU vendors.

Hygon processors disable ERMS feature as default. So we don't need to set this
in Hygon branch.

> 
> --
> H.J.
H.J. Lu Aug. 20, 2024, 2 p.m. UTC | #3
On Mon, Aug 19, 2024 at 7:25 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
>
>
>
> > -----邮件原件-----
> > 发件人: H.J. Lu <hjl.tools@gmail.com>
> > 发送时间: 2024年8月19日 21:14
> > 收件人: Feifei Wang <wangfeifei@hygon.cn>
> > 抄送: libc-alpha@sourceware.org; carlos@redhat.com; fw@deneb.enyo.de;
> > goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> > <lijing@hygon.cn>
> > 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon processors
> >
> > On Sun, Aug 18, 2024 at 11:59 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
> > >
> > > This patch uses 'Avoid_Non_Temporal_Memset' flag to access the
> > > non-temporal memset implementation for hygon processors.
> > >
> > > Test Results:
> > >
> > > hygon1 arch
> > > x86_memset_non_temporal_threshold = 8MB
> > > size                          new performance time / old performance
> > time
> > > 1MB                           0.994
> > > 4MB                           0.996
> > > 8MB                           0.670
> > > 16MB                          0.343
> > > 32MB                          0.355
> > >
> > > hygon2 arch
> > > x86_memset_non_temporal_threshold = 8MB
> > > size                          new performance time / old performance
> > time
> > > 1MB                           1
> > > 4MB                           1
> > > 8MB                           1.312
> > > 16MB                          0.822
> > > 32MB                          0.830
> > >
> > > hygon3 arch
> > > x86_memset_non_temporal_threshold = 8MB
> > > size                          new performance time / old performance
> > time
> > > 1MB                           1
> > > 4MB                           0.990
> > > 8MB                           0.737
> > > 16MB                          0.390
> > > 32MB                          0.401
> > >
> > > For hygon arch with this patch, non-temporal stores can improve
> > > performance by 20% - 65%.
> > >
> > > Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
> > > Reviewed-by: Jing Li <lijing@hygon.cn>
> > > ---
> > >  sysdeps/x86/cpu-features.c | 9 +++++++--  sysdeps/x86/dl-cacheinfo.h
> > > | 2 +-
> > >  2 files changed, 8 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > > index e6139e2837..1f30e237f5 100644
> > > --- a/sysdeps/x86/cpu-features.c
> > > +++ b/sysdeps/x86/cpu-features.c
> > > @@ -756,9 +756,9 @@ init_cpu_features (struct cpu_features
> > *cpu_features)
> > >    unsigned int stepping = 0;
> > >    enum cpu_features_kind kind;
> > >
> > > -  /* Default is avoid non-temporal memset for non Intel/AMD hardware.
> > > This is,
> > > +  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon
> > > + hardware. This is,
> > >       as of writing this, we only have benchmarks indicatings it profitability
> > > -     on Intel/AMD.  */
> > > +     on Intel/AMD/Hygon.  */
> > >    cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > >        |= bit_arch_Avoid_Non_Temporal_Memset;
> > >
> > > @@ -1116,6 +1116,11 @@
> > https://www.intel.com/content/www/us/en/support/articles/000059422/proce
> > ssors.ht
> > >        get_extended_indices (cpu_features);
> > >
> > >        update_active (cpu_features);
> > > +
> > > +      /* Benchmarks indicate non-temporal memset can be profitable on
> > Hygon
> > > +       hardware.  */
> > > +
> > cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > > +           &= ~bit_arch_Avoid_Non_Temporal_Memset;
> > >      }
> > >    else
> > >      {
> > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> > > index 8f4fe98d88..e9579505a3 100644
> > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > @@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features
> > > *cpu_features)
> > >
> > >    /* Non-temporal stores are more performant on some hardware above
> > >       non_temporal_threshold. Currently Prefer_Non_Temporal is set for
> > for both
> > > -     Intel and AMD hardware. */
> > > +     Intel, AMD and Hygon hardware. */
> > >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> > >    if (!CPU_FEATURES_ARCH_P (cpu_features,
> > Avoid_Non_Temporal_Memset))
> > >      memset_non_temporal_threshold = non_temporal_threshold;
> > > --
> > > 2.43.0
> > >
> >
> > Don't you need to enable Avoid_STOSB to avoid "stosb"?
>
> Thanks for the comments.
>
> Noah patch: 'Add `Avoid_STOSB` tunable to allow NT memset without ERMS'
> set 'Avoid_STOSB' bit when CPU can't support ERMs feature for all CPU vendors.
>
> Hygon processors disable ERMS feature as default. So we don't need to set this
> in Hygon branch.
>

Since Hygon processors clears Avoid_Non_Temporal_Memset, memset
with ERMS is used:

static inline int
prefer_erms_nt_impl (const struct cpu_features *cpu_features)
{
  return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
|| !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
}

You need to set Avoid_STOSB to avoid STOSB.
Feifei Wang Aug. 21, 2024, 5:41 a.m. UTC | #4
> -----邮件原件-----
> 发件人: H.J. Lu <hjl.tools@gmail.com>
> 发送时间: 2024年8月20日 22:01
> 收件人: Feifei Wang <wangfeifei@hygon.cn>
> 抄送: libc-alpha@sourceware.org; carlos@redhat.com; fw@deneb.enyo.de;
> goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> <lijing@hygon.cn>
> 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon processors
> 
> On Mon, Aug 19, 2024 at 7:25 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
> >
> >
> >
> > > -----邮件原件-----
> > > 发件人: H.J. Lu <hjl.tools@gmail.com>
> > > 发送时间: 2024年8月19日 21:14
> > > 收件人: Feifei Wang <wangfeifei@hygon.cn>
> > > 抄送: libc-alpha@sourceware.org; carlos@redhat.com;
> fw@deneb.enyo.de;
> > > goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> > > <lijing@hygon.cn>
> > > 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon
> > > processors
> > >
> > > On Sun, Aug 18, 2024 at 11:59 PM Feifei Wang <wangfeifei@hygon.cn>
> wrote:
> > > >
> > > > This patch uses 'Avoid_Non_Temporal_Memset' flag to access the
> > > > non-temporal memset implementation for hygon processors.
> > > >
> > > > Test Results:
> > > >
> > > > hygon1 arch
> > > > x86_memset_non_temporal_threshold = 8MB
> > > > size                          new performance time / old
> performance
> > > time
> > > > 1MB                           0.994
> > > > 4MB                           0.996
> > > > 8MB                           0.670
> > > > 16MB                          0.343
> > > > 32MB                          0.355
> > > >
> > > > hygon2 arch
> > > > x86_memset_non_temporal_threshold = 8MB
> > > > size                          new performance time / old
> performance
> > > time
> > > > 1MB                           1
> > > > 4MB                           1
> > > > 8MB                           1.312
> > > > 16MB                          0.822
> > > > 32MB                          0.830
> > > >
> > > > hygon3 arch
> > > > x86_memset_non_temporal_threshold = 8MB
> > > > size                          new performance time / old
> performance
> > > time
> > > > 1MB                           1
> > > > 4MB                           0.990
> > > > 8MB                           0.737
> > > > 16MB                          0.390
> > > > 32MB                          0.401
> > > >
> > > > For hygon arch with this patch, non-temporal stores can improve
> > > > performance by 20% - 65%.
> > > >
> > > > Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
> > > > Reviewed-by: Jing Li <lijing@hygon.cn>
> > > > ---
> > > >  sysdeps/x86/cpu-features.c | 9 +++++++--
> > > > sysdeps/x86/dl-cacheinfo.h
> > > > | 2 +-
> > > >  2 files changed, 8 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86/cpu-features.c
> > > > b/sysdeps/x86/cpu-features.c index e6139e2837..1f30e237f5 100644
> > > > --- a/sysdeps/x86/cpu-features.c
> > > > +++ b/sysdeps/x86/cpu-features.c
> > > > @@ -756,9 +756,9 @@ init_cpu_features (struct cpu_features
> > > *cpu_features)
> > > >    unsigned int stepping = 0;
> > > >    enum cpu_features_kind kind;
> > > >
> > > > -  /* Default is avoid non-temporal memset for non Intel/AMD hardware.
> > > > This is,
> > > > +  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon
> > > > + hardware. This is,
> > > >       as of writing this, we only have benchmarks indicatings it
> profitability
> > > > -     on Intel/AMD.  */
> > > > +     on Intel/AMD/Hygon.  */
> > > >    cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > > >        |= bit_arch_Avoid_Non_Temporal_Memset;
> > > >
> > > > @@ -1116,6 +1116,11 @@
> > > https://www.intel.com/content/www/us/en/support/articles/000059422/p
> > > roce
> > > ssors.ht
> > > >        get_extended_indices (cpu_features);
> > > >
> > > >        update_active (cpu_features);
> > > > +
> > > > +      /* Benchmarks indicate non-temporal memset can be
> > > > + profitable on
> > > Hygon
> > > > +       hardware.  */
> > > > +
> > > cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > > > +           &= ~bit_arch_Avoid_Non_Temporal_Memset;
> > > >      }
> > > >    else
> > > >      {
> > > > diff --git a/sysdeps/x86/dl-cacheinfo.h
> > > > b/sysdeps/x86/dl-cacheinfo.h index 8f4fe98d88..e9579505a3 100644
> > > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > > @@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features
> > > > *cpu_features)
> > > >
> > > >    /* Non-temporal stores are more performant on some hardware
> above
> > > >       non_temporal_threshold. Currently Prefer_Non_Temporal is set
> > > > for
> > > for both
> > > > -     Intel and AMD hardware. */
> > > > +     Intel, AMD and Hygon hardware. */
> > > >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> > > >    if (!CPU_FEATURES_ARCH_P (cpu_features,
> > > Avoid_Non_Temporal_Memset))
> > > >      memset_non_temporal_threshold = non_temporal_threshold;
> > > > --
> > > > 2.43.0
> > > >
> > >
> > > Don't you need to enable Avoid_STOSB to avoid "stosb"?
> >
> > Thanks for the comments.
> >
> > Noah patch: 'Add `Avoid_STOSB` tunable to allow NT memset without ERMS'
> > set 'Avoid_STOSB' bit when CPU can't support ERMs feature for all CPU
> vendors.
> >
> > Hygon processors disable ERMS feature as default. So we don't need to
> > set this in Hygon branch.
> >
> 
> Since Hygon processors clears Avoid_Non_Temporal_Memset, memset with
> ERMS is used:
> 
> static inline int
> prefer_erms_nt_impl (const struct cpu_features *cpu_features) {
>   return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
> }
> 
> You need to set Avoid_STOSB to avoid STOSB.

Sorry maybe my explanation is not clear.
 
Currently, Hygon CPU can't support ERMS feature, so in 'init_cpu_features' function,
It will run the following code:

  /* No ERMS, we want to avoid stosb for memset.  */
  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
    cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB;

In Hygon cpu, if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) is true.
 So, here the 'Avoid_STOSB' bit is set. 

Then in dl_init_cacheinfo,  set `x86_rep_stosb_threshold` =
`x86_memset_non_temporal_threshold`.

Furthermore, for hygon cpu, it will call memset_avx2_unaligned_erms:
If size < rep_stosb_threshold, run temporal stores.
If size > rep_stosb_threshold, and then check size > memset_non_temporal_threshold
run non-temporal stores.

Or maybe if I ignore something?
> 
> --
> H.J.
Noah Goldstein Aug. 21, 2024, 5:29 p.m. UTC | #5
On Tue, Aug 20, 2024 at 10:42 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
>
>
>
> > -----邮件原件-----
> > 发件人: H.J. Lu <hjl.tools@gmail.com>
> > 发送时间: 2024年8月20日 22:01
> > 收件人: Feifei Wang <wangfeifei@hygon.cn>
> > 抄送: libc-alpha@sourceware.org; carlos@redhat.com; fw@deneb.enyo.de;
> > goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> > <lijing@hygon.cn>
> > 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon processors
> >
> > On Mon, Aug 19, 2024 at 7:25 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
> > >
> > >
> > >
> > > > -----邮件原件-----
> > > > 发件人: H.J. Lu <hjl.tools@gmail.com>
> > > > 发送时间: 2024年8月19日 21:14
> > > > 收件人: Feifei Wang <wangfeifei@hygon.cn>
> > > > 抄送: libc-alpha@sourceware.org; carlos@redhat.com;
> > fw@deneb.enyo.de;
> > > > goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> > > > <lijing@hygon.cn>
> > > > 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon
> > > > processors
> > > >
> > > > On Sun, Aug 18, 2024 at 11:59 PM Feifei Wang <wangfeifei@hygon.cn>
> > wrote:
> > > > >
> > > > > This patch uses 'Avoid_Non_Temporal_Memset' flag to access the
> > > > > non-temporal memset implementation for hygon processors.
> > > > >
> > > > > Test Results:
> > > > >
> > > > > hygon1 arch
> > > > > x86_memset_non_temporal_threshold = 8MB
> > > > > size                          new performance time / old
> > performance
> > > > time
> > > > > 1MB                           0.994
> > > > > 4MB                           0.996
> > > > > 8MB                           0.670
> > > > > 16MB                          0.343
> > > > > 32MB                          0.355
> > > > >
> > > > > hygon2 arch
> > > > > x86_memset_non_temporal_threshold = 8MB
> > > > > size                          new performance time / old
> > performance
> > > > time
> > > > > 1MB                           1
> > > > > 4MB                           1
> > > > > 8MB                           1.312
> > > > > 16MB                          0.822
> > > > > 32MB                          0.830
> > > > >
> > > > > hygon3 arch
> > > > > x86_memset_non_temporal_threshold = 8MB
> > > > > size                          new performance time / old
> > performance
> > > > time
> > > > > 1MB                           1
> > > > > 4MB                           0.990
> > > > > 8MB                           0.737
> > > > > 16MB                          0.390
> > > > > 32MB                          0.401
> > > > >
> > > > > For hygon arch with this patch, non-temporal stores can improve
> > > > > performance by 20% - 65%.
> > > > >
> > > > > Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
> > > > > Reviewed-by: Jing Li <lijing@hygon.cn>
> > > > > ---
> > > > >  sysdeps/x86/cpu-features.c | 9 +++++++--
> > > > > sysdeps/x86/dl-cacheinfo.h
> > > > > | 2 +-
> > > > >  2 files changed, 8 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86/cpu-features.c
> > > > > b/sysdeps/x86/cpu-features.c index e6139e2837..1f30e237f5 100644
> > > > > --- a/sysdeps/x86/cpu-features.c
> > > > > +++ b/sysdeps/x86/cpu-features.c
> > > > > @@ -756,9 +756,9 @@ init_cpu_features (struct cpu_features
> > > > *cpu_features)
> > > > >    unsigned int stepping = 0;
> > > > >    enum cpu_features_kind kind;
> > > > >
> > > > > -  /* Default is avoid non-temporal memset for non Intel/AMD hardware.
> > > > > This is,
> > > > > +  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon
> > > > > + hardware. This is,
> > > > >       as of writing this, we only have benchmarks indicatings it
> > profitability
> > > > > -     on Intel/AMD.  */
> > > > > +     on Intel/AMD/Hygon.  */
> > > > >    cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > > > >        |= bit_arch_Avoid_Non_Temporal_Memset;
> > > > >
> > > > > @@ -1116,6 +1116,11 @@
> > > > https://www.intel.com/content/www/us/en/support/articles/000059422/p
> > > > roce
> > > > ssors.ht
> > > > >        get_extended_indices (cpu_features);
> > > > >
> > > > >        update_active (cpu_features);
> > > > > +
> > > > > +      /* Benchmarks indicate non-temporal memset can be
> > > > > + profitable on
> > > > Hygon
> > > > > +       hardware.  */
> > > > > +
> > > > cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > > > > +           &= ~bit_arch_Avoid_Non_Temporal_Memset;
> > > > >      }
> > > > >    else
> > > > >      {
> > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h
> > > > > b/sysdeps/x86/dl-cacheinfo.h index 8f4fe98d88..e9579505a3 100644
> > > > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > > > @@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features
> > > > > *cpu_features)
> > > > >
> > > > >    /* Non-temporal stores are more performant on some hardware
> > above
> > > > >       non_temporal_threshold. Currently Prefer_Non_Temporal is set
> > > > > for
> > > > for both
> > > > > -     Intel and AMD hardware. */
> > > > > +     Intel, AMD and Hygon hardware. */
> > > > >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> > > > >    if (!CPU_FEATURES_ARCH_P (cpu_features,
> > > > Avoid_Non_Temporal_Memset))
> > > > >      memset_non_temporal_threshold = non_temporal_threshold;
> > > > > --
> > > > > 2.43.0
> > > > >
> > > >
> > > > Don't you need to enable Avoid_STOSB to avoid "stosb"?
> > >
> > > Thanks for the comments.
> > >
> > > Noah patch: 'Add `Avoid_STOSB` tunable to allow NT memset without ERMS'
> > > set 'Avoid_STOSB' bit when CPU can't support ERMs feature for all CPU
> > vendors.
> > >
> > > Hygon processors disable ERMS feature as default. So we don't need to
> > > set this in Hygon branch.
> > >
> >
> > Since Hygon processors clears Avoid_Non_Temporal_Memset, memset with
> > ERMS is used:
> >
> > static inline int
> > prefer_erms_nt_impl (const struct cpu_features *cpu_features) {
> >   return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
> > }
> >
> > You need to set Avoid_STOSB to avoid STOSB.
>
> Sorry maybe my explanation is not clear.
>
> Currently, Hygon CPU can't support ERMS feature, so in 'init_cpu_features' function,
> It will run the following code:
>
>   /* No ERMS, we want to avoid stosb for memset.  */
>   if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>     cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB;
>
> In Hygon cpu, if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) is true.
>  So, here the 'Avoid_STOSB' bit is set.
>
> Then in dl_init_cacheinfo,  set `x86_rep_stosb_threshold` =
> `x86_memset_non_temporal_threshold`.
>
> Furthermore, for hygon cpu, it will call memset_avx2_unaligned_erms:
> If size < rep_stosb_threshold, run temporal stores.
> If size > rep_stosb_threshold, and then check size > memset_non_temporal_threshold
> run non-temporal stores.
>
> Or maybe if I ignore something?

That sounds correct to me, assuming that once Hygon starts supporting
ERMS you will want to use stosb.
> >
> > --
> > H.J.
Feifei Wang Aug. 22, 2024, 2:11 a.m. UTC | #6
> -----邮件原件-----
> 发件人: Noah Goldstein <goldstein.w.n@gmail.com>
> 发送时间: 2024年8月22日 1:30
> 收件人: Feifei Wang <wangfeifei@hygon.cn>
> 抄送: H.J. Lu <hjl.tools@gmail.com>; libc-alpha@sourceware.org;
> carlos@redhat.com; fw@deneb.enyo.de; adhemerval.zanella@linaro.org; Jing
> Li <lijing@hygon.cn>
> 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon processors
> 
> On Tue, Aug 20, 2024 at 10:42 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
> >
> >
> >
> > > -----邮件原件-----
> > > 发件人: H.J. Lu <hjl.tools@gmail.com>
> > > 发送时间: 2024年8月20日 22:01
> > > 收件人: Feifei Wang <wangfeifei@hygon.cn>
> > > 抄送: libc-alpha@sourceware.org; carlos@redhat.com;
> fw@deneb.enyo.de;
> > > goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> > > <lijing@hygon.cn>
> > > 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon
> > > processors
> > >
> > > On Mon, Aug 19, 2024 at 7:25 PM Feifei Wang <wangfeifei@hygon.cn>
> wrote:
> > > >
> > > >
> > > >
> > > > > -----邮件原件-----
> > > > > 发件人: H.J. Lu <hjl.tools@gmail.com>
> > > > > 发送时间: 2024年8月19日 21:14
> > > > > 收件人: Feifei Wang <wangfeifei@hygon.cn>
> > > > > 抄送: libc-alpha@sourceware.org; carlos@redhat.com;
> > > fw@deneb.enyo.de;
> > > > > goldstein.w.n@gmail.com; adhemerval.zanella@linaro.org; Jing Li
> > > > > <lijing@hygon.cn>
> > > > > 主题: Re: [v2 3/3] x86: Enable non-temporal memset for Hygon
> > > > > processors
> > > > >
> > > > > On Sun, Aug 18, 2024 at 11:59 PM Feifei Wang
> > > > > <wangfeifei@hygon.cn>
> > > wrote:
> > > > > >
> > > > > > This patch uses 'Avoid_Non_Temporal_Memset' flag to access the
> > > > > > non-temporal memset implementation for hygon processors.
> > > > > >
> > > > > > Test Results:
> > > > > >
> > > > > > hygon1 arch
> > > > > > x86_memset_non_temporal_threshold = 8MB
> > > > > > size                          new performance time / old
> > > performance
> > > > > time
> > > > > > 1MB                           0.994
> > > > > > 4MB                           0.996
> > > > > > 8MB                           0.670
> > > > > > 16MB                          0.343
> > > > > > 32MB                          0.355
> > > > > >
> > > > > > hygon2 arch
> > > > > > x86_memset_non_temporal_threshold = 8MB
> > > > > > size                          new performance time / old
> > > performance
> > > > > time
> > > > > > 1MB                           1
> > > > > > 4MB                           1
> > > > > > 8MB                           1.312
> > > > > > 16MB                          0.822
> > > > > > 32MB                          0.830
> > > > > >
> > > > > > hygon3 arch
> > > > > > x86_memset_non_temporal_threshold = 8MB
> > > > > > size                          new performance time / old
> > > performance
> > > > > time
> > > > > > 1MB                           1
> > > > > > 4MB                           0.990
> > > > > > 8MB                           0.737
> > > > > > 16MB                          0.390
> > > > > > 32MB                          0.401
> > > > > >
> > > > > > For hygon arch with this patch, non-temporal stores can
> > > > > > improve performance by 20% - 65%.
> > > > > >
> > > > > > Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
> > > > > > Reviewed-by: Jing Li <lijing@hygon.cn>
> > > > > > ---
> > > > > >  sysdeps/x86/cpu-features.c | 9 +++++++--
> > > > > > sysdeps/x86/dl-cacheinfo.h
> > > > > > | 2 +-
> > > > > >  2 files changed, 8 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86/cpu-features.c
> > > > > > b/sysdeps/x86/cpu-features.c index e6139e2837..1f30e237f5
> > > > > > 100644
> > > > > > --- a/sysdeps/x86/cpu-features.c
> > > > > > +++ b/sysdeps/x86/cpu-features.c
> > > > > > @@ -756,9 +756,9 @@ init_cpu_features (struct cpu_features
> > > > > *cpu_features)
> > > > > >    unsigned int stepping = 0;
> > > > > >    enum cpu_features_kind kind;
> > > > > >
> > > > > > -  /* Default is avoid non-temporal memset for non Intel/AMD
> hardware.
> > > > > > This is,
> > > > > > +  /* Default is avoid non-temporal memset for non
> > > > > > + Intel/AMD/Hygon hardware. This is,
> > > > > >       as of writing this, we only have benchmarks indicatings
> > > > > > it
> > > profitability
> > > > > > -     on Intel/AMD.  */
> > > > > > +     on Intel/AMD/Hygon.  */
> > > > > >
> cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > > > > >        |= bit_arch_Avoid_Non_Temporal_Memset;
> > > > > >
> > > > > > @@ -1116,6 +1116,11 @@
> > > > > https://www.intel.com/content/www/us/en/support/articles/0000594
> > > > > 22/p
> > > > > roce
> > > > > ssors.ht
> > > > > >        get_extended_indices (cpu_features);
> > > > > >
> > > > > >        update_active (cpu_features);
> > > > > > +
> > > > > > +      /* Benchmarks indicate non-temporal memset can be
> > > > > > + profitable on
> > > > > Hygon
> > > > > > +       hardware.  */
> > > > > > +
> > > > > cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> > > > > > +           &= ~bit_arch_Avoid_Non_Temporal_Memset;
> > > > > >      }
> > > > > >    else
> > > > > >      {
> > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h
> > > > > > b/sysdeps/x86/dl-cacheinfo.h index 8f4fe98d88..e9579505a3
> > > > > > 100644
> > > > > > --- a/sysdeps/x86/dl-cacheinfo.h
> > > > > > +++ b/sysdeps/x86/dl-cacheinfo.h
> > > > > > @@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features
> > > > > > *cpu_features)
> > > > > >
> > > > > >    /* Non-temporal stores are more performant on some hardware
> > > above
> > > > > >       non_temporal_threshold. Currently Prefer_Non_Temporal is
> > > > > > set for
> > > > > for both
> > > > > > -     Intel and AMD hardware. */
> > > > > > +     Intel, AMD and Hygon hardware. */
> > > > > >    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
> > > > > >    if (!CPU_FEATURES_ARCH_P (cpu_features,
> > > > > Avoid_Non_Temporal_Memset))
> > > > > >      memset_non_temporal_threshold = non_temporal_threshold;
> > > > > > --
> > > > > > 2.43.0
> > > > > >
> > > > >
> > > > > Don't you need to enable Avoid_STOSB to avoid "stosb"?
> > > >
> > > > Thanks for the comments.
> > > >
> > > > Noah patch: 'Add `Avoid_STOSB` tunable to allow NT memset without
> ERMS'
> > > > set 'Avoid_STOSB' bit when CPU can't support ERMs feature for all
> > > > CPU
> > > vendors.
> > > >
> > > > Hygon processors disable ERMS feature as default. So we don't need
> > > > to set this in Hygon branch.
> > > >
> > >
> > > Since Hygon processors clears Avoid_Non_Temporal_Memset, memset with
> > > ERMS is used:
> > >
> > > static inline int
> > > prefer_erms_nt_impl (const struct cpu_features *cpu_features) {
> > >   return CPU_FEATURE_USABLE_P (cpu_features, ERMS)
> > > || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset);
> > > }
> > >
> > > You need to set Avoid_STOSB to avoid STOSB.
> >
> > Sorry maybe my explanation is not clear.
> >
> > Currently, Hygon CPU can't support ERMS feature, so in
> > 'init_cpu_features' function, It will run the following code:
> >
> >   /* No ERMS, we want to avoid stosb for memset.  */
> >   if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> >     cpu_features->preferred[index_arch_Avoid_STOSB] |=
> > bit_arch_Avoid_STOSB;
> >
> > In Hygon cpu, if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) is true.
> >  So, here the 'Avoid_STOSB' bit is set.
> >
> > Then in dl_init_cacheinfo,  set `x86_rep_stosb_threshold` =
> > `x86_memset_non_temporal_threshold`.
> >
> > Furthermore, for hygon cpu, it will call memset_avx2_unaligned_erms:
> > If size < rep_stosb_threshold, run temporal stores.
> > If size > rep_stosb_threshold, and then check size >
> > memset_non_temporal_threshold run non-temporal stores.
> >
> > Or maybe if I ignore something?
> 
> That sounds correct to me, assuming that once Hygon starts supporting ERMS
> you will want to use stosb.

That's right and agree with this. Currently, all hygon processors disable ERMS feature
due to ERMS performance is worse than temporal store.

In the furture, we will fix this performance problem and enable ERMs, and then we will add
stosb in Hygon. 
> > >
> > > --
> > > H.J.
H.J. Lu Aug. 24, 2024, 8:32 p.m. UTC | #7
On Sun, Aug 18, 2024 at 11:59 PM Feifei Wang <wangfeifei@hygon.cn> wrote:
>
> This patch uses 'Avoid_Non_Temporal_Memset' flag to access
> the non-temporal memset implementation for hygon processors.
>
> Test Results:
>
> hygon1 arch
> x86_memset_non_temporal_threshold = 8MB
> size                          new performance time / old performance time
> 1MB                           0.994
> 4MB                           0.996
> 8MB                           0.670
> 16MB                          0.343
> 32MB                          0.355
>
> hygon2 arch
> x86_memset_non_temporal_threshold = 8MB
> size                          new performance time / old performance time
> 1MB                           1
> 4MB                           1
> 8MB                           1.312
> 16MB                          0.822
> 32MB                          0.830
>
> hygon3 arch
> x86_memset_non_temporal_threshold = 8MB
> size                          new performance time / old performance time
> 1MB                           1
> 4MB                           0.990
> 8MB                           0.737
> 16MB                          0.390
> 32MB                          0.401
>
> For hygon arch with this patch, non-temporal stores can improve
> performance by 20% - 65%.
>
> Signed-off-by: Feifei Wang <wangfeifei@hygon.cn>
> Reviewed-by: Jing Li <lijing@hygon.cn>
> ---
>  sysdeps/x86/cpu-features.c | 9 +++++++--
>  sysdeps/x86/dl-cacheinfo.h | 2 +-
>  2 files changed, 8 insertions(+), 3 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index e6139e2837..1f30e237f5 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -756,9 +756,9 @@ init_cpu_features (struct cpu_features *cpu_features)
>    unsigned int stepping = 0;
>    enum cpu_features_kind kind;
>
> -  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
> +  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
>       as of writing this, we only have benchmarks indicatings it profitability
> -     on Intel/AMD.  */
> +     on Intel/AMD/Hygon.  */
>    cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
>        |= bit_arch_Avoid_Non_Temporal_Memset;
>
> @@ -1116,6 +1116,11 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
>        get_extended_indices (cpu_features);
>
>        update_active (cpu_features);
> +
> +      /* Benchmarks indicate non-temporal memset can be profitable on Hygon
> +       hardware.  */
> +      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
> +           &= ~bit_arch_Avoid_Non_Temporal_Memset;
>      }
>    else
>      {
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index 8f4fe98d88..e9579505a3 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -1071,7 +1071,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>
>    /* Non-temporal stores are more performant on some hardware above
>       non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
> -     Intel and AMD hardware. */
> +     Intel, AMD and Hygon hardware. */
>    unsigned long int memset_non_temporal_threshold = SIZE_MAX;
>    if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
>      memset_non_temporal_threshold = non_temporal_threshold;
> --
> 2.43.0
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
diff mbox series

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index e6139e2837..1f30e237f5 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -756,9 +756,9 @@  init_cpu_features (struct cpu_features *cpu_features)
   unsigned int stepping = 0;
   enum cpu_features_kind kind;
 
-  /* Default is avoid non-temporal memset for non Intel/AMD hardware. This is,
+  /* Default is avoid non-temporal memset for non Intel/AMD/Hygon hardware. This is,
      as of writing this, we only have benchmarks indicatings it profitability
-     on Intel/AMD.  */
+     on Intel/AMD/Hygon.  */
   cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
       |= bit_arch_Avoid_Non_Temporal_Memset;
 
@@ -1116,6 +1116,11 @@  https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
       get_extended_indices (cpu_features);
 
       update_active (cpu_features);
+
+      /* Benchmarks indicate non-temporal memset can be profitable on Hygon
+       hardware.  */
+      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	    &= ~bit_arch_Avoid_Non_Temporal_Memset;
     }
   else
     {
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 8f4fe98d88..e9579505a3 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -1071,7 +1071,7 @@  dl_init_cacheinfo (struct cpu_features *cpu_features)
 
   /* Non-temporal stores are more performant on some hardware above
      non_temporal_threshold. Currently Prefer_Non_Temporal is set for for both
-     Intel and AMD hardware. */
+     Intel, AMD and Hygon hardware. */
   unsigned long int memset_non_temporal_threshold = SIZE_MAX;
   if (!CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset))
     memset_non_temporal_threshold = non_temporal_threshold;