Message ID | 20211106173706.3125357-5-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/5] string: Make tests birdirectional test-memcpy.c | expand |
On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > No bug. > > This patch doubles the rep_movsb_threshold when using ERMS. Based on > benchmarks the vector copy loop, especially now that it handles 4k > aliasing, is better for these medium ranged. > > On Skylake with ERMS: > > Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) > 4096, 0, 0, 0, 0.975 > 4096, 0, 0, 1, 0.953 > 4096, 12, 0, 0, 0.969 > 4096, 12, 0, 1, 0.872 > 4096, 44, 0, 0, 0.979 > 4096, 44, 0, 1, 0.83 > 4096, 0, 12, 0, 1.006 > 4096, 0, 12, 1, 0.989 > 4096, 0, 44, 0, 0.739 > 4096, 0, 44, 1, 0.942 > 4096, 12, 12, 0, 1.009 > 4096, 12, 12, 1, 0.973 > 4096, 44, 44, 0, 0.791 > 4096, 44, 44, 1, 0.961 > 4096, 2048, 0, 0, 0.978 > 4096, 2048, 0, 1, 0.951 > 4096, 2060, 0, 0, 0.986 > 4096, 2060, 0, 1, 0.963 > 4096, 2048, 12, 0, 0.971 > 4096, 2048, 12, 1, 0.941 > 4096, 2060, 12, 0, 0.977 > 4096, 2060, 12, 1, 0.949 > 8192, 0, 0, 0, 0.85 > 8192, 0, 0, 1, 0.845 > 8192, 13, 0, 0, 0.937 > 8192, 13, 0, 1, 0.939 > 8192, 45, 0, 0, 0.932 > 8192, 45, 0, 1, 0.927 > 8192, 0, 13, 0, 0.621 > 8192, 0, 13, 1, 0.62 > 8192, 0, 45, 0, 0.53 > 8192, 0, 45, 1, 0.516 > 8192, 13, 13, 0, 0.664 > 8192, 13, 13, 1, 0.659 > 8192, 45, 45, 0, 0.593 > 8192, 45, 45, 1, 0.575 > 8192, 2048, 0, 0, 0.854 > 8192, 2048, 0, 1, 0.834 > 8192, 2061, 0, 0, 0.863 > 8192, 2061, 0, 1, 0.857 > 8192, 2048, 13, 0, 0.63 > 8192, 2048, 13, 1, 0.629 > 8192, 2061, 13, 0, 0.627 > 8192, 2061, 13, 1, 0.62 > --- > sysdeps/x86/dl-cacheinfo.h | 8 ++++---- > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > index e6c94dfd02..ceb3b53828 100644 > --- a/sysdeps/x86/dl-cacheinfo.h > +++ b/sysdeps/x86/dl-cacheinfo.h > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ > unsigned int minimum_rep_movsb_threshold; > #endif > - /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ > + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16). */ > unsigned int rep_movsb_threshold; > if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) > { > - rep_movsb_threshold = 2048 * (64 / 16); > + rep_movsb_threshold = 4096 * (64 / 16); > #if HAVE_TUNABLES > minimum_rep_movsb_threshold = 64 * 8; > #endif > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > else if (CPU_FEATURE_PREFERRED_P (cpu_features, > AVX_Fast_Unaligned_Load)) > { > - rep_movsb_threshold = 2048 * (32 / 16); > + rep_movsb_threshold = 4096 * (32 / 16); > #if HAVE_TUNABLES > minimum_rep_movsb_threshold = 32 * 8; > #endif > } > else > { > - rep_movsb_threshold = 2048 * (16 / 16); > + rep_movsb_threshold = 4096 * (16 / 16); > #if HAVE_TUNABLES > minimum_rep_movsb_threshold = 16 * 8; > #endif > -- > 2.25.1 > You need to update comments for x86_rep_movsb_threshold in sysdeps/x86/dl-tunables.list
On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > No bug. > > > > This patch doubles the rep_movsb_threshold when using ERMS. Based on > > benchmarks the vector copy loop, especially now that it handles 4k > > aliasing, is better for these medium ranged. > > > > On Skylake with ERMS: > > > > Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) > > 4096, 0, 0, 0, 0.975 > > 4096, 0, 0, 1, 0.953 > > 4096, 12, 0, 0, 0.969 > > 4096, 12, 0, 1, 0.872 > > 4096, 44, 0, 0, 0.979 > > 4096, 44, 0, 1, 0.83 > > 4096, 0, 12, 0, 1.006 > > 4096, 0, 12, 1, 0.989 > > 4096, 0, 44, 0, 0.739 > > 4096, 0, 44, 1, 0.942 > > 4096, 12, 12, 0, 1.009 > > 4096, 12, 12, 1, 0.973 > > 4096, 44, 44, 0, 0.791 > > 4096, 44, 44, 1, 0.961 > > 4096, 2048, 0, 0, 0.978 > > 4096, 2048, 0, 1, 0.951 > > 4096, 2060, 0, 0, 0.986 > > 4096, 2060, 0, 1, 0.963 > > 4096, 2048, 12, 0, 0.971 > > 4096, 2048, 12, 1, 0.941 > > 4096, 2060, 12, 0, 0.977 > > 4096, 2060, 12, 1, 0.949 > > 8192, 0, 0, 0, 0.85 > > 8192, 0, 0, 1, 0.845 > > 8192, 13, 0, 0, 0.937 > > 8192, 13, 0, 1, 0.939 > > 8192, 45, 0, 0, 0.932 > > 8192, 45, 0, 1, 0.927 > > 8192, 0, 13, 0, 0.621 > > 8192, 0, 13, 1, 0.62 > > 8192, 0, 45, 0, 0.53 > > 8192, 0, 45, 1, 0.516 > > 8192, 13, 13, 0, 0.664 > > 8192, 13, 13, 1, 0.659 > > 8192, 45, 45, 0, 0.593 > > 8192, 45, 45, 1, 0.575 > > 8192, 2048, 0, 0, 0.854 > > 8192, 2048, 0, 1, 0.834 > > 8192, 2061, 0, 0, 0.863 > > 8192, 2061, 0, 1, 0.857 > > 8192, 2048, 13, 0, 0.63 > > 8192, 2048, 13, 1, 0.629 > > 8192, 2061, 13, 0, 0.627 > > 8192, 2061, 13, 1, 0.62 > > --- > > sysdeps/x86/dl-cacheinfo.h | 8 ++++---- > > 1 file changed, 4 insertions(+), 4 deletions(-) > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > index e6c94dfd02..ceb3b53828 100644 > > --- a/sysdeps/x86/dl-cacheinfo.h > > +++ b/sysdeps/x86/dl-cacheinfo.h > > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ > > unsigned int minimum_rep_movsb_threshold; > > #endif > > - /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ > > + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16). */ > > unsigned int rep_movsb_threshold; > > if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > > && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) > > { > > - rep_movsb_threshold = 2048 * (64 / 16); > > + rep_movsb_threshold = 4096 * (64 / 16); > > #if HAVE_TUNABLES > > minimum_rep_movsb_threshold = 64 * 8; > > #endif > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > else if (CPU_FEATURE_PREFERRED_P (cpu_features, > > AVX_Fast_Unaligned_Load)) > > { > > - rep_movsb_threshold = 2048 * (32 / 16); > > + rep_movsb_threshold = 4096 * (32 / 16); > > #if HAVE_TUNABLES > > minimum_rep_movsb_threshold = 32 * 8; > > #endif > > } > > else > > { > > - rep_movsb_threshold = 2048 * (16 / 16); > > + rep_movsb_threshold = 4096 * (16 / 16); > > #if HAVE_TUNABLES > > minimum_rep_movsb_threshold = 16 * 8; > > #endif > > -- > > 2.25.1 > > > > You need to update comments for x86_rep_movsb_threshold > in sysdeps/x86/dl-tunables.list Can do. Noticing that the original values were based on comparisons with SSE2 likely on SnB or IVB. I don't have any indication that the 2048 value is not optimal for those processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2? > > -- > H.J.
On Sat, Nov 6, 2021 at 11:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > No bug. > > > > > > This patch doubles the rep_movsb_threshold when using ERMS. Based on > > > benchmarks the vector copy loop, especially now that it handles 4k > > > aliasing, is better for these medium ranged. > > > > > > On Skylake with ERMS: > > > > > > Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) > > > 4096, 0, 0, 0, 0.975 > > > 4096, 0, 0, 1, 0.953 > > > 4096, 12, 0, 0, 0.969 > > > 4096, 12, 0, 1, 0.872 > > > 4096, 44, 0, 0, 0.979 > > > 4096, 44, 0, 1, 0.83 > > > 4096, 0, 12, 0, 1.006 > > > 4096, 0, 12, 1, 0.989 > > > 4096, 0, 44, 0, 0.739 > > > 4096, 0, 44, 1, 0.942 > > > 4096, 12, 12, 0, 1.009 > > > 4096, 12, 12, 1, 0.973 > > > 4096, 44, 44, 0, 0.791 > > > 4096, 44, 44, 1, 0.961 > > > 4096, 2048, 0, 0, 0.978 > > > 4096, 2048, 0, 1, 0.951 > > > 4096, 2060, 0, 0, 0.986 > > > 4096, 2060, 0, 1, 0.963 > > > 4096, 2048, 12, 0, 0.971 > > > 4096, 2048, 12, 1, 0.941 > > > 4096, 2060, 12, 0, 0.977 > > > 4096, 2060, 12, 1, 0.949 > > > 8192, 0, 0, 0, 0.85 > > > 8192, 0, 0, 1, 0.845 > > > 8192, 13, 0, 0, 0.937 > > > 8192, 13, 0, 1, 0.939 > > > 8192, 45, 0, 0, 0.932 > > > 8192, 45, 0, 1, 0.927 > > > 8192, 0, 13, 0, 0.621 > > > 8192, 0, 13, 1, 0.62 > > > 8192, 0, 45, 0, 0.53 > > > 8192, 0, 45, 1, 0.516 > > > 8192, 13, 13, 0, 0.664 > > > 8192, 13, 13, 1, 0.659 > > > 8192, 45, 45, 0, 0.593 > > > 8192, 45, 45, 1, 0.575 > > > 8192, 2048, 0, 0, 0.854 > > > 8192, 2048, 0, 1, 0.834 > > > 8192, 2061, 0, 0, 0.863 > > > 8192, 2061, 0, 1, 0.857 > > > 8192, 2048, 13, 0, 0.63 > > > 8192, 2048, 13, 1, 0.629 > > > 8192, 2061, 13, 0, 0.627 > > > 8192, 2061, 13, 1, 0.62 > > > --- > > > sysdeps/x86/dl-cacheinfo.h | 8 ++++---- > > > 1 file changed, 4 insertions(+), 4 deletions(-) > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > index e6c94dfd02..ceb3b53828 100644 > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ > > > unsigned int minimum_rep_movsb_threshold; > > > #endif > > > - /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ > > > + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16). */ > > > unsigned int rep_movsb_threshold; > > > if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > > > && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) > > > { > > > - rep_movsb_threshold = 2048 * (64 / 16); > > > + rep_movsb_threshold = 4096 * (64 / 16); > > > #if HAVE_TUNABLES > > > minimum_rep_movsb_threshold = 64 * 8; > > > #endif > > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > else if (CPU_FEATURE_PREFERRED_P (cpu_features, > > > AVX_Fast_Unaligned_Load)) > > > { > > > - rep_movsb_threshold = 2048 * (32 / 16); > > > + rep_movsb_threshold = 4096 * (32 / 16); > > > #if HAVE_TUNABLES > > > minimum_rep_movsb_threshold = 32 * 8; > > > #endif > > > } > > > else > > > { > > > - rep_movsb_threshold = 2048 * (16 / 16); > > > + rep_movsb_threshold = 4096 * (16 / 16); > > > #if HAVE_TUNABLES > > > minimum_rep_movsb_threshold = 16 * 8; > > > #endif > > > -- > > > 2.25.1 > > > > > > > You need to update comments for x86_rep_movsb_threshold > > in sysdeps/x86/dl-tunables.list > > Can do. > > Noticing that the original values were based on comparisons with SSE2 likely on > SnB or IVB. I don't have any indication that the 2048 value is not > optimal for those > processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2? Good idea. So change the threshold to 2048 * (VEC_SIZE / 16) * (VEC_SIZE / 16)?
On Sat, Nov 6, 2021 at 1:21 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sat, Nov 6, 2021 at 11:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Sat, Nov 6, 2021 at 12:57 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Sat, Nov 6, 2021 at 10:39 AM Noah Goldstein via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > No bug. > > > > > > > > This patch doubles the rep_movsb_threshold when using ERMS. Based on > > > > benchmarks the vector copy loop, especially now that it handles 4k > > > > aliasing, is better for these medium ranged. > > > > > > > > On Skylake with ERMS: > > > > > > > > Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) > > > > 4096, 0, 0, 0, 0.975 > > > > 4096, 0, 0, 1, 0.953 > > > > 4096, 12, 0, 0, 0.969 > > > > 4096, 12, 0, 1, 0.872 > > > > 4096, 44, 0, 0, 0.979 > > > > 4096, 44, 0, 1, 0.83 > > > > 4096, 0, 12, 0, 1.006 > > > > 4096, 0, 12, 1, 0.989 > > > > 4096, 0, 44, 0, 0.739 > > > > 4096, 0, 44, 1, 0.942 > > > > 4096, 12, 12, 0, 1.009 > > > > 4096, 12, 12, 1, 0.973 > > > > 4096, 44, 44, 0, 0.791 > > > > 4096, 44, 44, 1, 0.961 > > > > 4096, 2048, 0, 0, 0.978 > > > > 4096, 2048, 0, 1, 0.951 > > > > 4096, 2060, 0, 0, 0.986 > > > > 4096, 2060, 0, 1, 0.963 > > > > 4096, 2048, 12, 0, 0.971 > > > > 4096, 2048, 12, 1, 0.941 > > > > 4096, 2060, 12, 0, 0.977 > > > > 4096, 2060, 12, 1, 0.949 > > > > 8192, 0, 0, 0, 0.85 > > > > 8192, 0, 0, 1, 0.845 > > > > 8192, 13, 0, 0, 0.937 > > > > 8192, 13, 0, 1, 0.939 > > > > 8192, 45, 0, 0, 0.932 > > > > 8192, 45, 0, 1, 0.927 > > > > 8192, 0, 13, 0, 0.621 > > > > 8192, 0, 13, 1, 0.62 > > > > 8192, 0, 45, 0, 0.53 > > > > 8192, 0, 45, 1, 0.516 > > > > 8192, 13, 13, 0, 0.664 > > > > 8192, 13, 13, 1, 0.659 > > > > 8192, 45, 45, 0, 0.593 > > > > 8192, 45, 45, 1, 0.575 > > > > 8192, 2048, 0, 0, 0.854 > > > > 8192, 2048, 0, 1, 0.834 > > > > 8192, 2061, 0, 0, 0.863 > > > > 8192, 2061, 0, 1, 0.857 > > > > 8192, 2048, 13, 0, 0.63 > > > > 8192, 2048, 13, 1, 0.629 > > > > 8192, 2061, 13, 0, 0.627 > > > > 8192, 2061, 13, 1, 0.62 > > > > --- > > > > sysdeps/x86/dl-cacheinfo.h | 8 ++++---- > > > > 1 file changed, 4 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h > > > > index e6c94dfd02..ceb3b53828 100644 > > > > --- a/sysdeps/x86/dl-cacheinfo.h > > > > +++ b/sysdeps/x86/dl-cacheinfo.h > > > > @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ > > > > unsigned int minimum_rep_movsb_threshold; > > > > #endif > > > > - /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ > > > > + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16). */ > > > > unsigned int rep_movsb_threshold; > > > > if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > > > > && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) > > > > { > > > > - rep_movsb_threshold = 2048 * (64 / 16); > > > > + rep_movsb_threshold = 4096 * (64 / 16); > > > > #if HAVE_TUNABLES > > > > minimum_rep_movsb_threshold = 64 * 8; > > > > #endif > > > > @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) > > > > else if (CPU_FEATURE_PREFERRED_P (cpu_features, > > > > AVX_Fast_Unaligned_Load)) > > > > { > > > > - rep_movsb_threshold = 2048 * (32 / 16); > > > > + rep_movsb_threshold = 4096 * (32 / 16); > > > > #if HAVE_TUNABLES > > > > minimum_rep_movsb_threshold = 32 * 8; > > > > #endif > > > > } > > > > else > > > > { > > > > - rep_movsb_threshold = 2048 * (16 / 16); > > > > + rep_movsb_threshold = 4096 * (16 / 16); > > > > #if HAVE_TUNABLES > > > > minimum_rep_movsb_threshold = 16 * 8; > > > > #endif > > > > -- > > > > 2.25.1 > > > > > > > > > > You need to update comments for x86_rep_movsb_threshold > > > in sysdeps/x86/dl-tunables.list > > > > Can do. > > > > Noticing that the original values were based on comparisons with SSE2 likely on > > SnB or IVB. I don't have any indication that the 2048 value is not > > optimal for those > > processors. Should we keep 2048 / (VEC_SIZE / 16) for SSE2? > > Good idea. So change the threshold to 2048 * (VEC_SIZE / 16) * > (VEC_SIZE / 16)? Done and updated the comments explaining the thresholds. > > -- > H.J.
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h index e6c94dfd02..ceb3b53828 100644 --- a/sysdeps/x86/dl-cacheinfo.h +++ b/sysdeps/x86/dl-cacheinfo.h @@ -866,12 +866,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ unsigned int minimum_rep_movsb_threshold; #endif - /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16). */ unsigned int rep_movsb_threshold; if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) { - rep_movsb_threshold = 2048 * (64 / 16); + rep_movsb_threshold = 4096 * (64 / 16); #if HAVE_TUNABLES minimum_rep_movsb_threshold = 64 * 8; #endif @@ -879,14 +879,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) else if (CPU_FEATURE_PREFERRED_P (cpu_features, AVX_Fast_Unaligned_Load)) { - rep_movsb_threshold = 2048 * (32 / 16); + rep_movsb_threshold = 4096 * (32 / 16); #if HAVE_TUNABLES minimum_rep_movsb_threshold = 32 * 8; #endif } else { - rep_movsb_threshold = 2048 * (16 / 16); + rep_movsb_threshold = 4096 * (16 / 16); #if HAVE_TUNABLES minimum_rep_movsb_threshold = 16 * 8; #endif