Message ID | 20150130145656.GA26219@gmail.com |
---|---|
State | New |
Headers | show |
On Fri, Jan 30, 2015 at 6:56 AM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Tue, Jan 06, 2015 at 06:54:50AM -0800, H.J. Lu wrote: >> On Tue, Jan 6, 2015 at 6:29 AM, Ondřej Bílka <neleai@seznam.cz> wrote: >> > H. J, in this commit there slipped performance regression by review. >> > >> > commit 05f3633da4f9df870d04dd77336e793746e57ed4 >> > Author: Ling Ma <ling.ml@alibaba-inc.com> >> > Date: Mon Jul 14 00:02:52 2014 -0400 >> > >> > Improve 64bit memcpy performance for Haswell CPU with AVX >> > instruction >> > >> > >> > I seem to recall that I mentioned something about avx being typo and >> > should be avx2 but did not look it further. >> > >> > As I assumed its avx2 only I was ok with that nad haswell specific >> > optimizations like using rep movsq. However ifunc checks for avx which >> > is bad as we already know that avx loads/stores are slow on sandy >> > bridge. >> > >> > Also testing on affected architectures would reveal that. Especially amd >> > bulldozer where its five times slower on 2kb-16kb range, see >> > http://kam.mff.cuni.cz/~ondra/benchmark_string/fx10/memcpy_profile_avx/results_rand/result.html >> > because movsb is slow. >> > >> > On sandy bridge its only 20% regression on same range. >> > http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/memcpy_profile_avx/results_rand/result.html >> > >> > >> > Also avx loop for 128-2024 bytes is slower there so there is no point >> > using it. >> > >> > What about following change? >> > >> > >> > * sysdeps/x86_64/multiarch/memcpy.S: Fix performance regression. >> > >> > diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S >> > index 992e40d..27f89e4 100644 >> > --- a/sysdeps/x86_64/multiarch/memcpy.S >> > +++ b/sysdeps/x86_64/multiarch/memcpy.S >> > @@ -32,10 +32,13 @@ ENTRY(__new_memcpy) >> > cmpl $0, KIND_OFFSET+__cpu_features(%rip) >> > jne 1f >> > call __init_cpu_features >> > +#ifdef HAVE_AVX2_SUPPORT >> > 1: leaq __memcpy_avx_unaligned(%rip), %rax >> > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) >> > + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) >> > + >> > jz 1f >> > ret >> > +#endif >> > 1: leaq __memcpy_sse2(%rip), %rax >> > testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip) >> > jnz 2f >> >> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it >> on together >> with bit_AVX2_Usable. >> > > I know we are in freeze. But I'd like to fix this regression in 2.21. > OK for master? Since this is a serious performance regression, I will check it in before the end of the day unless I am told otherwise. > Thanks. > > > H.J. > --- > From 56d25c11b64a97255a115901d136d753c86de24e Mon Sep 17 00:00:00 2001 > From: "H.J. Lu" <hjl.tools@gmail.com> > Date: Fri, 30 Jan 2015 06:50:20 -0800 > Subject: [PATCH] Use AVX unaligned memcpy only if AVX2 is available > > memcpy with unaligned 256-bit AVX register loads/stores are slow on older > processorsl like Sandy Bridge. This patch adds bit_AVX_Fast_Unaligned_Load > and sets it only when AVX2 is available. > > [BZ #17801] > * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): > Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. > * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): > New. > (index_AVX_Fast_Unaligned_Load): Likewise. > (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. > * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the > bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. > * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. > * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. > * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. > * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace > HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. > * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. > --- > ChangeLog | 18 ++++++++++++++++++ > sysdeps/x86_64/multiarch/init-arch.c | 9 +++++++-- > sysdeps/x86_64/multiarch/init-arch.h | 4 ++++ > sysdeps/x86_64/multiarch/memcpy.S | 2 +- > sysdeps/x86_64/multiarch/memcpy_chk.S | 2 +- > sysdeps/x86_64/multiarch/memmove.c | 2 +- > sysdeps/x86_64/multiarch/memmove_chk.c | 2 +- > sysdeps/x86_64/multiarch/mempcpy.S | 2 +- > sysdeps/x86_64/multiarch/mempcpy_chk.S | 2 +- > 9 files changed, 35 insertions(+), 8 deletions(-) > > diff --git a/ChangeLog b/ChangeLog > index 26f7f3f..a696e39 100644 > --- a/ChangeLog > +++ b/ChangeLog > @@ -1,3 +1,21 @@ > +2015-01-30 H.J. Lu <hongjiu.lu@intel.com> > + > + [BZ #17801] > + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): > + Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. > + * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): > + New. > + (index_AVX_Fast_Unaligned_Load): Likewise. > + (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. > + * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the > + bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. > + * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. > + * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. > + * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. > + * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace > + HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. > + * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. > + > 2015-01-29 Andreas Schwab <schwab@suse.de> > > * sysdeps/nptl/allocrtsig.c: Include <signal.h>. > diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c > index 9299360..7dec218 100644 > --- a/sysdeps/x86_64/multiarch/init-arch.c > +++ b/sysdeps/x86_64/multiarch/init-arch.c > @@ -171,9 +171,14 @@ __init_cpu_features (void) > /* Determine if AVX is usable. */ > if (CPUID_AVX) > __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; > - /* Determine if AVX2 is usable. */ > +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load > +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load > +#endif > + /* Determine if AVX2 is usable. Unaligned load with 256-bit > + AVX registers are faster on processors with AVX2. */ > if (CPUID_AVX2) > - __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable; > + __cpu_features.feature[index_AVX2_Usable] > + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; > /* Determine if FMA is usable. */ > if (CPUID_FMA) > __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; > diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h > index 55f1c5b..e6b5ba5 100644 > --- a/sysdeps/x86_64/multiarch/init-arch.h > +++ b/sysdeps/x86_64/multiarch/init-arch.h > @@ -25,6 +25,7 @@ > #define bit_FMA4_Usable (1 << 8) > #define bit_Slow_SSE4_2 (1 << 9) > #define bit_AVX2_Usable (1 << 10) > +#define bit_AVX_Fast_Unaligned_Load (1 << 11) > > /* CPUID Feature flags. */ > > @@ -74,6 +75,7 @@ > # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE > # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE > # define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE > +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE > > #else /* __ASSEMBLER__ */ > > @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void) > # define index_FMA4_Usable FEATURE_INDEX_1 > # define index_Slow_SSE4_2 FEATURE_INDEX_1 > # define index_AVX2_Usable FEATURE_INDEX_1 > +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 > > # define HAS_ARCH_FEATURE(name) \ > ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) > @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void) > # define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable) > # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) > # define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) > +# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) > > #endif /* __ASSEMBLER__ */ > diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S > index 992e40d..4e18cd3 100644 > --- a/sysdeps/x86_64/multiarch/memcpy.S > +++ b/sysdeps/x86_64/multiarch/memcpy.S > @@ -33,7 +33,7 @@ ENTRY(__new_memcpy) > jne 1f > call __init_cpu_features > 1: leaq __memcpy_avx_unaligned(%rip), %rax > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) > + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) > jz 1f > ret > 1: leaq __memcpy_sse2(%rip), %rax > diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S > index 5e9cf00..1e756ea 100644 > --- a/sysdeps/x86_64/multiarch/memcpy_chk.S > +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S > @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk) > testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) > jz 2f > leaq __memcpy_chk_ssse3_back(%rip), %rax > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) > + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) > jz 2f > leaq __memcpy_chk_avx_unaligned(%rip), %rax > 2: ret > diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c > index d93bfd0..dd153a3 100644 > --- a/sysdeps/x86_64/multiarch/memmove.c > +++ b/sysdeps/x86_64/multiarch/memmove.c > @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; > ifunc symbol properly. */ > extern __typeof (__redirect_memmove) __libc_memmove; > libc_ifunc (__libc_memmove, > - HAS_AVX > + HAS_AVX_FAST_UNALIGNED_LOAD > ? __memmove_avx_unaligned > : (HAS_SSSE3 > ? (HAS_FAST_COPY_BACKWARD > diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c > index 743ca2a..8b12d00 100644 > --- a/sysdeps/x86_64/multiarch/memmove_chk.c > +++ b/sysdeps/x86_64/multiarch/memmove_chk.c > @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; > #include "debug/memmove_chk.c" > > libc_ifunc (__memmove_chk, > - HAS_AVX ? __memmove_chk_avx_unaligned : > + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned : > (HAS_SSSE3 > ? (HAS_FAST_COPY_BACKWARD > ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) > diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S > index cdf1dab..2eaacdf 100644 > --- a/sysdeps/x86_64/multiarch/mempcpy.S > +++ b/sysdeps/x86_64/multiarch/mempcpy.S > @@ -37,7 +37,7 @@ ENTRY(__mempcpy) > testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) > jz 2f > leaq __mempcpy_ssse3_back(%rip), %rax > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) > + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) > jz 2f > leaq __mempcpy_avx_unaligned(%rip), %rax > 2: ret > diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S > index b7f9e89..17b8470 100644 > --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S > +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S > @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk) > testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) > jz 2f > leaq __mempcpy_chk_ssse3_back(%rip), %rax > - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) > + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) > jz 2f > leaq __mempcpy_chk_avx_unaligned(%rip), %rax > 2: ret > -- > 2.1.0 >
On 01/30/2015 03:04 PM, H.J. Lu wrote: >>> >>> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it >>> on together >>> with bit_AVX2_Usable. >>> >> >> I know we are in freeze. But I'd like to fix this regression in 2.21. >> OK for master? > > Since this is a serious performance regression, I will check it in > before the end of the day unless I am told otherwise. In the future please TO: me so that I have high visibility on this change as the release manager. I'm testing each of the changes to make sure things are in good shape for the release. Could you explain in detail why this is needed? +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +#endif Why do they have to be on the same index in the feature array of bits? I don't see anywhere that checks them both simultaneously. At the very least please add a detailed comment why the error condition exists and how to fix it in the future if another author needs to fix it. Cheers, Carlos.
On Fri, Jan 30, 2015 at 10:50 PM, Carlos O'Donell <carlos@redhat.com> wrote: > On 01/30/2015 03:04 PM, H.J. Lu wrote: >>>> >>>> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it >>>> on together >>>> with bit_AVX2_Usable. >>>> >>> >>> I know we are in freeze. But I'd like to fix this regression in 2.21. >>> OK for master? >> >> Since this is a serious performance regression, I will check it in >> before the end of the day unless I am told otherwise. > > In the future please TO: me so that I have high visibility on this change > as the release manager. I'm testing each of the changes to make sure things > are in good shape for the release. > > Could you explain in detail why this is needed? > > +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load > +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load > +#endif > > Why do they have to be on the same index in the feature > array of bits? I don't see anywhere that checks them > both simultaneously. At the very least please add a detailed > comment why the error condition exists and how to fix it in > the future if another author needs to fix it. > There are /* Unaligned load versions are faster than SSSE3 on Silvermont. */ #if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop # error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop #endif #if index_Fast_Unaligned_Load != index_Slow_SSE4_2 # error index_Fast_Unaligned_Load != index_Slow_SSE4_2 #endif __cpu_features.feature[index_Fast_Unaligned_Load] |= (bit_Fast_Unaligned_Load | bit_Prefer_PMINUB_for_stringop | bit_Slow_SSE4_2); and #if index_Fast_Rep_String != index_Fast_Copy_Backward # error index_Fast_Rep_String != index_Fast_Copy_Backward #endif #if index_Fast_Rep_String != index_Fast_Unaligned_Load # error index_Fast_Rep_String != index_Fast_Unaligned_Load #endif #if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop # error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop #endif __cpu_features.feature[index_Fast_Rep_String] |= (bit_Fast_Rep_String | bit_Fast_Copy_Backward | bit_Fast_Unaligned_Load | bit_Prefer_PMINUB_for_stringop); before. Since there are xtern struct cpu_features { enum cpu_features_kind { arch_kind_unknown = 0, arch_kind_intel, arch_kind_amd, arch_kind_other } kind; int max_cpuid; struct cpuid_registers { unsigned int eax; unsigned int ebx; unsigned int ecx; unsigned int edx; } cpuid[COMMON_CPUID_INDEX_MAX]; unsigned int family; unsigned int model; unsigned int feature[FEATURE_INDEX_MAX]; } __cpu_features attribute_hidden; Each feature element can hold up to 32 features. We use # define index_Fast_Rep_String FEATURE_INDEX_1 # define index_Fast_Copy_Backward FEATURE_INDEX_1 # define index_Slow_BSF FEATURE_INDEX_1 # define index_Fast_Unaligned_Load FEATURE_INDEX_1 # define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1 # define index_AVX_Usable FEATURE_INDEX_1 # define index_FMA_Usable FEATURE_INDEX_1 # define index_FMA4_Usable FEATURE_INDEX_1 # define index_Slow_SSE4_2 FEATURE_INDEX_1 # define index_AVX2_Usable FEATURE_INDEX_1 # define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 to indicate which element the feature bit is in and use a single statement #if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load # error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load #endif /* Determine if AVX2 is usable. Unaligned load with 256-bit AVX registers are faster on processors with AVX2. */ if (CPUID_AVX2) __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; to update 2 features. It works only if they have the same index_XXX. We need this check when we update more than one feature bit in a single statement.
"H.J. Lu" <hjl.tools@gmail.com> writes: > #if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load > # error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load > #endif > /* Determine if AVX2 is usable. Unaligned load with 256-bit > AVX registers are faster on processors with AVX2. */ > if (CPUID_AVX2) > __cpu_features.feature[index_AVX2_Usable] > |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; > > to update 2 features. It works only if they have the same index_XXX. > We need this check when we update more than one feature bit > in a single statement. You can use two statements, and the compiler will be able to combine them. Andreas.
diff --git a/ChangeLog b/ChangeLog index 26f7f3f..a696e39 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2015-01-30 H.J. Lu <hongjiu.lu@intel.com> + + [BZ #17801] + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Set the bit_AVX_Fast_Unaligned_Load bit for AVX2. + * sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load): + New. + (index_AVX_Fast_Unaligned_Load): Likewise. + (HAS_AVX_FAST_UNALIGNED_LOAD): Likewise. + * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the + bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit. + * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise. + * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise. + * sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace + HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD. + * sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise. + 2015-01-29 Andreas Schwab <schwab@suse.de> * sysdeps/nptl/allocrtsig.c: Include <signal.h>. diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 9299360..7dec218 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -171,9 +171,14 @@ __init_cpu_features (void) /* Determine if AVX is usable. */ if (CPUID_AVX) __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable; - /* Determine if AVX2 is usable. */ +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load +#endif + /* Determine if AVX2 is usable. Unaligned load with 256-bit + AVX registers are faster on processors with AVX2. */ if (CPUID_AVX2) - __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable; + __cpu_features.feature[index_AVX2_Usable] + |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load; /* Determine if FMA is usable. */ if (CPUID_FMA) __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable; diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index 55f1c5b..e6b5ba5 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -25,6 +25,7 @@ #define bit_FMA4_Usable (1 << 8) #define bit_Slow_SSE4_2 (1 << 9) #define bit_AVX2_Usable (1 << 10) +#define bit_AVX_Fast_Unaligned_Load (1 << 11) /* CPUID Feature flags. */ @@ -74,6 +75,7 @@ # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE # define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void) # define index_FMA4_Usable FEATURE_INDEX_1 # define index_Slow_SSE4_2 FEATURE_INDEX_1 # define index_AVX2_Usable FEATURE_INDEX_1 +# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1 # define HAS_ARCH_FEATURE(name) \ ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0) @@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void) # define HAS_AVX2 HAS_ARCH_FEATURE (AVX2_Usable) # define HAS_FMA HAS_ARCH_FEATURE (FMA_Usable) # define HAS_FMA4 HAS_ARCH_FEATURE (FMA4_Usable) +# define HAS_AVX_FAST_UNALIGNED_LOAD HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S index 992e40d..4e18cd3 100644 --- a/sysdeps/x86_64/multiarch/memcpy.S +++ b/sysdeps/x86_64/multiarch/memcpy.S @@ -33,7 +33,7 @@ ENTRY(__new_memcpy) jne 1f call __init_cpu_features 1: leaq __memcpy_avx_unaligned(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 1f ret 1: leaq __memcpy_sse2(%rip), %rax diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S index 5e9cf00..1e756ea 100644 --- a/sysdeps/x86_64/multiarch/memcpy_chk.S +++ b/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__memcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __memcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __memcpy_chk_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c index d93bfd0..dd153a3 100644 --- a/sysdeps/x86_64/multiarch/memmove.c +++ b/sysdeps/x86_64/multiarch/memmove.c @@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden; ifunc symbol properly. */ extern __typeof (__redirect_memmove) __libc_memmove; libc_ifunc (__libc_memmove, - HAS_AVX + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c index 743ca2a..8b12d00 100644 --- a/sysdeps/x86_64/multiarch/memmove_chk.c +++ b/sysdeps/x86_64/multiarch/memmove_chk.c @@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden; #include "debug/memmove_chk.c" libc_ifunc (__memmove_chk, - HAS_AVX ? __memmove_chk_avx_unaligned : + HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned : (HAS_SSSE3 ? (HAS_FAST_COPY_BACKWARD ? __memmove_chk_ssse3_back : __memmove_chk_ssse3) diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S index cdf1dab..2eaacdf 100644 --- a/sysdeps/x86_64/multiarch/mempcpy.S +++ b/sysdeps/x86_64/multiarch/mempcpy.S @@ -37,7 +37,7 @@ ENTRY(__mempcpy) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_avx_unaligned(%rip), %rax 2: ret diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S index b7f9e89..17b8470 100644 --- a/sysdeps/x86_64/multiarch/mempcpy_chk.S +++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk) testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip) jz 2f leaq __mempcpy_chk_ssse3_back(%rip), %rax - testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip) + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) jz 2f leaq __mempcpy_chk_avx_unaligned(%rip), %rax 2: ret