@@ -30,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4
endif
ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2
endif
endif
@@ -126,7 +126,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
- IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
+ IFUNC_IMPL_ADD (array, i, strcmp, HAS_AVX2, __strcmp_avx2)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
new file mode 100644
@@ -0,0 +1,3 @@
+#define USE_AVX2
+#define __strcmp_sse2_unaligned __strcmp_avx2
+#include "strcmp-sse2-unaligned.S"
@@ -89,12 +89,35 @@ L(main_loop_header):
subq %rsi, %rcx
shrq $6, %rcx
movq %rcx, %rsi
-
+#ifdef USE_AVX2
+ vpxor %xmm7, %xmm7, %xmm7
+#endif
.p2align 4
L(loop):
add $-1, %rsi
ja L(loop_cross_page)
L(back_to_loop):
+#ifdef USE_AVX2
+ vmovdqu (%rdx), %ymm0
+ vmovdqu 32(%rdx), %ymm1
+ vpcmpeqb (%rax), %ymm0, %ymm0
+ vpminub (%rax), %ymm0, %ymm0
+ vpcmpeqb (%rax), %ymm1, %ymm1
+ vpminub (%rax), %ymm1, %ymm1
+ vpminub %ymm0, %ymm1, %ymm2
+ vpcmpeqb %ymm7, %ymm2, %ymm2
+ addq $64, %rax
+ addq $64, %rdx
+ vpmovmskb %ymm2, %esi
+ test %esi, %esi
+ je L(loop)
+ shl $32, %rsi
+ vpcmpeqb %ymm7, %ymm0, %ymm0
+ vpmovmskb %ymm0, %ecx
+ or %rsi, %rcx
+ vzeroupper
+#else
+
movdqu (%rdx), %xmm0
movdqu 16(%rdx), %xmm1
movdqa (%rax), %xmm2
@@ -132,14 +155,17 @@ L(back_to_loop):
orq %rdi, %rcx
sal $16, %esi
orq %rsi, %rcx
+#endif
bsfq %rcx, %rcx
movzbl -64(%rax, %rcx), %eax
movzbl -64(%rdx, %rcx), %edx
subl %edx, %eax
ret
-
.p2align 4
L(loop_cross_page):
+#ifdef USE_AVX2
+ vzeroupper
+#endif
xor %ecx, %ecx
movq %rdx, %r9
and $63, %r9
@@ -90,6 +90,12 @@ ENTRY(STRCMP)
call __init_cpu_features
1:
#ifdef USE_AS_STRCMP
+# ifdef HAVE_AVX2_SUPPORT
+
+ leaq __strcmp_avx2(%rip), %rax
+ testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+ jnz 3f
+# endif
leaq __strcmp_sse2_unaligned(%rip), %rax
testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jnz 3f