@@ -8,4 +8,5 @@ LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales)
LC_CTYPE
_NL_CTYPE_NONASCII_CASE
LOCALE_DATA_VALUES offsetof (struct __locale_data, values)
+LOCALE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower)
SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0])
@@ -14,6 +14,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
+ strcasecmp_l-sse2-unaligned \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
@@ -94,20 +94,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
IFUNC_IMPL (i, name, strcasecmp,
- IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_AVX,
- __strcasecmp_avx)
- IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
- __strcasecmp_sse42)
+ IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
+ __strcasecmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
__strcasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */
IFUNC_IMPL (i, name, strcasecmp_l,
- IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_AVX,
- __strcasecmp_l_avx)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSE4_2,
- __strcasecmp_l_sse42)
+ IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
+ __strcasecmp_sse2_unaligned_l)
IFUNC_IMPL_ADD (array, i, strcasecmp_l, HAS_SSSE3,
__strcasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
new file mode 100644
@@ -0,0 +1,294 @@
+/* strcasecmp with unaligned loads
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NOT_IN_libc
+
+#include "sysdep.h"
+
+# include "locale-defines.h"
+
+
+ENTRY (__strcasecmp_sse2_unaligned_l)
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %rdx
+ // XXX 5 byte should be before the function
+ /* 5-byte NOP. */
+ .byte 0x0f,0x1f,0x44,0x00,0x00
+
+END (__strcasecmp_sse2_unaligned_l)
+
+ENTRY (__strcasecmp_sse2_unaligned)
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ mov LOCALE_TOLOWER(%rdx), %r11
+ movl (%r11,%rax,4), %eax
+ subl (%r11,%rcx,4), %eax
+ je L(next)
+L(return):
+ ret
+L(next):
+ test %ecx, %ecx
+ je L(return)
+ leaq 1(%rsi), %rsi
+ leaq 1(%rdi), %rdi
+
+ pxor %xmm7, %xmm7
+ movl %edi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page_start)
+ movl %esi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page_start)
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm1, %xmm0
+ pxor %xmm1, %xmm1
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ je L(next_48_bytes)
+
+L(loop1):
+ bsf %ecx, %r8d
+ movzbl (%rdi,%r8), %eax
+ movzbl (%rsi,%r8), %r8d
+ movl (%r11,%rax,4), %eax
+ subl (%r11,%r8,4), %eax
+ jne L(return)
+ test %r8d, %r8d
+ je L(return)
+ leaq -1(%rcx), %rax
+ andq %rax, %rcx
+ je L(next_48_bytes)
+ jmp L(loop1)
+
+
+ .p2align 4
+L(next_48_bytes):
+ movdqu 16(%rdi), %xmm6
+ movdqu 16(%rsi), %xmm3
+ movdqu 32(%rdi), %xmm5
+ pcmpeqb %xmm6, %xmm3
+ movdqu 32(%rsi), %xmm2
+ pminub %xmm6, %xmm3
+ pcmpeqb %xmm1, %xmm3
+ movdqu 48(%rdi), %xmm4
+ pcmpeqb %xmm5, %xmm2
+ pmovmskb %xmm3, %edx
+ movdqu 48(%rsi), %xmm0
+ pminub %xmm5, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm2, %eax
+ sal $16, %edx
+ pminub %xmm4, %xmm0
+ pcmpeqb %xmm1, %xmm0
+ salq $32, %rax
+ orq %rdx, %rax
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rax, %rcx
+ jne L(caseloop2)
+L(main_loop_header):
+ leaq 64(%rdi), %rdx
+ movl $4096, %ecx
+ pxor %xmm9, %xmm9
+ andq $-64, %rdx
+ subq %rdi, %rdx
+ leaq (%rdi, %rdx), %rax
+ addq %rsi, %rdx
+ movq %rdx, %rsi
+ andl $4095, %esi
+ subq %rsi, %rcx
+ shrq $6, %rcx
+ movq %rcx, %rsi
+ jmp L(loop_start)
+
+L(caseloop2):
+ bsf %rcx, %r8
+ movzbl (%rdi,%r8), %eax
+ movzbl (%rsi,%r8), %r8d
+ movl (%r11,%rax,4), %eax
+ subl (%r11,%r8,4), %eax
+ jne L(return)
+ test %r8d, %r8d
+ je L(return)
+ leaq -1(%rcx), %rax
+ andq %rax, %rcx
+ je L(main_loop_header)
+ jmp L(caseloop2)
+
+
+
+L(caseloop3):
+ bsfq %rax, %rdx
+ leaq -1(%rax), %r10
+ andq %rax, %r10
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rdx, 4), %edx
+ testl %eax, %eax
+ je L(zero3)
+ cmpl %edx, %eax
+ je L(casecnt3)
+L(zero3):
+ subl %edx, %eax
+ ret
+L(casecnt3):
+ movq %rdi, %rax
+ movq %rsi, %rdx
+ testq %r10, %r10
+ je L(back_to_loop)
+ movq %r10, %rax
+ jmp L(caseloop3)
+
+
+
+ .p2align 4
+L(loop):
+ addq $64, %rax
+ addq $64, %rdx
+L(loop_start):
+ testq %rsi, %rsi
+ leaq -1(%rsi), %rsi
+ je L(loop_cross_page)
+L(back_to_loop):
+ movdqu (%rdx), %xmm0
+ movdqu 16(%rdx), %xmm1
+ movdqa (%rax), %xmm2
+ movdqa 16(%rax), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ movdqu 32(%rdx), %xmm5
+ pcmpeqb %xmm3, %xmm1
+ pminub %xmm2, %xmm0
+ movdqu 48(%rdx), %xmm6
+ pminub %xmm3, %xmm1
+ movdqa 32(%rax), %xmm2
+ pminub %xmm1, %xmm0
+ movdqa 48(%rax), %xmm3
+ pcmpeqb %xmm2, %xmm5
+ pcmpeqb %xmm3, %xmm6
+ pminub %xmm2, %xmm5
+ pminub %xmm3, %xmm6
+ pminub %xmm5, %xmm0
+ pminub %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %ecx
+ testl %ecx, %ecx
+ je L(loop)
+ pcmpeqb %xmm7, %xmm5
+ movdqu (%rdx), %xmm0
+ pcmpeqb %xmm7, %xmm1
+ movdqa (%rax), %xmm2
+ pcmpeqb %xmm2, %xmm0
+ pminub %xmm2, %xmm0
+ pcmpeqb %xmm7, %xmm6
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ salq $16, %rcx
+ salq $32, %r8
+ pmovmskb %xmm6, %esi
+ orq %r8, %rcx
+ orq %rdi, %rcx
+ salq $48, %rsi
+ orq %rsi, %rcx
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ jmp L(caseloop2)
+
+ .p2align 4
+L(loop_cross_page):
+ xor %r10, %r10
+ movq %rdx, %r9
+ and $63, %r9
+ subq %r9, %r10
+
+ movdqa (%rdx, %r10), %xmm0
+ movdqa 16(%rdx, %r10), %xmm1
+ movdqu (%rax, %r10), %xmm2
+ movdqu 16(%rax, %r10), %xmm3
+ pcmpeqb %xmm2, %xmm0
+ movdqa 32(%rdx, %r10), %xmm5
+ pcmpeqb %xmm3, %xmm1
+ pminub %xmm2, %xmm0
+ movdqa 48(%rdx, %r10), %xmm6
+ pminub %xmm3, %xmm1
+ movdqu 32(%rax, %r10), %xmm2
+ movdqu 48(%rax, %r10), %xmm3
+ pcmpeqb %xmm2, %xmm5
+ pcmpeqb %xmm3, %xmm6
+ pminub %xmm2, %xmm5
+ pminub %xmm3, %xmm6
+
+ pcmpeqb %xmm7, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pcmpeqb %xmm7, %xmm5
+ pcmpeqb %xmm7, %xmm6
+
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ salq $16, %rcx
+ salq $32, %r8
+ pmovmskb %xmm6, %esi
+ orq %r8, %rdi
+ orq %rcx, %rdi
+ salq $48, %rsi
+ orq %rsi, %rdi
+ movq %r9, %rcx
+ movq $63, %rsi
+ shrq %cl, %rdi
+ test %rdi, %rdi
+ je L(back_to_loop)
+ movq %rdi, %r10
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq %r10, %rax
+ jmp L(caseloop3)
+
+L(cross_page_start):
+ xor %edx, %edx
+ jmp L(cross_page)
+ .p2align 4
+L(cross_page_loop):
+ addq $1, %rdx
+ cmpq $64, %rdx
+ je L(main_loop_header)
+L(cross_page):
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %ecx
+ cmp %eax, %ecx
+ je L(skip_table)
+ movl (%r11, %rax, 4), %eax
+ subl (%r11, %rcx, 4), %ecx
+ jne L(ret2)
+L(skip_table):
+ test %eax, %eax
+ jne L(cross_page_loop)
+L(ret2):
+ ret
+
+
+END (__strcasecmp_sse2_unaligned)
+#endif
@@ -1,8 +1,106 @@
-/* Multiple versions of strcasecmp and strcasecmp_l
- All versions must be listed in ifunc-impl-list.c. */
+/* Multiple versions of strcasecmp
+ Copyright (C) 2009-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
#define STRCMP __strcasecmp_l
#define USE_AS_STRCASECMP_L
-#include "strcmp.S"
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+# include "locale-defines.h"
+
+
+# define STRCMP_SSSE3 __strcasecmp_l_ssse3
+# define STRCMP_SSE2 __strcasecmp_l_sse2
+# define __GI_STRCMP __GI___strcasecmp_l
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncmp in static library since we
+ need strncmp before the initialization happened. */
+#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
+ .text
+ENTRY(STRCMP)
+ .type STRCMP, @gnu_indirect_function
+ /* Manually inlined call to __get_cpu_features. */
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1:
+ leaq __strcasecmp_sse2_unaligned_l(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 3f
+2: leaq STRCMP_SSSE3(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 3f
+ leaq STRCMP_SSE2(%rip), %rax
+3: ret
+END(STRCMP)
+
+ENTRY(__strcasecmp)
+ .type __strcasecmp, @gnu_indirect_function
+ /* Manually inlined call to __get_cpu_features. */
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1:
+ leaq __strcasecmp_sse2_unaligned(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 3f
+2: leaq __strcasecmp_ssse3(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 3f
+ leaq __strcasecmp_sse2(%rip), %rax
+3: ret
+
+END(__strcasecmp)
+weak_alias (__strcasecmp, strcasecmp)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCMP_SSE2, @function; \
+ .align 16; \
+ .globl STRCMP_SSE2; \
+ .hidden STRCMP_SSE2; \
+ STRCMP_SSE2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
+
+# define ENTRY2(name) \
+ .type __strcasecmp_sse2, @function; \
+ .align 16; \
+ .globl __strcasecmp_sse2; \
+ .hidden __strcasecmp_sse2; \
+ __strcasecmp_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# define END2(name) \
+ cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
+
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
+#endif
+
+#include "../strcmp.S"
weak_alias (__strcasecmp_l, strcasecmp_l)
libc_hidden_def (strcasecmp_l)