@@ -1,4 +1,4 @@
-/* SSE2 version of strlen.
+/* SSE2 version of strlen/wcslen.
Copyright (C) 2012-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -18,6 +18,16 @@
#include <sysdep.h>
+#ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+#else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+#endif
+
/* Long lived register in strlen(s), strnlen(s, n) are:
%xmm3 - zero
@@ -32,10 +42,10 @@ ENTRY(strlen)
/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
#define FIND_ZERO \
- pcmpeqb (%rax), %xmm0; \
- pcmpeqb 16(%rax), %xmm1; \
- pcmpeqb 32(%rax), %xmm2; \
- pcmpeqb 48(%rax), %xmm3; \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
pmovmskb %xmm0, %esi; \
pmovmskb %xmm1, %edx; \
pmovmskb %xmm2, %r8d; \
@@ -54,6 +64,9 @@ ENTRY(strlen)
xor %rax, %rax
ret
L(n_nonzero):
+# ifdef AS_WCSLEN
+ shlq $2, %rsi
+# endif
/* Initialize long lived registers. */
@@ -96,6 +109,7 @@ L(n_nonzero):
test %rdx, %rdx; \
je L(lab); \
bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
ret
#ifdef AS_STRNLEN
@@ -104,19 +118,20 @@ L(n_nonzero):
#else
/* Test first 16 bytes unaligned. */
movdqu (%rax), %xmm4
- pcmpeqb %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm4
pmovmskb %xmm4, %edx
test %edx, %edx
je L(next48_bytes)
bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
ret
L(next48_bytes):
/* Same as FIND_ZERO except we do not check first 16 bytes. */
andq $-16, %rax
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
pmovmskb %xmm1, %edx
pmovmskb %xmm2, %r8d
pmovmskb %xmm3, %ecx
@@ -145,6 +160,7 @@ L(strnlen_ret):
test %rdx, %rdx
je L(loop_init)
bsfq %rdx, %rax
+ SHIFT_RETURN
ret
#endif
.p2align 4
@@ -161,10 +177,10 @@ L(loop):
je L(exit_end)
movdqa (%rax), %xmm0
- pminub 16(%rax), %xmm0
- pminub 32(%rax), %xmm0
- pminub 48(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit)
@@ -182,6 +198,7 @@ L(first):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
.p2align 4
@@ -192,6 +209,7 @@ L(exit):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
#else
@@ -201,10 +219,10 @@ L(exit):
L(loop):
movdqa 64(%rax), %xmm0
- pminub 80(%rax), %xmm0
- pminub 96(%rax), %xmm0
- pminub 112(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit64)
@@ -212,10 +230,10 @@ L(loop):
subq $-128, %rax
movdqa (%rax), %xmm0
- pminub 16(%rax), %xmm0
- pminub 32(%rax), %xmm0
- pminub 48(%rax), %xmm0
- pcmpeqb %xmm3, %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
pmovmskb %xmm0, %edx
testl %edx, %edx
jne L(exit0)
@@ -231,6 +249,7 @@ L(exit0):
bsfq %rdx, %rdx
addq %rdx, %rax
subq %rdi, %rax
+ SHIFT_RETURN
ret
#endif
@@ -1,238 +1,6 @@
-/* Optimized wcslen for x86-64 with SSE2.
- Copyright (C) 2011-2017 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
+#define AS_WCSLEN
+#define strlen __wcslen
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
- .text
-ENTRY (__wcslen)
- cmpl $0, (%rdi)
- jz L(exit_tail0)
- cmpl $0, 4(%rdi)
- jz L(exit_tail1)
- cmpl $0, 8(%rdi)
- jz L(exit_tail2)
- cmpl $0, 12(%rdi)
- jz L(exit_tail3)
- cmpl $0, 16(%rdi)
- jz L(exit_tail4)
- cmpl $0, 20(%rdi)
- jz L(exit_tail5)
- cmpl $0, 24(%rdi)
- jz L(exit_tail6)
- cmpl $0, 28(%rdi)
- jz L(exit_tail7)
-
- pxor %xmm0, %xmm0
-
- lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
- and $-16, %rax
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64_loop):
- movaps (%rax), %xmm0
- movaps 16(%rax), %xmm1
- movaps 32(%rax), %xmm2
- movaps 48(%rax), %xmm6
-
- pminub %xmm1, %xmm0
- pminub %xmm6, %xmm2
- pminub %xmm0, %xmm2
- pcmpeqd %xmm3, %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 64(%rax), %rax
- jz L(aligned_64_loop)
-
- pcmpeqd -64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 48(%rcx), %rcx
- jnz L(exit)
-
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- pcmpeqd -32(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- pcmpeqd %xmm6, %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
-
- .p2align 4
-L(exit):
- sub %rcx, %rax
- shr $2, %rax
- test %dl, %dl
- jz L(exit_high)
-
- mov %dl, %cl
- and $15, %cl
- jz L(exit_1)
- ret
-
- .p2align 4
-L(exit_high):
- mov %dh, %ch
- and $15, %ch
- jz L(exit_3)
- add $2, %rax
- ret
-
- .p2align 4
-L(exit_1):
- add $1, %rax
- ret
-
- .p2align 4
-L(exit_3):
- add $3, %rax
- ret
-
- .p2align 4
-L(exit_tail0):
- xor %rax, %rax
- ret
-
- .p2align 4
-L(exit_tail1):
- mov $1, %rax
- ret
-
- .p2align 4
-L(exit_tail2):
- mov $2, %rax
- ret
-
- .p2align 4
-L(exit_tail3):
- mov $3, %rax
- ret
-
- .p2align 4
-L(exit_tail4):
- mov $4, %rax
- ret
-
- .p2align 4
-L(exit_tail5):
- mov $5, %rax
- ret
-
- .p2align 4
-L(exit_tail6):
- mov $6, %rax
- ret
-
- .p2align 4
-L(exit_tail7):
- mov $7, %rax
- ret
-
-END (__wcslen)
+#include "strlen.S"
weak_alias(__wcslen, wcslen)
new file mode 100644
@@ -0,0 +1,7 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen __wcsnlen
+
+#include "strlen.S"
+
+weak_alias(__wcsnlen, wcsnlen)