Message ID | 20220712192910.351121-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S | expand |
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/rtld-strlen.S | 18 ++ > sysdeps/x86_64/multiarch/rtld-strnlen.S | 18 ++ > sysdeps/x86_64/multiarch/strlen-sse2.S | 260 ++++++++++++++++++++- > sysdeps/x86_64/multiarch/strlen-vec.S | 267 ---------------------- > sysdeps/x86_64/multiarch/strnlen-sse2.S | 12 +- > sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 +- > sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 4 +- > sysdeps/x86_64/strlen.S | 3 +- > sysdeps/x86_64/strnlen.S | 6 +- > 9 files changed, 306 insertions(+), 286 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S > create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S > delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S > > diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S > new file mode 100644 > index 0000000000..609d26256e > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S > @@ -0,0 +1,18 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include "../strlen.S" > diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S > new file mode 100644 > index 0000000000..ef2d64abc2 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S > @@ -0,0 +1,18 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include "../strnlen.S" > diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S > index 660b327ed2..5be72267d5 100644 > --- a/sysdeps/x86_64/multiarch/strlen-sse2.S > +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S > @@ -16,8 +16,260 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#if IS_IN (libc) > -# define strlen __strlen_sse2 > -#endif > +#if IS_IN (libc) || defined STRLEN > + > +# ifndef STRLEN > +# define STRLEN __strlen_sse2 > +# endif > + > + > +# include <sysdep.h> > + > +# ifdef AS_WCSLEN > +# define PMINU pminud > +# define PCMPEQ pcmpeqd > +# define SHIFT_RETURN shrq $2, %rax > +# else > +# define PMINU pminub > +# define PCMPEQ pcmpeqb > +# define SHIFT_RETURN > +# endif > + > +# ifndef SECTION > +# define SECTION(p) p > +# endif > + > +/* Long lived register in strlen(s), strnlen(s, n) are: > + > + %xmm3 - zero > + %rdi - s > + %r10 (s+n) & (~(64-1)) > + %r11 s+n > +*/ > + > + > + .section SECTION(.text),"ax",@progbits > +ENTRY(STRLEN) > + > +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ > +# define FIND_ZERO \ > + PCMPEQ (%rax), %xmm0; \ > + PCMPEQ 16(%rax), %xmm1; \ > + PCMPEQ 32(%rax), %xmm2; \ > + PCMPEQ 48(%rax), %xmm3; \ > + pmovmskb %xmm0, %esi; \ > + pmovmskb %xmm1, %edx; \ > + pmovmskb %xmm2, %r8d; \ > + pmovmskb %xmm3, %ecx; \ > + salq $16, %rdx; \ > + salq $16, %rcx; \ > + orq %rsi, %rdx; \ > + orq %r8, %rcx; \ > + salq $32, %rcx; \ > + orq %rcx, %rdx; > + > +# ifdef AS_STRNLEN > +/* Do not read anything when n==0. */ > + test %RSI_LP, %RSI_LP > + jne L(n_nonzero) > + xor %rax, %rax > + ret > +L(n_nonzero): > +# ifdef AS_WCSLEN > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would > + overflow the only way this program doesn't have undefined behavior > + is if there is a null terminator in valid memory so wcslen will > + suffice. */ > + mov %RSI_LP, %R10_LP > + sar $62, %R10_LP > + jnz __wcslen_sse4_1 > + sal $2, %RSI_LP > +# endif > + > +/* Initialize long lived registers. */ > + add %RDI_LP, %RSI_LP > + mov %RSI_LP, %R10_LP > + and $-64, %R10_LP > + mov %RSI_LP, %R11_LP > +# endif > + > + pxor %xmm0, %xmm0 > + pxor %xmm1, %xmm1 > + pxor %xmm2, %xmm2 > + pxor %xmm3, %xmm3 > + movq %rdi, %rax > + movq %rdi, %rcx > + andq $4095, %rcx > +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ > + cmpq $4047, %rcx > +/* We cannot unify this branching as it would be ~6 cycles slower. */ > + ja L(cross_page) > + > +# ifdef AS_STRNLEN > +/* Test if end is among first 64 bytes. */ > +# define STRNLEN_PROLOG \ > + mov %r11, %rsi; \ > + subq %rax, %rsi; \ > + andq $-64, %rax; \ > + testq $-64, %rsi; \ > + je L(strnlen_ret) > +# else > +# define STRNLEN_PROLOG andq $-64, %rax; > +# endif > + > +/* Ignore bits in mask that come before start of string. */ > +# define PROLOG(lab) \ > + movq %rdi, %rcx; \ > + xorq %rax, %rcx; \ > + STRNLEN_PROLOG; \ > + sarq %cl, %rdx; \ > + test %rdx, %rdx; \ > + je L(lab); \ > + bsfq %rdx, %rax; \ > + SHIFT_RETURN; \ > + ret > + > +# ifdef AS_STRNLEN > + andq $-16, %rax > + FIND_ZERO > +# else > + /* Test first 16 bytes unaligned. */ > + movdqu (%rax), %xmm4 > + PCMPEQ %xmm0, %xmm4 > + pmovmskb %xmm4, %edx > + test %edx, %edx > + je L(next48_bytes) > + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ > + SHIFT_RETURN > + ret > + > +L(next48_bytes): > +/* Same as FIND_ZERO except we do not check first 16 bytes. */ > + andq $-16, %rax > + PCMPEQ 16(%rax), %xmm1 > + PCMPEQ 32(%rax), %xmm2 > + PCMPEQ 48(%rax), %xmm3 > + pmovmskb %xmm1, %edx > + pmovmskb %xmm2, %r8d > + pmovmskb %xmm3, %ecx > + salq $16, %rdx > + salq $16, %rcx > + orq %r8, %rcx > + salq $32, %rcx > + orq %rcx, %rdx > +# endif > > -#include "strlen-vec.S" > + /* When no zero byte is found xmm1-3 are zero so we do not have to > + zero them. */ > + PROLOG(loop) > + > + .p2align 4 > +L(cross_page): > + andq $-64, %rax > + FIND_ZERO > + PROLOG(loop_init) > + > +# ifdef AS_STRNLEN > +/* We must do this check to correctly handle strnlen (s, -1). */ > +L(strnlen_ret): > + bts %rsi, %rdx > + sarq %cl, %rdx > + test %rdx, %rdx > + je L(loop_init) > + bsfq %rdx, %rax > + SHIFT_RETURN > + ret > +# endif > + .p2align 4 > +L(loop_init): > + pxor %xmm1, %xmm1 > + pxor %xmm2, %xmm2 > + pxor %xmm3, %xmm3 > +# ifdef AS_STRNLEN > + .p2align 4 > +L(loop): > + > + addq $64, %rax > + cmpq %rax, %r10 > + je L(exit_end) > + > + movdqa (%rax), %xmm0 > + PMINU 16(%rax), %xmm0 > + PMINU 32(%rax), %xmm0 > + PMINU 48(%rax), %xmm0 > + PCMPEQ %xmm3, %xmm0 > + pmovmskb %xmm0, %edx > + testl %edx, %edx > + jne L(exit) > + jmp L(loop) > + > + .p2align 4 > +L(exit_end): > + cmp %rax, %r11 > + je L(first) /* Do not read when end is at page boundary. */ > + pxor %xmm0, %xmm0 > + FIND_ZERO > + > +L(first): > + bts %r11, %rdx > + bsfq %rdx, %rdx > + addq %rdx, %rax > + subq %rdi, %rax > + SHIFT_RETURN > + ret > + > + .p2align 4 > +L(exit): > + pxor %xmm0, %xmm0 > + FIND_ZERO > + > + bsfq %rdx, %rdx > + addq %rdx, %rax > + subq %rdi, %rax > + SHIFT_RETURN > + ret > + > +# else > + > + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ > + .p2align 4 > +L(loop): > + > + movdqa 64(%rax), %xmm0 > + PMINU 80(%rax), %xmm0 > + PMINU 96(%rax), %xmm0 > + PMINU 112(%rax), %xmm0 > + PCMPEQ %xmm3, %xmm0 > + pmovmskb %xmm0, %edx > + testl %edx, %edx > + jne L(exit64) > + > + subq $-128, %rax > + > + movdqa (%rax), %xmm0 > + PMINU 16(%rax), %xmm0 > + PMINU 32(%rax), %xmm0 > + PMINU 48(%rax), %xmm0 > + PCMPEQ %xmm3, %xmm0 > + pmovmskb %xmm0, %edx > + testl %edx, %edx > + jne L(exit0) > + jmp L(loop) > + > + .p2align 4 > +L(exit64): > + addq $64, %rax > +L(exit0): > + pxor %xmm0, %xmm0 > + FIND_ZERO > + > + bsfq %rdx, %rdx > + addq %rdx, %rax > + subq %rdi, %rax > + SHIFT_RETURN > + ret > + > +# endif > + > +END(STRLEN) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S > deleted file mode 100644 > index 874123d604..0000000000 > --- a/sysdeps/x86_64/multiarch/strlen-vec.S > +++ /dev/null > @@ -1,267 +0,0 @@ > -/* SSE2 version of strlen and SSE4.1 version of wcslen. > - Copyright (C) 2012-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#ifdef AS_WCSLEN > -# define PMINU pminud > -# define PCMPEQ pcmpeqd > -# define SHIFT_RETURN shrq $2, %rax > -#else > -# define PMINU pminub > -# define PCMPEQ pcmpeqb > -# define SHIFT_RETURN > -#endif > - > -#ifndef SECTION > -# define SECTION(p) p > -#endif > - > -/* Long lived register in strlen(s), strnlen(s, n) are: > - > - %xmm3 - zero > - %rdi - s > - %r10 (s+n) & (~(64-1)) > - %r11 s+n > -*/ > - > - > - .section SECTION(.text),"ax",@progbits > -ENTRY(strlen) > - > -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ > -#define FIND_ZERO \ > - PCMPEQ (%rax), %xmm0; \ > - PCMPEQ 16(%rax), %xmm1; \ > - PCMPEQ 32(%rax), %xmm2; \ > - PCMPEQ 48(%rax), %xmm3; \ > - pmovmskb %xmm0, %esi; \ > - pmovmskb %xmm1, %edx; \ > - pmovmskb %xmm2, %r8d; \ > - pmovmskb %xmm3, %ecx; \ > - salq $16, %rdx; \ > - salq $16, %rcx; \ > - orq %rsi, %rdx; \ > - orq %r8, %rcx; \ > - salq $32, %rcx; \ > - orq %rcx, %rdx; > - > -#ifdef AS_STRNLEN > -/* Do not read anything when n==0. */ > - test %RSI_LP, %RSI_LP > - jne L(n_nonzero) > - xor %rax, %rax > - ret > -L(n_nonzero): > -# ifdef AS_WCSLEN > -/* Check for overflow from maxlen * sizeof(wchar_t). If it would > - overflow the only way this program doesn't have undefined behavior > - is if there is a null terminator in valid memory so wcslen will > - suffice. */ > - mov %RSI_LP, %R10_LP > - sar $62, %R10_LP > - jnz __wcslen_sse4_1 > - sal $2, %RSI_LP > -# endif > - > -/* Initialize long lived registers. */ > - add %RDI_LP, %RSI_LP > - mov %RSI_LP, %R10_LP > - and $-64, %R10_LP > - mov %RSI_LP, %R11_LP > -#endif > - > - pxor %xmm0, %xmm0 > - pxor %xmm1, %xmm1 > - pxor %xmm2, %xmm2 > - pxor %xmm3, %xmm3 > - movq %rdi, %rax > - movq %rdi, %rcx > - andq $4095, %rcx > -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ > - cmpq $4047, %rcx > -/* We cannot unify this branching as it would be ~6 cycles slower. */ > - ja L(cross_page) > - > -#ifdef AS_STRNLEN > -/* Test if end is among first 64 bytes. */ > -# define STRNLEN_PROLOG \ > - mov %r11, %rsi; \ > - subq %rax, %rsi; \ > - andq $-64, %rax; \ > - testq $-64, %rsi; \ > - je L(strnlen_ret) > -#else > -# define STRNLEN_PROLOG andq $-64, %rax; > -#endif > - > -/* Ignore bits in mask that come before start of string. */ > -#define PROLOG(lab) \ > - movq %rdi, %rcx; \ > - xorq %rax, %rcx; \ > - STRNLEN_PROLOG; \ > - sarq %cl, %rdx; \ > - test %rdx, %rdx; \ > - je L(lab); \ > - bsfq %rdx, %rax; \ > - SHIFT_RETURN; \ > - ret > - > -#ifdef AS_STRNLEN > - andq $-16, %rax > - FIND_ZERO > -#else > - /* Test first 16 bytes unaligned. */ > - movdqu (%rax), %xmm4 > - PCMPEQ %xmm0, %xmm4 > - pmovmskb %xmm4, %edx > - test %edx, %edx > - je L(next48_bytes) > - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ > - SHIFT_RETURN > - ret > - > -L(next48_bytes): > -/* Same as FIND_ZERO except we do not check first 16 bytes. */ > - andq $-16, %rax > - PCMPEQ 16(%rax), %xmm1 > - PCMPEQ 32(%rax), %xmm2 > - PCMPEQ 48(%rax), %xmm3 > - pmovmskb %xmm1, %edx > - pmovmskb %xmm2, %r8d > - pmovmskb %xmm3, %ecx > - salq $16, %rdx > - salq $16, %rcx > - orq %r8, %rcx > - salq $32, %rcx > - orq %rcx, %rdx > -#endif > - > - /* When no zero byte is found xmm1-3 are zero so we do not have to > - zero them. */ > - PROLOG(loop) > - > - .p2align 4 > -L(cross_page): > - andq $-64, %rax > - FIND_ZERO > - PROLOG(loop_init) > - > -#ifdef AS_STRNLEN > -/* We must do this check to correctly handle strnlen (s, -1). */ > -L(strnlen_ret): > - bts %rsi, %rdx > - sarq %cl, %rdx > - test %rdx, %rdx > - je L(loop_init) > - bsfq %rdx, %rax > - SHIFT_RETURN > - ret > -#endif > - .p2align 4 > -L(loop_init): > - pxor %xmm1, %xmm1 > - pxor %xmm2, %xmm2 > - pxor %xmm3, %xmm3 > -#ifdef AS_STRNLEN > - .p2align 4 > -L(loop): > - > - addq $64, %rax > - cmpq %rax, %r10 > - je L(exit_end) > - > - movdqa (%rax), %xmm0 > - PMINU 16(%rax), %xmm0 > - PMINU 32(%rax), %xmm0 > - PMINU 48(%rax), %xmm0 > - PCMPEQ %xmm3, %xmm0 > - pmovmskb %xmm0, %edx > - testl %edx, %edx > - jne L(exit) > - jmp L(loop) > - > - .p2align 4 > -L(exit_end): > - cmp %rax, %r11 > - je L(first) /* Do not read when end is at page boundary. */ > - pxor %xmm0, %xmm0 > - FIND_ZERO > - > -L(first): > - bts %r11, %rdx > - bsfq %rdx, %rdx > - addq %rdx, %rax > - subq %rdi, %rax > - SHIFT_RETURN > - ret > - > - .p2align 4 > -L(exit): > - pxor %xmm0, %xmm0 > - FIND_ZERO > - > - bsfq %rdx, %rdx > - addq %rdx, %rax > - subq %rdi, %rax > - SHIFT_RETURN > - ret > - > -#else > - > - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ > - .p2align 4 > -L(loop): > - > - movdqa 64(%rax), %xmm0 > - PMINU 80(%rax), %xmm0 > - PMINU 96(%rax), %xmm0 > - PMINU 112(%rax), %xmm0 > - PCMPEQ %xmm3, %xmm0 > - pmovmskb %xmm0, %edx > - testl %edx, %edx > - jne L(exit64) > - > - subq $-128, %rax > - > - movdqa (%rax), %xmm0 > - PMINU 16(%rax), %xmm0 > - PMINU 32(%rax), %xmm0 > - PMINU 48(%rax), %xmm0 > - PCMPEQ %xmm3, %xmm0 > - pmovmskb %xmm0, %edx > - testl %edx, %edx > - jne L(exit0) > - jmp L(loop) > - > - .p2align 4 > -L(exit64): > - addq $64, %rax > -L(exit0): > - pxor %xmm0, %xmm0 > - FIND_ZERO > - > - bsfq %rdx, %rdx > - addq %rdx, %rax > - subq %rdi, %rax > - SHIFT_RETURN > - ret > - > -#endif > - > -END(strlen) > diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S > index c4f395c210..a50c7d6a28 100644 > --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S > +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S > @@ -17,12 +17,10 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# define __strnlen __strnlen_sse2 > - > -# undef weak_alias > -# define weak_alias(__strnlen, strnlen) > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(strnlen) > +# ifndef STRLEN > +# define STRLEN __strnlen_sse2 > +# endif > #endif > > -#include "../strnlen.S" > +#define AS_STRNLEN > +#include "strlen-sse2.S" > diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S > index e306a77f51..c88e8342a1 100644 > --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S > +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S > @@ -1,5 +1,5 @@ > #define AS_WCSLEN > -#define strlen __wcslen_sse4_1 > +#define STRLEN __wcslen_sse4_1 > #define SECTION(p) p##.sse4.1 > > -#include "strlen-vec.S" > +#include "strlen-sse2.S" > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > index d2f7dd6e22..17cdedc2a9 100644 > --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > @@ -1,6 +1,6 @@ > #define AS_WCSLEN > #define AS_STRNLEN > -#define strlen __wcsnlen_sse4_1 > +#define STRLEN __wcsnlen_sse4_1 > #define SECTION(p) p##.sse4.1 > > -#include "strlen-vec.S" > +#include "strlen-sse2.S" > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S > index e1f0b19f2f..c2f5674f8d 100644 > --- a/sysdeps/x86_64/strlen.S > +++ b/sysdeps/x86_64/strlen.S > @@ -16,6 +16,7 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include "multiarch/strlen-vec.S" > +#define STRLEN strlen > +#include "multiarch/strlen-sse2.S" > > libc_hidden_builtin_def (strlen) > diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S > index d3c43ac482..174970d58f 100644 > --- a/sysdeps/x86_64/strnlen.S > +++ b/sysdeps/x86_64/strnlen.S > @@ -1,6 +1,6 @@ > -#define AS_STRNLEN > -#define strlen __strnlen > -#include "strlen.S" > +#define STRLEN __strnlen > +#include "multiarch/strnlen-sse2.S" > > +libc_hidden_def (__strnlen) > weak_alias (__strnlen, strnlen); > libc_hidden_builtin_def (strnlen) > -- > 2.34.1 > LGTM. Thanks.
Carlos, Any issue with pushing the "move <func> SSE2 implementation to multiarch/<func>-sse2.S" commits? The follow on patchwork: https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-8-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-7-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-6-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-5-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-4-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-3-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-2-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-1-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-4-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-3-goldstein.w.n@gmail.com/ https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-2-goldstein.w.n@gmail.com/ They are necessary for a coming ISA raising patch that I hope to get into 2.36, but it may be too near the release date for such large changes. On Tue, Jul 12, 2022 at 4:30 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > > for adding explicit ISA level support. > > > > Tested build on x86_64 and x86_32 with/without multiarch. > > --- > > sysdeps/x86_64/multiarch/rtld-strlen.S | 18 ++ > > sysdeps/x86_64/multiarch/rtld-strnlen.S | 18 ++ > > sysdeps/x86_64/multiarch/strlen-sse2.S | 260 ++++++++++++++++++++- > > sysdeps/x86_64/multiarch/strlen-vec.S | 267 ---------------------- > > sysdeps/x86_64/multiarch/strnlen-sse2.S | 12 +- > > sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 +- > > sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 4 +- > > sysdeps/x86_64/strlen.S | 3 +- > > sysdeps/x86_64/strnlen.S | 6 +- > > 9 files changed, 306 insertions(+), 286 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S > > create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S > > delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S > > > > diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S > > new file mode 100644 > > index 0000000000..609d26256e > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../strlen.S" > > diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S > > new file mode 100644 > > index 0000000000..ef2d64abc2 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S > > @@ -0,0 +1,18 @@ > > +/* Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include "../strnlen.S" > > diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S > > index 660b327ed2..5be72267d5 100644 > > --- a/sysdeps/x86_64/multiarch/strlen-sse2.S > > +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S > > @@ -16,8 +16,260 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#if IS_IN (libc) > > -# define strlen __strlen_sse2 > > -#endif > > +#if IS_IN (libc) || defined STRLEN > > + > > +# ifndef STRLEN > > +# define STRLEN __strlen_sse2 > > +# endif > > + > > + > > +# include <sysdep.h> > > + > > +# ifdef AS_WCSLEN > > +# define PMINU pminud > > +# define PCMPEQ pcmpeqd > > +# define SHIFT_RETURN shrq $2, %rax > > +# else > > +# define PMINU pminub > > +# define PCMPEQ pcmpeqb > > +# define SHIFT_RETURN > > +# endif > > + > > +# ifndef SECTION > > +# define SECTION(p) p > > +# endif > > + > > +/* Long lived register in strlen(s), strnlen(s, n) are: > > + > > + %xmm3 - zero > > + %rdi - s > > + %r10 (s+n) & (~(64-1)) > > + %r11 s+n > > +*/ > > + > > + > > + .section SECTION(.text),"ax",@progbits > > +ENTRY(STRLEN) > > + > > +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ > > +# define FIND_ZERO \ > > + PCMPEQ (%rax), %xmm0; \ > > + PCMPEQ 16(%rax), %xmm1; \ > > + PCMPEQ 32(%rax), %xmm2; \ > > + PCMPEQ 48(%rax), %xmm3; \ > > + pmovmskb %xmm0, %esi; \ > > + pmovmskb %xmm1, %edx; \ > > + pmovmskb %xmm2, %r8d; \ > > + pmovmskb %xmm3, %ecx; \ > > + salq $16, %rdx; \ > > + salq $16, %rcx; \ > > + orq %rsi, %rdx; \ > > + orq %r8, %rcx; \ > > + salq $32, %rcx; \ > > + orq %rcx, %rdx; > > + > > +# ifdef AS_STRNLEN > > +/* Do not read anything when n==0. */ > > + test %RSI_LP, %RSI_LP > > + jne L(n_nonzero) > > + xor %rax, %rax > > + ret > > +L(n_nonzero): > > +# ifdef AS_WCSLEN > > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would > > + overflow the only way this program doesn't have undefined behavior > > + is if there is a null terminator in valid memory so wcslen will > > + suffice. */ > > + mov %RSI_LP, %R10_LP > > + sar $62, %R10_LP > > + jnz __wcslen_sse4_1 > > + sal $2, %RSI_LP > > +# endif > > + > > +/* Initialize long lived registers. */ > > + add %RDI_LP, %RSI_LP > > + mov %RSI_LP, %R10_LP > > + and $-64, %R10_LP > > + mov %RSI_LP, %R11_LP > > +# endif > > + > > + pxor %xmm0, %xmm0 > > + pxor %xmm1, %xmm1 > > + pxor %xmm2, %xmm2 > > + pxor %xmm3, %xmm3 > > + movq %rdi, %rax > > + movq %rdi, %rcx > > + andq $4095, %rcx > > +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ > > + cmpq $4047, %rcx > > +/* We cannot unify this branching as it would be ~6 cycles slower. */ > > + ja L(cross_page) > > + > > +# ifdef AS_STRNLEN > > +/* Test if end is among first 64 bytes. */ > > +# define STRNLEN_PROLOG \ > > + mov %r11, %rsi; \ > > + subq %rax, %rsi; \ > > + andq $-64, %rax; \ > > + testq $-64, %rsi; \ > > + je L(strnlen_ret) > > +# else > > +# define STRNLEN_PROLOG andq $-64, %rax; > > +# endif > > + > > +/* Ignore bits in mask that come before start of string. */ > > +# define PROLOG(lab) \ > > + movq %rdi, %rcx; \ > > + xorq %rax, %rcx; \ > > + STRNLEN_PROLOG; \ > > + sarq %cl, %rdx; \ > > + test %rdx, %rdx; \ > > + je L(lab); \ > > + bsfq %rdx, %rax; \ > > + SHIFT_RETURN; \ > > + ret > > + > > +# ifdef AS_STRNLEN > > + andq $-16, %rax > > + FIND_ZERO > > +# else > > + /* Test first 16 bytes unaligned. */ > > + movdqu (%rax), %xmm4 > > + PCMPEQ %xmm0, %xmm4 > > + pmovmskb %xmm4, %edx > > + test %edx, %edx > > + je L(next48_bytes) > > + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ > > + SHIFT_RETURN > > + ret > > + > > +L(next48_bytes): > > +/* Same as FIND_ZERO except we do not check first 16 bytes. */ > > + andq $-16, %rax > > + PCMPEQ 16(%rax), %xmm1 > > + PCMPEQ 32(%rax), %xmm2 > > + PCMPEQ 48(%rax), %xmm3 > > + pmovmskb %xmm1, %edx > > + pmovmskb %xmm2, %r8d > > + pmovmskb %xmm3, %ecx > > + salq $16, %rdx > > + salq $16, %rcx > > + orq %r8, %rcx > > + salq $32, %rcx > > + orq %rcx, %rdx > > +# endif > > > > -#include "strlen-vec.S" > > + /* When no zero byte is found xmm1-3 are zero so we do not have to > > + zero them. */ > > + PROLOG(loop) > > + > > + .p2align 4 > > +L(cross_page): > > + andq $-64, %rax > > + FIND_ZERO > > + PROLOG(loop_init) > > + > > +# ifdef AS_STRNLEN > > +/* We must do this check to correctly handle strnlen (s, -1). */ > > +L(strnlen_ret): > > + bts %rsi, %rdx > > + sarq %cl, %rdx > > + test %rdx, %rdx > > + je L(loop_init) > > + bsfq %rdx, %rax > > + SHIFT_RETURN > > + ret > > +# endif > > + .p2align 4 > > +L(loop_init): > > + pxor %xmm1, %xmm1 > > + pxor %xmm2, %xmm2 > > + pxor %xmm3, %xmm3 > > +# ifdef AS_STRNLEN > > + .p2align 4 > > +L(loop): > > + > > + addq $64, %rax > > + cmpq %rax, %r10 > > + je L(exit_end) > > + > > + movdqa (%rax), %xmm0 > > + PMINU 16(%rax), %xmm0 > > + PMINU 32(%rax), %xmm0 > > + PMINU 48(%rax), %xmm0 > > + PCMPEQ %xmm3, %xmm0 > > + pmovmskb %xmm0, %edx > > + testl %edx, %edx > > + jne L(exit) > > + jmp L(loop) > > + > > + .p2align 4 > > +L(exit_end): > > + cmp %rax, %r11 > > + je L(first) /* Do not read when end is at page boundary. */ > > + pxor %xmm0, %xmm0 > > + FIND_ZERO > > + > > +L(first): > > + bts %r11, %rdx > > + bsfq %rdx, %rdx > > + addq %rdx, %rax > > + subq %rdi, %rax > > + SHIFT_RETURN > > + ret > > + > > + .p2align 4 > > +L(exit): > > + pxor %xmm0, %xmm0 > > + FIND_ZERO > > + > > + bsfq %rdx, %rdx > > + addq %rdx, %rax > > + subq %rdi, %rax > > + SHIFT_RETURN > > + ret > > + > > +# else > > + > > + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ > > + .p2align 4 > > +L(loop): > > + > > + movdqa 64(%rax), %xmm0 > > + PMINU 80(%rax), %xmm0 > > + PMINU 96(%rax), %xmm0 > > + PMINU 112(%rax), %xmm0 > > + PCMPEQ %xmm3, %xmm0 > > + pmovmskb %xmm0, %edx > > + testl %edx, %edx > > + jne L(exit64) > > + > > + subq $-128, %rax > > + > > + movdqa (%rax), %xmm0 > > + PMINU 16(%rax), %xmm0 > > + PMINU 32(%rax), %xmm0 > > + PMINU 48(%rax), %xmm0 > > + PCMPEQ %xmm3, %xmm0 > > + pmovmskb %xmm0, %edx > > + testl %edx, %edx > > + jne L(exit0) > > + jmp L(loop) > > + > > + .p2align 4 > > +L(exit64): > > + addq $64, %rax > > +L(exit0): > > + pxor %xmm0, %xmm0 > > + FIND_ZERO > > + > > + bsfq %rdx, %rdx > > + addq %rdx, %rax > > + subq %rdi, %rax > > + SHIFT_RETURN > > + ret > > + > > +# endif > > + > > +END(STRLEN) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S > > deleted file mode 100644 > > index 874123d604..0000000000 > > --- a/sysdeps/x86_64/multiarch/strlen-vec.S > > +++ /dev/null > > @@ -1,267 +0,0 @@ > > -/* SSE2 version of strlen and SSE4.1 version of wcslen. > > - Copyright (C) 2012-2022 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - <https://www.gnu.org/licenses/>. */ > > - > > -#include <sysdep.h> > > - > > -#ifdef AS_WCSLEN > > -# define PMINU pminud > > -# define PCMPEQ pcmpeqd > > -# define SHIFT_RETURN shrq $2, %rax > > -#else > > -# define PMINU pminub > > -# define PCMPEQ pcmpeqb > > -# define SHIFT_RETURN > > -#endif > > - > > -#ifndef SECTION > > -# define SECTION(p) p > > -#endif > > - > > -/* Long lived register in strlen(s), strnlen(s, n) are: > > - > > - %xmm3 - zero > > - %rdi - s > > - %r10 (s+n) & (~(64-1)) > > - %r11 s+n > > -*/ > > - > > - > > - .section SECTION(.text),"ax",@progbits > > -ENTRY(strlen) > > - > > -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ > > -#define FIND_ZERO \ > > - PCMPEQ (%rax), %xmm0; \ > > - PCMPEQ 16(%rax), %xmm1; \ > > - PCMPEQ 32(%rax), %xmm2; \ > > - PCMPEQ 48(%rax), %xmm3; \ > > - pmovmskb %xmm0, %esi; \ > > - pmovmskb %xmm1, %edx; \ > > - pmovmskb %xmm2, %r8d; \ > > - pmovmskb %xmm3, %ecx; \ > > - salq $16, %rdx; \ > > - salq $16, %rcx; \ > > - orq %rsi, %rdx; \ > > - orq %r8, %rcx; \ > > - salq $32, %rcx; \ > > - orq %rcx, %rdx; > > - > > -#ifdef AS_STRNLEN > > -/* Do not read anything when n==0. */ > > - test %RSI_LP, %RSI_LP > > - jne L(n_nonzero) > > - xor %rax, %rax > > - ret > > -L(n_nonzero): > > -# ifdef AS_WCSLEN > > -/* Check for overflow from maxlen * sizeof(wchar_t). If it would > > - overflow the only way this program doesn't have undefined behavior > > - is if there is a null terminator in valid memory so wcslen will > > - suffice. */ > > - mov %RSI_LP, %R10_LP > > - sar $62, %R10_LP > > - jnz __wcslen_sse4_1 > > - sal $2, %RSI_LP > > -# endif > > - > > -/* Initialize long lived registers. */ > > - add %RDI_LP, %RSI_LP > > - mov %RSI_LP, %R10_LP > > - and $-64, %R10_LP > > - mov %RSI_LP, %R11_LP > > -#endif > > - > > - pxor %xmm0, %xmm0 > > - pxor %xmm1, %xmm1 > > - pxor %xmm2, %xmm2 > > - pxor %xmm3, %xmm3 > > - movq %rdi, %rax > > - movq %rdi, %rcx > > - andq $4095, %rcx > > -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ > > - cmpq $4047, %rcx > > -/* We cannot unify this branching as it would be ~6 cycles slower. */ > > - ja L(cross_page) > > - > > -#ifdef AS_STRNLEN > > -/* Test if end is among first 64 bytes. */ > > -# define STRNLEN_PROLOG \ > > - mov %r11, %rsi; \ > > - subq %rax, %rsi; \ > > - andq $-64, %rax; \ > > - testq $-64, %rsi; \ > > - je L(strnlen_ret) > > -#else > > -# define STRNLEN_PROLOG andq $-64, %rax; > > -#endif > > - > > -/* Ignore bits in mask that come before start of string. */ > > -#define PROLOG(lab) \ > > - movq %rdi, %rcx; \ > > - xorq %rax, %rcx; \ > > - STRNLEN_PROLOG; \ > > - sarq %cl, %rdx; \ > > - test %rdx, %rdx; \ > > - je L(lab); \ > > - bsfq %rdx, %rax; \ > > - SHIFT_RETURN; \ > > - ret > > - > > -#ifdef AS_STRNLEN > > - andq $-16, %rax > > - FIND_ZERO > > -#else > > - /* Test first 16 bytes unaligned. */ > > - movdqu (%rax), %xmm4 > > - PCMPEQ %xmm0, %xmm4 > > - pmovmskb %xmm4, %edx > > - test %edx, %edx > > - je L(next48_bytes) > > - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ > > - SHIFT_RETURN > > - ret > > - > > -L(next48_bytes): > > -/* Same as FIND_ZERO except we do not check first 16 bytes. */ > > - andq $-16, %rax > > - PCMPEQ 16(%rax), %xmm1 > > - PCMPEQ 32(%rax), %xmm2 > > - PCMPEQ 48(%rax), %xmm3 > > - pmovmskb %xmm1, %edx > > - pmovmskb %xmm2, %r8d > > - pmovmskb %xmm3, %ecx > > - salq $16, %rdx > > - salq $16, %rcx > > - orq %r8, %rcx > > - salq $32, %rcx > > - orq %rcx, %rdx > > -#endif > > - > > - /* When no zero byte is found xmm1-3 are zero so we do not have to > > - zero them. */ > > - PROLOG(loop) > > - > > - .p2align 4 > > -L(cross_page): > > - andq $-64, %rax > > - FIND_ZERO > > - PROLOG(loop_init) > > - > > -#ifdef AS_STRNLEN > > -/* We must do this check to correctly handle strnlen (s, -1). */ > > -L(strnlen_ret): > > - bts %rsi, %rdx > > - sarq %cl, %rdx > > - test %rdx, %rdx > > - je L(loop_init) > > - bsfq %rdx, %rax > > - SHIFT_RETURN > > - ret > > -#endif > > - .p2align 4 > > -L(loop_init): > > - pxor %xmm1, %xmm1 > > - pxor %xmm2, %xmm2 > > - pxor %xmm3, %xmm3 > > -#ifdef AS_STRNLEN > > - .p2align 4 > > -L(loop): > > - > > - addq $64, %rax > > - cmpq %rax, %r10 > > - je L(exit_end) > > - > > - movdqa (%rax), %xmm0 > > - PMINU 16(%rax), %xmm0 > > - PMINU 32(%rax), %xmm0 > > - PMINU 48(%rax), %xmm0 > > - PCMPEQ %xmm3, %xmm0 > > - pmovmskb %xmm0, %edx > > - testl %edx, %edx > > - jne L(exit) > > - jmp L(loop) > > - > > - .p2align 4 > > -L(exit_end): > > - cmp %rax, %r11 > > - je L(first) /* Do not read when end is at page boundary. */ > > - pxor %xmm0, %xmm0 > > - FIND_ZERO > > - > > -L(first): > > - bts %r11, %rdx > > - bsfq %rdx, %rdx > > - addq %rdx, %rax > > - subq %rdi, %rax > > - SHIFT_RETURN > > - ret > > - > > - .p2align 4 > > -L(exit): > > - pxor %xmm0, %xmm0 > > - FIND_ZERO > > - > > - bsfq %rdx, %rdx > > - addq %rdx, %rax > > - subq %rdi, %rax > > - SHIFT_RETURN > > - ret > > - > > -#else > > - > > - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ > > - .p2align 4 > > -L(loop): > > - > > - movdqa 64(%rax), %xmm0 > > - PMINU 80(%rax), %xmm0 > > - PMINU 96(%rax), %xmm0 > > - PMINU 112(%rax), %xmm0 > > - PCMPEQ %xmm3, %xmm0 > > - pmovmskb %xmm0, %edx > > - testl %edx, %edx > > - jne L(exit64) > > - > > - subq $-128, %rax > > - > > - movdqa (%rax), %xmm0 > > - PMINU 16(%rax), %xmm0 > > - PMINU 32(%rax), %xmm0 > > - PMINU 48(%rax), %xmm0 > > - PCMPEQ %xmm3, %xmm0 > > - pmovmskb %xmm0, %edx > > - testl %edx, %edx > > - jne L(exit0) > > - jmp L(loop) > > - > > - .p2align 4 > > -L(exit64): > > - addq $64, %rax > > -L(exit0): > > - pxor %xmm0, %xmm0 > > - FIND_ZERO > > - > > - bsfq %rdx, %rdx > > - addq %rdx, %rax > > - subq %rdi, %rax > > - SHIFT_RETURN > > - ret > > - > > -#endif > > - > > -END(strlen) > > diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S > > index c4f395c210..a50c7d6a28 100644 > > --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S > > +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S > > @@ -17,12 +17,10 @@ > > <https://www.gnu.org/licenses/>. */ > > > > #if IS_IN (libc) > > -# define __strnlen __strnlen_sse2 > > - > > -# undef weak_alias > > -# define weak_alias(__strnlen, strnlen) > > -# undef libc_hidden_builtin_def > > -# define libc_hidden_builtin_def(strnlen) > > +# ifndef STRLEN > > +# define STRLEN __strnlen_sse2 > > +# endif > > #endif > > > > -#include "../strnlen.S" > > +#define AS_STRNLEN > > +#include "strlen-sse2.S" > > diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S > > index e306a77f51..c88e8342a1 100644 > > --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S > > +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S > > @@ -1,5 +1,5 @@ > > #define AS_WCSLEN > > -#define strlen __wcslen_sse4_1 > > +#define STRLEN __wcslen_sse4_1 > > #define SECTION(p) p##.sse4.1 > > > > -#include "strlen-vec.S" > > +#include "strlen-sse2.S" > > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > > index d2f7dd6e22..17cdedc2a9 100644 > > --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > > +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > > @@ -1,6 +1,6 @@ > > #define AS_WCSLEN > > #define AS_STRNLEN > > -#define strlen __wcsnlen_sse4_1 > > +#define STRLEN __wcsnlen_sse4_1 > > #define SECTION(p) p##.sse4.1 > > > > -#include "strlen-vec.S" > > +#include "strlen-sse2.S" > > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S > > index e1f0b19f2f..c2f5674f8d 100644 > > --- a/sysdeps/x86_64/strlen.S > > +++ b/sysdeps/x86_64/strlen.S > > @@ -16,6 +16,7 @@ > > License along with the GNU C Library; if not, see > > <https://www.gnu.org/licenses/>. */ > > > > -#include "multiarch/strlen-vec.S" > > +#define STRLEN strlen > > +#include "multiarch/strlen-sse2.S" > > > > libc_hidden_builtin_def (strlen) > > diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S > > index d3c43ac482..174970d58f 100644 > > --- a/sysdeps/x86_64/strnlen.S > > +++ b/sysdeps/x86_64/strnlen.S > > @@ -1,6 +1,6 @@ > > -#define AS_STRNLEN > > -#define strlen __strnlen > > -#include "strlen.S" > > +#define STRLEN __strnlen > > +#include "multiarch/strnlen-sse2.S" > > > > +libc_hidden_def (__strnlen) > > weak_alias (__strnlen, strnlen); > > libc_hidden_builtin_def (strnlen) > > -- > > 2.34.1 > > > > LGTM. > > Thanks. > > -- > H.J.
diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S new file mode 100644 index 0000000000..609d26256e --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../strlen.S" diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S new file mode 100644 index 0000000000..ef2d64abc2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../strnlen.S" diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S index 660b327ed2..5be72267d5 100644 --- a/sysdeps/x86_64/multiarch/strlen-sse2.S +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S @@ -16,8 +16,260 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) -# define strlen __strlen_sse2 -#endif +#if IS_IN (libc) || defined STRLEN + +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif + + +# include <sysdep.h> + +# ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +# else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +# endif + +# ifndef SECTION +# define SECTION(p) p +# endif + +/* Long lived register in strlen(s), strnlen(s, n) are: + + %xmm3 - zero + %rdi - s + %r10 (s+n) & (~(64-1)) + %r11 s+n +*/ + + + .section SECTION(.text),"ax",@progbits +ENTRY(STRLEN) + +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +# define FIND_ZERO \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ + pmovmskb %xmm0, %esi; \ + pmovmskb %xmm1, %edx; \ + pmovmskb %xmm2, %r8d; \ + pmovmskb %xmm3, %ecx; \ + salq $16, %rdx; \ + salq $16, %rcx; \ + orq %rsi, %rdx; \ + orq %r8, %rcx; \ + salq $32, %rcx; \ + orq %rcx, %rdx; + +# ifdef AS_STRNLEN +/* Do not read anything when n==0. */ + test %RSI_LP, %RSI_LP + jne L(n_nonzero) + xor %rax, %rax + ret +L(n_nonzero): +# ifdef AS_WCSLEN +/* Check for overflow from maxlen * sizeof(wchar_t). If it would + overflow the only way this program doesn't have undefined behavior + is if there is a null terminator in valid memory so wcslen will + suffice. */ + mov %RSI_LP, %R10_LP + sar $62, %R10_LP + jnz __wcslen_sse4_1 + sal $2, %RSI_LP +# endif + +/* Initialize long lived registers. */ + add %RDI_LP, %RSI_LP + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +# endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + movq %rdi, %rax + movq %rdi, %rcx + andq $4095, %rcx +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ + cmpq $4047, %rcx +/* We cannot unify this branching as it would be ~6 cycles slower. */ + ja L(cross_page) + +# ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ +# define STRNLEN_PROLOG \ + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) +# else +# define STRNLEN_PROLOG andq $-64, %rax; +# endif + +/* Ignore bits in mask that come before start of string. */ +# define PROLOG(lab) \ + movq %rdi, %rcx; \ + xorq %rax, %rcx; \ + STRNLEN_PROLOG; \ + sarq %cl, %rdx; \ + test %rdx, %rdx; \ + je L(lab); \ + bsfq %rdx, %rax; \ + SHIFT_RETURN; \ + ret + +# ifdef AS_STRNLEN + andq $-16, %rax + FIND_ZERO +# else + /* Test first 16 bytes unaligned. */ + movdqu (%rax), %xmm4 + PCMPEQ %xmm0, %xmm4 + pmovmskb %xmm4, %edx + test %edx, %edx + je L(next48_bytes) + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN + ret + +L(next48_bytes): +/* Same as FIND_ZERO except we do not check first 16 bytes. */ + andq $-16, %rax + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r8d + pmovmskb %xmm3, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx +# endif -#include "strlen-vec.S" + /* When no zero byte is found xmm1-3 are zero so we do not have to + zero them. */ + PROLOG(loop) + + .p2align 4 +L(cross_page): + andq $-64, %rax + FIND_ZERO + PROLOG(loop_init) + +# ifdef AS_STRNLEN +/* We must do this check to correctly handle strnlen (s, -1). */ +L(strnlen_ret): + bts %rsi, %rdx + sarq %cl, %rdx + test %rdx, %rdx + je L(loop_init) + bsfq %rdx, %rax + SHIFT_RETURN + ret +# endif + .p2align 4 +L(loop_init): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 +# ifdef AS_STRNLEN + .p2align 4 +L(loop): + + addq $64, %rax + cmpq %rax, %r10 + je L(exit_end) + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit) + jmp L(loop) + + .p2align 4 +L(exit_end): + cmp %rax, %r11 + je L(first) /* Do not read when end is at page boundary. */ + pxor %xmm0, %xmm0 + FIND_ZERO + +L(first): + bts %r11, %rdx + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + + .p2align 4 +L(exit): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +# else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ + .p2align 4 +L(loop): + + movdqa 64(%rax), %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit64) + + subq $-128, %rax + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit0) + jmp L(loop) + + .p2align 4 +L(exit64): + addq $64, %rax +L(exit0): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +# endif + +END(STRLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S deleted file mode 100644 index 874123d604..0000000000 --- a/sysdeps/x86_64/multiarch/strlen-vec.S +++ /dev/null @@ -1,267 +0,0 @@ -/* SSE2 version of strlen and SSE4.1 version of wcslen. - Copyright (C) 2012-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifdef AS_WCSLEN -# define PMINU pminud -# define PCMPEQ pcmpeqd -# define SHIFT_RETURN shrq $2, %rax -#else -# define PMINU pminub -# define PCMPEQ pcmpeqb -# define SHIFT_RETURN -#endif - -#ifndef SECTION -# define SECTION(p) p -#endif - -/* Long lived register in strlen(s), strnlen(s, n) are: - - %xmm3 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ - - - .section SECTION(.text),"ax",@progbits -ENTRY(strlen) - -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ -#define FIND_ZERO \ - PCMPEQ (%rax), %xmm0; \ - PCMPEQ 16(%rax), %xmm1; \ - PCMPEQ 32(%rax), %xmm2; \ - PCMPEQ 48(%rax), %xmm3; \ - pmovmskb %xmm0, %esi; \ - pmovmskb %xmm1, %edx; \ - pmovmskb %xmm2, %r8d; \ - pmovmskb %xmm3, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - -#ifdef AS_STRNLEN -/* Do not read anything when n==0. */ - test %RSI_LP, %RSI_LP - jne L(n_nonzero) - xor %rax, %rax - ret -L(n_nonzero): -# ifdef AS_WCSLEN -/* Check for overflow from maxlen * sizeof(wchar_t). If it would - overflow the only way this program doesn't have undefined behavior - is if there is a null terminator in valid memory so wcslen will - suffice. */ - mov %RSI_LP, %R10_LP - sar $62, %R10_LP - jnz __wcslen_sse4_1 - sal $2, %RSI_LP -# endif - -/* Initialize long lived registers. */ - add %RDI_LP, %RSI_LP - mov %RSI_LP, %R10_LP - and $-64, %R10_LP - mov %RSI_LP, %R11_LP -#endif - - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ - ja L(cross_page) - -#ifdef AS_STRNLEN -/* Test if end is among first 64 bytes. */ -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) -#else -# define STRNLEN_PROLOG andq $-64, %rax; -#endif - -/* Ignore bits in mask that come before start of string. */ -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ - SHIFT_RETURN; \ - ret - -#ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm4 - PCMPEQ %xmm0, %xmm4 - pmovmskb %xmm4, %edx - test %edx, %edx - je L(next48_bytes) - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ - SHIFT_RETURN - ret - -L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - PCMPEQ 16(%rax), %xmm1 - PCMPEQ 32(%rax), %xmm2 - PCMPEQ 48(%rax), %xmm3 - pmovmskb %xmm1, %edx - pmovmskb %xmm2, %r8d - pmovmskb %xmm3, %ecx - salq $16, %rdx - salq $16, %rcx - orq %r8, %rcx - salq $32, %rcx - orq %rcx, %rdx -#endif - - /* When no zero byte is found xmm1-3 are zero so we do not have to - zero them. */ - PROLOG(loop) - - .p2align 4 -L(cross_page): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) - -#ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx - sarq %cl, %rdx - test %rdx, %rdx - je L(loop_init) - bsfq %rdx, %rax - SHIFT_RETURN - ret -#endif - .p2align 4 -L(loop_init): - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 -#ifdef AS_STRNLEN - .p2align 4 -L(loop): - - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit) - jmp L(loop) - - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm0, %xmm0 - FIND_ZERO - -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - - .p2align 4 -L(exit): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#else - - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ - .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm0 - PMINU 80(%rax), %xmm0 - PMINU 96(%rax), %xmm0 - PMINU 112(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit64) - - subq $-128, %rax - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit0) - jmp L(loop) - - .p2align 4 -L(exit64): - addq $64, %rax -L(exit0): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#endif - -END(strlen) diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S index c4f395c210..a50c7d6a28 100644 --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S @@ -17,12 +17,10 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define __strnlen __strnlen_sse2 - -# undef weak_alias -# define weak_alias(__strnlen, strnlen) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strnlen) +# ifndef STRLEN +# define STRLEN __strnlen_sse2 +# endif #endif -#include "../strnlen.S" +#define AS_STRNLEN +#include "strlen-sse2.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S index e306a77f51..c88e8342a1 100644 --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S @@ -1,5 +1,5 @@ #define AS_WCSLEN -#define strlen __wcslen_sse4_1 +#define STRLEN __wcslen_sse4_1 #define SECTION(p) p##.sse4.1 -#include "strlen-vec.S" +#include "strlen-sse2.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S index d2f7dd6e22..17cdedc2a9 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -1,6 +1,6 @@ #define AS_WCSLEN #define AS_STRNLEN -#define strlen __wcsnlen_sse4_1 +#define STRLEN __wcsnlen_sse4_1 #define SECTION(p) p##.sse4.1 -#include "strlen-vec.S" +#include "strlen-sse2.S" diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index e1f0b19f2f..c2f5674f8d 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -16,6 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "multiarch/strlen-vec.S" +#define STRLEN strlen +#include "multiarch/strlen-sse2.S" libc_hidden_builtin_def (strlen) diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S index d3c43ac482..174970d58f 100644 --- a/sysdeps/x86_64/strnlen.S +++ b/sysdeps/x86_64/strnlen.S @@ -1,6 +1,6 @@ -#define AS_STRNLEN -#define strlen __strnlen -#include "strlen.S" +#define STRLEN __strnlen +#include "multiarch/strnlen-sse2.S" +libc_hidden_def (__strnlen) weak_alias (__strnlen, strnlen); libc_hidden_builtin_def (strnlen)