Message ID | 20220712192910.351121-8-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S | expand |
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 ++++++++++++++++++++++++- > sysdeps/x86_64/wcslen.S | 216 +----------------------- > 2 files changed, 218 insertions(+), 219 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S > index 2b3a9efd64..944c3bd9c6 100644 > --- a/sysdeps/x86_64/multiarch/wcslen-sse2.S > +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S > @@ -17,10 +17,221 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# define __wcslen __wcslen_sse2 > - > -# undef weak_alias > -# define weak_alias(__wcslen, wcslen) > +# ifndef WCSLEN > +# define WCSLEN __wcslen_sse2 > +# endif > #endif > > -#include "../wcslen.S" > +#include <sysdep.h> > + > + .text > +ENTRY (WCSLEN) > + cmpl $0, (%rdi) > + jz L(exit_tail0) > + cmpl $0, 4(%rdi) > + jz L(exit_tail1) > + cmpl $0, 8(%rdi) > + jz L(exit_tail2) > + cmpl $0, 12(%rdi) > + jz L(exit_tail3) > + cmpl $0, 16(%rdi) > + jz L(exit_tail4) > + cmpl $0, 20(%rdi) > + jz L(exit_tail5) > + cmpl $0, 24(%rdi) > + jz L(exit_tail6) > + cmpl $0, 28(%rdi) > + jz L(exit_tail7) > + > + pxor %xmm0, %xmm0 > + > + lea 32(%rdi), %rax > + addq $16, %rdi > + and $-16, %rax > + > + pcmpeqd (%rax), %xmm0 > + pmovmskb %xmm0, %edx > + pxor %xmm1, %xmm1 > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm1 > + pmovmskb %xmm1, %edx > + pxor %xmm2, %xmm2 > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm2 > + pmovmskb %xmm2, %edx > + pxor %xmm3, %xmm3 > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm3 > + pmovmskb %xmm3, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm0 > + pmovmskb %xmm0, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm1 > + pmovmskb %xmm1, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm2 > + pmovmskb %xmm2, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm3 > + pmovmskb %xmm3, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm0 > + pmovmskb %xmm0, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm1 > + pmovmskb %xmm1, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm2 > + pmovmskb %xmm2, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd (%rax), %xmm3 > + pmovmskb %xmm3, %edx > + addq $16, %rax > + test %edx, %edx > + jnz L(exit) > + > + and $-0x40, %rax > + > + .p2align 4 > +L(aligned_64_loop): > + movaps (%rax), %xmm0 > + movaps 16(%rax), %xmm1 > + movaps 32(%rax), %xmm2 > + movaps 48(%rax), %xmm6 > + > + pminub %xmm1, %xmm0 > + pminub %xmm6, %xmm2 > + pminub %xmm0, %xmm2 > + pcmpeqd %xmm3, %xmm2 > + pmovmskb %xmm2, %edx > + addq $64, %rax > + test %edx, %edx > + jz L(aligned_64_loop) > + > + pcmpeqd -64(%rax), %xmm3 > + pmovmskb %xmm3, %edx > + addq $48, %rdi > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd %xmm1, %xmm3 > + pmovmskb %xmm3, %edx > + addq $-16, %rdi > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd -32(%rax), %xmm3 > + pmovmskb %xmm3, %edx > + addq $-16, %rdi > + test %edx, %edx > + jnz L(exit) > + > + pcmpeqd %xmm6, %xmm3 > + pmovmskb %xmm3, %edx > + addq $-16, %rdi > + test %edx, %edx > + jz L(aligned_64_loop) > + > + .p2align 4 > +L(exit): > + sub %rdi, %rax > + shr $2, %rax > + test %dl, %dl > + jz L(exit_high) > + > + andl $15, %edx > + jz L(exit_1) > + ret > + > + /* No align here. Naturally aligned % 16 == 1. */ > +L(exit_high): > + andl $(15 << 8), %edx > + jz L(exit_3) > + add $2, %rax > + ret > + > + .p2align 3 > +L(exit_1): > + add $1, %rax > + ret > + > + .p2align 3 > +L(exit_3): > + add $3, %rax > + ret > + > + .p2align 3 > +L(exit_tail0): > + xorl %eax, %eax > + ret > + > + .p2align 3 > +L(exit_tail1): > + movl $1, %eax > + ret > + > + .p2align 3 > +L(exit_tail2): > + movl $2, %eax > + ret > + > + .p2align 3 > +L(exit_tail3): > + movl $3, %eax > + ret > + > + .p2align 3 > +L(exit_tail4): > + movl $4, %eax > + ret > + > + .p2align 3 > +L(exit_tail5): > + movl $5, %eax > + ret > + > + .p2align 3 > +L(exit_tail6): > + movl $6, %eax > + ret > + > + .p2align 3 > +L(exit_tail7): > + movl $7, %eax > + ret > + > +END (WCSLEN) > diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S > index d641141d75..588a0fbe01 100644 > --- a/sysdeps/x86_64/wcslen.S > +++ b/sysdeps/x86_64/wcslen.S > @@ -16,218 +16,6 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdep.h> > - > - .text > -ENTRY (__wcslen) > - cmpl $0, (%rdi) > - jz L(exit_tail0) > - cmpl $0, 4(%rdi) > - jz L(exit_tail1) > - cmpl $0, 8(%rdi) > - jz L(exit_tail2) > - cmpl $0, 12(%rdi) > - jz L(exit_tail3) > - cmpl $0, 16(%rdi) > - jz L(exit_tail4) > - cmpl $0, 20(%rdi) > - jz L(exit_tail5) > - cmpl $0, 24(%rdi) > - jz L(exit_tail6) > - cmpl $0, 28(%rdi) > - jz L(exit_tail7) > - > - pxor %xmm0, %xmm0 > - > - lea 32(%rdi), %rax > - addq $16, %rdi > - and $-16, %rax > - > - pcmpeqd (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - pxor %xmm1, %xmm1 > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - pxor %xmm2, %xmm2 > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - pxor %xmm3, %xmm3 > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - addq $16, %rax > - test %edx, %edx > - jnz L(exit) > - > - and $-0x40, %rax > - > - .p2align 4 > -L(aligned_64_loop): > - movaps (%rax), %xmm0 > - movaps 16(%rax), %xmm1 > - movaps 32(%rax), %xmm2 > - movaps 48(%rax), %xmm6 > - > - pminub %xmm1, %xmm0 > - pminub %xmm6, %xmm2 > - pminub %xmm0, %xmm2 > - pcmpeqd %xmm3, %xmm2 > - pmovmskb %xmm2, %edx > - addq $64, %rax > - test %edx, %edx > - jz L(aligned_64_loop) > - > - pcmpeqd -64(%rax), %xmm3 > - pmovmskb %xmm3, %edx > - addq $48, %rdi > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - addq $-16, %rdi > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd -32(%rax), %xmm3 > - pmovmskb %xmm3, %edx > - addq $-16, %rdi > - test %edx, %edx > - jnz L(exit) > - > - pcmpeqd %xmm6, %xmm3 > - pmovmskb %xmm3, %edx > - addq $-16, %rdi > - test %edx, %edx > - jz L(aligned_64_loop) > - > - .p2align 4 > -L(exit): > - sub %rdi, %rax > - shr $2, %rax > - test %dl, %dl > - jz L(exit_high) > - > - andl $15, %edx > - jz L(exit_1) > - ret > - > - /* No align here. Naturally aligned % 16 == 1. */ > -L(exit_high): > - andl $(15 << 8), %edx > - jz L(exit_3) > - add $2, %rax > - ret > - > - .p2align 3 > -L(exit_1): > - add $1, %rax > - ret > - > - .p2align 3 > -L(exit_3): > - add $3, %rax > - ret > - > - .p2align 3 > -L(exit_tail0): > - xorl %eax, %eax > - ret > - > - .p2align 3 > -L(exit_tail1): > - movl $1, %eax > - ret > - > - .p2align 3 > -L(exit_tail2): > - movl $2, %eax > - ret > - > - .p2align 3 > -L(exit_tail3): > - movl $3, %eax > - ret > - > - .p2align 3 > -L(exit_tail4): > - movl $4, %eax > - ret > - > - .p2align 3 > -L(exit_tail5): > - movl $5, %eax > - ret > - > - .p2align 3 > -L(exit_tail6): > - movl $6, %eax > - ret > - > - .p2align 3 > -L(exit_tail7): > - movl $7, %eax > - ret > - > -END (__wcslen) > - > +#define WCSLEN __wcslen > +#include "multiarch/wcslen-sse2.S" > weak_alias(__wcslen, wcslen) > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S index 2b3a9efd64..944c3bd9c6 100644 --- a/sysdeps/x86_64/multiarch/wcslen-sse2.S +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S @@ -17,10 +17,221 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define __wcslen __wcslen_sse2 - -# undef weak_alias -# define weak_alias(__wcslen, wcslen) +# ifndef WCSLEN +# define WCSLEN __wcslen_sse2 +# endif #endif -#include "../wcslen.S" +#include <sysdep.h> + + .text +ENTRY (WCSLEN) + cmpl $0, (%rdi) + jz L(exit_tail0) + cmpl $0, 4(%rdi) + jz L(exit_tail1) + cmpl $0, 8(%rdi) + jz L(exit_tail2) + cmpl $0, 12(%rdi) + jz L(exit_tail3) + cmpl $0, 16(%rdi) + jz L(exit_tail4) + cmpl $0, 20(%rdi) + jz L(exit_tail5) + cmpl $0, 24(%rdi) + jz L(exit_tail6) + cmpl $0, 28(%rdi) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax + addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64_loop): + movaps (%rax), %xmm0 + movaps 16(%rax), %xmm1 + movaps 32(%rax), %xmm2 + movaps 48(%rax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + addq $64, %rax + test %edx, %edx + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $48, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jz L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + + andl $15, %edx + jz L(exit_1) + ret + + /* No align here. Naturally aligned % 16 == 1. */ +L(exit_high): + andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + + .p2align 3 +L(exit_1): + add $1, %rax + ret + + .p2align 3 +L(exit_3): + add $3, %rax + ret + + .p2align 3 +L(exit_tail0): + xorl %eax, %eax + ret + + .p2align 3 +L(exit_tail1): + movl $1, %eax + ret + + .p2align 3 +L(exit_tail2): + movl $2, %eax + ret + + .p2align 3 +L(exit_tail3): + movl $3, %eax + ret + + .p2align 3 +L(exit_tail4): + movl $4, %eax + ret + + .p2align 3 +L(exit_tail5): + movl $5, %eax + ret + + .p2align 3 +L(exit_tail6): + movl $6, %eax + ret + + .p2align 3 +L(exit_tail7): + movl $7, %eax + ret + +END (WCSLEN) diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index d641141d75..588a0fbe01 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -16,218 +16,6 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> - - .text -ENTRY (__wcslen) - cmpl $0, (%rdi) - jz L(exit_tail0) - cmpl $0, 4(%rdi) - jz L(exit_tail1) - cmpl $0, 8(%rdi) - jz L(exit_tail2) - cmpl $0, 12(%rdi) - jz L(exit_tail3) - cmpl $0, 16(%rdi) - jz L(exit_tail4) - cmpl $0, 20(%rdi) - jz L(exit_tail5) - cmpl $0, 24(%rdi) - jz L(exit_tail6) - cmpl $0, 28(%rdi) - jz L(exit_tail7) - - pxor %xmm0, %xmm0 - - lea 32(%rdi), %rax - addq $16, %rdi - and $-16, %rax - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64_loop): - movaps (%rax), %xmm0 - movaps 16(%rax), %xmm1 - movaps 32(%rax), %xmm2 - movaps 48(%rax), %xmm6 - - pminub %xmm1, %xmm0 - pminub %xmm6, %xmm2 - pminub %xmm0, %xmm2 - pcmpeqd %xmm3, %xmm2 - pmovmskb %xmm2, %edx - addq $64, %rax - test %edx, %edx - jz L(aligned_64_loop) - - pcmpeqd -64(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $48, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd -32(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm6, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jz L(aligned_64_loop) - - .p2align 4 -L(exit): - sub %rdi, %rax - shr $2, %rax - test %dl, %dl - jz L(exit_high) - - andl $15, %edx - jz L(exit_1) - ret - - /* No align here. Naturally aligned % 16 == 1. */ -L(exit_high): - andl $(15 << 8), %edx - jz L(exit_3) - add $2, %rax - ret - - .p2align 3 -L(exit_1): - add $1, %rax - ret - - .p2align 3 -L(exit_3): - add $3, %rax - ret - - .p2align 3 -L(exit_tail0): - xorl %eax, %eax - ret - - .p2align 3 -L(exit_tail1): - movl $1, %eax - ret - - .p2align 3 -L(exit_tail2): - movl $2, %eax - ret - - .p2align 3 -L(exit_tail3): - movl $3, %eax - ret - - .p2align 3 -L(exit_tail4): - movl $4, %eax - ret - - .p2align 3 -L(exit_tail5): - movl $5, %eax - ret - - .p2align 3 -L(exit_tail6): - movl $6, %eax - ret - - .p2align 3 -L(exit_tail7): - movl $7, %eax - ret - -END (__wcslen) - +#define WCSLEN __wcslen +#include "multiarch/wcslen-sse2.S" weak_alias(__wcslen, wcslen)