Message ID | 20220712192910.351121-7-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S | expand |
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++-- > sysdeps/x86_64/wcschr.S | 135 +---------------------- > 2 files changed, 138 insertions(+), 142 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S > index 218ea609b9..c872926ba9 100644 > --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S > +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S > @@ -17,14 +17,141 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# define __wcschr __wcschr_sse2 > - > -# undef weak_alias > -# define weak_alias(__wcschr, wcschr) > -# undef libc_hidden_def > -# define libc_hidden_def(__wcschr) > -# undef libc_hidden_weak > -# define libc_hidden_weak(wcschr) > +# ifndef WCSCHR > +# define WCSCHR __wcschr_sse2 > +# endif > #endif > > -#include "../wcschr.S" > +#include <sysdep.h> > + > + .text > +ENTRY (WCSCHR) > + > + movd %rsi, %xmm1 > + pxor %xmm2, %xmm2 > + mov %rdi, %rcx > + punpckldq %xmm1, %xmm1 > + punpckldq %xmm1, %xmm1 > + > + and $63, %rcx > + cmp $48, %rcx > + ja L(cross_cache) > + > + movdqu (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + and $-16, %rdi > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + jmp L(loop) > + > +L(cross_cache): > + and $15, %rcx > + and $-16, %rdi > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + > + sar %cl, %rdx > + sar %cl, %rax > + test %rax, %rax > + je L(unaligned_no_match) > + > + bsf %rax, %rax > + test %rdx, %rdx > + je L(unaligned_match) > + bsf %rdx, %rdx > + cmp %rdx, %rax > + ja L(return_null) > + > +L(unaligned_match): > + add %rdi, %rax > + add %rcx, %rax > + ret > + > + .p2align 4 > +L(unaligned_no_match): > + test %rdx, %rdx > + jne L(return_null) > + pxor %xmm2, %xmm2 > + > + add $16, %rdi > + > + .p2align 4 > +/* Loop start on aligned string. */ > +L(loop): > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + jmp L(loop) > + > + .p2align 4 > +L(matches): > + pmovmskb %xmm2, %rdx > + test %rax, %rax > + jz L(return_null) > + bsf %rax, %rax > + test %rdx, %rdx > + je L(match) > + bsf %rdx, %rcx > + cmp %rcx, %rax > + ja L(return_null) > +L(match): > + sub $16, %rdi > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(return_null): > + xor %rax, %rax > + ret > + > +END (WCSCHR) > diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S > index 2131220382..80b12c4286 100644 > --- a/sysdeps/x86_64/wcschr.S > +++ b/sysdeps/x86_64/wcschr.S > @@ -16,140 +16,9 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <sysdep.h> > - > - .text > -ENTRY (__wcschr) > - > - movd %rsi, %xmm1 > - pxor %xmm2, %xmm2 > - mov %rdi, %rcx > - punpckldq %xmm1, %xmm1 > - punpckldq %xmm1, %xmm1 > - > - and $63, %rcx > - cmp $48, %rcx > - ja L(cross_cache) > - > - movdqu (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - and $-16, %rdi > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - jmp L(loop) > - > -L(cross_cache): > - and $15, %rcx > - and $-16, %rdi > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - > - sar %cl, %rdx > - sar %cl, %rax > - test %rax, %rax > - je L(unaligned_no_match) > - > - bsf %rax, %rax > - test %rdx, %rdx > - je L(unaligned_match) > - bsf %rdx, %rdx > - cmp %rdx, %rax > - ja L(return_null) > - > -L(unaligned_match): > - add %rdi, %rax > - add %rcx, %rax > - ret > - > - .p2align 4 > -L(unaligned_no_match): > - test %rdx, %rdx > - jne L(return_null) > - pxor %xmm2, %xmm2 > - > - add $16, %rdi > - > - .p2align 4 > -/* Loop start on aligned string. */ > -L(loop): > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - jmp L(loop) > - > - .p2align 4 > -L(matches): > - pmovmskb %xmm2, %rdx > - test %rax, %rax > - jz L(return_null) > - bsf %rax, %rax > - test %rdx, %rdx > - je L(match) > - bsf %rdx, %rcx > - cmp %rcx, %rax > - ja L(return_null) > -L(match): > - sub $16, %rdi > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(return_null): > - xor %rax, %rax > - ret > - > -END (__wcschr) > > +#define WCSCHR __wcschr > +#include "multiarch/wcschr-sse2.S" > libc_hidden_def(__wcschr) > weak_alias (__wcschr, wcschr) > libc_hidden_weak (wcschr) > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S index 218ea609b9..c872926ba9 100644 --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S @@ -17,14 +17,141 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define __wcschr __wcschr_sse2 - -# undef weak_alias -# define weak_alias(__wcschr, wcschr) -# undef libc_hidden_def -# define libc_hidden_def(__wcschr) -# undef libc_hidden_weak -# define libc_hidden_weak(wcschr) +# ifndef WCSCHR +# define WCSCHR __wcschr_sse2 +# endif #endif -#include "../wcschr.S" +#include <sysdep.h> + + .text +ENTRY (WCSCHR) + + movd %rsi, %xmm1 + pxor %xmm2, %xmm2 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + punpckldq %xmm1, %xmm1 + + and $63, %rcx + cmp $48, %rcx + ja L(cross_cache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + and $-16, %rdi + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + jmp L(loop) + +L(cross_cache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + + sar %cl, %rdx + sar %cl, %rax + test %rax, %rax + je L(unaligned_no_match) + + bsf %rax, %rax + test %rdx, %rdx + je L(unaligned_match) + bsf %rdx, %rdx + cmp %rdx, %rax + ja L(return_null) + +L(unaligned_match): + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + test %rdx, %rdx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %rdi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + jmp L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %rdx + test %rax, %rax + jz L(return_null) + bsf %rax, %rax + test %rdx, %rdx + je L(match) + bsf %rdx, %rcx + cmp %rcx, %rax + ja L(return_null) +L(match): + sub $16, %rdi + add %rdi, %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (WCSCHR) diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S index 2131220382..80b12c4286 100644 --- a/sysdeps/x86_64/wcschr.S +++ b/sysdeps/x86_64/wcschr.S @@ -16,140 +16,9 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> - - .text -ENTRY (__wcschr) - - movd %rsi, %xmm1 - pxor %xmm2, %xmm2 - mov %rdi, %rcx - punpckldq %xmm1, %xmm1 - punpckldq %xmm1, %xmm1 - - and $63, %rcx - cmp $48, %rcx - ja L(cross_cache) - - movdqu (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - and $-16, %rdi - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - jmp L(loop) - -L(cross_cache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - - sar %cl, %rdx - sar %cl, %rax - test %rax, %rax - je L(unaligned_no_match) - - bsf %rax, %rax - test %rdx, %rdx - je L(unaligned_match) - bsf %rdx, %rdx - cmp %rdx, %rax - ja L(return_null) - -L(unaligned_match): - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - test %rdx, %rdx - jne L(return_null) - pxor %xmm2, %xmm2 - - add $16, %rdi - - .p2align 4 -/* Loop start on aligned string. */ -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - jmp L(loop) - - .p2align 4 -L(matches): - pmovmskb %xmm2, %rdx - test %rax, %rax - jz L(return_null) - bsf %rax, %rax - test %rdx, %rdx - je L(match) - bsf %rdx, %rcx - cmp %rcx, %rax - ja L(return_null) -L(match): - sub $16, %rdi - add %rdi, %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret - -END (__wcschr) +#define WCSCHR __wcschr +#include "multiarch/wcschr-sse2.S" libc_hidden_def(__wcschr) weak_alias (__wcschr, wcschr) libc_hidden_weak (wcschr)