Message ID | 20150621104732.GA16055@domone |
---|---|
State | New |
Headers | show |
On Sun, Jun 21, 2015 at 12:47:32PM +0200, Ondřej Bílka wrote: > On Fri, Jun 19, 2015 at 05:53:04PM +0200, Ondřej Bílka wrote: > > On Thu, Jun 18, 2015 at 10:09:10AM +0200, Ondřej Bílka wrote: > > > Hi, > > > > > > As I sumbitted before in 2013 memcmp improvement here is new version > > > that improves performance a bit more. > > > > > > Also when I browsed results I found that memcmp-sse4 is in fact > > > regression for i7 nehalem, ivy bridge and haswell architectures. There > > > its beaten by old sse2 code by more than 10%. > > > > > Also when I tried different headers to see if I could improve avx2 > version. It turned out that byte-by-byte loop that I use for crosspage > case is best. If I always use that it beats sse4 version on gcc > workload. > > Main problem is that branch misprediction kills performance and I > couldn't make decision about n fast. > > > > Main idea of new implementation is same, problem with performance is > > > that lot inputs were identical with small n. > > > For that I found that following approach gives best performance when > > > n<64 is likely. > > > > > > if (!cross_page (s1) && !cross_page (s2)) > > > { > > > mask = get_mask(EQ(EQ(LOAD(s1),LOAD(s2)),zero)) > > > mask2 = mask & (2 << (n-1)); > > > if (mask2) > > > return s1[first_byte(mask2)]-s2[first_byte(mask2)]; > > > if (n<=16) > > > return 0; > > > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 16; > > > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 32; > > > mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 48; > > > mask2 = mask & (2 << (n-1)); > > > if (mask2) > > > return s1[first_byte(mask2)]-s2[first_byte(mask2)]; > > > if (n<=64) > > > return 0; > > > if (mask) > > > return s1[first_byte(mask)]-s2[first_byte(mask)]; > > > } > > > > > > I didn't checked yet using just registers and byteswap to eliminate need > > > of getting exact byte position as I wrote in related thread. > > > > > > I could improve this bit more, I lose lot of cycles in loop ending > > > conditions. Problem is that I need to handle that unaligned s2 may read > > > from next page, I would need to add more complicated logic to compute > > > number of loop iterations. > > > > > > Thats related to avx2. I as RFC included it but it harm performance on > > > haswell. > > > > > > Last is wmemcmp that I would also need to convert, now I just moved > > > memcmp-sse-4 there. > > > > > > A profile is found here. > > > > > > http://kam.mff.cuni.cz/~ondra/benchmark_string/memcmp_profile.html > > > > > I updated that new version. I removed avx2 for now, I will submit it > > when I find how it could improve performance. > > > > Second change is that I added wmemcmp conditionals so now I could delete > > memcmp-sse4 and wmemcmp-sse4. > > > > > After finding out bts trick for strncmp I also tried to use it in > memcmp. Problem is that in memcmp my previous control flow was better as > for memcmp its likely that arguments are equal so I save cost of bsf and > comparing bytes. > > Only improvement was that using bts with same control flow saves few > cycles making around 2% improvement for gcc workload. > > Also in cross-page case only optimization was to unroll a byte-by-byte > loop as switching to bigger comparison caused more overhead than saved. > > So what about following version? > > * sysdeps/x86_64/memcmp.S: New implementation. > * sysdeps/x86_64/multiarch/ifunc-impl-list.c > (__libc_ifunc_impl_list): Remove memcmp-sse4 > * sysdeps/x86_64/multiarch/Makefile(routines): Remove memcmp-sse4. > * sysdeps/x86_64/multiarch/memcmp.S: Likewise. > * sysdeps/x86_64/multiarch/memcmp-sse4.S: Removed. > * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: Likewise. > > > > --- > sysdeps/x86_64/memcmp.S | 512 +++---- > sysdeps/x86_64/multiarch/Makefile | 6 +- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 9 +- > sysdeps/x86_64/multiarch/memcmp-avx2.S | 3 + > sysdeps/x86_64/multiarch/memcmp-sse4.S | 1776 ---------------------- > sysdeps/x86_64/multiarch/memcmp.S | 25 +- > sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 9 +- > sysdeps/x86_64/multiarch/wmemcmp-sse4.S | 4 - > sysdeps/x86_64/multiarch/wmemcmp.S | 12 +- > 9 files changed, 221 insertions(+), 2135 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2.S > delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S > delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S > > diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S > index f636716..88c0c4a 100644 > --- a/sysdeps/x86_64/memcmp.S > +++ b/sysdeps/x86_64/memcmp.S > @@ -19,340 +19,204 @@ > > #include <sysdep.h> > > +#ifndef MEMCMP > +# define MEMCMP memcmp > +#endif > + > .text > -ENTRY (memcmp) > - test %rdx, %rdx > - jz L(finz) > - cmpq $1, %rdx > - jle L(finr1b) > - subq %rdi, %rsi > - movq %rdx, %r10 > - cmpq $32, %r10 > - jge L(gt32) > - /* Handle small chunks and last block of less than 32 bytes. */ > -L(small): > - testq $1, %r10 > - jz L(s2b) > - movzbl (%rdi), %eax > - movzbl (%rdi, %rsi), %edx > - subq $1, %r10 > - je L(finz1) > - addq $1, %rdi > - subl %edx, %eax > - jnz L(exit) > -L(s2b): > - testq $2, %r10 > - jz L(s4b) > - movzwl (%rdi), %eax > - movzwl (%rdi, %rsi), %edx > - subq $2, %r10 > - je L(fin2_7) > - addq $2, %rdi > - cmpl %edx, %eax > - jnz L(fin2_7) > -L(s4b): > - testq $4, %r10 > - jz L(s8b) > - movl (%rdi), %eax > - movl (%rdi, %rsi), %edx > - subq $4, %r10 > - je L(fin2_7) > - addq $4, %rdi > - cmpl %edx, %eax > - jnz L(fin2_7) > -L(s8b): > - testq $8, %r10 > - jz L(s16b) > - movq (%rdi), %rax > - movq (%rdi, %rsi), %rdx > - subq $8, %r10 > - je L(fin2_7) > - addq $8, %rdi > - cmpq %rdx, %rax > - jnz L(fin2_7) > -L(s16b): > - movdqu (%rdi), %xmm1 > - movdqu (%rdi, %rsi), %xmm0 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %edx > - xorl %eax, %eax > - subl $0xffff, %edx > - jz L(finz) > - bsfl %edx, %ecx > - leaq (%rdi, %rcx), %rcx > - movzbl (%rcx), %eax > - movzbl (%rsi, %rcx), %edx > - jmp L(finz1) > +ENTRY (MEMCMP) > + testq %rdx, %rdx > + je L(return_zero) > +#ifdef AS_WMEMCMP > + shl $2, %rdx > +#endif > + pxor %xmm4, %xmm4 > + movl %edi, %eax > + andl $4095, %eax > + cmpl $4032, %eax > + ja L(cross_page_start) > +L(handle_end): > + movl %esi, %eax > + andl $4095, %eax > + cmpl $4032, %eax > + ja L(cross_page_start) > +L(back_header): > + xor %ecx, %ecx > + bts %rdx, %rcx > + sub $1, %rcx > + movdqu (%rdi), %xmm0 > + movdqu (%rsi), %xmm1 > + pcmpeqb %xmm1, %xmm0 > + pcmpeqb %xmm4, %xmm0 > + pmovmskb %xmm0, %eax > + and %ecx, %eax > + jne L(different) > + cmpq $16, %rdx > + ja L(next) > + ret > +L(next): > + pmovmskb %xmm0, %r8d > + movdqu 16(%rdi), %xmm2 > + movdqu 16(%rsi), %xmm6 > + movdqu 32(%rdi), %xmm1 > + pcmpeqb %xmm6, %xmm2 > + movdqu 32(%rsi), %xmm5 > + pcmpeqb %xmm4, %xmm2 > + pcmpeqb %xmm5, %xmm1 > + movdqu 48(%rdi), %xmm7 > + pmovmskb %xmm2, %eax > + movdqu 48(%rsi), %xmm3 > + pcmpeqb %xmm4, %xmm1 > + pmovmskb %xmm1, %r9d > + sal $16, %eax > + pcmpeqb %xmm3, %xmm7 > + salq $32, %r9 > + pcmpeqb %xmm4, %xmm7 > + orq %r9, %rax > + orq %r8, %rax > + pmovmskb %xmm7, %r8d > + salq $48, %r8 > + orq %r8, %rax > + movq %rax, %r8 > + andq %rcx, %rax > + jne L(different) > + cmpq $64, %rdx > + jb L(return_zero) > + movq %r8, %rax > + testq %rax, %rax > + jne L(different) > +L(align_loop): > + leaq 64(%rdi), %rax > + andq $-64, %rax > + subq %rdi, %rax > + subq %rax, %rdx > + addq %rax, %rdi > + addq %rax, %rsi > + cmpq $64, %rdx > + ja L(loop_start) > + testq %rdx, %rdx > + jne L(handle_end) > + xorl %eax, %eax > + ret > > - .p2align 4,, 4 > -L(finr1b): > - movzbl (%rdi), %eax > - movzbl (%rsi), %edx > -L(finz1): > + .p2align 4 > +L(different): > + bsfq %rax, %rdx > +#ifdef AS_WMEMCMP > + and $-4, %rdx > + mov (%rdi,%rdx), %eax > + mov (%rsi,%rdx), %edx > subl %edx, %eax > -L(exit): > + jg L(ret1) > + jl L(ret_neg_1) > ret > - > - .p2align 4,, 4 > -L(fin2_7): > - cmpq %rdx, %rax > - jz L(finz) > - movq %rax, %r11 > - subq %rdx, %r11 > - bsfq %r11, %rcx > - sarq $3, %rcx > - salq $3, %rcx > - sarq %cl, %rax > - movzbl %al, %eax > - sarq %cl, %rdx > - movzbl %dl, %edx > +L(ret1): > + mov $1, %eax > + ret > +L(ret_neg_1): > + mov $-1, %eax > + ret > +#else > + movzbl (%rdi,%rdx), %eax > + movzbl (%rsi,%rdx), %edx > subl %edx, %eax > ret > - > - .p2align 4,, 4 > -L(finz): > +#endif > +L(return_zero): > + xor %eax, %eax > + ret > + .p2align 4 > +L(loop): > + subq $64, %rdx > + addq $64, %rdi > + addq $64, %rsi > + cmpq $64, %rdx > + jbe L(less_64_bytes) > +L(loop_start): > + movdqu (%rsi), %xmm0 > + movdqu 16(%rsi), %xmm1 > + pcmpeqb (%rdi), %xmm0 > + movdqu 32(%rsi), %xmm2 > + pcmpeqb 16(%rdi), %xmm1 > + movdqu 48(%rsi), %xmm3 > + pcmpeqb 32(%rdi), %xmm2 > + pcmpeqb 48(%rdi), %xmm3 > + pminub %xmm0, %xmm3 > + pminub %xmm1, %xmm3 > + pminub %xmm2, %xmm3 > + pcmpeqb %xmm4, %xmm3 > + pmovmskb %xmm3, %eax > + testl %eax, %eax > + je L(loop) > + shl $48, %rax > + pcmpeqb %xmm4, %xmm0 > + pcmpeqb %xmm4, %xmm1 > + pcmpeqb %xmm4, %xmm2 > + pmovmskb %xmm0, %r8 > + pmovmskb %xmm1, %rcx > + pmovmskb %xmm2, %r9 > + shl $16, %ecx > + shl $32, %r9 > + or %r8, %rax > + or %r9, %rax > + or %rcx, %rax > + jmp L(different) > + > + .p2align 4 > +L(less_64_bytes): > + testq %rdx, %rdx > + jne L(handle_end) > xorl %eax, %eax > ret > > - /* For blocks bigger than 32 bytes > - 1. Advance one of the addr pointer to be 16B aligned. > - 2. Treat the case of both addr pointers aligned to 16B > - separately to avoid movdqu. > - 3. Handle any blocks of greater than 64 consecutive bytes with > - unrolling to reduce branches. > - 4. At least one addr pointer is 16B aligned, use memory version > - of pcmbeqb. > - */ > - .p2align 4,, 4 > -L(gt32): > - movq %rdx, %r11 > - addq %rdi, %r11 > - movq %rdi, %r8 > - > - andq $15, %r8 > - jz L(16am) > - /* Both pointers may be misaligned. */ > - movdqu (%rdi), %xmm1 > - movdqu (%rdi, %rsi), %xmm0 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %edx > - subl $0xffff, %edx > - jnz L(neq) > - neg %r8 > - leaq 16(%rdi, %r8), %rdi > -L(16am): > - /* Handle two 16B aligned pointers separately. */ > - testq $15, %rsi > - jz L(ATR) > - testq $16, %rdi > - jz L(A32) > - movdqu (%rdi, %rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > -L(A32): > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jge L(mt16) > - /* Pre-unroll to be ready for unrolled 64B loop. */ > - testq $32, %rdi > - jz L(A64) > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > -L(A64): > - movq %r11, %r10 > - andq $-64, %r10 > - cmpq %r10, %rdi > - jge L(mt32) > - > -L(A64main): > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - cmpq %rdi, %r10 > - jne L(A64main) > - > -L(mt32): > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jge L(mt16) > > -L(A32main): > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - cmpq %rdi, %r10 > - jne L(A32main) > -L(mt16): > - subq %rdi, %r11 > - je L(finz) > - movq %r11, %r10 > - jmp L(small) > - > - .p2align 4,, 4 > -L(neq): > - bsfl %edx, %ecx > - movzbl (%rdi, %rcx), %eax > - addq %rdi, %rsi > - movzbl (%rsi,%rcx), %edx > - jmp L(finz1) > - > - .p2align 4,, 4 > -L(ATR): > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jge L(mt16) > - testq $16, %rdi > - jz L(ATR32) > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - cmpq %rdi, %r10 > - je L(mt16) > - > -L(ATR32): > - movq %r11, %r10 > - andq $-64, %r10 > - testq $32, %rdi > - jz L(ATR64) > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > -L(ATR64): > - cmpq %rdi, %r10 > - je L(mt32) > - > -L(ATR64main): > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - cmpq %rdi, %r10 > - jne L(ATR64main) > - > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jge L(mt16) > - > -L(ATR32res): > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - cmpq %r10, %rdi > - jne L(ATR32res) > - > - subq %rdi, %r11 > - je L(finz) > - movq %r11, %r10 > - jmp L(small) > - /* Align to 16byte to improve instruction fetch. */ > - .p2align 4,, 4 > -END(memcmp) > + .p2align 4 > +L(cross_page_start): > + cmp $64, %rdx > + ja L(back_header) > + > + .p2align 4 > +L(cross_page): > + test %edx, %edx > + je L(return_zero) > +#ifdef AS_WMEMCMP > + mov (%rdi), %eax > + mov (%rsi), %ecx > + subl %ecx, %eax > + jg L(ret1) > + jl L(ret_neg_1) > +#else > + movzbl (%rdi), %eax > + movzbl (%rsi), %ecx > + subl %ecx, %eax > + jne L(return) > + cmp $1, %edx > + je L(return) > + movzbl 1(%rdi), %eax > + movzbl 1(%rsi), %ecx > + subl %ecx, %eax > + jne L(return) > + cmp $2, %edx > + je L(return) > + movzbl 2(%rdi), %eax > + movzbl 2(%rsi), %ecx > + subl %ecx, %eax > + jne L(return) > + cmp $3, %edx > + je L(return) > + movzbl 3(%rdi), %eax > + movzbl 3(%rsi), %ecx > + subl %ecx, %eax > + jne L(return) > +#endif > + sub $4, %edx > + add $4, %rdi > + add $4, %rsi > + jmp L(cross_page) > +L(return): > + ret > +END(MEMCMP) > > -#undef bcmp > +#undef bcmp > weak_alias (memcmp, bcmp) > libc_hidden_builtin_def (memcmp) > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index c573744..679db2a 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -8,7 +8,7 @@ ifeq ($(subdir),string) > > sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ > strcmp-sse2-unaligned strncmp-ssse3 \ > - memcmp-sse4 memcpy-ssse3 \ > + memcpy-ssse3 \ > memcpy-sse2-unaligned mempcpy-ssse3 \ > memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ > memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \ > @@ -29,10 +29,10 @@ CFLAGS-strspn-c.c += -msse4 > endif > > ifeq (yes,$(config-cflags-avx2)) > -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 > +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 > endif > endif > > ifeq ($(subdir),wcsmbs) > -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c > +sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c > endif > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index d398e43..b3dbe65 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -39,10 +39,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/memcmp.S. */ > IFUNC_IMPL (i, name, memcmp, > - IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1, > - __memcmp_sse4_1) > + IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2) > IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3) > - IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) > + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned)) > > /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */ > IFUNC_IMPL (i, name, __memmove_chk, > @@ -211,8 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ > IFUNC_IMPL (i, name, wmemcmp, > - IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1, > - __wmemcmp_sse4_1) > + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, > + __wmemcmp_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3, > __wmemcmp_ssse3) > IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) > diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S > new file mode 100644 > index 0000000..60483bf > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S > @@ -0,0 +1,3 @@ > +#define USE_AVX2 > +#define MEMCMP __memcmp_avx2 > +#include "../memcmp.S" > diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S > deleted file mode 100644 > index 533fece..0000000 > --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S > +++ /dev/null > @@ -1,1776 +0,0 @@ > -/* memcmp with SSE4.1, wmemcmp with SSE4.1 > - Copyright (C) 2010-2015 Free Software Foundation, Inc. > - Contributed by Intel Corporation. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <http://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# include <sysdep.h> > - > -# ifndef MEMCMP > -# define MEMCMP __memcmp_sse4_1 > -# endif > - > -# define JMPTBL(I, B) (I - B) > - > -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ > - lea TABLE(%rip), %r11; \ > - movslq (%r11, INDEX, SCALE), %rcx; \ > - add %r11, %rcx; \ > - jmp *%rcx; \ > - ud2 > - > -/* Warning! > - wmemcmp has to use SIGNED comparison for elements. > - memcmp has to use UNSIGNED comparison for elemnts. > -*/ > - > - .section .text.sse4.1,"ax",@progbits > -ENTRY (MEMCMP) > -# ifdef USE_AS_WMEMCMP > - shl $2, %rdx > -# endif > - pxor %xmm0, %xmm0 > - cmp $79, %rdx > - ja L(79bytesormore) > -# ifndef USE_AS_WMEMCMP > - cmp $1, %rdx > - je L(firstbyte) > -# endif > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > -# ifndef USE_AS_WMEMCMP > - .p2align 4 > -L(firstbyte): > - movzbl (%rdi), %eax > - movzbl (%rsi), %ecx > - sub %ecx, %eax > - ret > -# endif > - > - .p2align 4 > -L(79bytesormore): > - movdqu (%rsi), %xmm1 > - movdqu (%rdi), %xmm2 > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - mov %rsi, %rcx > - and $-16, %rsi > - add $16, %rsi > - sub %rsi, %rcx > - > - sub %rcx, %rdi > - add %rcx, %rdx > - test $0xf, %rdi > - jz L(2aligned) > - > - cmp $128, %rdx > - ja L(128bytesormore) > -L(less128bytes): > - sub $64, %rdx > - > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqu 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - > - movdqu 32(%rdi), %xmm2 > - pxor 32(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(48bytesin256) > - > - movdqu 48(%rdi), %xmm2 > - pxor 48(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(64bytesin256) > - cmp $32, %rdx > - jb L(less32bytesin64) > - > - movdqu 64(%rdi), %xmm2 > - pxor 64(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(80bytesin256) > - > - movdqu 80(%rdi), %xmm2 > - pxor 80(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(96bytesin256) > - sub $32, %rdx > - add $32, %rdi > - add $32, %rsi > -L(less32bytesin64): > - add $64, %rdi > - add $64, %rsi > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > -L(128bytesormore): > - cmp $512, %rdx > - ja L(512bytesormore) > - cmp $256, %rdx > - ja L(less512bytes) > -L(less256bytes): > - sub $128, %rdx > - > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqu 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - > - movdqu 32(%rdi), %xmm2 > - pxor 32(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(48bytesin256) > - > - movdqu 48(%rdi), %xmm2 > - pxor 48(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(64bytesin256) > - > - movdqu 64(%rdi), %xmm2 > - pxor 64(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(80bytesin256) > - > - movdqu 80(%rdi), %xmm2 > - pxor 80(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(96bytesin256) > - > - movdqu 96(%rdi), %xmm2 > - pxor 96(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(112bytesin256) > - > - movdqu 112(%rdi), %xmm2 > - pxor 112(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(128bytesin256) > - > - add $128, %rsi > - add $128, %rdi > - > - cmp $64, %rdx > - jae L(less128bytes) > - > - cmp $32, %rdx > - jb L(less32bytesin128) > - > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqu 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - sub $32, %rdx > - add $32, %rdi > - add $32, %rsi > -L(less32bytesin128): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > -L(less512bytes): > - sub $256, %rdx > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqu 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - > - movdqu 32(%rdi), %xmm2 > - pxor 32(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(48bytesin256) > - > - movdqu 48(%rdi), %xmm2 > - pxor 48(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(64bytesin256) > - > - movdqu 64(%rdi), %xmm2 > - pxor 64(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(80bytesin256) > - > - movdqu 80(%rdi), %xmm2 > - pxor 80(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(96bytesin256) > - > - movdqu 96(%rdi), %xmm2 > - pxor 96(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(112bytesin256) > - > - movdqu 112(%rdi), %xmm2 > - pxor 112(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(128bytesin256) > - > - movdqu 128(%rdi), %xmm2 > - pxor 128(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(144bytesin256) > - > - movdqu 144(%rdi), %xmm2 > - pxor 144(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(160bytesin256) > - > - movdqu 160(%rdi), %xmm2 > - pxor 160(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(176bytesin256) > - > - movdqu 176(%rdi), %xmm2 > - pxor 176(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(192bytesin256) > - > - movdqu 192(%rdi), %xmm2 > - pxor 192(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(208bytesin256) > - > - movdqu 208(%rdi), %xmm2 > - pxor 208(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(224bytesin256) > - > - movdqu 224(%rdi), %xmm2 > - pxor 224(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(240bytesin256) > - > - movdqu 240(%rdi), %xmm2 > - pxor 240(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(256bytesin256) > - > - add $256, %rsi > - add $256, %rdi > - > - cmp $128, %rdx > - jae L(less256bytes) > - > - cmp $64, %rdx > - jae L(less128bytes) > - > - cmp $32, %rdx > - jb L(less32bytesin256) > - > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqu 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - sub $32, %rdx > - add $32, %rdi > - add $32, %rsi > -L(less32bytesin256): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > - .p2align 4 > -L(512bytesormore): > -# ifdef DATA_CACHE_SIZE_HALF > - mov $DATA_CACHE_SIZE_HALF, %R8_LP > -# else > - mov __x86_data_cache_size_half(%rip), %R8_LP > -# endif > - mov %r8, %r9 > - shr $1, %r8 > - add %r9, %r8 > - cmp %r8, %rdx > - ja L(L2_L3_cache_unaglined) > - sub $64, %rdx > - .p2align 4 > -L(64bytesormore_loop): > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - movdqa %xmm2, %xmm1 > - > - movdqu 16(%rdi), %xmm3 > - pxor 16(%rsi), %xmm3 > - por %xmm3, %xmm1 > - > - movdqu 32(%rdi), %xmm4 > - pxor 32(%rsi), %xmm4 > - por %xmm4, %xmm1 > - > - movdqu 48(%rdi), %xmm5 > - pxor 48(%rsi), %xmm5 > - por %xmm5, %xmm1 > - > - ptest %xmm1, %xmm0 > - jnc L(64bytesormore_loop_end) > - add $64, %rsi > - add $64, %rdi > - sub $64, %rdx > - jae L(64bytesormore_loop) > - > - add $64, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > -L(L2_L3_cache_unaglined): > - sub $64, %rdx > - .p2align 4 > -L(L2_L3_unaligned_128bytes_loop): > - prefetchnta 0x1c0(%rdi) > - prefetchnta 0x1c0(%rsi) > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - movdqa %xmm2, %xmm1 > - > - movdqu 16(%rdi), %xmm3 > - pxor 16(%rsi), %xmm3 > - por %xmm3, %xmm1 > - > - movdqu 32(%rdi), %xmm4 > - pxor 32(%rsi), %xmm4 > - por %xmm4, %xmm1 > - > - movdqu 48(%rdi), %xmm5 > - pxor 48(%rsi), %xmm5 > - por %xmm5, %xmm1 > - > - ptest %xmm1, %xmm0 > - jnc L(64bytesormore_loop_end) > - add $64, %rsi > - add $64, %rdi > - sub $64, %rdx > - jae L(L2_L3_unaligned_128bytes_loop) > - > - add $64, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > -/* > - * This case is for machines which are sensitive for unaligned instructions. > - */ > - .p2align 4 > -L(2aligned): > - cmp $128, %rdx > - ja L(128bytesormorein2aligned) > -L(less128bytesin2aligned): > - sub $64, %rdx > - > - movdqa (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqa 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - > - movdqa 32(%rdi), %xmm2 > - pxor 32(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(48bytesin256) > - > - movdqa 48(%rdi), %xmm2 > - pxor 48(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(64bytesin256) > - cmp $32, %rdx > - jb L(less32bytesin64in2alinged) > - > - movdqa 64(%rdi), %xmm2 > - pxor 64(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(80bytesin256) > - > - movdqa 80(%rdi), %xmm2 > - pxor 80(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(96bytesin256) > - sub $32, %rdx > - add $32, %rdi > - add $32, %rsi > -L(less32bytesin64in2alinged): > - add $64, %rdi > - add $64, %rsi > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > - .p2align 4 > -L(128bytesormorein2aligned): > - cmp $512, %rdx > - ja L(512bytesormorein2aligned) > - cmp $256, %rdx > - ja L(256bytesormorein2aligned) > -L(less256bytesin2alinged): > - sub $128, %rdx > - > - movdqa (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqa 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - > - movdqa 32(%rdi), %xmm2 > - pxor 32(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(48bytesin256) > - > - movdqa 48(%rdi), %xmm2 > - pxor 48(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(64bytesin256) > - > - movdqa 64(%rdi), %xmm2 > - pxor 64(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(80bytesin256) > - > - movdqa 80(%rdi), %xmm2 > - pxor 80(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(96bytesin256) > - > - movdqa 96(%rdi), %xmm2 > - pxor 96(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(112bytesin256) > - > - movdqa 112(%rdi), %xmm2 > - pxor 112(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(128bytesin256) > - > - add $128, %rsi > - add $128, %rdi > - > - cmp $64, %rdx > - jae L(less128bytesin2aligned) > - > - cmp $32, %rdx > - jb L(less32bytesin128in2aligned) > - > - movdqu (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqu 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - sub $32, %rdx > - add $32, %rdi > - add $32, %rsi > -L(less32bytesin128in2aligned): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > - .p2align 4 > -L(256bytesormorein2aligned): > - > - sub $256, %rdx > - movdqa (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqa 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - > - movdqa 32(%rdi), %xmm2 > - pxor 32(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(48bytesin256) > - > - movdqa 48(%rdi), %xmm2 > - pxor 48(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(64bytesin256) > - > - movdqa 64(%rdi), %xmm2 > - pxor 64(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(80bytesin256) > - > - movdqa 80(%rdi), %xmm2 > - pxor 80(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(96bytesin256) > - > - movdqa 96(%rdi), %xmm2 > - pxor 96(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(112bytesin256) > - > - movdqa 112(%rdi), %xmm2 > - pxor 112(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(128bytesin256) > - > - movdqa 128(%rdi), %xmm2 > - pxor 128(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(144bytesin256) > - > - movdqa 144(%rdi), %xmm2 > - pxor 144(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(160bytesin256) > - > - movdqa 160(%rdi), %xmm2 > - pxor 160(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(176bytesin256) > - > - movdqa 176(%rdi), %xmm2 > - pxor 176(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(192bytesin256) > - > - movdqa 192(%rdi), %xmm2 > - pxor 192(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(208bytesin256) > - > - movdqa 208(%rdi), %xmm2 > - pxor 208(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(224bytesin256) > - > - movdqa 224(%rdi), %xmm2 > - pxor 224(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(240bytesin256) > - > - movdqa 240(%rdi), %xmm2 > - pxor 240(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(256bytesin256) > - > - add $256, %rsi > - add $256, %rdi > - > - cmp $128, %rdx > - jae L(less256bytesin2alinged) > - > - cmp $64, %rdx > - jae L(less128bytesin2aligned) > - > - cmp $32, %rdx > - jb L(less32bytesin256in2alinged) > - > - movdqa (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(16bytesin256) > - > - movdqa 16(%rdi), %xmm2 > - pxor 16(%rsi), %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(32bytesin256) > - sub $32, %rdx > - add $32, %rdi > - add $32, %rsi > -L(less32bytesin256in2alinged): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > - .p2align 4 > -L(512bytesormorein2aligned): > -# ifdef DATA_CACHE_SIZE_HALF > - mov $DATA_CACHE_SIZE_HALF, %R8_LP > -# else > - mov __x86_data_cache_size_half(%rip), %R8_LP > -# endif > - mov %r8, %r9 > - shr $1, %r8 > - add %r9, %r8 > - cmp %r8, %rdx > - ja L(L2_L3_cache_aglined) > - > - sub $64, %rdx > - .p2align 4 > -L(64bytesormore_loopin2aligned): > - movdqa (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - movdqa %xmm2, %xmm1 > - > - movdqa 16(%rdi), %xmm3 > - pxor 16(%rsi), %xmm3 > - por %xmm3, %xmm1 > - > - movdqa 32(%rdi), %xmm4 > - pxor 32(%rsi), %xmm4 > - por %xmm4, %xmm1 > - > - movdqa 48(%rdi), %xmm5 > - pxor 48(%rsi), %xmm5 > - por %xmm5, %xmm1 > - > - ptest %xmm1, %xmm0 > - jnc L(64bytesormore_loop_end) > - add $64, %rsi > - add $64, %rdi > - sub $64, %rdx > - jae L(64bytesormore_loopin2aligned) > - > - add $64, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > -L(L2_L3_cache_aglined): > - sub $64, %rdx > - > - .p2align 4 > -L(L2_L3_aligned_128bytes_loop): > - prefetchnta 0x1c0(%rdi) > - prefetchnta 0x1c0(%rsi) > - movdqa (%rdi), %xmm2 > - pxor (%rsi), %xmm2 > - movdqa %xmm2, %xmm1 > - > - movdqa 16(%rdi), %xmm3 > - pxor 16(%rsi), %xmm3 > - por %xmm3, %xmm1 > - > - movdqa 32(%rdi), %xmm4 > - pxor 32(%rsi), %xmm4 > - por %xmm4, %xmm1 > - > - movdqa 48(%rdi), %xmm5 > - pxor 48(%rsi), %xmm5 > - por %xmm5, %xmm1 > - > - ptest %xmm1, %xmm0 > - jnc L(64bytesormore_loop_end) > - add $64, %rsi > - add $64, %rdi > - sub $64, %rdx > - jae L(L2_L3_aligned_128bytes_loop) > - > - add $64, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > - > - > - .p2align 4 > -L(64bytesormore_loop_end): > - add $16, %rdi > - add $16, %rsi > - ptest %xmm2, %xmm0 > - jnc L(16bytes) > - > - add $16, %rdi > - add $16, %rsi > - ptest %xmm3, %xmm0 > - jnc L(16bytes) > - > - add $16, %rdi > - add $16, %rsi > - ptest %xmm4, %xmm0 > - jnc L(16bytes) > - > - add $16, %rdi > - add $16, %rsi > - jmp L(16bytes) > - > -L(256bytesin256): > - add $256, %rdi > - add $256, %rsi > - jmp L(16bytes) > -L(240bytesin256): > - add $240, %rdi > - add $240, %rsi > - jmp L(16bytes) > -L(224bytesin256): > - add $224, %rdi > - add $224, %rsi > - jmp L(16bytes) > -L(208bytesin256): > - add $208, %rdi > - add $208, %rsi > - jmp L(16bytes) > -L(192bytesin256): > - add $192, %rdi > - add $192, %rsi > - jmp L(16bytes) > -L(176bytesin256): > - add $176, %rdi > - add $176, %rsi > - jmp L(16bytes) > -L(160bytesin256): > - add $160, %rdi > - add $160, %rsi > - jmp L(16bytes) > -L(144bytesin256): > - add $144, %rdi > - add $144, %rsi > - jmp L(16bytes) > -L(128bytesin256): > - add $128, %rdi > - add $128, %rsi > - jmp L(16bytes) > -L(112bytesin256): > - add $112, %rdi > - add $112, %rsi > - jmp L(16bytes) > -L(96bytesin256): > - add $96, %rdi > - add $96, %rsi > - jmp L(16bytes) > -L(80bytesin256): > - add $80, %rdi > - add $80, %rsi > - jmp L(16bytes) > -L(64bytesin256): > - add $64, %rdi > - add $64, %rsi > - jmp L(16bytes) > -L(48bytesin256): > - add $16, %rdi > - add $16, %rsi > -L(32bytesin256): > - add $16, %rdi > - add $16, %rsi > -L(16bytesin256): > - add $16, %rdi > - add $16, %rsi > -L(16bytes): > - mov -16(%rdi), %rax > - mov -16(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > -L(8bytes): > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(12bytes): > - mov -12(%rdi), %rax > - mov -12(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > -L(4bytes): > - mov -4(%rsi), %ecx > -# ifndef USE_AS_WMEMCMP > - mov -4(%rdi), %eax > - cmp %eax, %ecx > -# else > - cmp -4(%rdi), %ecx > -# endif > - jne L(diffin4bytes) > -L(0bytes): > - xor %eax, %eax > - ret > - > -# ifndef USE_AS_WMEMCMP > -/* unreal case for wmemcmp */ > - .p2align 4 > -L(65bytes): > - movdqu -65(%rdi), %xmm1 > - movdqu -65(%rsi), %xmm2 > - mov $-65, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(49bytes): > - movdqu -49(%rdi), %xmm1 > - movdqu -49(%rsi), %xmm2 > - mov $-49, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(33bytes): > - movdqu -33(%rdi), %xmm1 > - movdqu -33(%rsi), %xmm2 > - mov $-33, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(17bytes): > - mov -17(%rdi), %rax > - mov -17(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > -L(9bytes): > - mov -9(%rdi), %rax > - mov -9(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - movzbl -1(%rdi), %eax > - movzbl -1(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(13bytes): > - mov -13(%rdi), %rax > - mov -13(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(5bytes): > - mov -5(%rdi), %eax > - mov -5(%rsi), %ecx > - cmp %eax, %ecx > - jne L(diffin4bytes) > - movzbl -1(%rdi), %eax > - movzbl -1(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(66bytes): > - movdqu -66(%rdi), %xmm1 > - movdqu -66(%rsi), %xmm2 > - mov $-66, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(50bytes): > - movdqu -50(%rdi), %xmm1 > - movdqu -50(%rsi), %xmm2 > - mov $-50, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(34bytes): > - movdqu -34(%rdi), %xmm1 > - movdqu -34(%rsi), %xmm2 > - mov $-34, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(18bytes): > - mov -18(%rdi), %rax > - mov -18(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > -L(10bytes): > - mov -10(%rdi), %rax > - mov -10(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - movzwl -2(%rdi), %eax > - movzwl -2(%rsi), %ecx > - cmp %cl, %al > - jne L(end) > - and $0xffff, %eax > - and $0xffff, %ecx > - sub %ecx, %eax > - ret > - > - .p2align 4 > -L(14bytes): > - mov -14(%rdi), %rax > - mov -14(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(6bytes): > - mov -6(%rdi), %eax > - mov -6(%rsi), %ecx > - cmp %eax, %ecx > - jne L(diffin4bytes) > -L(2bytes): > - movzwl -2(%rsi), %ecx > - movzwl -2(%rdi), %eax > - cmp %cl, %al > - jne L(end) > - and $0xffff, %eax > - and $0xffff, %ecx > - sub %ecx, %eax > - ret > - > - .p2align 4 > -L(67bytes): > - movdqu -67(%rdi), %xmm2 > - movdqu -67(%rsi), %xmm1 > - mov $-67, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(51bytes): > - movdqu -51(%rdi), %xmm2 > - movdqu -51(%rsi), %xmm1 > - mov $-51, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(35bytes): > - movdqu -35(%rsi), %xmm1 > - movdqu -35(%rdi), %xmm2 > - mov $-35, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(19bytes): > - mov -19(%rdi), %rax > - mov -19(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > -L(11bytes): > - mov -11(%rdi), %rax > - mov -11(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -4(%rdi), %eax > - mov -4(%rsi), %ecx > - cmp %eax, %ecx > - jne L(diffin4bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(15bytes): > - mov -15(%rdi), %rax > - mov -15(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(7bytes): > - mov -7(%rdi), %eax > - mov -7(%rsi), %ecx > - cmp %eax, %ecx > - jne L(diffin4bytes) > - mov -4(%rdi), %eax > - mov -4(%rsi), %ecx > - cmp %eax, %ecx > - jne L(diffin4bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(3bytes): > - movzwl -3(%rdi), %eax > - movzwl -3(%rsi), %ecx > - cmp %eax, %ecx > - jne L(diffin2bytes) > -L(1bytes): > - movzbl -1(%rdi), %eax > - movzbl -1(%rsi), %ecx > - sub %ecx, %eax > - ret > -# endif > - > - .p2align 4 > -L(68bytes): > - movdqu -68(%rdi), %xmm2 > - movdqu -68(%rsi), %xmm1 > - mov $-68, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(52bytes): > - movdqu -52(%rdi), %xmm2 > - movdqu -52(%rsi), %xmm1 > - mov $-52, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(36bytes): > - movdqu -36(%rdi), %xmm2 > - movdqu -36(%rsi), %xmm1 > - mov $-36, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(20bytes): > - movdqu -20(%rdi), %xmm2 > - movdqu -20(%rsi), %xmm1 > - mov $-20, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -4(%rsi), %ecx > - > -# ifndef USE_AS_WMEMCMP > - mov -4(%rdi), %eax > - cmp %eax, %ecx > -# else > - cmp -4(%rdi), %ecx > -# endif > - jne L(diffin4bytes) > - xor %eax, %eax > - ret > - > -# ifndef USE_AS_WMEMCMP > -/* unreal cases for wmemcmp */ > - .p2align 4 > -L(69bytes): > - movdqu -69(%rsi), %xmm1 > - movdqu -69(%rdi), %xmm2 > - mov $-69, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(53bytes): > - movdqu -53(%rsi), %xmm1 > - movdqu -53(%rdi), %xmm2 > - mov $-53, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(37bytes): > - movdqu -37(%rsi), %xmm1 > - movdqu -37(%rdi), %xmm2 > - mov $-37, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(21bytes): > - movdqu -21(%rsi), %xmm1 > - movdqu -21(%rdi), %xmm2 > - mov $-21, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(70bytes): > - movdqu -70(%rsi), %xmm1 > - movdqu -70(%rdi), %xmm2 > - mov $-70, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(54bytes): > - movdqu -54(%rsi), %xmm1 > - movdqu -54(%rdi), %xmm2 > - mov $-54, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(38bytes): > - movdqu -38(%rsi), %xmm1 > - movdqu -38(%rdi), %xmm2 > - mov $-38, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(22bytes): > - movdqu -22(%rsi), %xmm1 > - movdqu -22(%rdi), %xmm2 > - mov $-22, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(71bytes): > - movdqu -71(%rsi), %xmm1 > - movdqu -71(%rdi), %xmm2 > - mov $-71, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(55bytes): > - movdqu -55(%rdi), %xmm2 > - movdqu -55(%rsi), %xmm1 > - mov $-55, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(39bytes): > - movdqu -39(%rdi), %xmm2 > - movdqu -39(%rsi), %xmm1 > - mov $-39, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(23bytes): > - movdqu -23(%rdi), %xmm2 > - movdqu -23(%rsi), %xmm1 > - mov $-23, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > -# endif > - > - .p2align 4 > -L(72bytes): > - movdqu -72(%rsi), %xmm1 > - movdqu -72(%rdi), %xmm2 > - mov $-72, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(56bytes): > - movdqu -56(%rdi), %xmm2 > - movdqu -56(%rsi), %xmm1 > - mov $-56, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(40bytes): > - movdqu -40(%rdi), %xmm2 > - movdqu -40(%rsi), %xmm1 > - mov $-40, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(24bytes): > - movdqu -24(%rdi), %xmm2 > - movdqu -24(%rsi), %xmm1 > - mov $-24, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - > - mov -8(%rsi), %rcx > - mov -8(%rdi), %rax > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > -# ifndef USE_AS_WMEMCMP > -/* unreal cases for wmemcmp */ > - .p2align 4 > -L(73bytes): > - movdqu -73(%rsi), %xmm1 > - movdqu -73(%rdi), %xmm2 > - mov $-73, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(57bytes): > - movdqu -57(%rdi), %xmm2 > - movdqu -57(%rsi), %xmm1 > - mov $-57, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(41bytes): > - movdqu -41(%rdi), %xmm2 > - movdqu -41(%rsi), %xmm1 > - mov $-41, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(25bytes): > - movdqu -25(%rdi), %xmm2 > - movdqu -25(%rsi), %xmm1 > - mov $-25, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -9(%rdi), %rax > - mov -9(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - movzbl -1(%rdi), %eax > - movzbl -1(%rsi), %ecx > - sub %ecx, %eax > - ret > - > - .p2align 4 > -L(74bytes): > - movdqu -74(%rsi), %xmm1 > - movdqu -74(%rdi), %xmm2 > - mov $-74, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(58bytes): > - movdqu -58(%rdi), %xmm2 > - movdqu -58(%rsi), %xmm1 > - mov $-58, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(42bytes): > - movdqu -42(%rdi), %xmm2 > - movdqu -42(%rsi), %xmm1 > - mov $-42, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(26bytes): > - movdqu -26(%rdi), %xmm2 > - movdqu -26(%rsi), %xmm1 > - mov $-26, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -10(%rdi), %rax > - mov -10(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - movzwl -2(%rdi), %eax > - movzwl -2(%rsi), %ecx > - jmp L(diffin2bytes) > - > - .p2align 4 > -L(75bytes): > - movdqu -75(%rsi), %xmm1 > - movdqu -75(%rdi), %xmm2 > - mov $-75, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(59bytes): > - movdqu -59(%rdi), %xmm2 > - movdqu -59(%rsi), %xmm1 > - mov $-59, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(43bytes): > - movdqu -43(%rdi), %xmm2 > - movdqu -43(%rsi), %xmm1 > - mov $-43, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(27bytes): > - movdqu -27(%rdi), %xmm2 > - movdqu -27(%rsi), %xmm1 > - mov $-27, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -11(%rdi), %rax > - mov -11(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -4(%rdi), %eax > - mov -4(%rsi), %ecx > - cmp %eax, %ecx > - jne L(diffin4bytes) > - xor %eax, %eax > - ret > -# endif > - .p2align 4 > -L(76bytes): > - movdqu -76(%rsi), %xmm1 > - movdqu -76(%rdi), %xmm2 > - mov $-76, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(60bytes): > - movdqu -60(%rdi), %xmm2 > - movdqu -60(%rsi), %xmm1 > - mov $-60, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(44bytes): > - movdqu -44(%rdi), %xmm2 > - movdqu -44(%rsi), %xmm1 > - mov $-44, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(28bytes): > - movdqu -28(%rdi), %xmm2 > - movdqu -28(%rsi), %xmm1 > - mov $-28, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -12(%rdi), %rax > - mov -12(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -4(%rsi), %ecx > -# ifndef USE_AS_WMEMCMP > - mov -4(%rdi), %eax > - cmp %eax, %ecx > -# else > - cmp -4(%rdi), %ecx > -# endif > - jne L(diffin4bytes) > - xor %eax, %eax > - ret > - > -# ifndef USE_AS_WMEMCMP > -/* unreal cases for wmemcmp */ > - .p2align 4 > -L(77bytes): > - movdqu -77(%rsi), %xmm1 > - movdqu -77(%rdi), %xmm2 > - mov $-77, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(61bytes): > - movdqu -61(%rdi), %xmm2 > - movdqu -61(%rsi), %xmm1 > - mov $-61, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(45bytes): > - movdqu -45(%rdi), %xmm2 > - movdqu -45(%rsi), %xmm1 > - mov $-45, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(29bytes): > - movdqu -29(%rdi), %xmm2 > - movdqu -29(%rsi), %xmm1 > - mov $-29, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - > - mov -13(%rdi), %rax > - mov -13(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(78bytes): > - movdqu -78(%rsi), %xmm1 > - movdqu -78(%rdi), %xmm2 > - mov $-78, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(62bytes): > - movdqu -62(%rdi), %xmm2 > - movdqu -62(%rsi), %xmm1 > - mov $-62, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(46bytes): > - movdqu -46(%rdi), %xmm2 > - movdqu -46(%rsi), %xmm1 > - mov $-46, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(30bytes): > - movdqu -30(%rdi), %xmm2 > - movdqu -30(%rsi), %xmm1 > - mov $-30, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -14(%rdi), %rax > - mov -14(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(79bytes): > - movdqu -79(%rsi), %xmm1 > - movdqu -79(%rdi), %xmm2 > - mov $-79, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(63bytes): > - movdqu -63(%rdi), %xmm2 > - movdqu -63(%rsi), %xmm1 > - mov $-63, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(47bytes): > - movdqu -47(%rdi), %xmm2 > - movdqu -47(%rsi), %xmm1 > - mov $-47, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(31bytes): > - movdqu -31(%rdi), %xmm2 > - movdqu -31(%rsi), %xmm1 > - mov $-31, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - mov -15(%rdi), %rax > - mov -15(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > -# endif > - .p2align 4 > -L(64bytes): > - movdqu -64(%rdi), %xmm2 > - movdqu -64(%rsi), %xmm1 > - mov $-64, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(48bytes): > - movdqu -48(%rdi), %xmm2 > - movdqu -48(%rsi), %xmm1 > - mov $-48, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > -L(32bytes): > - movdqu -32(%rdi), %xmm2 > - movdqu -32(%rsi), %xmm1 > - mov $-32, %dl > - pxor %xmm1, %xmm2 > - ptest %xmm2, %xmm0 > - jnc L(less16bytes) > - > - mov -16(%rdi), %rax > - mov -16(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - > - mov -8(%rdi), %rax > - mov -8(%rsi), %rcx > - cmp %rax, %rcx > - jne L(diffin8bytes) > - xor %eax, %eax > - ret > - > -/* > - * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. > - */ > - .p2align 3 > -L(less16bytes): > - movsbq %dl, %rdx > - mov (%rsi, %rdx), %rcx > - mov (%rdi, %rdx), %rax > - cmp %rax, %rcx > - jne L(diffin8bytes) > - mov 8(%rsi, %rdx), %rcx > - mov 8(%rdi, %rdx), %rax > -L(diffin8bytes): > - cmp %eax, %ecx > - jne L(diffin4bytes) > - shr $32, %rcx > - shr $32, %rax > - > -# ifdef USE_AS_WMEMCMP > -/* for wmemcmp */ > - cmp %eax, %ecx > - jne L(diffin4bytes) > - xor %eax, %eax > - ret > -# endif > - > -L(diffin4bytes): > -# ifndef USE_AS_WMEMCMP > - cmp %cx, %ax > - jne L(diffin2bytes) > - shr $16, %ecx > - shr $16, %eax > -L(diffin2bytes): > - cmp %cl, %al > - jne L(end) > - and $0xffff, %eax > - and $0xffff, %ecx > - sub %ecx, %eax > - ret > - > - .p2align 4 > -L(end): > - and $0xff, %eax > - and $0xff, %ecx > - sub %ecx, %eax > - ret > -# else > - > -/* for wmemcmp */ > - mov $1, %eax > - jl L(nequal_bigger) > - neg %eax > - ret > - > - .p2align 4 > -L(nequal_bigger): > - ret > - > -L(unreal_case): > - xor %eax, %eax > - ret > -# endif > - > -END (MEMCMP) > - > - .section .rodata.sse4.1,"a",@progbits > - .p2align 3 > -# ifndef USE_AS_WMEMCMP > -L(table_64bytes): > - .int JMPTBL (L(0bytes), L(table_64bytes)) > - .int JMPTBL (L(1bytes), L(table_64bytes)) > - .int JMPTBL (L(2bytes), L(table_64bytes)) > - .int JMPTBL (L(3bytes), L(table_64bytes)) > - .int JMPTBL (L(4bytes), L(table_64bytes)) > - .int JMPTBL (L(5bytes), L(table_64bytes)) > - .int JMPTBL (L(6bytes), L(table_64bytes)) > - .int JMPTBL (L(7bytes), L(table_64bytes)) > - .int JMPTBL (L(8bytes), L(table_64bytes)) > - .int JMPTBL (L(9bytes), L(table_64bytes)) > - .int JMPTBL (L(10bytes), L(table_64bytes)) > - .int JMPTBL (L(11bytes), L(table_64bytes)) > - .int JMPTBL (L(12bytes), L(table_64bytes)) > - .int JMPTBL (L(13bytes), L(table_64bytes)) > - .int JMPTBL (L(14bytes), L(table_64bytes)) > - .int JMPTBL (L(15bytes), L(table_64bytes)) > - .int JMPTBL (L(16bytes), L(table_64bytes)) > - .int JMPTBL (L(17bytes), L(table_64bytes)) > - .int JMPTBL (L(18bytes), L(table_64bytes)) > - .int JMPTBL (L(19bytes), L(table_64bytes)) > - .int JMPTBL (L(20bytes), L(table_64bytes)) > - .int JMPTBL (L(21bytes), L(table_64bytes)) > - .int JMPTBL (L(22bytes), L(table_64bytes)) > - .int JMPTBL (L(23bytes), L(table_64bytes)) > - .int JMPTBL (L(24bytes), L(table_64bytes)) > - .int JMPTBL (L(25bytes), L(table_64bytes)) > - .int JMPTBL (L(26bytes), L(table_64bytes)) > - .int JMPTBL (L(27bytes), L(table_64bytes)) > - .int JMPTBL (L(28bytes), L(table_64bytes)) > - .int JMPTBL (L(29bytes), L(table_64bytes)) > - .int JMPTBL (L(30bytes), L(table_64bytes)) > - .int JMPTBL (L(31bytes), L(table_64bytes)) > - .int JMPTBL (L(32bytes), L(table_64bytes)) > - .int JMPTBL (L(33bytes), L(table_64bytes)) > - .int JMPTBL (L(34bytes), L(table_64bytes)) > - .int JMPTBL (L(35bytes), L(table_64bytes)) > - .int JMPTBL (L(36bytes), L(table_64bytes)) > - .int JMPTBL (L(37bytes), L(table_64bytes)) > - .int JMPTBL (L(38bytes), L(table_64bytes)) > - .int JMPTBL (L(39bytes), L(table_64bytes)) > - .int JMPTBL (L(40bytes), L(table_64bytes)) > - .int JMPTBL (L(41bytes), L(table_64bytes)) > - .int JMPTBL (L(42bytes), L(table_64bytes)) > - .int JMPTBL (L(43bytes), L(table_64bytes)) > - .int JMPTBL (L(44bytes), L(table_64bytes)) > - .int JMPTBL (L(45bytes), L(table_64bytes)) > - .int JMPTBL (L(46bytes), L(table_64bytes)) > - .int JMPTBL (L(47bytes), L(table_64bytes)) > - .int JMPTBL (L(48bytes), L(table_64bytes)) > - .int JMPTBL (L(49bytes), L(table_64bytes)) > - .int JMPTBL (L(50bytes), L(table_64bytes)) > - .int JMPTBL (L(51bytes), L(table_64bytes)) > - .int JMPTBL (L(52bytes), L(table_64bytes)) > - .int JMPTBL (L(53bytes), L(table_64bytes)) > - .int JMPTBL (L(54bytes), L(table_64bytes)) > - .int JMPTBL (L(55bytes), L(table_64bytes)) > - .int JMPTBL (L(56bytes), L(table_64bytes)) > - .int JMPTBL (L(57bytes), L(table_64bytes)) > - .int JMPTBL (L(58bytes), L(table_64bytes)) > - .int JMPTBL (L(59bytes), L(table_64bytes)) > - .int JMPTBL (L(60bytes), L(table_64bytes)) > - .int JMPTBL (L(61bytes), L(table_64bytes)) > - .int JMPTBL (L(62bytes), L(table_64bytes)) > - .int JMPTBL (L(63bytes), L(table_64bytes)) > - .int JMPTBL (L(64bytes), L(table_64bytes)) > - .int JMPTBL (L(65bytes), L(table_64bytes)) > - .int JMPTBL (L(66bytes), L(table_64bytes)) > - .int JMPTBL (L(67bytes), L(table_64bytes)) > - .int JMPTBL (L(68bytes), L(table_64bytes)) > - .int JMPTBL (L(69bytes), L(table_64bytes)) > - .int JMPTBL (L(70bytes), L(table_64bytes)) > - .int JMPTBL (L(71bytes), L(table_64bytes)) > - .int JMPTBL (L(72bytes), L(table_64bytes)) > - .int JMPTBL (L(73bytes), L(table_64bytes)) > - .int JMPTBL (L(74bytes), L(table_64bytes)) > - .int JMPTBL (L(75bytes), L(table_64bytes)) > - .int JMPTBL (L(76bytes), L(table_64bytes)) > - .int JMPTBL (L(77bytes), L(table_64bytes)) > - .int JMPTBL (L(78bytes), L(table_64bytes)) > - .int JMPTBL (L(79bytes), L(table_64bytes)) > -# else > -L(table_64bytes): > - .int JMPTBL (L(0bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(4bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(8bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(12bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(16bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(20bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(24bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(28bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(32bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(36bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(40bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(44bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(48bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(52bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(56bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(60bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(64bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(68bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(72bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(76bytes), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > -# endif > -#endif > diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S > index f8b4636..5d87a17 100644 > --- a/sysdeps/x86_64/multiarch/memcmp.S > +++ b/sysdeps/x86_64/multiarch/memcmp.S > @@ -29,33 +29,28 @@ ENTRY(memcmp) > cmpl $0, KIND_OFFSET+__cpu_features(%rip) > jne 1f > call __init_cpu_features > - > -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) > + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) > jnz 2f > - leaq __memcmp_sse2(%rip), %rax > - ret > - > -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) > - jz 3f > - leaq __memcmp_sse4_1(%rip), %rax > +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) > + jnz 3f > +2: leaq __memcmp_sse2_unaligned(%rip), %rax > ret > > 3: leaq __memcmp_ssse3(%rip), %rax > ret > - > END(memcmp) > > # undef ENTRY > # define ENTRY(name) \ > - .type __memcmp_sse2, @function; \ > + .type __memcmp_sse2_unaligned, @function; \ > .p2align 4; \ > - .globl __memcmp_sse2; \ > - .hidden __memcmp_sse2; \ > - __memcmp_sse2: cfi_startproc; \ > + .globl __memcmp_sse2_unaligned; \ > + .hidden __memcmp_sse2_unaligned; \ > + __memcmp_sse2_unaligned: cfi_startproc; \ > CALL_MCOUNT > # undef END > # define END(name) \ > - cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 > + cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned > > # ifdef SHARED > # undef libc_hidden_builtin_def > @@ -63,7 +58,7 @@ END(memcmp) > they will be called without setting up EBX needed for PLT which is > used by IFUNC. */ > # define libc_hidden_builtin_def(name) \ > - .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 > + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned > # endif > #endif > > diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S > index 695a236..5dd8d44 100644 > --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S > +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S > @@ -201,6 +201,10 @@ L(prepare_loop): > movdqu %xmm2, 96(%rdi) > movdqu %xmm3, 112(%rdi) > > +#ifdef USE_AVX2 > + vpxor %xmm5, %xmm5, %xmm5 > +#endif > + > subq %rsi, %rdi > add $64, %rsi > andq $-64, %rsi > @@ -348,10 +352,13 @@ L(cross_loop): > sub $1, %rcx > ja L(cross_loop) > > +#ifdef USE_AVX2 > + vpxor %xmm5, %xmm5, %xmm5 > +#else > pxor %xmm5, %xmm5 > pxor %xmm6, %xmm6 > pxor %xmm7, %xmm7 > - > +#endif > lea -64(%rsi), %rdx > andq $-64, %rdx > addq %rdx, %rdi > diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S > deleted file mode 100644 > index b07973a..0000000 > --- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_WMEMCMP 1 > -#define MEMCMP __wmemcmp_sse4_1 > - > -#include "memcmp-sse4.S" > diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S > index 109e245..dabd3ed 100644 > --- a/sysdeps/x86_64/multiarch/wmemcmp.S > +++ b/sysdeps/x86_64/multiarch/wmemcmp.S > @@ -30,18 +30,16 @@ ENTRY(wmemcmp) > jne 1f > call __init_cpu_features > > -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) > + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) > jnz 2f > - leaq __wmemcmp_sse2(%rip), %rax > - ret > - > -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) > - jz 3f > - leaq __wmemcmp_sse4_1(%rip), %rax > +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) > + jnz 3f > +2: leaq __wmemcmp_sse2_unaligned(%rip), %rax > ret > > 3: leaq __wmemcmp_ssse3(%rip), %rax > ret > > + > END(wmemcmp) > #endif > -- > 1.8.4.rc3
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index f636716..88c0c4a 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -19,340 +19,204 @@ #include <sysdep.h> +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + .text -ENTRY (memcmp) - test %rdx, %rdx - jz L(finz) - cmpq $1, %rdx - jle L(finr1b) - subq %rdi, %rsi - movq %rdx, %r10 - cmpq $32, %r10 - jge L(gt32) - /* Handle small chunks and last block of less than 32 bytes. */ -L(small): - testq $1, %r10 - jz L(s2b) - movzbl (%rdi), %eax - movzbl (%rdi, %rsi), %edx - subq $1, %r10 - je L(finz1) - addq $1, %rdi - subl %edx, %eax - jnz L(exit) -L(s2b): - testq $2, %r10 - jz L(s4b) - movzwl (%rdi), %eax - movzwl (%rdi, %rsi), %edx - subq $2, %r10 - je L(fin2_7) - addq $2, %rdi - cmpl %edx, %eax - jnz L(fin2_7) -L(s4b): - testq $4, %r10 - jz L(s8b) - movl (%rdi), %eax - movl (%rdi, %rsi), %edx - subq $4, %r10 - je L(fin2_7) - addq $4, %rdi - cmpl %edx, %eax - jnz L(fin2_7) -L(s8b): - testq $8, %r10 - jz L(s16b) - movq (%rdi), %rax - movq (%rdi, %rsi), %rdx - subq $8, %r10 - je L(fin2_7) - addq $8, %rdi - cmpq %rdx, %rax - jnz L(fin2_7) -L(s16b): - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - xorl %eax, %eax - subl $0xffff, %edx - jz L(finz) - bsfl %edx, %ecx - leaq (%rdi, %rcx), %rcx - movzbl (%rcx), %eax - movzbl (%rsi, %rcx), %edx - jmp L(finz1) +ENTRY (MEMCMP) + testq %rdx, %rdx + je L(return_zero) +#ifdef AS_WMEMCMP + shl $2, %rdx +#endif + pxor %xmm4, %xmm4 + movl %edi, %eax + andl $4095, %eax + cmpl $4032, %eax + ja L(cross_page_start) +L(handle_end): + movl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + ja L(cross_page_start) +L(back_header): + xor %ecx, %ecx + bts %rdx, %rcx + sub $1, %rcx + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + and %ecx, %eax + jne L(different) + cmpq $16, %rdx + ja L(next) + ret +L(next): + pmovmskb %xmm0, %r8d + movdqu 16(%rdi), %xmm2 + movdqu 16(%rsi), %xmm6 + movdqu 32(%rdi), %xmm1 + pcmpeqb %xmm6, %xmm2 + movdqu 32(%rsi), %xmm5 + pcmpeqb %xmm4, %xmm2 + pcmpeqb %xmm5, %xmm1 + movdqu 48(%rdi), %xmm7 + pmovmskb %xmm2, %eax + movdqu 48(%rsi), %xmm3 + pcmpeqb %xmm4, %xmm1 + pmovmskb %xmm1, %r9d + sal $16, %eax + pcmpeqb %xmm3, %xmm7 + salq $32, %r9 + pcmpeqb %xmm4, %xmm7 + orq %r9, %rax + orq %r8, %rax + pmovmskb %xmm7, %r8d + salq $48, %r8 + orq %r8, %rax + movq %rax, %r8 + andq %rcx, %rax + jne L(different) + cmpq $64, %rdx + jb L(return_zero) + movq %r8, %rax + testq %rax, %rax + jne L(different) +L(align_loop): + leaq 64(%rdi), %rax + andq $-64, %rax + subq %rdi, %rax + subq %rax, %rdx + addq %rax, %rdi + addq %rax, %rsi + cmpq $64, %rdx + ja L(loop_start) + testq %rdx, %rdx + jne L(handle_end) + xorl %eax, %eax + ret - .p2align 4,, 4 -L(finr1b): - movzbl (%rdi), %eax - movzbl (%rsi), %edx -L(finz1): + .p2align 4 +L(different): + bsfq %rax, %rdx +#ifdef AS_WMEMCMP + and $-4, %rdx + mov (%rdi,%rdx), %eax + mov (%rsi,%rdx), %edx subl %edx, %eax -L(exit): + jg L(ret1) + jl L(ret_neg_1) ret - - .p2align 4,, 4 -L(fin2_7): - cmpq %rdx, %rax - jz L(finz) - movq %rax, %r11 - subq %rdx, %r11 - bsfq %r11, %rcx - sarq $3, %rcx - salq $3, %rcx - sarq %cl, %rax - movzbl %al, %eax - sarq %cl, %rdx - movzbl %dl, %edx +L(ret1): + mov $1, %eax + ret +L(ret_neg_1): + mov $-1, %eax + ret +#else + movzbl (%rdi,%rdx), %eax + movzbl (%rsi,%rdx), %edx subl %edx, %eax ret - - .p2align 4,, 4 -L(finz): +#endif +L(return_zero): + xor %eax, %eax + ret + .p2align 4 +L(loop): + subq $64, %rdx + addq $64, %rdi + addq $64, %rsi + cmpq $64, %rdx + jbe L(less_64_bytes) +L(loop_start): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + pcmpeqb (%rdi), %xmm0 + movdqu 32(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm1 + movdqu 48(%rsi), %xmm3 + pcmpeqb 32(%rdi), %xmm2 + pcmpeqb 48(%rdi), %xmm3 + pminub %xmm0, %xmm3 + pminub %xmm1, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm4, %xmm3 + pmovmskb %xmm3, %eax + testl %eax, %eax + je L(loop) + shl $48, %rax + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm4, %xmm1 + pcmpeqb %xmm4, %xmm2 + pmovmskb %xmm0, %r8 + pmovmskb %xmm1, %rcx + pmovmskb %xmm2, %r9 + shl $16, %ecx + shl $32, %r9 + or %r8, %rax + or %r9, %rax + or %rcx, %rax + jmp L(different) + + .p2align 4 +L(less_64_bytes): + testq %rdx, %rdx + jne L(handle_end) xorl %eax, %eax ret - /* For blocks bigger than 32 bytes - 1. Advance one of the addr pointer to be 16B aligned. - 2. Treat the case of both addr pointers aligned to 16B - separately to avoid movdqu. - 3. Handle any blocks of greater than 64 consecutive bytes with - unrolling to reduce branches. - 4. At least one addr pointer is 16B aligned, use memory version - of pcmbeqb. - */ - .p2align 4,, 4 -L(gt32): - movq %rdx, %r11 - addq %rdi, %r11 - movq %rdi, %r8 - - andq $15, %r8 - jz L(16am) - /* Both pointers may be misaligned. */ - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - subl $0xffff, %edx - jnz L(neq) - neg %r8 - leaq 16(%rdi, %r8), %rdi -L(16am): - /* Handle two 16B aligned pointers separately. */ - testq $15, %rsi - jz L(ATR) - testq $16, %rdi - jz L(A32) - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi -L(A32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) - /* Pre-unroll to be ready for unrolled 64B loop. */ - testq $32, %rdi - jz L(A64) - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(A64): - movq %r11, %r10 - andq $-64, %r10 - cmpq %r10, %rdi - jge L(mt32) - -L(A64main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A64main) - -L(mt32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) -L(A32main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A32main) -L(mt16): - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - - .p2align 4,, 4 -L(neq): - bsfl %edx, %ecx - movzbl (%rdi, %rcx), %eax - addq %rdi, %rsi - movzbl (%rsi,%rcx), %edx - jmp L(finz1) - - .p2align 4,, 4 -L(ATR): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) - testq $16, %rdi - jz L(ATR32) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - je L(mt16) - -L(ATR32): - movq %r11, %r10 - andq $-64, %r10 - testq $32, %rdi - jz L(ATR64) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(ATR64): - cmpq %rdi, %r10 - je L(mt32) - -L(ATR64main): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - jne L(ATR64main) - - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jge L(mt16) - -L(ATR32res): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %r10, %rdi - jne L(ATR32res) - - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - /* Align to 16byte to improve instruction fetch. */ - .p2align 4,, 4 -END(memcmp) + .p2align 4 +L(cross_page_start): + cmp $64, %rdx + ja L(back_header) + + .p2align 4 +L(cross_page): + test %edx, %edx + je L(return_zero) +#ifdef AS_WMEMCMP + mov (%rdi), %eax + mov (%rsi), %ecx + subl %ecx, %eax + jg L(ret1) + jl L(ret_neg_1) +#else + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + jne L(return) + cmp $1, %edx + je L(return) + movzbl 1(%rdi), %eax + movzbl 1(%rsi), %ecx + subl %ecx, %eax + jne L(return) + cmp $2, %edx + je L(return) + movzbl 2(%rdi), %eax + movzbl 2(%rsi), %ecx + subl %ecx, %eax + jne L(return) + cmp $3, %edx + je L(return) + movzbl 3(%rdi), %eax + movzbl 3(%rsi), %ecx + subl %ecx, %eax + jne L(return) +#endif + sub $4, %edx + add $4, %rdi + add $4, %rsi + jmp L(cross_page) +L(return): + ret +END(MEMCMP) -#undef bcmp +#undef bcmp weak_alias (memcmp, bcmp) libc_hidden_builtin_def (memcmp) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index c573744..679db2a 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -8,7 +8,7 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ - memcmp-sse4 memcpy-ssse3 \ + memcpy-ssse3 \ memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \ @@ -29,10 +29,10 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq (yes,$(config-cflags-avx2)) -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 endif endif ifeq ($(subdir),wcsmbs) -sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c +sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index d398e43..b3dbe65 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -39,10 +39,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/memcmp.S. */ IFUNC_IMPL (i, name, memcmp, - IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1, - __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2) IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3) - IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned)) /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */ IFUNC_IMPL (i, name, __memmove_chk, @@ -211,8 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ IFUNC_IMPL (i, name, wmemcmp, - IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1, - __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, + __wmemcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3, __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S new file mode 100644 index 0000000..60483bf --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S @@ -0,0 +1,3 @@ +#define USE_AVX2 +#define MEMCMP __memcmp_avx2 +#include "../memcmp.S" diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S deleted file mode 100644 index 533fece..0000000 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ /dev/null @@ -1,1776 +0,0 @@ -/* memcmp with SSE4.1, wmemcmp with SSE4.1 - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef MEMCMP -# define MEMCMP __memcmp_sse4_1 -# endif - -# define JMPTBL(I, B) (I - B) - -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), %rcx; \ - add %r11, %rcx; \ - jmp *%rcx; \ - ud2 - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - .section .text.sse4.1,"ax",@progbits -ENTRY (MEMCMP) -# ifdef USE_AS_WMEMCMP - shl $2, %rdx -# endif - pxor %xmm0, %xmm0 - cmp $79, %rdx - ja L(79bytesormore) -# ifndef USE_AS_WMEMCMP - cmp $1, %rdx - je L(firstbyte) -# endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(firstbyte): - movzbl (%rdi), %eax - movzbl (%rsi), %ecx - sub %ecx, %eax - ret -# endif - - .p2align 4 -L(79bytesormore): - movdqu (%rsi), %xmm1 - movdqu (%rdi), %xmm2 - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - mov %rsi, %rcx - and $-16, %rsi - add $16, %rsi - sub %rsi, %rcx - - sub %rcx, %rdi - add %rcx, %rdx - test $0xf, %rdi - jz L(2aligned) - - cmp $128, %rdx - ja L(128bytesormore) -L(less128bytes): - sub $64, %rdx - - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqu 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - - movdqu 32(%rdi), %xmm2 - pxor 32(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(48bytesin256) - - movdqu 48(%rdi), %xmm2 - pxor 48(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(64bytesin256) - cmp $32, %rdx - jb L(less32bytesin64) - - movdqu 64(%rdi), %xmm2 - pxor 64(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(80bytesin256) - - movdqu 80(%rdi), %xmm2 - pxor 80(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(96bytesin256) - sub $32, %rdx - add $32, %rdi - add $32, %rsi -L(less32bytesin64): - add $64, %rdi - add $64, %rsi - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - -L(128bytesormore): - cmp $512, %rdx - ja L(512bytesormore) - cmp $256, %rdx - ja L(less512bytes) -L(less256bytes): - sub $128, %rdx - - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqu 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - - movdqu 32(%rdi), %xmm2 - pxor 32(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(48bytesin256) - - movdqu 48(%rdi), %xmm2 - pxor 48(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(64bytesin256) - - movdqu 64(%rdi), %xmm2 - pxor 64(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(80bytesin256) - - movdqu 80(%rdi), %xmm2 - pxor 80(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(96bytesin256) - - movdqu 96(%rdi), %xmm2 - pxor 96(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(112bytesin256) - - movdqu 112(%rdi), %xmm2 - pxor 112(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(128bytesin256) - - add $128, %rsi - add $128, %rdi - - cmp $64, %rdx - jae L(less128bytes) - - cmp $32, %rdx - jb L(less32bytesin128) - - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqu 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - sub $32, %rdx - add $32, %rdi - add $32, %rsi -L(less32bytesin128): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - -L(less512bytes): - sub $256, %rdx - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqu 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - - movdqu 32(%rdi), %xmm2 - pxor 32(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(48bytesin256) - - movdqu 48(%rdi), %xmm2 - pxor 48(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(64bytesin256) - - movdqu 64(%rdi), %xmm2 - pxor 64(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(80bytesin256) - - movdqu 80(%rdi), %xmm2 - pxor 80(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(96bytesin256) - - movdqu 96(%rdi), %xmm2 - pxor 96(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(112bytesin256) - - movdqu 112(%rdi), %xmm2 - pxor 112(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(128bytesin256) - - movdqu 128(%rdi), %xmm2 - pxor 128(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(144bytesin256) - - movdqu 144(%rdi), %xmm2 - pxor 144(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(160bytesin256) - - movdqu 160(%rdi), %xmm2 - pxor 160(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(176bytesin256) - - movdqu 176(%rdi), %xmm2 - pxor 176(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(192bytesin256) - - movdqu 192(%rdi), %xmm2 - pxor 192(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(208bytesin256) - - movdqu 208(%rdi), %xmm2 - pxor 208(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(224bytesin256) - - movdqu 224(%rdi), %xmm2 - pxor 224(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(240bytesin256) - - movdqu 240(%rdi), %xmm2 - pxor 240(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(256bytesin256) - - add $256, %rsi - add $256, %rdi - - cmp $128, %rdx - jae L(less256bytes) - - cmp $64, %rdx - jae L(less128bytes) - - cmp $32, %rdx - jb L(less32bytesin256) - - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqu 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - sub $32, %rdx - add $32, %rdi - add $32, %rsi -L(less32bytesin256): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - - .p2align 4 -L(512bytesormore): -# ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %R8_LP -# else - mov __x86_data_cache_size_half(%rip), %R8_LP -# endif - mov %r8, %r9 - shr $1, %r8 - add %r9, %r8 - cmp %r8, %rdx - ja L(L2_L3_cache_unaglined) - sub $64, %rdx - .p2align 4 -L(64bytesormore_loop): - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - movdqa %xmm2, %xmm1 - - movdqu 16(%rdi), %xmm3 - pxor 16(%rsi), %xmm3 - por %xmm3, %xmm1 - - movdqu 32(%rdi), %xmm4 - pxor 32(%rsi), %xmm4 - por %xmm4, %xmm1 - - movdqu 48(%rdi), %xmm5 - pxor 48(%rsi), %xmm5 - por %xmm5, %xmm1 - - ptest %xmm1, %xmm0 - jnc L(64bytesormore_loop_end) - add $64, %rsi - add $64, %rdi - sub $64, %rdx - jae L(64bytesormore_loop) - - add $64, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - -L(L2_L3_cache_unaglined): - sub $64, %rdx - .p2align 4 -L(L2_L3_unaligned_128bytes_loop): - prefetchnta 0x1c0(%rdi) - prefetchnta 0x1c0(%rsi) - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - movdqa %xmm2, %xmm1 - - movdqu 16(%rdi), %xmm3 - pxor 16(%rsi), %xmm3 - por %xmm3, %xmm1 - - movdqu 32(%rdi), %xmm4 - pxor 32(%rsi), %xmm4 - por %xmm4, %xmm1 - - movdqu 48(%rdi), %xmm5 - pxor 48(%rsi), %xmm5 - por %xmm5, %xmm1 - - ptest %xmm1, %xmm0 - jnc L(64bytesormore_loop_end) - add $64, %rsi - add $64, %rdi - sub $64, %rdx - jae L(L2_L3_unaligned_128bytes_loop) - - add $64, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - -/* - * This case is for machines which are sensitive for unaligned instructions. - */ - .p2align 4 -L(2aligned): - cmp $128, %rdx - ja L(128bytesormorein2aligned) -L(less128bytesin2aligned): - sub $64, %rdx - - movdqa (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqa 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - - movdqa 32(%rdi), %xmm2 - pxor 32(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(48bytesin256) - - movdqa 48(%rdi), %xmm2 - pxor 48(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(64bytesin256) - cmp $32, %rdx - jb L(less32bytesin64in2alinged) - - movdqa 64(%rdi), %xmm2 - pxor 64(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(80bytesin256) - - movdqa 80(%rdi), %xmm2 - pxor 80(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(96bytesin256) - sub $32, %rdx - add $32, %rdi - add $32, %rsi -L(less32bytesin64in2alinged): - add $64, %rdi - add $64, %rsi - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - - .p2align 4 -L(128bytesormorein2aligned): - cmp $512, %rdx - ja L(512bytesormorein2aligned) - cmp $256, %rdx - ja L(256bytesormorein2aligned) -L(less256bytesin2alinged): - sub $128, %rdx - - movdqa (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqa 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - - movdqa 32(%rdi), %xmm2 - pxor 32(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(48bytesin256) - - movdqa 48(%rdi), %xmm2 - pxor 48(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(64bytesin256) - - movdqa 64(%rdi), %xmm2 - pxor 64(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(80bytesin256) - - movdqa 80(%rdi), %xmm2 - pxor 80(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(96bytesin256) - - movdqa 96(%rdi), %xmm2 - pxor 96(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(112bytesin256) - - movdqa 112(%rdi), %xmm2 - pxor 112(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(128bytesin256) - - add $128, %rsi - add $128, %rdi - - cmp $64, %rdx - jae L(less128bytesin2aligned) - - cmp $32, %rdx - jb L(less32bytesin128in2aligned) - - movdqu (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqu 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - sub $32, %rdx - add $32, %rdi - add $32, %rsi -L(less32bytesin128in2aligned): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - - .p2align 4 -L(256bytesormorein2aligned): - - sub $256, %rdx - movdqa (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqa 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - - movdqa 32(%rdi), %xmm2 - pxor 32(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(48bytesin256) - - movdqa 48(%rdi), %xmm2 - pxor 48(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(64bytesin256) - - movdqa 64(%rdi), %xmm2 - pxor 64(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(80bytesin256) - - movdqa 80(%rdi), %xmm2 - pxor 80(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(96bytesin256) - - movdqa 96(%rdi), %xmm2 - pxor 96(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(112bytesin256) - - movdqa 112(%rdi), %xmm2 - pxor 112(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(128bytesin256) - - movdqa 128(%rdi), %xmm2 - pxor 128(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(144bytesin256) - - movdqa 144(%rdi), %xmm2 - pxor 144(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(160bytesin256) - - movdqa 160(%rdi), %xmm2 - pxor 160(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(176bytesin256) - - movdqa 176(%rdi), %xmm2 - pxor 176(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(192bytesin256) - - movdqa 192(%rdi), %xmm2 - pxor 192(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(208bytesin256) - - movdqa 208(%rdi), %xmm2 - pxor 208(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(224bytesin256) - - movdqa 224(%rdi), %xmm2 - pxor 224(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(240bytesin256) - - movdqa 240(%rdi), %xmm2 - pxor 240(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(256bytesin256) - - add $256, %rsi - add $256, %rdi - - cmp $128, %rdx - jae L(less256bytesin2alinged) - - cmp $64, %rdx - jae L(less128bytesin2aligned) - - cmp $32, %rdx - jb L(less32bytesin256in2alinged) - - movdqa (%rdi), %xmm2 - pxor (%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(16bytesin256) - - movdqa 16(%rdi), %xmm2 - pxor 16(%rsi), %xmm2 - ptest %xmm2, %xmm0 - jnc L(32bytesin256) - sub $32, %rdx - add $32, %rdi - add $32, %rsi -L(less32bytesin256in2alinged): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - - .p2align 4 -L(512bytesormorein2aligned): -# ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %R8_LP -# else - mov __x86_data_cache_size_half(%rip), %R8_LP -# endif - mov %r8, %r9 - shr $1, %r8 - add %r9, %r8 - cmp %r8, %rdx - ja L(L2_L3_cache_aglined) - - sub $64, %rdx - .p2align 4 -L(64bytesormore_loopin2aligned): - movdqa (%rdi), %xmm2 - pxor (%rsi), %xmm2 - movdqa %xmm2, %xmm1 - - movdqa 16(%rdi), %xmm3 - pxor 16(%rsi), %xmm3 - por %xmm3, %xmm1 - - movdqa 32(%rdi), %xmm4 - pxor 32(%rsi), %xmm4 - por %xmm4, %xmm1 - - movdqa 48(%rdi), %xmm5 - pxor 48(%rsi), %xmm5 - por %xmm5, %xmm1 - - ptest %xmm1, %xmm0 - jnc L(64bytesormore_loop_end) - add $64, %rsi - add $64, %rdi - sub $64, %rdx - jae L(64bytesormore_loopin2aligned) - - add $64, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) -L(L2_L3_cache_aglined): - sub $64, %rdx - - .p2align 4 -L(L2_L3_aligned_128bytes_loop): - prefetchnta 0x1c0(%rdi) - prefetchnta 0x1c0(%rsi) - movdqa (%rdi), %xmm2 - pxor (%rsi), %xmm2 - movdqa %xmm2, %xmm1 - - movdqa 16(%rdi), %xmm3 - pxor 16(%rsi), %xmm3 - por %xmm3, %xmm1 - - movdqa 32(%rdi), %xmm4 - pxor 32(%rsi), %xmm4 - por %xmm4, %xmm1 - - movdqa 48(%rdi), %xmm5 - pxor 48(%rsi), %xmm5 - por %xmm5, %xmm1 - - ptest %xmm1, %xmm0 - jnc L(64bytesormore_loop_end) - add $64, %rsi - add $64, %rdi - sub $64, %rdx - jae L(L2_L3_aligned_128bytes_loop) - - add $64, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - - - .p2align 4 -L(64bytesormore_loop_end): - add $16, %rdi - add $16, %rsi - ptest %xmm2, %xmm0 - jnc L(16bytes) - - add $16, %rdi - add $16, %rsi - ptest %xmm3, %xmm0 - jnc L(16bytes) - - add $16, %rdi - add $16, %rsi - ptest %xmm4, %xmm0 - jnc L(16bytes) - - add $16, %rdi - add $16, %rsi - jmp L(16bytes) - -L(256bytesin256): - add $256, %rdi - add $256, %rsi - jmp L(16bytes) -L(240bytesin256): - add $240, %rdi - add $240, %rsi - jmp L(16bytes) -L(224bytesin256): - add $224, %rdi - add $224, %rsi - jmp L(16bytes) -L(208bytesin256): - add $208, %rdi - add $208, %rsi - jmp L(16bytes) -L(192bytesin256): - add $192, %rdi - add $192, %rsi - jmp L(16bytes) -L(176bytesin256): - add $176, %rdi - add $176, %rsi - jmp L(16bytes) -L(160bytesin256): - add $160, %rdi - add $160, %rsi - jmp L(16bytes) -L(144bytesin256): - add $144, %rdi - add $144, %rsi - jmp L(16bytes) -L(128bytesin256): - add $128, %rdi - add $128, %rsi - jmp L(16bytes) -L(112bytesin256): - add $112, %rdi - add $112, %rsi - jmp L(16bytes) -L(96bytesin256): - add $96, %rdi - add $96, %rsi - jmp L(16bytes) -L(80bytesin256): - add $80, %rdi - add $80, %rsi - jmp L(16bytes) -L(64bytesin256): - add $64, %rdi - add $64, %rsi - jmp L(16bytes) -L(48bytesin256): - add $16, %rdi - add $16, %rsi -L(32bytesin256): - add $16, %rdi - add $16, %rsi -L(16bytesin256): - add $16, %rdi - add $16, %rsi -L(16bytes): - mov -16(%rdi), %rax - mov -16(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) -L(8bytes): - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(12bytes): - mov -12(%rdi), %rax - mov -12(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) -L(4bytes): - mov -4(%rsi), %ecx -# ifndef USE_AS_WMEMCMP - mov -4(%rdi), %eax - cmp %eax, %ecx -# else - cmp -4(%rdi), %ecx -# endif - jne L(diffin4bytes) -L(0bytes): - xor %eax, %eax - ret - -# ifndef USE_AS_WMEMCMP -/* unreal case for wmemcmp */ - .p2align 4 -L(65bytes): - movdqu -65(%rdi), %xmm1 - movdqu -65(%rsi), %xmm2 - mov $-65, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(49bytes): - movdqu -49(%rdi), %xmm1 - movdqu -49(%rsi), %xmm2 - mov $-49, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(33bytes): - movdqu -33(%rdi), %xmm1 - movdqu -33(%rsi), %xmm2 - mov $-33, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(17bytes): - mov -17(%rdi), %rax - mov -17(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) -L(9bytes): - mov -9(%rdi), %rax - mov -9(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - movzbl -1(%rdi), %eax - movzbl -1(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(13bytes): - mov -13(%rdi), %rax - mov -13(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(5bytes): - mov -5(%rdi), %eax - mov -5(%rsi), %ecx - cmp %eax, %ecx - jne L(diffin4bytes) - movzbl -1(%rdi), %eax - movzbl -1(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(66bytes): - movdqu -66(%rdi), %xmm1 - movdqu -66(%rsi), %xmm2 - mov $-66, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(50bytes): - movdqu -50(%rdi), %xmm1 - movdqu -50(%rsi), %xmm2 - mov $-50, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(34bytes): - movdqu -34(%rdi), %xmm1 - movdqu -34(%rsi), %xmm2 - mov $-34, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(18bytes): - mov -18(%rdi), %rax - mov -18(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) -L(10bytes): - mov -10(%rdi), %rax - mov -10(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - movzwl -2(%rdi), %eax - movzwl -2(%rsi), %ecx - cmp %cl, %al - jne L(end) - and $0xffff, %eax - and $0xffff, %ecx - sub %ecx, %eax - ret - - .p2align 4 -L(14bytes): - mov -14(%rdi), %rax - mov -14(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(6bytes): - mov -6(%rdi), %eax - mov -6(%rsi), %ecx - cmp %eax, %ecx - jne L(diffin4bytes) -L(2bytes): - movzwl -2(%rsi), %ecx - movzwl -2(%rdi), %eax - cmp %cl, %al - jne L(end) - and $0xffff, %eax - and $0xffff, %ecx - sub %ecx, %eax - ret - - .p2align 4 -L(67bytes): - movdqu -67(%rdi), %xmm2 - movdqu -67(%rsi), %xmm1 - mov $-67, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(51bytes): - movdqu -51(%rdi), %xmm2 - movdqu -51(%rsi), %xmm1 - mov $-51, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(35bytes): - movdqu -35(%rsi), %xmm1 - movdqu -35(%rdi), %xmm2 - mov $-35, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(19bytes): - mov -19(%rdi), %rax - mov -19(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) -L(11bytes): - mov -11(%rdi), %rax - mov -11(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -4(%rdi), %eax - mov -4(%rsi), %ecx - cmp %eax, %ecx - jne L(diffin4bytes) - xor %eax, %eax - ret - - .p2align 4 -L(15bytes): - mov -15(%rdi), %rax - mov -15(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(7bytes): - mov -7(%rdi), %eax - mov -7(%rsi), %ecx - cmp %eax, %ecx - jne L(diffin4bytes) - mov -4(%rdi), %eax - mov -4(%rsi), %ecx - cmp %eax, %ecx - jne L(diffin4bytes) - xor %eax, %eax - ret - - .p2align 4 -L(3bytes): - movzwl -3(%rdi), %eax - movzwl -3(%rsi), %ecx - cmp %eax, %ecx - jne L(diffin2bytes) -L(1bytes): - movzbl -1(%rdi), %eax - movzbl -1(%rsi), %ecx - sub %ecx, %eax - ret -# endif - - .p2align 4 -L(68bytes): - movdqu -68(%rdi), %xmm2 - movdqu -68(%rsi), %xmm1 - mov $-68, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(52bytes): - movdqu -52(%rdi), %xmm2 - movdqu -52(%rsi), %xmm1 - mov $-52, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(36bytes): - movdqu -36(%rdi), %xmm2 - movdqu -36(%rsi), %xmm1 - mov $-36, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(20bytes): - movdqu -20(%rdi), %xmm2 - movdqu -20(%rsi), %xmm1 - mov $-20, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -4(%rsi), %ecx - -# ifndef USE_AS_WMEMCMP - mov -4(%rdi), %eax - cmp %eax, %ecx -# else - cmp -4(%rdi), %ecx -# endif - jne L(diffin4bytes) - xor %eax, %eax - ret - -# ifndef USE_AS_WMEMCMP -/* unreal cases for wmemcmp */ - .p2align 4 -L(69bytes): - movdqu -69(%rsi), %xmm1 - movdqu -69(%rdi), %xmm2 - mov $-69, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(53bytes): - movdqu -53(%rsi), %xmm1 - movdqu -53(%rdi), %xmm2 - mov $-53, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(37bytes): - movdqu -37(%rsi), %xmm1 - movdqu -37(%rdi), %xmm2 - mov $-37, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(21bytes): - movdqu -21(%rsi), %xmm1 - movdqu -21(%rdi), %xmm2 - mov $-21, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(70bytes): - movdqu -70(%rsi), %xmm1 - movdqu -70(%rdi), %xmm2 - mov $-70, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(54bytes): - movdqu -54(%rsi), %xmm1 - movdqu -54(%rdi), %xmm2 - mov $-54, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(38bytes): - movdqu -38(%rsi), %xmm1 - movdqu -38(%rdi), %xmm2 - mov $-38, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(22bytes): - movdqu -22(%rsi), %xmm1 - movdqu -22(%rdi), %xmm2 - mov $-22, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(71bytes): - movdqu -71(%rsi), %xmm1 - movdqu -71(%rdi), %xmm2 - mov $-71, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(55bytes): - movdqu -55(%rdi), %xmm2 - movdqu -55(%rsi), %xmm1 - mov $-55, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(39bytes): - movdqu -39(%rdi), %xmm2 - movdqu -39(%rsi), %xmm1 - mov $-39, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(23bytes): - movdqu -23(%rdi), %xmm2 - movdqu -23(%rsi), %xmm1 - mov $-23, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret -# endif - - .p2align 4 -L(72bytes): - movdqu -72(%rsi), %xmm1 - movdqu -72(%rdi), %xmm2 - mov $-72, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(56bytes): - movdqu -56(%rdi), %xmm2 - movdqu -56(%rsi), %xmm1 - mov $-56, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(40bytes): - movdqu -40(%rdi), %xmm2 - movdqu -40(%rsi), %xmm1 - mov $-40, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(24bytes): - movdqu -24(%rdi), %xmm2 - movdqu -24(%rsi), %xmm1 - mov $-24, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -8(%rsi), %rcx - mov -8(%rdi), %rax - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - -# ifndef USE_AS_WMEMCMP -/* unreal cases for wmemcmp */ - .p2align 4 -L(73bytes): - movdqu -73(%rsi), %xmm1 - movdqu -73(%rdi), %xmm2 - mov $-73, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(57bytes): - movdqu -57(%rdi), %xmm2 - movdqu -57(%rsi), %xmm1 - mov $-57, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(41bytes): - movdqu -41(%rdi), %xmm2 - movdqu -41(%rsi), %xmm1 - mov $-41, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(25bytes): - movdqu -25(%rdi), %xmm2 - movdqu -25(%rsi), %xmm1 - mov $-25, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -9(%rdi), %rax - mov -9(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - movzbl -1(%rdi), %eax - movzbl -1(%rsi), %ecx - sub %ecx, %eax - ret - - .p2align 4 -L(74bytes): - movdqu -74(%rsi), %xmm1 - movdqu -74(%rdi), %xmm2 - mov $-74, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(58bytes): - movdqu -58(%rdi), %xmm2 - movdqu -58(%rsi), %xmm1 - mov $-58, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(42bytes): - movdqu -42(%rdi), %xmm2 - movdqu -42(%rsi), %xmm1 - mov $-42, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(26bytes): - movdqu -26(%rdi), %xmm2 - movdqu -26(%rsi), %xmm1 - mov $-26, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -10(%rdi), %rax - mov -10(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - movzwl -2(%rdi), %eax - movzwl -2(%rsi), %ecx - jmp L(diffin2bytes) - - .p2align 4 -L(75bytes): - movdqu -75(%rsi), %xmm1 - movdqu -75(%rdi), %xmm2 - mov $-75, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(59bytes): - movdqu -59(%rdi), %xmm2 - movdqu -59(%rsi), %xmm1 - mov $-59, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(43bytes): - movdqu -43(%rdi), %xmm2 - movdqu -43(%rsi), %xmm1 - mov $-43, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(27bytes): - movdqu -27(%rdi), %xmm2 - movdqu -27(%rsi), %xmm1 - mov $-27, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -11(%rdi), %rax - mov -11(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -4(%rdi), %eax - mov -4(%rsi), %ecx - cmp %eax, %ecx - jne L(diffin4bytes) - xor %eax, %eax - ret -# endif - .p2align 4 -L(76bytes): - movdqu -76(%rsi), %xmm1 - movdqu -76(%rdi), %xmm2 - mov $-76, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(60bytes): - movdqu -60(%rdi), %xmm2 - movdqu -60(%rsi), %xmm1 - mov $-60, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(44bytes): - movdqu -44(%rdi), %xmm2 - movdqu -44(%rsi), %xmm1 - mov $-44, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(28bytes): - movdqu -28(%rdi), %xmm2 - movdqu -28(%rsi), %xmm1 - mov $-28, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -12(%rdi), %rax - mov -12(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -4(%rsi), %ecx -# ifndef USE_AS_WMEMCMP - mov -4(%rdi), %eax - cmp %eax, %ecx -# else - cmp -4(%rdi), %ecx -# endif - jne L(diffin4bytes) - xor %eax, %eax - ret - -# ifndef USE_AS_WMEMCMP -/* unreal cases for wmemcmp */ - .p2align 4 -L(77bytes): - movdqu -77(%rsi), %xmm1 - movdqu -77(%rdi), %xmm2 - mov $-77, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(61bytes): - movdqu -61(%rdi), %xmm2 - movdqu -61(%rsi), %xmm1 - mov $-61, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(45bytes): - movdqu -45(%rdi), %xmm2 - movdqu -45(%rsi), %xmm1 - mov $-45, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(29bytes): - movdqu -29(%rdi), %xmm2 - movdqu -29(%rsi), %xmm1 - mov $-29, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -13(%rdi), %rax - mov -13(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(78bytes): - movdqu -78(%rsi), %xmm1 - movdqu -78(%rdi), %xmm2 - mov $-78, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(62bytes): - movdqu -62(%rdi), %xmm2 - movdqu -62(%rsi), %xmm1 - mov $-62, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(46bytes): - movdqu -46(%rdi), %xmm2 - movdqu -46(%rsi), %xmm1 - mov $-46, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(30bytes): - movdqu -30(%rdi), %xmm2 - movdqu -30(%rsi), %xmm1 - mov $-30, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -14(%rdi), %rax - mov -14(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - - .p2align 4 -L(79bytes): - movdqu -79(%rsi), %xmm1 - movdqu -79(%rdi), %xmm2 - mov $-79, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(63bytes): - movdqu -63(%rdi), %xmm2 - movdqu -63(%rsi), %xmm1 - mov $-63, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(47bytes): - movdqu -47(%rdi), %xmm2 - movdqu -47(%rsi), %xmm1 - mov $-47, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(31bytes): - movdqu -31(%rdi), %xmm2 - movdqu -31(%rsi), %xmm1 - mov $-31, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - mov -15(%rdi), %rax - mov -15(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret -# endif - .p2align 4 -L(64bytes): - movdqu -64(%rdi), %xmm2 - movdqu -64(%rsi), %xmm1 - mov $-64, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(48bytes): - movdqu -48(%rdi), %xmm2 - movdqu -48(%rsi), %xmm1 - mov $-48, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) -L(32bytes): - movdqu -32(%rdi), %xmm2 - movdqu -32(%rsi), %xmm1 - mov $-32, %dl - pxor %xmm1, %xmm2 - ptest %xmm2, %xmm0 - jnc L(less16bytes) - - mov -16(%rdi), %rax - mov -16(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - - mov -8(%rdi), %rax - mov -8(%rsi), %rcx - cmp %rax, %rcx - jne L(diffin8bytes) - xor %eax, %eax - ret - -/* - * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. - */ - .p2align 3 -L(less16bytes): - movsbq %dl, %rdx - mov (%rsi, %rdx), %rcx - mov (%rdi, %rdx), %rax - cmp %rax, %rcx - jne L(diffin8bytes) - mov 8(%rsi, %rdx), %rcx - mov 8(%rdi, %rdx), %rax -L(diffin8bytes): - cmp %eax, %ecx - jne L(diffin4bytes) - shr $32, %rcx - shr $32, %rax - -# ifdef USE_AS_WMEMCMP -/* for wmemcmp */ - cmp %eax, %ecx - jne L(diffin4bytes) - xor %eax, %eax - ret -# endif - -L(diffin4bytes): -# ifndef USE_AS_WMEMCMP - cmp %cx, %ax - jne L(diffin2bytes) - shr $16, %ecx - shr $16, %eax -L(diffin2bytes): - cmp %cl, %al - jne L(end) - and $0xffff, %eax - and $0xffff, %ecx - sub %ecx, %eax - ret - - .p2align 4 -L(end): - and $0xff, %eax - and $0xff, %ecx - sub %ecx, %eax - ret -# else - -/* for wmemcmp */ - mov $1, %eax - jl L(nequal_bigger) - neg %eax - ret - - .p2align 4 -L(nequal_bigger): - ret - -L(unreal_case): - xor %eax, %eax - ret -# endif - -END (MEMCMP) - - .section .rodata.sse4.1,"a",@progbits - .p2align 3 -# ifndef USE_AS_WMEMCMP -L(table_64bytes): - .int JMPTBL (L(0bytes), L(table_64bytes)) - .int JMPTBL (L(1bytes), L(table_64bytes)) - .int JMPTBL (L(2bytes), L(table_64bytes)) - .int JMPTBL (L(3bytes), L(table_64bytes)) - .int JMPTBL (L(4bytes), L(table_64bytes)) - .int JMPTBL (L(5bytes), L(table_64bytes)) - .int JMPTBL (L(6bytes), L(table_64bytes)) - .int JMPTBL (L(7bytes), L(table_64bytes)) - .int JMPTBL (L(8bytes), L(table_64bytes)) - .int JMPTBL (L(9bytes), L(table_64bytes)) - .int JMPTBL (L(10bytes), L(table_64bytes)) - .int JMPTBL (L(11bytes), L(table_64bytes)) - .int JMPTBL (L(12bytes), L(table_64bytes)) - .int JMPTBL (L(13bytes), L(table_64bytes)) - .int JMPTBL (L(14bytes), L(table_64bytes)) - .int JMPTBL (L(15bytes), L(table_64bytes)) - .int JMPTBL (L(16bytes), L(table_64bytes)) - .int JMPTBL (L(17bytes), L(table_64bytes)) - .int JMPTBL (L(18bytes), L(table_64bytes)) - .int JMPTBL (L(19bytes), L(table_64bytes)) - .int JMPTBL (L(20bytes), L(table_64bytes)) - .int JMPTBL (L(21bytes), L(table_64bytes)) - .int JMPTBL (L(22bytes), L(table_64bytes)) - .int JMPTBL (L(23bytes), L(table_64bytes)) - .int JMPTBL (L(24bytes), L(table_64bytes)) - .int JMPTBL (L(25bytes), L(table_64bytes)) - .int JMPTBL (L(26bytes), L(table_64bytes)) - .int JMPTBL (L(27bytes), L(table_64bytes)) - .int JMPTBL (L(28bytes), L(table_64bytes)) - .int JMPTBL (L(29bytes), L(table_64bytes)) - .int JMPTBL (L(30bytes), L(table_64bytes)) - .int JMPTBL (L(31bytes), L(table_64bytes)) - .int JMPTBL (L(32bytes), L(table_64bytes)) - .int JMPTBL (L(33bytes), L(table_64bytes)) - .int JMPTBL (L(34bytes), L(table_64bytes)) - .int JMPTBL (L(35bytes), L(table_64bytes)) - .int JMPTBL (L(36bytes), L(table_64bytes)) - .int JMPTBL (L(37bytes), L(table_64bytes)) - .int JMPTBL (L(38bytes), L(table_64bytes)) - .int JMPTBL (L(39bytes), L(table_64bytes)) - .int JMPTBL (L(40bytes), L(table_64bytes)) - .int JMPTBL (L(41bytes), L(table_64bytes)) - .int JMPTBL (L(42bytes), L(table_64bytes)) - .int JMPTBL (L(43bytes), L(table_64bytes)) - .int JMPTBL (L(44bytes), L(table_64bytes)) - .int JMPTBL (L(45bytes), L(table_64bytes)) - .int JMPTBL (L(46bytes), L(table_64bytes)) - .int JMPTBL (L(47bytes), L(table_64bytes)) - .int JMPTBL (L(48bytes), L(table_64bytes)) - .int JMPTBL (L(49bytes), L(table_64bytes)) - .int JMPTBL (L(50bytes), L(table_64bytes)) - .int JMPTBL (L(51bytes), L(table_64bytes)) - .int JMPTBL (L(52bytes), L(table_64bytes)) - .int JMPTBL (L(53bytes), L(table_64bytes)) - .int JMPTBL (L(54bytes), L(table_64bytes)) - .int JMPTBL (L(55bytes), L(table_64bytes)) - .int JMPTBL (L(56bytes), L(table_64bytes)) - .int JMPTBL (L(57bytes), L(table_64bytes)) - .int JMPTBL (L(58bytes), L(table_64bytes)) - .int JMPTBL (L(59bytes), L(table_64bytes)) - .int JMPTBL (L(60bytes), L(table_64bytes)) - .int JMPTBL (L(61bytes), L(table_64bytes)) - .int JMPTBL (L(62bytes), L(table_64bytes)) - .int JMPTBL (L(63bytes), L(table_64bytes)) - .int JMPTBL (L(64bytes), L(table_64bytes)) - .int JMPTBL (L(65bytes), L(table_64bytes)) - .int JMPTBL (L(66bytes), L(table_64bytes)) - .int JMPTBL (L(67bytes), L(table_64bytes)) - .int JMPTBL (L(68bytes), L(table_64bytes)) - .int JMPTBL (L(69bytes), L(table_64bytes)) - .int JMPTBL (L(70bytes), L(table_64bytes)) - .int JMPTBL (L(71bytes), L(table_64bytes)) - .int JMPTBL (L(72bytes), L(table_64bytes)) - .int JMPTBL (L(73bytes), L(table_64bytes)) - .int JMPTBL (L(74bytes), L(table_64bytes)) - .int JMPTBL (L(75bytes), L(table_64bytes)) - .int JMPTBL (L(76bytes), L(table_64bytes)) - .int JMPTBL (L(77bytes), L(table_64bytes)) - .int JMPTBL (L(78bytes), L(table_64bytes)) - .int JMPTBL (L(79bytes), L(table_64bytes)) -# else -L(table_64bytes): - .int JMPTBL (L(0bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(4bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(8bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(12bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(16bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(20bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(24bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(28bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(32bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(36bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(40bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(44bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(48bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(52bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(56bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(60bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(64bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(68bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(72bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(76bytes), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) - .int JMPTBL (L(unreal_case), L(table_64bytes)) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S index f8b4636..5d87a17 100644 --- a/sysdeps/x86_64/multiarch/memcmp.S +++ b/sysdeps/x86_64/multiarch/memcmp.S @@ -29,33 +29,28 @@ ENTRY(memcmp) cmpl $0, KIND_OFFSET+__cpu_features(%rip) jne 1f call __init_cpu_features - -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 2f - leaq __memcmp_sse2(%rip), %rax - ret - -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) - jz 3f - leaq __memcmp_sse4_1(%rip), %rax +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 3f +2: leaq __memcmp_sse2_unaligned(%rip), %rax ret 3: leaq __memcmp_ssse3(%rip), %rax ret - END(memcmp) # undef ENTRY # define ENTRY(name) \ - .type __memcmp_sse2, @function; \ + .type __memcmp_sse2_unaligned, @function; \ .p2align 4; \ - .globl __memcmp_sse2; \ - .hidden __memcmp_sse2; \ - __memcmp_sse2: cfi_startproc; \ + .globl __memcmp_sse2_unaligned; \ + .hidden __memcmp_sse2_unaligned; \ + __memcmp_sse2_unaligned: cfi_startproc; \ CALL_MCOUNT # undef END # define END(name) \ - cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 + cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned # ifdef SHARED # undef libc_hidden_builtin_def @@ -63,7 +58,7 @@ END(memcmp) they will be called without setting up EBX needed for PLT which is used by IFUNC. */ # define libc_hidden_builtin_def(name) \ - .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned # endif #endif diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S index 695a236..5dd8d44 100644 --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S @@ -201,6 +201,10 @@ L(prepare_loop): movdqu %xmm2, 96(%rdi) movdqu %xmm3, 112(%rdi) +#ifdef USE_AVX2 + vpxor %xmm5, %xmm5, %xmm5 +#endif + subq %rsi, %rdi add $64, %rsi andq $-64, %rsi @@ -348,10 +352,13 @@ L(cross_loop): sub $1, %rcx ja L(cross_loop) +#ifdef USE_AVX2 + vpxor %xmm5, %xmm5, %xmm5 +#else pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 - +#endif lea -64(%rsi), %rdx andq $-64, %rdx addq %rdx, %rdi diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S deleted file mode 100644 index b07973a..0000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_WMEMCMP 1 -#define MEMCMP __wmemcmp_sse4_1 - -#include "memcmp-sse4.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S index 109e245..dabd3ed 100644 --- a/sysdeps/x86_64/multiarch/wmemcmp.S +++ b/sysdeps/x86_64/multiarch/wmemcmp.S @@ -30,18 +30,16 @@ ENTRY(wmemcmp) jne 1f call __init_cpu_features -1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 2f - leaq __wmemcmp_sse2(%rip), %rax - ret - -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) - jz 3f - leaq __wmemcmp_sse4_1(%rip), %rax +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 3f +2: leaq __wmemcmp_sse2_unaligned(%rip), %rax ret 3: leaq __wmemcmp_ssse3(%rip), %rax ret + END(wmemcmp) #endif