Message ID | 20220325183625.1170867-6-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/6] x86: Remove {w}memcmp-ssse3 | expand |
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 4 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - > sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - > sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - > sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- > sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - > 6 files changed, 3572 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index a2ebc06c5f..292353bad7 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -42,13 +42,11 @@ sysdep_routines += \ > stpcpy-evex \ > stpcpy-sse2 \ > stpcpy-sse2-unaligned \ > - stpcpy-ssse3 \ > stpncpy-avx2 \ > stpncpy-avx2-rtm \ > stpncpy-c \ > stpncpy-evex \ > stpncpy-sse2-unaligned \ > - stpncpy-ssse3 \ > strcasecmp_l-avx2 \ > strcasecmp_l-avx2-rtm \ > strcasecmp_l-evex \ > @@ -79,7 +77,6 @@ sysdep_routines += \ > strcpy-evex \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > - strcpy-ssse3 \ > strcspn-c \ > strcspn-sse2 \ > strlen-avx2 \ > @@ -106,7 +103,6 @@ sysdep_routines += \ > strncpy-c \ > strncpy-evex \ > strncpy-sse2-unaligned \ > - strncpy-ssse3 \ > strnlen-avx2 \ > strnlen-avx2-rtm \ > strnlen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 4133ed7e43..505b8002e0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ > IFUNC_IMPL (i, name, stpncpy, > - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), > - __stpncpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), > __stpncpy_avx2) > IFUNC_IMPL_ADD (array, i, stpncpy, > @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ > IFUNC_IMPL (i, name, stpcpy, > - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), > - __stpcpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), > __stpcpy_avx2) > IFUNC_IMPL_ADD (array, i, stpcpy, > @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strcpy_evex) > - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), > - __strcpy_ssse3) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) > > @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strncpy_evex) > - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), > - __strncpy_ssse3) > IFUNC_IMPL_ADD (array, i, strncpy, 1, > __strncpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) > diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > deleted file mode 100644 > index d971c2da38..0000000000 > --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STPCPY > -#define STRCPY __stpcpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > deleted file mode 100644 > index 14ed16f6b5..0000000000 > --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_STPCPY > -#define USE_AS_STRNCPY > -#define STRCPY __stpncpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S > deleted file mode 100644 > index f617a535cf..0000000000 > --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S > +++ /dev/null > @@ -1,3550 +0,0 @@ > -/* strcpy with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# ifndef USE_AS_STRCAT > -# include <sysdep.h> > - > -# ifndef STRCPY > -# define STRCPY __strcpy_ssse3 > -# endif > - > - .section .text.ssse3,"ax",@progbits > -ENTRY (STRCPY) > - > - mov %rsi, %rcx > -# ifdef USE_AS_STRNCPY > - mov %RDX_LP, %R8_LP > -# endif > - mov %rdi, %rdx > -# ifdef USE_AS_STRNCPY > - test %R8_LP, %R8_LP > - jz L(Exit0) > - cmp $8, %R8_LP > - jbe L(StrncpyExit8Bytes) > -# endif > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmpb $0, 7(%rcx) > - jz L(Exit8) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - jb L(StrncpyExit15Bytes) > -# endif > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmpb $0, 14(%rcx) > - jz L(Exit15) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - je L(Exit16) > -# endif > - cmpb $0, 15(%rcx) > - jz L(Exit16) > -# endif > - > -# ifdef USE_AS_STRNCPY > - mov %rcx, %rsi > - sub $16, %r8 > - and $0xf, %rsi > - > -/* add 16 bytes rcx_offset to r8 */ > - > - add %rsi, %r8 > -# endif > - lea 16(%rcx), %rsi > - and $-16, %rsi > - pxor %xmm0, %xmm0 > - mov (%rcx), %r9 > - mov %r9, (%rdx) > - pcmpeqb (%rsi), %xmm0 > - mov 8(%rcx), %r9 > - mov %r9, 8(%rdx) > - > -/* convert byte mask in xmm0 to bit mask */ > - > - pmovmskb %xmm0, %rax > - sub %rcx, %rsi > - > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - mov %rdx, %rax > - lea 16(%rdx), %rdx > - and $-16, %rdx > - sub %rdx, %rax > - > -# ifdef USE_AS_STRNCPY > - add %rax, %rsi > - lea -1(%rsi), %rsi > - and $1<<31, %esi > - test %rsi, %rsi > - jnz L(ContinueCopy) > - lea 16(%r8), %r8 > - > -L(ContinueCopy): > -# endif > - sub %rax, %rcx > - mov %rcx, %rax > - and $0xf, %rax > - mov $0, %rsi > - > -/* case: rcx_offset == rdx_offset */ > - > - jz L(Align16Both) > - > - cmp $8, %rax > - jae L(ShlHigh8) > - cmp $1, %rax > - je L(Shl1) > - cmp $2, %rax > - je L(Shl2) > - cmp $3, %rax > - je L(Shl3) > - cmp $4, %rax > - je L(Shl4) > - cmp $5, %rax > - je L(Shl5) > - cmp $6, %rax > - je L(Shl6) > - jmp L(Shl7) > - > -L(ShlHigh8): > - je L(Shl8) > - cmp $9, %rax > - je L(Shl9) > - cmp $10, %rax > - je L(Shl10) > - cmp $11, %rax > - je L(Shl11) > - cmp $12, %rax > - je L(Shl12) > - cmp $13, %rax > - je L(Shl13) > - cmp $14, %rax > - je L(Shl14) > - jmp L(Shl15) > - > -L(Align16Both): > - movaps (%rcx), %xmm1 > - movaps 16(%rcx), %xmm2 > - movaps %xmm1, (%rdx) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm4 > - movaps %xmm3, (%rdx, %rsi) > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm1 > - movaps %xmm4, (%rdx, %rsi) > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm2 > - movaps %xmm1, (%rdx, %rsi) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm3, (%rdx, %rsi) > - mov %rcx, %rax > - lea 16(%rcx, %rsi), %rcx > - and $-0x40, %rcx > - sub %rcx, %rax > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - lea 112(%r8, %rax), %r8 > -# endif > - mov $-0x40, %rsi > - > - .p2align 4 > -L(Aligned64Loop): > - movaps (%rcx), %xmm2 > - movaps %xmm2, %xmm4 > - movaps 16(%rcx), %xmm5 > - movaps 32(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 48(%rcx), %xmm7 > - pminub %xmm5, %xmm2 > - pminub %xmm7, %xmm3 > - pminub %xmm2, %xmm3 > - pcmpeqb %xmm0, %xmm3 > - pmovmskb %xmm3, %rax > - lea 64(%rdx), %rdx > - lea 64(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeaveCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Aligned64Leave) > - movaps %xmm4, -64(%rdx) > - movaps %xmm5, -48(%rdx) > - movaps %xmm6, -32(%rdx) > - movaps %xmm7, -16(%rdx) > - jmp L(Aligned64Loop) > - > -L(Aligned64Leave): > -# ifdef USE_AS_STRNCPY > - lea 48(%r8), %r8 > -# endif > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm6, -32(%rdx) > - pcmpeqb %xmm7, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl1): > - movaps -1(%rcx), %xmm1 > - movaps 15(%rcx), %xmm2 > -L(Shl1Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 31(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -15(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -1(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl1LoopStart): > - movaps 15(%rcx), %xmm2 > - movaps 31(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 47(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 63(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $1, %xmm4, %xmm5 > - test %rax, %rax > - palignr $1, %xmm3, %xmm4 > - jnz L(Shl1Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave1) > -# endif > - palignr $1, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $1, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl1LoopStart) > - > -L(Shl1LoopExit): > - movdqu -1(%rcx), %xmm1 > - mov $15, %rsi > - movdqu %xmm1, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl2): > - movaps -2(%rcx), %xmm1 > - movaps 14(%rcx), %xmm2 > -L(Shl2Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 30(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -14(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -2(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl2LoopStart): > - movaps 14(%rcx), %xmm2 > - movaps 30(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 46(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 62(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $2, %xmm4, %xmm5 > - test %rax, %rax > - palignr $2, %xmm3, %xmm4 > - jnz L(Shl2Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave2) > -# endif > - palignr $2, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $2, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl2LoopStart) > - > -L(Shl2LoopExit): > - movdqu -2(%rcx), %xmm1 > - mov $14, %rsi > - movdqu %xmm1, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl3): > - movaps -3(%rcx), %xmm1 > - movaps 13(%rcx), %xmm2 > -L(Shl3Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 29(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -13(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -3(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl3LoopStart): > - movaps 13(%rcx), %xmm2 > - movaps 29(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 45(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 61(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $3, %xmm4, %xmm5 > - test %rax, %rax > - palignr $3, %xmm3, %xmm4 > - jnz L(Shl3Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave3) > -# endif > - palignr $3, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $3, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl3LoopStart) > - > -L(Shl3LoopExit): > - movdqu -3(%rcx), %xmm1 > - mov $13, %rsi > - movdqu %xmm1, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl4): > - movaps -4(%rcx), %xmm1 > - movaps 12(%rcx), %xmm2 > -L(Shl4Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 28(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -12(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -4(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl4LoopStart): > - movaps 12(%rcx), %xmm2 > - movaps 28(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 44(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 60(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $4, %xmm4, %xmm5 > - test %rax, %rax > - palignr $4, %xmm3, %xmm4 > - jnz L(Shl4Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave4) > -# endif > - palignr $4, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $4, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl4LoopStart) > - > -L(Shl4LoopExit): > - movdqu -4(%rcx), %xmm1 > - mov $12, %rsi > - movdqu %xmm1, -4(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl5): > - movaps -5(%rcx), %xmm1 > - movaps 11(%rcx), %xmm2 > -L(Shl5Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 27(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -11(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -5(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl5LoopStart): > - movaps 11(%rcx), %xmm2 > - movaps 27(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 43(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 59(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $5, %xmm4, %xmm5 > - test %rax, %rax > - palignr $5, %xmm3, %xmm4 > - jnz L(Shl5Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave5) > -# endif > - palignr $5, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $5, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl5LoopStart) > - > -L(Shl5LoopExit): > - movdqu -5(%rcx), %xmm1 > - mov $11, %rsi > - movdqu %xmm1, -5(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl6): > - movaps -6(%rcx), %xmm1 > - movaps 10(%rcx), %xmm2 > -L(Shl6Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 26(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -10(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -6(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl6LoopStart): > - movaps 10(%rcx), %xmm2 > - movaps 26(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 42(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 58(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $6, %xmm4, %xmm5 > - test %rax, %rax > - palignr $6, %xmm3, %xmm4 > - jnz L(Shl6Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave6) > -# endif > - palignr $6, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $6, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl6LoopStart) > - > -L(Shl6LoopExit): > - mov (%rcx), %r9 > - mov 6(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 6(%rdx) > - mov $10, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl7): > - movaps -7(%rcx), %xmm1 > - movaps 9(%rcx), %xmm2 > -L(Shl7Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 25(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -9(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -7(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl7LoopStart): > - movaps 9(%rcx), %xmm2 > - movaps 25(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 41(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 57(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $7, %xmm4, %xmm5 > - test %rax, %rax > - palignr $7, %xmm3, %xmm4 > - jnz L(Shl7Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave7) > -# endif > - palignr $7, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $7, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl7LoopStart) > - > -L(Shl7LoopExit): > - mov (%rcx), %r9 > - mov 5(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 5(%rdx) > - mov $9, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl8): > - movaps -8(%rcx), %xmm1 > - movaps 8(%rcx), %xmm2 > -L(Shl8Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 24(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -8(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -8(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl8LoopStart): > - movaps 8(%rcx), %xmm2 > - movaps 24(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 40(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 56(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $8, %xmm4, %xmm5 > - test %rax, %rax > - palignr $8, %xmm3, %xmm4 > - jnz L(Shl8Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave8) > -# endif > - palignr $8, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $8, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl8LoopStart) > - > -L(Shl8LoopExit): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl9): > - movaps -9(%rcx), %xmm1 > - movaps 7(%rcx), %xmm2 > -L(Shl9Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 23(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -7(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -9(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl9LoopStart): > - movaps 7(%rcx), %xmm2 > - movaps 23(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 39(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 55(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $9, %xmm4, %xmm5 > - test %rax, %rax > - palignr $9, %xmm3, %xmm4 > - jnz L(Shl9Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave9) > -# endif > - palignr $9, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $9, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl9LoopStart) > - > -L(Shl9LoopExit): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl10): > - movaps -10(%rcx), %xmm1 > - movaps 6(%rcx), %xmm2 > -L(Shl10Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 22(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -6(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -10(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl10LoopStart): > - movaps 6(%rcx), %xmm2 > - movaps 22(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 38(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 54(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $10, %xmm4, %xmm5 > - test %rax, %rax > - palignr $10, %xmm3, %xmm4 > - jnz L(Shl10Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave10) > -# endif > - palignr $10, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $10, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl10LoopStart) > - > -L(Shl10LoopExit): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl11): > - movaps -11(%rcx), %xmm1 > - movaps 5(%rcx), %xmm2 > -L(Shl11Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 21(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -5(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -11(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl11LoopStart): > - movaps 5(%rcx), %xmm2 > - movaps 21(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 37(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 53(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $11, %xmm4, %xmm5 > - test %rax, %rax > - palignr $11, %xmm3, %xmm4 > - jnz L(Shl11Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave11) > -# endif > - palignr $11, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $11, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl11LoopStart) > - > -L(Shl11LoopExit): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl12): > - movaps -12(%rcx), %xmm1 > - movaps 4(%rcx), %xmm2 > -L(Shl12Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 20(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -4(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -12(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl12LoopStart): > - movaps 4(%rcx), %xmm2 > - movaps 20(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 36(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 52(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $12, %xmm4, %xmm5 > - test %rax, %rax > - palignr $12, %xmm3, %xmm4 > - jnz L(Shl12Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave12) > -# endif > - palignr $12, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $12, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl12LoopStart) > - > -L(Shl12LoopExit): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl13): > - movaps -13(%rcx), %xmm1 > - movaps 3(%rcx), %xmm2 > -L(Shl13Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 19(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -3(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -13(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl13LoopStart): > - movaps 3(%rcx), %xmm2 > - movaps 19(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 35(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 51(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $13, %xmm4, %xmm5 > - test %rax, %rax > - palignr $13, %xmm3, %xmm4 > - jnz L(Shl13Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave13) > -# endif > - palignr $13, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $13, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl13LoopStart) > - > -L(Shl13LoopExit): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl14): > - movaps -14(%rcx), %xmm1 > - movaps 2(%rcx), %xmm2 > -L(Shl14Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 18(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -2(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -14(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl14LoopStart): > - movaps 2(%rcx), %xmm2 > - movaps 18(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 34(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 50(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $14, %xmm4, %xmm5 > - test %rax, %rax > - palignr $14, %xmm3, %xmm4 > - jnz L(Shl14Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave14) > -# endif > - palignr $14, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $14, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl14LoopStart) > - > -L(Shl14LoopExit): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl15): > - movaps -15(%rcx), %xmm1 > - movaps 1(%rcx), %xmm2 > -L(Shl15Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 17(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -1(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -15(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl15LoopStart): > - movaps 1(%rcx), %xmm2 > - movaps 17(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 33(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 49(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $15, %xmm4, %xmm5 > - test %rax, %rax > - palignr $15, %xmm3, %xmm4 > - jnz L(Shl15Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave15) > -# endif > - palignr $15, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $15, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl15LoopStart) > - > -L(Shl15LoopExit): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > -# ifdef USE_AS_STRCAT > - jmp L(CopyFrom1To16Bytes) > -# endif > - > -# ifndef USE_AS_STRCAT > - > - .p2align 4 > -L(CopyFrom1To16Bytes): > -# ifdef USE_AS_STRNCPY > - add $16, %r8 > -# endif > - add %rsi, %rdx > - add %rsi, %rcx > - > - test %al, %al > - jz L(ExitHigh) > - test $0x01, %al > - jnz L(Exit1) > - test $0x02, %al > - jnz L(Exit2) > - test $0x04, %al > - jnz L(Exit3) > - test $0x08, %al > - jnz L(Exit4) > - test $0x10, %al > - jnz L(Exit5) > - test $0x20, %al > - jnz L(Exit6) > - test $0x40, %al > - jnz L(Exit7) > - > - .p2align 4 > -L(Exit8): > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $8, %r8 > - lea 8(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(ExitHigh): > - test $0x01, %ah > - jnz L(Exit9) > - test $0x02, %ah > - jnz L(Exit10) > - test $0x04, %ah > - jnz L(Exit11) > - test $0x08, %ah > - jnz L(Exit12) > - test $0x10, %ah > - jnz L(Exit13) > - test $0x20, %ah > - jnz L(Exit14) > - test $0x40, %ah > - jnz L(Exit15) > - > - .p2align 4 > -L(Exit16): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %rax > - mov %rax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 15(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - lea 16(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - > - .p2align 4 > -L(CopyFrom1To16BytesCase2): > - add $16, %r8 > - add %rsi, %rcx > - lea (%rsi, %rdx), %rsi > - lea -9(%r8), %rdx > - and $1<<7, %dh > - or %al, %dh > - test %dh, %dh > - lea (%rsi), %rdx > - jz L(ExitHighCase2) > - > - cmp $1, %r8 > - je L(Exit1) > - test $0x01, %al > - jnz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - test $0x02, %al > - jnz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - test $0x04, %al > - jnz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - test $0x08, %al > - jnz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - test $0x10, %al > - jnz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - test $0x20, %al > - jnz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - test $0x40, %al > - jnz L(Exit7) > - jmp L(Exit8) > - > - .p2align 4 > -L(ExitHighCase2): > - cmp $9, %r8 > - je L(Exit9) > - test $0x01, %ah > - jnz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - test $0x02, %ah > - jnz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - test $0x04, %ah > - jnz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - test $0x8, %ah > - jnz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - test $0x10, %ah > - jnz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - test $0x20, %ah > - jnz L(Exit14) > - cmp $15, %r8 > - je L(Exit15) > - test $0x40, %ah > - jnz L(Exit15) > - jmp L(Exit16) > - > -L(CopyFrom1To16BytesCase2OrCase3): > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - > - .p2align 4 > -L(CopyFrom1To16BytesCase3): > - add $16, %r8 > - add %rsi, %rdx > - add %rsi, %rcx > - > - cmp $16, %r8 > - je L(Exit16) > - cmp $8, %r8 > - je L(Exit8) > - jg L(More8Case3) > - cmp $4, %r8 > - je L(Exit4) > - jg L(More4Case3) > - cmp $2, %r8 > - jl L(Exit1) > - je L(Exit2) > - jg L(Exit3) > -L(More8Case3): /* but less than 16 */ > - cmp $12, %r8 > - je L(Exit12) > - jl L(Less12Case3) > - cmp $14, %r8 > - jl L(Exit13) > - je L(Exit14) > - jg L(Exit15) > -L(More4Case3): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Exit5) > - je L(Exit6) > - jg L(Exit7) > -L(Less12Case3): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Exit9) > - je L(Exit10) > - jg L(Exit11) > -# endif > - > - .p2align 4 > -L(Exit1): > - movb (%rcx), %al > - movb %al, (%rdx) > -# ifdef USE_AS_STPCPY > - lea (%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $1, %r8 > - lea 1(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit2): > - movw (%rcx), %ax > - movw %ax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 1(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $2, %r8 > - lea 2(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit3): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - movb 2(%rcx), %al > - movb %al, 2(%rdx) > -# ifdef USE_AS_STPCPY > - lea 2(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $3, %r8 > - lea 3(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit4): > - movl (%rcx), %eax > - movl %eax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 3(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $4, %r8 > - lea 4(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit5): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movb 4(%rcx), %al > - movb %al, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 4(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $5, %r8 > - lea 5(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit6): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movw 4(%rcx), %ax > - movw %ax, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 5(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $6, %r8 > - lea 6(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit7): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movl 3(%rcx), %eax > - movl %eax, 3(%rdx) > -# ifdef USE_AS_STPCPY > - lea 6(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $7, %r8 > - lea 7(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit9): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %eax > - mov %eax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 8(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $9, %r8 > - lea 9(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit10): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %eax > - mov %eax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 9(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $10, %r8 > - lea 10(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit11): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %eax > - mov %eax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 10(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $11, %r8 > - lea 11(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit12): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %eax > - mov %eax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 11(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $12, %r8 > - lea 12(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit13): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %rax > - mov %rax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 12(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $13, %r8 > - lea 13(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit14): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %rax > - mov %rax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 13(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $14, %r8 > - lea 14(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit15): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $15, %r8 > - lea 15(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(Fill0): > - ret > - > - .p2align 4 > -L(Fill1): > - movb %dl, (%rcx) > - ret > - > - .p2align 4 > -L(Fill2): > - movw %dx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill3): > - movw %dx, (%rcx) > - movb %dl, 2(%rcx) > - ret > - > - .p2align 4 > -L(Fill4): > - movl %edx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill5): > - movl %edx, (%rcx) > - movb %dl, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill6): > - movl %edx, (%rcx) > - movw %dx, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill7): > - movl %edx, (%rcx) > - movl %edx, 3(%rcx) > - ret > - > - .p2align 4 > -L(Fill8): > - mov %rdx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill9): > - mov %rdx, (%rcx) > - movb %dl, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill10): > - mov %rdx, (%rcx) > - movw %dx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill11): > - mov %rdx, (%rcx) > - movl %edx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill12): > - mov %rdx, (%rcx) > - movl %edx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill13): > - mov %rdx, (%rcx) > - mov %rdx, 5(%rcx) > - ret > - > - .p2align 4 > -L(Fill14): > - mov %rdx, (%rcx) > - mov %rdx, 6(%rcx) > - ret > - > - .p2align 4 > -L(Fill15): > - mov %rdx, (%rcx) > - mov %rdx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill16): > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - ret > - > - .p2align 4 > -L(StrncpyFillExit1): > - lea 16(%r8), %r8 > -L(FillFrom1To16Bytes): > - test %r8, %r8 > - jz L(Fill0) > - cmp $16, %r8 > - je L(Fill16) > - cmp $8, %r8 > - je L(Fill8) > - jg L(FillMore8) > - cmp $4, %r8 > - je L(Fill4) > - jg L(FillMore4) > - cmp $2, %r8 > - jl L(Fill1) > - je L(Fill2) > - jg L(Fill3) > -L(FillMore8): /* but less than 16 */ > - cmp $12, %r8 > - je L(Fill12) > - jl L(FillLess12) > - cmp $14, %r8 > - jl L(Fill13) > - je L(Fill14) > - jg L(Fill15) > -L(FillMore4): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Fill5) > - je L(Fill6) > - jg L(Fill7) > -L(FillLess12): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Fill9) > - je L(Fill10) > - jmp L(Fill11) > - > - .p2align 4 > -L(StrncpyFillTailWithZero1): > - xor %rdx, %rdx > - sub $16, %r8 > - jbe L(StrncpyFillExit1) > - > - pxor %xmm0, %xmm0 > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - > - lea 16(%rcx), %rcx > - > - mov %rcx, %rdx > - and $0xf, %rdx > - sub %rdx, %rcx > - add %rdx, %r8 > - xor %rdx, %rdx > - sub $64, %r8 > - jb L(StrncpyFillLess64) > - > -L(StrncpyFillLoopMovdqa): > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - movdqa %xmm0, 32(%rcx) > - movdqa %xmm0, 48(%rcx) > - lea 64(%rcx), %rcx > - sub $64, %r8 > - jae L(StrncpyFillLoopMovdqa) > - > -L(StrncpyFillLess64): > - add $32, %r8 > - jl L(StrncpyFillLess32) > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - lea 32(%rcx), %rcx > - sub $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > -L(StrncpyFillLess32): > - add $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > - .p2align 4 > -L(Exit0): > - mov %rdx, %rax > - ret > - > - .p2align 4 > -L(StrncpyExit15Bytes): > - cmp $9, %r8 > - je L(Exit9) > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > - .p2align 4 > -L(StrncpyExit8Bytes): > - cmp $1, %r8 > - je L(Exit1) > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > -# endif > -# endif > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(StrncpyLeaveCase2OrCase3): > - test %rax, %rax > - jnz L(Aligned64LeaveCase2) > - > -L(Aligned64LeaveCase3): > - lea 64(%r8), %r8 > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase3) > - > -L(Aligned64LeaveCase2): > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - add $48, %r8 > - jle L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm7, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase2) > -/*--------------------------------------------------*/ > - .p2align 4 > -L(StrncpyExit1Case2OrCase3): > - movdqu -1(%rcx), %xmm0 > - movdqu %xmm0, -1(%rdx) > - mov $15, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit2Case2OrCase3): > - movdqu -2(%rcx), %xmm0 > - movdqu %xmm0, -2(%rdx) > - mov $14, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit3Case2OrCase3): > - movdqu -3(%rcx), %xmm0 > - movdqu %xmm0, -3(%rdx) > - mov $13, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit4Case2OrCase3): > - movdqu -4(%rcx), %xmm0 > - movdqu %xmm0, -4(%rdx) > - mov $12, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit5Case2OrCase3): > - movdqu -5(%rcx), %xmm0 > - movdqu %xmm0, -5(%rdx) > - mov $11, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit6Case2OrCase3): > - mov (%rcx), %rsi > - mov 6(%rcx), %r9d > - mov %r9d, 6(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $10, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit7Case2OrCase3): > - mov (%rcx), %rsi > - mov 5(%rcx), %r9d > - mov %r9d, 5(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $9, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit8Case2OrCase3): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit9Case2OrCase3): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit10Case2OrCase3): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit11Case2OrCase3): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit12Case2OrCase3): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit13Case2OrCase3): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit14Case2OrCase3): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit15Case2OrCase3): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave1): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit1) > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit1): > - lea 15(%rdx, %rsi), %rdx > - lea 15(%rcx, %rsi), %rcx > - mov -15(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -15(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave2): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit2) > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit2): > - lea 14(%rdx, %rsi), %rdx > - lea 14(%rcx, %rsi), %rcx > - mov -14(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -14(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave3): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit3) > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit3): > - lea 13(%rdx, %rsi), %rdx > - lea 13(%rcx, %rsi), %rcx > - mov -13(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -13(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave4): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit4) > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit4): > - lea 12(%rdx, %rsi), %rdx > - lea 12(%rcx, %rsi), %rcx > - mov -12(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -12(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave5): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit5) > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit5): > - lea 11(%rdx, %rsi), %rdx > - lea 11(%rcx, %rsi), %rcx > - mov -11(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -11(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave6): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit6) > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit6): > - lea 10(%rdx, %rsi), %rdx > - lea 10(%rcx, %rsi), %rcx > - mov -10(%rcx), %rsi > - movw -2(%rcx), %ax > - mov %rsi, -10(%rdx) > - movw %ax, -2(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave7): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit7) > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit7): > - lea 9(%rdx, %rsi), %rdx > - lea 9(%rcx, %rsi), %rcx > - mov -9(%rcx), %rsi > - movb -1(%rcx), %ah > - mov %rsi, -9(%rdx) > - movb %ah, -1(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave8): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit8) > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit8): > - lea 8(%rdx, %rsi), %rdx > - lea 8(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave9): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit9) > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit9): > - lea 7(%rdx, %rsi), %rdx > - lea 7(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave10): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit10) > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit10): > - lea 6(%rdx, %rsi), %rdx > - lea 6(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave11): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit11) > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit11): > - lea 5(%rdx, %rsi), %rdx > - lea 5(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave12): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit12) > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit12): > - lea 4(%rdx, %rsi), %rdx > - lea 4(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave13): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit13) > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit13): > - lea 3(%rdx, %rsi), %rdx > - lea 3(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave14): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit14) > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit14): > - lea 2(%rdx, %rsi), %rdx > - lea 2(%rcx, %rsi), %rcx > - movw -2(%rcx), %ax > - xor %rsi, %rsi > - movw %ax, -2(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave15): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit15) > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit15): > - lea 1(%rdx, %rsi), %rdx > - lea 1(%rcx, %rsi), %rcx > - movb -1(%rcx), %ah > - xor %rsi, %rsi > - movb %ah, -1(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > -# endif > -# ifndef USE_AS_STRCAT > -END (STRCPY) > -# endif > -#endif > diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S > deleted file mode 100644 > index bf82ee447d..0000000000 > --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STRNCPY > -#define STRCPY __strncpy_ssse3 > -#include "strcpy-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index a2ebc06c5f..292353bad7 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -42,13 +42,11 @@ sysdep_routines += \ stpcpy-evex \ stpcpy-sse2 \ stpcpy-sse2-unaligned \ - stpcpy-ssse3 \ stpncpy-avx2 \ stpncpy-avx2-rtm \ stpncpy-c \ stpncpy-evex \ stpncpy-sse2-unaligned \ - stpncpy-ssse3 \ strcasecmp_l-avx2 \ strcasecmp_l-avx2-rtm \ strcasecmp_l-evex \ @@ -79,7 +77,6 @@ sysdep_routines += \ strcpy-evex \ strcpy-sse2 \ strcpy-sse2-unaligned \ - strcpy-ssse3 \ strcspn-c \ strcspn-sse2 \ strlen-avx2 \ @@ -106,7 +103,6 @@ sysdep_routines += \ strncpy-c \ strncpy-evex \ strncpy-sse2-unaligned \ - strncpy-ssse3 \ strnlen-avx2 \ strnlen-avx2-rtm \ strnlen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 4133ed7e43..505b8002e0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), - __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), - __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), - __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), - __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S deleted file mode 100644 index d971c2da38..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S deleted file mode 100644 index 14ed16f6b5..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S deleted file mode 100644 index f617a535cf..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ /dev/null @@ -1,3550 +0,0 @@ -/* strcpy with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif - - .section .text.ssse3,"ax",@progbits -ENTRY (STRCPY) - - mov %rsi, %rcx -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP -# endif - mov %rdi, %rdx -# ifdef USE_AS_STRNCPY - test %R8_LP, %R8_LP - jz L(Exit0) - cmp $8, %R8_LP - jbe L(StrncpyExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - jb L(StrncpyExit15Bytes) -# endif - cmpb $0, 8(%rcx) - jz L(Exit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - je L(Exit16) -# endif - cmpb $0, 15(%rcx) - jz L(Exit16) -# endif - -# ifdef USE_AS_STRNCPY - mov %rcx, %rsi - sub $16, %r8 - and $0xf, %rsi - -/* add 16 bytes rcx_offset to r8 */ - - add %rsi, %r8 -# endif - lea 16(%rcx), %rsi - and $-16, %rsi - pxor %xmm0, %xmm0 - mov (%rcx), %r9 - mov %r9, (%rdx) - pcmpeqb (%rsi), %xmm0 - mov 8(%rcx), %r9 - mov %r9, 8(%rdx) - -/* convert byte mask in xmm0 to bit mask */ - - pmovmskb %xmm0, %rax - sub %rcx, %rsi - -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - mov %rdx, %rax - lea 16(%rdx), %rdx - and $-16, %rdx - sub %rdx, %rax - -# ifdef USE_AS_STRNCPY - add %rax, %rsi - lea -1(%rsi), %rsi - and $1<<31, %esi - test %rsi, %rsi - jnz L(ContinueCopy) - lea 16(%r8), %r8 - -L(ContinueCopy): -# endif - sub %rax, %rcx - mov %rcx, %rax - and $0xf, %rax - mov $0, %rsi - -/* case: rcx_offset == rdx_offset */ - - jz L(Align16Both) - - cmp $8, %rax - jae L(ShlHigh8) - cmp $1, %rax - je L(Shl1) - cmp $2, %rax - je L(Shl2) - cmp $3, %rax - je L(Shl3) - cmp $4, %rax - je L(Shl4) - cmp $5, %rax - je L(Shl5) - cmp $6, %rax - je L(Shl6) - jmp L(Shl7) - -L(ShlHigh8): - je L(Shl8) - cmp $9, %rax - je L(Shl9) - cmp $10, %rax - je L(Shl10) - cmp $11, %rax - je L(Shl11) - cmp $12, %rax - je L(Shl12) - cmp $13, %rax - je L(Shl13) - cmp $14, %rax - je L(Shl14) - jmp L(Shl15) - -L(Align16Both): - movaps (%rcx), %xmm1 - movaps 16(%rcx), %xmm2 - movaps %xmm1, (%rdx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm4 - movaps %xmm3, (%rdx, %rsi) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm1 - movaps %xmm4, (%rdx, %rsi) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm2 - movaps %xmm1, (%rdx, %rsi) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps %xmm3, (%rdx, %rsi) - mov %rcx, %rax - lea 16(%rcx, %rsi), %rcx - and $-0x40, %rcx - sub %rcx, %rax - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - lea 112(%r8, %rax), %r8 -# endif - mov $-0x40, %rsi - - .p2align 4 -L(Aligned64Loop): - movaps (%rcx), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rcx), %xmm5 - movaps 32(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rcx), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeaveCase2OrCase3) -# endif - test %rax, %rax - jnz L(Aligned64Leave) - movaps %xmm4, -64(%rdx) - movaps %xmm5, -48(%rdx) - movaps %xmm6, -32(%rdx) - movaps %xmm7, -16(%rdx) - jmp L(Aligned64Loop) - -L(Aligned64Leave): -# ifdef USE_AS_STRNCPY - lea 48(%r8), %r8 -# endif - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - movaps %xmm6, -32(%rdx) - pcmpeqb %xmm7, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl1): - movaps -1(%rcx), %xmm1 - movaps 15(%rcx), %xmm2 -L(Shl1Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 31(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -15(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -1(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl1LoopStart): - movaps 15(%rcx), %xmm2 - movaps 31(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 47(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 63(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - test %rax, %rax - palignr $1, %xmm3, %xmm4 - jnz L(Shl1Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave1) -# endif - palignr $1, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $1, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl1LoopStart) - -L(Shl1LoopExit): - movdqu -1(%rcx), %xmm1 - mov $15, %rsi - movdqu %xmm1, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl2): - movaps -2(%rcx), %xmm1 - movaps 14(%rcx), %xmm2 -L(Shl2Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 30(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -14(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -2(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl2LoopStart): - movaps 14(%rcx), %xmm2 - movaps 30(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 46(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 62(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - test %rax, %rax - palignr $2, %xmm3, %xmm4 - jnz L(Shl2Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave2) -# endif - palignr $2, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $2, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl2LoopStart) - -L(Shl2LoopExit): - movdqu -2(%rcx), %xmm1 - mov $14, %rsi - movdqu %xmm1, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl3): - movaps -3(%rcx), %xmm1 - movaps 13(%rcx), %xmm2 -L(Shl3Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 29(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -13(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -3(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl3LoopStart): - movaps 13(%rcx), %xmm2 - movaps 29(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 45(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 61(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - test %rax, %rax - palignr $3, %xmm3, %xmm4 - jnz L(Shl3Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave3) -# endif - palignr $3, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $3, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl3LoopStart) - -L(Shl3LoopExit): - movdqu -3(%rcx), %xmm1 - mov $13, %rsi - movdqu %xmm1, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl4): - movaps -4(%rcx), %xmm1 - movaps 12(%rcx), %xmm2 -L(Shl4Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -12(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -4(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl4LoopStart): - movaps 12(%rcx), %xmm2 - movaps 28(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 44(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 60(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $4, %xmm4, %xmm5 - test %rax, %rax - palignr $4, %xmm3, %xmm4 - jnz L(Shl4Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave4) -# endif - palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $4, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl4LoopStart) - -L(Shl4LoopExit): - movdqu -4(%rcx), %xmm1 - mov $12, %rsi - movdqu %xmm1, -4(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl5): - movaps -5(%rcx), %xmm1 - movaps 11(%rcx), %xmm2 -L(Shl5Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 27(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -11(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -5(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl5LoopStart): - movaps 11(%rcx), %xmm2 - movaps 27(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 43(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 59(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $5, %xmm4, %xmm5 - test %rax, %rax - palignr $5, %xmm3, %xmm4 - jnz L(Shl5Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave5) -# endif - palignr $5, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $5, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl5LoopStart) - -L(Shl5LoopExit): - movdqu -5(%rcx), %xmm1 - mov $11, %rsi - movdqu %xmm1, -5(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl6): - movaps -6(%rcx), %xmm1 - movaps 10(%rcx), %xmm2 -L(Shl6Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 26(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -10(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -6(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl6LoopStart): - movaps 10(%rcx), %xmm2 - movaps 26(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 42(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 58(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $6, %xmm4, %xmm5 - test %rax, %rax - palignr $6, %xmm3, %xmm4 - jnz L(Shl6Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave6) -# endif - palignr $6, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $6, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl6LoopStart) - -L(Shl6LoopExit): - mov (%rcx), %r9 - mov 6(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 6(%rdx) - mov $10, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl7): - movaps -7(%rcx), %xmm1 - movaps 9(%rcx), %xmm2 -L(Shl7Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 25(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -9(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -7(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl7LoopStart): - movaps 9(%rcx), %xmm2 - movaps 25(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 41(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 57(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $7, %xmm4, %xmm5 - test %rax, %rax - palignr $7, %xmm3, %xmm4 - jnz L(Shl7Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave7) -# endif - palignr $7, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $7, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl7LoopStart) - -L(Shl7LoopExit): - mov (%rcx), %r9 - mov 5(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 5(%rdx) - mov $9, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl8): - movaps -8(%rcx), %xmm1 - movaps 8(%rcx), %xmm2 -L(Shl8Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -8(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -8(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl8LoopStart): - movaps 8(%rcx), %xmm2 - movaps 24(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 40(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 56(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $8, %xmm4, %xmm5 - test %rax, %rax - palignr $8, %xmm3, %xmm4 - jnz L(Shl8Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave8) -# endif - palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $8, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl8LoopStart) - -L(Shl8LoopExit): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl9): - movaps -9(%rcx), %xmm1 - movaps 7(%rcx), %xmm2 -L(Shl9Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 23(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -7(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -9(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl9LoopStart): - movaps 7(%rcx), %xmm2 - movaps 23(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 39(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 55(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $9, %xmm4, %xmm5 - test %rax, %rax - palignr $9, %xmm3, %xmm4 - jnz L(Shl9Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave9) -# endif - palignr $9, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $9, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl9LoopStart) - -L(Shl9LoopExit): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl10): - movaps -10(%rcx), %xmm1 - movaps 6(%rcx), %xmm2 -L(Shl10Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 22(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -6(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -10(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl10LoopStart): - movaps 6(%rcx), %xmm2 - movaps 22(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 38(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 54(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $10, %xmm4, %xmm5 - test %rax, %rax - palignr $10, %xmm3, %xmm4 - jnz L(Shl10Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave10) -# endif - palignr $10, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $10, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl10LoopStart) - -L(Shl10LoopExit): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl11): - movaps -11(%rcx), %xmm1 - movaps 5(%rcx), %xmm2 -L(Shl11Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 21(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -5(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -11(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl11LoopStart): - movaps 5(%rcx), %xmm2 - movaps 21(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 37(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 53(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $11, %xmm4, %xmm5 - test %rax, %rax - palignr $11, %xmm3, %xmm4 - jnz L(Shl11Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave11) -# endif - palignr $11, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $11, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl11LoopStart) - -L(Shl11LoopExit): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl12): - movaps -12(%rcx), %xmm1 - movaps 4(%rcx), %xmm2 -L(Shl12Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -4(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -12(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl12LoopStart): - movaps 4(%rcx), %xmm2 - movaps 20(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 36(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 52(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $12, %xmm4, %xmm5 - test %rax, %rax - palignr $12, %xmm3, %xmm4 - jnz L(Shl12Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave12) -# endif - palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $12, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl12LoopStart) - -L(Shl12LoopExit): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl13): - movaps -13(%rcx), %xmm1 - movaps 3(%rcx), %xmm2 -L(Shl13Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 19(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -3(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -13(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl13LoopStart): - movaps 3(%rcx), %xmm2 - movaps 19(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 35(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 51(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $13, %xmm4, %xmm5 - test %rax, %rax - palignr $13, %xmm3, %xmm4 - jnz L(Shl13Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave13) -# endif - palignr $13, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $13, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl13LoopStart) - -L(Shl13LoopExit): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl14): - movaps -14(%rcx), %xmm1 - movaps 2(%rcx), %xmm2 -L(Shl14Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 18(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -2(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -14(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl14LoopStart): - movaps 2(%rcx), %xmm2 - movaps 18(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 34(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 50(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $14, %xmm4, %xmm5 - test %rax, %rax - palignr $14, %xmm3, %xmm4 - jnz L(Shl14Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave14) -# endif - palignr $14, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $14, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl14LoopStart) - -L(Shl14LoopExit): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl15): - movaps -15(%rcx), %xmm1 - movaps 1(%rcx), %xmm2 -L(Shl15Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 17(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -1(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -15(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl15LoopStart): - movaps 1(%rcx), %xmm2 - movaps 17(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 33(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 49(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $15, %xmm4, %xmm5 - test %rax, %rax - palignr $15, %xmm3, %xmm4 - jnz L(Shl15Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave15) -# endif - palignr $15, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $15, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl15LoopStart) - -L(Shl15LoopExit): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) -# ifdef USE_AS_STRCAT - jmp L(CopyFrom1To16Bytes) -# endif - -# ifndef USE_AS_STRCAT - - .p2align 4 -L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY - add $16, %r8 -# endif - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - - .p2align 4 -L(Exit8): - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $8, %r8 - lea 8(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - - .p2align 4 -L(Exit16): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %rax - mov %rax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 15(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $16, %r8 - lea 16(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - cmp $1, %r8 - je L(Exit1) - test $0x01, %al - jnz L(Exit1) - cmp $2, %r8 - je L(Exit2) - test $0x02, %al - jnz L(Exit2) - cmp $3, %r8 - je L(Exit3) - test $0x04, %al - jnz L(Exit3) - cmp $4, %r8 - je L(Exit4) - test $0x08, %al - jnz L(Exit4) - cmp $5, %r8 - je L(Exit5) - test $0x10, %al - jnz L(Exit5) - cmp $6, %r8 - je L(Exit6) - test $0x20, %al - jnz L(Exit6) - cmp $7, %r8 - je L(Exit7) - test $0x40, %al - jnz L(Exit7) - jmp L(Exit8) - - .p2align 4 -L(ExitHighCase2): - cmp $9, %r8 - je L(Exit9) - test $0x01, %ah - jnz L(Exit9) - cmp $10, %r8 - je L(Exit10) - test $0x02, %ah - jnz L(Exit10) - cmp $11, %r8 - je L(Exit11) - test $0x04, %ah - jnz L(Exit11) - cmp $12, %r8 - je L(Exit12) - test $0x8, %ah - jnz L(Exit12) - cmp $13, %r8 - je L(Exit13) - test $0x10, %ah - jnz L(Exit13) - cmp $14, %r8 - je L(Exit14) - test $0x20, %ah - jnz L(Exit14) - cmp $15, %r8 - je L(Exit15) - test $0x40, %ah - jnz L(Exit15) - jmp L(Exit16) - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $16, %r8 - je L(Exit16) - cmp $8, %r8 - je L(Exit8) - jg L(More8Case3) - cmp $4, %r8 - je L(Exit4) - jg L(More4Case3) - cmp $2, %r8 - jl L(Exit1) - je L(Exit2) - jg L(Exit3) -L(More8Case3): /* but less than 16 */ - cmp $12, %r8 - je L(Exit12) - jl L(Less12Case3) - cmp $14, %r8 - jl L(Exit13) - je L(Exit14) - jg L(Exit15) -L(More4Case3): /* but less than 8 */ - cmp $6, %r8 - jl L(Exit5) - je L(Exit6) - jg L(Exit7) -L(Less12Case3): /* but more than 8 */ - cmp $10, %r8 - jl L(Exit9) - je L(Exit10) - jg L(Exit11) -# endif - - .p2align 4 -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) -# ifdef USE_AS_STPCPY - lea (%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $1, %r8 - lea 1(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) -# ifdef USE_AS_STPCPY - lea 1(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $2, %r8 - lea 2(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) -# ifdef USE_AS_STPCPY - lea 2(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $3, %r8 - lea 3(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit4): - movl (%rcx), %eax - movl %eax, (%rdx) -# ifdef USE_AS_STPCPY - lea 3(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $4, %r8 - lea 4(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit5): - movl (%rcx), %eax - movl %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 4(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $5, %r8 - lea 5(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit6): - movl (%rcx), %eax - movl %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 5(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $6, %r8 - lea 6(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit7): - movl (%rcx), %eax - movl %eax, (%rdx) - movl 3(%rcx), %eax - movl %eax, 3(%rdx) -# ifdef USE_AS_STPCPY - lea 6(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $7, %r8 - lea 7(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit9): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %eax - mov %eax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 8(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $9, %r8 - lea 9(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit10): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %eax - mov %eax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 9(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $10, %r8 - lea 10(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit11): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 10(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $11, %r8 - lea 11(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit12): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 11(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $12, %r8 - lea 12(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit13): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %rax - mov %rax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 12(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $13, %r8 - lea 13(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit14): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %rax - mov %rax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 13(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $14, %r8 - lea 14(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit15): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $15, %r8 - lea 15(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(Fill0): - ret - - .p2align 4 -L(Fill1): - movb %dl, (%rcx) - ret - - .p2align 4 -L(Fill2): - movw %dx, (%rcx) - ret - - .p2align 4 -L(Fill3): - movw %dx, (%rcx) - movb %dl, 2(%rcx) - ret - - .p2align 4 -L(Fill4): - movl %edx, (%rcx) - ret - - .p2align 4 -L(Fill5): - movl %edx, (%rcx) - movb %dl, 4(%rcx) - ret - - .p2align 4 -L(Fill6): - movl %edx, (%rcx) - movw %dx, 4(%rcx) - ret - - .p2align 4 -L(Fill7): - movl %edx, (%rcx) - movl %edx, 3(%rcx) - ret - - .p2align 4 -L(Fill8): - mov %rdx, (%rcx) - ret - - .p2align 4 -L(Fill9): - mov %rdx, (%rcx) - movb %dl, 8(%rcx) - ret - - .p2align 4 -L(Fill10): - mov %rdx, (%rcx) - movw %dx, 8(%rcx) - ret - - .p2align 4 -L(Fill11): - mov %rdx, (%rcx) - movl %edx, 7(%rcx) - ret - - .p2align 4 -L(Fill12): - mov %rdx, (%rcx) - movl %edx, 8(%rcx) - ret - - .p2align 4 -L(Fill13): - mov %rdx, (%rcx) - mov %rdx, 5(%rcx) - ret - - .p2align 4 -L(Fill14): - mov %rdx, (%rcx) - mov %rdx, 6(%rcx) - ret - - .p2align 4 -L(Fill15): - mov %rdx, (%rcx) - mov %rdx, 7(%rcx) - ret - - .p2align 4 -L(Fill16): - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - ret - - .p2align 4 -L(StrncpyFillExit1): - lea 16(%r8), %r8 -L(FillFrom1To16Bytes): - test %r8, %r8 - jz L(Fill0) - cmp $16, %r8 - je L(Fill16) - cmp $8, %r8 - je L(Fill8) - jg L(FillMore8) - cmp $4, %r8 - je L(Fill4) - jg L(FillMore4) - cmp $2, %r8 - jl L(Fill1) - je L(Fill2) - jg L(Fill3) -L(FillMore8): /* but less than 16 */ - cmp $12, %r8 - je L(Fill12) - jl L(FillLess12) - cmp $14, %r8 - jl L(Fill13) - je L(Fill14) - jg L(Fill15) -L(FillMore4): /* but less than 8 */ - cmp $6, %r8 - jl L(Fill5) - je L(Fill6) - jg L(Fill7) -L(FillLess12): /* but more than 8 */ - cmp $10, %r8 - jl L(Fill9) - je L(Fill10) - jmp L(Fill11) - - .p2align 4 -L(StrncpyFillTailWithZero1): - xor %rdx, %rdx - sub $16, %r8 - jbe L(StrncpyFillExit1) - - pxor %xmm0, %xmm0 - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - - lea 16(%rcx), %rcx - - mov %rcx, %rdx - and $0xf, %rdx - sub %rdx, %rcx - add %rdx, %r8 - xor %rdx, %rdx - sub $64, %r8 - jb L(StrncpyFillLess64) - -L(StrncpyFillLoopMovdqa): - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - movdqa %xmm0, 32(%rcx) - movdqa %xmm0, 48(%rcx) - lea 64(%rcx), %rcx - sub $64, %r8 - jae L(StrncpyFillLoopMovdqa) - -L(StrncpyFillLess64): - add $32, %r8 - jl L(StrncpyFillLess32) - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - lea 32(%rcx), %rcx - sub $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - -L(StrncpyFillLess32): - add $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - - .p2align 4 -L(Exit0): - mov %rdx, %rax - ret - - .p2align 4 -L(StrncpyExit15Bytes): - cmp $9, %r8 - je L(Exit9) - cmpb $0, 8(%rcx) - jz L(Exit9) - cmp $10, %r8 - je L(Exit10) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $11, %r8 - je L(Exit11) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $12, %r8 - je L(Exit12) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $13, %r8 - je L(Exit13) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $14, %r8 - je L(Exit14) - cmpb $0, 13(%rcx) - jz L(Exit14) - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - - .p2align 4 -L(StrncpyExit8Bytes): - cmp $1, %r8 - je L(Exit1) - cmpb $0, (%rcx) - jz L(Exit1) - cmp $2, %r8 - je L(Exit2) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $3, %r8 - je L(Exit3) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $4, %r8 - je L(Exit4) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $5, %r8 - je L(Exit5) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $6, %r8 - je L(Exit6) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $7, %r8 - je L(Exit7) - cmpb $0, 6(%rcx) - jz L(Exit7) - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - -# endif -# endif - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(StrncpyLeaveCase2OrCase3): - test %rax, %rax - jnz L(Aligned64LeaveCase2) - -L(Aligned64LeaveCase3): - lea 64(%r8), %r8 - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase3) - -L(Aligned64LeaveCase2): - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - add $48, %r8 - jle L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase2) -/*--------------------------------------------------*/ - .p2align 4 -L(StrncpyExit1Case2OrCase3): - movdqu -1(%rcx), %xmm0 - movdqu %xmm0, -1(%rdx) - mov $15, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit2Case2OrCase3): - movdqu -2(%rcx), %xmm0 - movdqu %xmm0, -2(%rdx) - mov $14, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit3Case2OrCase3): - movdqu -3(%rcx), %xmm0 - movdqu %xmm0, -3(%rdx) - mov $13, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit4Case2OrCase3): - movdqu -4(%rcx), %xmm0 - movdqu %xmm0, -4(%rdx) - mov $12, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit5Case2OrCase3): - movdqu -5(%rcx), %xmm0 - movdqu %xmm0, -5(%rdx) - mov $11, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit6Case2OrCase3): - mov (%rcx), %rsi - mov 6(%rcx), %r9d - mov %r9d, 6(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $10, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit7Case2OrCase3): - mov (%rcx), %rsi - mov 5(%rcx), %r9d - mov %r9d, 5(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $9, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit8Case2OrCase3): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit9Case2OrCase3): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit10Case2OrCase3): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit11Case2OrCase3): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit12Case2OrCase3): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit13Case2OrCase3): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit14Case2OrCase3): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit15Case2OrCase3): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave1): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - palignr $1, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit1): - lea 15(%rdx, %rsi), %rdx - lea 15(%rcx, %rsi), %rcx - mov -15(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -15(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave2): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - palignr $2, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit2): - lea 14(%rdx, %rsi), %rdx - lea 14(%rcx, %rsi), %rcx - mov -14(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -14(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave3): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - palignr $3, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit3): - lea 13(%rdx, %rsi), %rdx - lea 13(%rcx, %rsi), %rcx - mov -13(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -13(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave4): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - palignr $4, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit4): - lea 12(%rdx, %rsi), %rdx - lea 12(%rcx, %rsi), %rcx - mov -12(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -12(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave5): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - palignr $5, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit5): - lea 11(%rdx, %rsi), %rdx - lea 11(%rcx, %rsi), %rcx - mov -11(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -11(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave6): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - palignr $6, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit6): - lea 10(%rdx, %rsi), %rdx - lea 10(%rcx, %rsi), %rcx - mov -10(%rcx), %rsi - movw -2(%rcx), %ax - mov %rsi, -10(%rdx) - movw %ax, -2(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave7): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - palignr $7, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit7): - lea 9(%rdx, %rsi), %rdx - lea 9(%rcx, %rsi), %rcx - mov -9(%rcx), %rsi - movb -1(%rcx), %ah - mov %rsi, -9(%rdx) - movb %ah, -1(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave8): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - palignr $8, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit8): - lea 8(%rdx, %rsi), %rdx - lea 8(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave9): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - palignr $9, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit9): - lea 7(%rdx, %rsi), %rdx - lea 7(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave10): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - palignr $10, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit10): - lea 6(%rdx, %rsi), %rdx - lea 6(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave11): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - palignr $11, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit11): - lea 5(%rdx, %rsi), %rdx - lea 5(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave12): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - palignr $12, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit12): - lea 4(%rdx, %rsi), %rdx - lea 4(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave13): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - palignr $13, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit13): - lea 3(%rdx, %rsi), %rdx - lea 3(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave14): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - palignr $14, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit14): - lea 2(%rdx, %rsi), %rdx - lea 2(%rcx, %rsi), %rcx - movw -2(%rcx), %ax - xor %rsi, %rsi - movw %ax, -2(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave15): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - palignr $15, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit15): - lea 1(%rdx, %rsi), %rdx - lea 1(%rcx, %rsi), %rcx - movb -1(%rcx), %ah - xor %rsi, %rsi - movb %ah, -1(%rdx) - jmp L(CopyFrom1To16BytesCase3) - -# endif -# ifndef USE_AS_STRCAT -END (STRCPY) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S deleted file mode 100644 index bf82ee447d..0000000000 --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_ssse3 -#include "strcpy-ssse3.S"