diff mbox series

[v1,6/6] x86: Remove str{p}{n}cpy-ssse3

Message ID 20220325183625.1170867-6-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/6] x86: Remove {w}memcmp-ssse3 | expand

Commit Message

Noah Goldstein March 25, 2022, 6:36 p.m. UTC
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
 sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
 sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
 sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
 sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
 6 files changed, 3572 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S

Comments

H.J. Lu March 25, 2022, 7:57 p.m. UTC | #1
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |    4 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
>  sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
>  sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
>  sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
>  sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
>  6 files changed, 3572 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index a2ebc06c5f..292353bad7 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -42,13 +42,11 @@ sysdep_routines += \
>    stpcpy-evex \
>    stpcpy-sse2 \
>    stpcpy-sse2-unaligned \
> -  stpcpy-ssse3 \
>    stpncpy-avx2 \
>    stpncpy-avx2-rtm \
>    stpncpy-c \
>    stpncpy-evex \
>    stpncpy-sse2-unaligned \
> -  stpncpy-ssse3 \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
>    strcasecmp_l-evex \
> @@ -79,7 +77,6 @@ sysdep_routines += \
>    strcpy-evex \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
> -  strcpy-ssse3 \
>    strcspn-c \
>    strcspn-sse2 \
>    strlen-avx2 \
> @@ -106,7 +103,6 @@ sysdep_routines += \
>    strncpy-c \
>    strncpy-evex \
>    strncpy-sse2-unaligned \
> -  strncpy-ssse3 \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 4133ed7e43..505b8002e0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
>    IFUNC_IMPL (i, name, stpncpy,
> -             IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpncpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
>    IFUNC_IMPL (i, name, stpcpy,
> -             IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpcpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strcpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1,
>                               __strncpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> -
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_ssse3
> -#  endif
> -
> -       .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> -       mov     %rsi, %rcx
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -#  endif
> -       mov     %rdi, %rdx
> -#  ifdef USE_AS_STRNCPY
> -       test    %R8_LP, %R8_LP
> -       jz      L(Exit0)
> -       cmp     $8, %R8_LP
> -       jbe     L(StrncpyExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       jb      L(StrncpyExit15Bytes)
> -# endif
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -# endif
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       mov     %rcx, %rsi
> -       sub     $16, %r8
> -       and     $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> -       add     %rsi, %r8
> -# endif
> -       lea     16(%rcx), %rsi
> -       and     $-16, %rsi
> -       pxor    %xmm0, %xmm0
> -       mov     (%rcx), %r9
> -       mov     %r9, (%rdx)
> -       pcmpeqb (%rsi), %xmm0
> -       mov     8(%rcx), %r9
> -       mov     %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> -       pmovmskb %xmm0, %rax
> -       sub     %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       mov     %rdx, %rax
> -       lea     16(%rdx), %rdx
> -       and     $-16, %rdx
> -       sub     %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %rsi
> -       lea     -1(%rsi), %rsi
> -       and     $1<<31, %esi
> -       test    %rsi, %rsi
> -       jnz     L(ContinueCopy)
> -       lea     16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> -       sub     %rax, %rcx
> -       mov     %rcx, %rax
> -       and     $0xf, %rax
> -       mov     $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> -       jz      L(Align16Both)
> -
> -       cmp     $8, %rax
> -       jae     L(ShlHigh8)
> -       cmp     $1, %rax
> -       je      L(Shl1)
> -       cmp     $2, %rax
> -       je      L(Shl2)
> -       cmp     $3, %rax
> -       je      L(Shl3)
> -       cmp     $4, %rax
> -       je      L(Shl4)
> -       cmp     $5, %rax
> -       je      L(Shl5)
> -       cmp     $6, %rax
> -       je      L(Shl6)
> -       jmp     L(Shl7)
> -
> -L(ShlHigh8):
> -       je      L(Shl8)
> -       cmp     $9, %rax
> -       je      L(Shl9)
> -       cmp     $10, %rax
> -       je      L(Shl10)
> -       cmp     $11, %rax
> -       je      L(Shl11)
> -       cmp     $12, %rax
> -       je      L(Shl12)
> -       cmp     $13, %rax
> -       je      L(Shl13)
> -       cmp     $14, %rax
> -       je      L(Shl14)
> -       jmp     L(Shl15)
> -
> -L(Align16Both):
> -       movaps  (%rcx), %xmm1
> -       movaps  16(%rcx), %xmm2
> -       movaps  %xmm1, (%rdx)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm4
> -       movaps  %xmm3, (%rdx, %rsi)
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm1
> -       movaps  %xmm4, (%rdx, %rsi)
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm2
> -       movaps  %xmm1, (%rdx, %rsi)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm3, (%rdx, %rsi)
> -       mov     %rcx, %rax
> -       lea     16(%rcx, %rsi), %rcx
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       lea     112(%r8, %rax), %r8
> -# endif
> -       mov     $-0x40, %rsi
> -
> -       .p2align 4
> -L(Aligned64Loop):
> -       movaps  (%rcx), %xmm2
> -       movaps  %xmm2, %xmm4
> -       movaps  16(%rcx), %xmm5
> -       movaps  32(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  48(%rcx), %xmm7
> -       pminub  %xmm5, %xmm2
> -       pminub  %xmm7, %xmm3
> -       pminub  %xmm2, %xmm3
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %rax
> -       lea     64(%rdx), %rdx
> -       lea     64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeaveCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Aligned64Leave)
> -       movaps  %xmm4, -64(%rdx)
> -       movaps  %xmm5, -48(%rdx)
> -       movaps  %xmm6, -32(%rdx)
> -       movaps  %xmm7, -16(%rdx)
> -       jmp     L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> -       lea     48(%r8), %r8
> -# endif
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm6, -32(%rdx)
> -       pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl1):
> -       movaps  -1(%rcx), %xmm1
> -       movaps  15(%rcx), %xmm2
> -L(Shl1Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     31(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -15(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl1LoopStart):
> -       movaps  15(%rcx), %xmm2
> -       movaps  31(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  47(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  63(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $1, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $1, %xmm3, %xmm4
> -       jnz     L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave1)
> -# endif
> -       palignr $1, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> -       movdqu  -1(%rcx), %xmm1
> -       mov     $15, %rsi
> -       movdqu  %xmm1, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl2):
> -       movaps  -2(%rcx), %xmm1
> -       movaps  14(%rcx), %xmm2
> -L(Shl2Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     30(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -14(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl2LoopStart):
> -       movaps  14(%rcx), %xmm2
> -       movaps  30(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  46(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  62(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $2, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $2, %xmm3, %xmm4
> -       jnz     L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave2)
> -# endif
> -       palignr $2, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> -       movdqu  -2(%rcx), %xmm1
> -       mov     $14, %rsi
> -       movdqu  %xmm1, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl3):
> -       movaps  -3(%rcx), %xmm1
> -       movaps  13(%rcx), %xmm2
> -L(Shl3Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     29(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -13(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl3LoopStart):
> -       movaps  13(%rcx), %xmm2
> -       movaps  29(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  45(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  61(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $3, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $3, %xmm3, %xmm4
> -       jnz     L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave3)
> -# endif
> -       palignr $3, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> -       movdqu  -3(%rcx), %xmm1
> -       mov     $13, %rsi
> -       movdqu  %xmm1, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl4):
> -       movaps  -4(%rcx), %xmm1
> -       movaps  12(%rcx), %xmm2
> -L(Shl4Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     28(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -12(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl4LoopStart):
> -       movaps  12(%rcx), %xmm2
> -       movaps  28(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  44(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  60(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $4, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $4, %xmm3, %xmm4
> -       jnz     L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave4)
> -# endif
> -       palignr $4, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> -       movdqu  -4(%rcx), %xmm1
> -       mov     $12, %rsi
> -       movdqu  %xmm1, -4(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl5):
> -       movaps  -5(%rcx), %xmm1
> -       movaps  11(%rcx), %xmm2
> -L(Shl5Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     27(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -11(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl5LoopStart):
> -       movaps  11(%rcx), %xmm2
> -       movaps  27(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  43(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  59(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $5, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $5, %xmm3, %xmm4
> -       jnz     L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave5)
> -# endif
> -       palignr $5, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> -       movdqu  -5(%rcx), %xmm1
> -       mov     $11, %rsi
> -       movdqu  %xmm1, -5(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl6):
> -       movaps  -6(%rcx), %xmm1
> -       movaps  10(%rcx), %xmm2
> -L(Shl6Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     26(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -10(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl6LoopStart):
> -       movaps  10(%rcx), %xmm2
> -       movaps  26(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  42(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  58(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $6, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $6, %xmm3, %xmm4
> -       jnz     L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave6)
> -# endif
> -       palignr $6, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> -       mov     (%rcx), %r9
> -       mov     6(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 6(%rdx)
> -       mov     $10, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl7):
> -       movaps  -7(%rcx), %xmm1
> -       movaps  9(%rcx), %xmm2
> -L(Shl7Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     25(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -9(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl7LoopStart):
> -       movaps  9(%rcx), %xmm2
> -       movaps  25(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  41(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  57(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $7, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $7, %xmm3, %xmm4
> -       jnz     L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave7)
> -# endif
> -       palignr $7, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> -       mov     (%rcx), %r9
> -       mov     5(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 5(%rdx)
> -       mov     $9, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl8):
> -       movaps  -8(%rcx), %xmm1
> -       movaps  8(%rcx), %xmm2
> -L(Shl8Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     24(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -8(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl8LoopStart):
> -       movaps  8(%rcx), %xmm2
> -       movaps  24(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  40(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  56(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $8, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $8, %xmm3, %xmm4
> -       jnz     L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave8)
> -# endif
> -       palignr $8, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl9):
> -       movaps  -9(%rcx), %xmm1
> -       movaps  7(%rcx), %xmm2
> -L(Shl9Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     23(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -7(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl9LoopStart):
> -       movaps  7(%rcx), %xmm2
> -       movaps  23(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  39(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  55(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $9, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $9, %xmm3, %xmm4
> -       jnz     L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave9)
> -# endif
> -       palignr $9, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl10):
> -       movaps  -10(%rcx), %xmm1
> -       movaps  6(%rcx), %xmm2
> -L(Shl10Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     22(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -6(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl10LoopStart):
> -       movaps  6(%rcx), %xmm2
> -       movaps  22(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  38(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  54(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $10, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $10, %xmm3, %xmm4
> -       jnz     L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave10)
> -# endif
> -       palignr $10, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl11):
> -       movaps  -11(%rcx), %xmm1
> -       movaps  5(%rcx), %xmm2
> -L(Shl11Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     21(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -5(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl11LoopStart):
> -       movaps  5(%rcx), %xmm2
> -       movaps  21(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  37(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  53(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $11, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $11, %xmm3, %xmm4
> -       jnz     L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave11)
> -# endif
> -       palignr $11, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl12):
> -       movaps  -12(%rcx), %xmm1
> -       movaps  4(%rcx), %xmm2
> -L(Shl12Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     20(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -4(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl12LoopStart):
> -       movaps  4(%rcx), %xmm2
> -       movaps  20(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  36(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  52(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $12, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $12, %xmm3, %xmm4
> -       jnz     L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave12)
> -# endif
> -       palignr $12, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl13):
> -       movaps  -13(%rcx), %xmm1
> -       movaps  3(%rcx), %xmm2
> -L(Shl13Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     19(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -3(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl13LoopStart):
> -       movaps  3(%rcx), %xmm2
> -       movaps  19(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  35(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  51(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $13, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $13, %xmm3, %xmm4
> -       jnz     L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave13)
> -# endif
> -       palignr $13, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl14):
> -       movaps  -14(%rcx), %xmm1
> -       movaps  2(%rcx), %xmm2
> -L(Shl14Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     18(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -2(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl14LoopStart):
> -       movaps  2(%rcx), %xmm2
> -       movaps  18(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  34(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  50(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $14, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $14, %xmm3, %xmm4
> -       jnz     L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave14)
> -# endif
> -       palignr $14, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl15):
> -       movaps  -15(%rcx), %xmm1
> -       movaps  1(%rcx), %xmm2
> -L(Shl15Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     17(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -1(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl15LoopStart):
> -       movaps  1(%rcx), %xmm2
> -       movaps  17(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  33(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  49(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $15, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $15, %xmm3, %xmm4
> -       jnz     L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave15)
> -# endif
> -       palignr $15, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> -       jmp     L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -#  ifdef USE_AS_STRNCPY
> -       add     $16, %r8
> -#  endif
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -
> -       .p2align 4
> -L(Exit8):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $8, %r8
> -       lea     8(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -
> -       .p2align 4
> -L(Exit16):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %rax
> -       mov     %rax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     15(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       lea     16(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       jmp     L(Exit8)
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $15, %r8
> -       je      L(Exit15)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       jmp     L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -       cmp     $8, %r8
> -       je      L(Exit8)
> -       jg      L(More8Case3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       jg      L(More4Case3)
> -       cmp     $2, %r8
> -       jl      L(Exit1)
> -       je      L(Exit2)
> -       jg      L(Exit3)
> -L(More8Case3): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       jl      L(Less12Case3)
> -       cmp     $14, %r8
> -       jl      L(Exit13)
> -       je      L(Exit14)
> -       jg      L(Exit15)
> -L(More4Case3): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Exit5)
> -       je      L(Exit6)
> -       jg      L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Exit9)
> -       je      L(Exit10)
> -       jg      L(Exit11)
> -#  endif
> -
> -       .p2align 4
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $1, %r8
> -       lea     1(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     1(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $2, %r8
> -       lea     2(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     2(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $3, %r8
> -       lea     3(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit4):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     3(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $4, %r8
> -       lea     4(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit5):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     4(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $5, %r8
> -       lea     5(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit6):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     5(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $6, %r8
> -       lea     6(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit7):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movl    3(%rcx), %eax
> -       movl    %eax, 3(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     6(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $7, %r8
> -       lea     7(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit9):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %eax
> -       mov     %eax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     8(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $9, %r8
> -       lea     9(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit10):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %eax
> -       mov     %eax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     9(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $10, %r8
> -       lea     10(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit11):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     10(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $11, %r8
> -       lea     11(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit12):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     11(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $12, %r8
> -       lea     12(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit13):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %rax
> -       mov     %rax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     12(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $13, %r8
> -       lea     13(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit14):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %rax
> -       mov     %rax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     13(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $14, %r8
> -       lea     14(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit15):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $15, %r8
> -       lea     15(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(Fill0):
> -       ret
> -
> -       .p2align 4
> -L(Fill1):
> -       movb    %dl, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill2):
> -       movw    %dx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill3):
> -       movw    %dx, (%rcx)
> -       movb    %dl, 2(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill4):
> -       movl    %edx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill5):
> -       movl    %edx, (%rcx)
> -       movb    %dl, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill6):
> -       movl    %edx, (%rcx)
> -       movw    %dx, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill7):
> -       movl    %edx, (%rcx)
> -       movl    %edx, 3(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill8):
> -       mov     %rdx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill9):
> -       mov     %rdx, (%rcx)
> -       movb    %dl, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill10):
> -       mov     %rdx, (%rcx)
> -       movw    %dx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill11):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill12):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill13):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 5(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill14):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 6(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill15):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill16):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(StrncpyFillExit1):
> -       lea     16(%r8), %r8
> -L(FillFrom1To16Bytes):
> -       test    %r8, %r8
> -       jz      L(Fill0)
> -       cmp     $16, %r8
> -       je      L(Fill16)
> -       cmp     $8, %r8
> -       je      L(Fill8)
> -       jg      L(FillMore8)
> -       cmp     $4, %r8
> -       je      L(Fill4)
> -       jg      L(FillMore4)
> -       cmp     $2, %r8
> -       jl      L(Fill1)
> -       je      L(Fill2)
> -       jg      L(Fill3)
> -L(FillMore8): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Fill12)
> -       jl      L(FillLess12)
> -       cmp     $14, %r8
> -       jl      L(Fill13)
> -       je      L(Fill14)
> -       jg      L(Fill15)
> -L(FillMore4): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Fill5)
> -       je      L(Fill6)
> -       jg      L(Fill7)
> -L(FillLess12): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Fill9)
> -       je      L(Fill10)
> -       jmp     L(Fill11)
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero1):
> -       xor     %rdx, %rdx
> -       sub     $16, %r8
> -       jbe     L(StrncpyFillExit1)
> -
> -       pxor    %xmm0, %xmm0
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -
> -       lea     16(%rcx), %rcx
> -
> -       mov     %rcx, %rdx
> -       and     $0xf, %rdx
> -       sub     %rdx, %rcx
> -       add     %rdx, %r8
> -       xor     %rdx, %rdx
> -       sub     $64, %r8
> -       jb      L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       movdqa  %xmm0, 32(%rcx)
> -       movdqa  %xmm0, 48(%rcx)
> -       lea     64(%rcx), %rcx
> -       sub     $64, %r8
> -       jae     L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> -       add     $32, %r8
> -       jl      L(StrncpyFillLess32)
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       lea     32(%rcx), %rcx
> -       sub     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> -       add     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Exit0):
> -       mov     %rdx, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit8Bytes):
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -#  endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> -       lea     64(%r8), %r8
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       add     $48, %r8
> -       jle     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> -       .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> -       movdqu  -1(%rcx), %xmm0
> -       movdqu  %xmm0, -1(%rdx)
> -       mov     $15, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> -       movdqu  -2(%rcx), %xmm0
> -       movdqu  %xmm0, -2(%rdx)
> -       mov     $14, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> -       movdqu  -3(%rcx), %xmm0
> -       movdqu  %xmm0, -3(%rdx)
> -       mov     $13, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> -       movdqu  -4(%rcx), %xmm0
> -       movdqu  %xmm0, -4(%rdx)
> -       mov     $12, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> -       movdqu  -5(%rcx), %xmm0
> -       movdqu  %xmm0, -5(%rdx)
> -       mov     $11, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     6(%rcx), %r9d
> -       mov     %r9d, 6(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $10, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     5(%rcx), %r9d
> -       mov     %r9d, 5(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $9, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave1):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit1)
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit1):
> -       lea     15(%rdx, %rsi), %rdx
> -       lea     15(%rcx, %rsi), %rcx
> -       mov     -15(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -15(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave2):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit2)
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit2):
> -       lea     14(%rdx, %rsi), %rdx
> -       lea     14(%rcx, %rsi), %rcx
> -       mov     -14(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -14(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave3):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit3)
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit3):
> -       lea     13(%rdx, %rsi), %rdx
> -       lea     13(%rcx, %rsi), %rcx
> -       mov     -13(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -13(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave4):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit4)
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit4):
> -       lea     12(%rdx, %rsi), %rdx
> -       lea     12(%rcx, %rsi), %rcx
> -       mov     -12(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -12(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave5):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit5)
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit5):
> -       lea     11(%rdx, %rsi), %rdx
> -       lea     11(%rcx, %rsi), %rcx
> -       mov     -11(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -11(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave6):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit6)
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit6):
> -       lea     10(%rdx, %rsi), %rdx
> -       lea     10(%rcx, %rsi), %rcx
> -       mov     -10(%rcx), %rsi
> -       movw    -2(%rcx), %ax
> -       mov     %rsi, -10(%rdx)
> -       movw    %ax, -2(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave7):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit7)
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit7):
> -       lea     9(%rdx, %rsi), %rdx
> -       lea     9(%rcx, %rsi), %rcx
> -       mov     -9(%rcx), %rsi
> -       movb    -1(%rcx), %ah
> -       mov     %rsi, -9(%rdx)
> -       movb    %ah, -1(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave8):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit8)
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit8):
> -       lea     8(%rdx, %rsi), %rdx
> -       lea     8(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave9):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit9)
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit9):
> -       lea     7(%rdx, %rsi), %rdx
> -       lea     7(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave10):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit10)
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit10):
> -       lea     6(%rdx, %rsi), %rdx
> -       lea     6(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave11):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit11)
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit11):
> -       lea     5(%rdx, %rsi), %rdx
> -       lea     5(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave12):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit12)
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit12):
> -       lea     4(%rdx, %rsi), %rdx
> -       lea     4(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave13):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit13)
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit13):
> -       lea     3(%rdx, %rsi), %rdx
> -       lea     3(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave14):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit14)
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit14):
> -       lea     2(%rdx, %rsi), %rdx
> -       lea     2(%rcx, %rsi), %rcx
> -       movw    -2(%rcx), %ax
> -       xor     %rsi, %rsi
> -       movw    %ax, -2(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave15):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit15)
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit15):
> -       lea     1(%rdx, %rsi), %rdx
> -       lea     1(%rcx, %rsi), %rcx
> -       movb    -1(%rcx), %ah
> -       xor     %rsi, %rsi
> -       movb    %ah, -1(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a2ebc06c5f..292353bad7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -42,13 +42,11 @@  sysdep_routines += \
   stpcpy-evex \
   stpcpy-sse2 \
   stpcpy-sse2-unaligned \
-  stpcpy-ssse3 \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
   stpncpy-c \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
-  stpncpy-ssse3 \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -79,7 +77,6 @@  sysdep_routines += \
   strcpy-evex \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
-  strcpy-ssse3 \
   strcspn-c \
   strcspn-sse2 \
   strlen-avx2 \
@@ -106,7 +103,6 @@  sysdep_routines += \
   strncpy-c \
   strncpy-evex \
   strncpy-sse2-unaligned \
-  strncpy-ssse3 \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 4133ed7e43..505b8002e0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -389,8 +389,6 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -407,8 +405,6 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -557,8 +553,6 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
@@ -634,8 +628,6 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@ 
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@ 
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@ 
-/* strcpy with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_ssse3
-#  endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
-	mov	%rsi, %rcx
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-#  endif
-	mov	%rdi, %rdx
-#  ifdef USE_AS_STRNCPY
-	test	%R8_LP, %R8_LP
-	jz	L(Exit0)
-	cmp	$8, %R8_LP
-	jbe	L(StrncpyExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	jb	L(StrncpyExit15Bytes)
-# endif
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	je	L(Exit16)
-# endif
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
-	mov	%rcx, %rsi
-	sub	$16, %r8
-	and	$0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
-	add	%rsi, %r8
-# endif
-	lea	16(%rcx), %rsi
-	and	$-16, %rsi
-	pxor	%xmm0, %xmm0
-	mov	(%rcx), %r9
-	mov	%r9, (%rdx)
-	pcmpeqb	(%rsi), %xmm0
-	mov	8(%rcx), %r9
-	mov	%r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
-	pmovmskb %xmm0, %rax
-	sub	%rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	mov	%rdx, %rax
-	lea	16(%rdx), %rdx
-	and	$-16, %rdx
-	sub	%rdx, %rax
-
-# ifdef USE_AS_STRNCPY
-	add	%rax, %rsi
-	lea	-1(%rsi), %rsi
-	and	$1<<31, %esi
-	test	%rsi, %rsi
-	jnz	L(ContinueCopy)
-	lea	16(%r8), %r8
-
-L(ContinueCopy):
-# endif
-	sub	%rax, %rcx
-	mov	%rcx, %rax
-	and	$0xf, %rax
-	mov	$0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
-	jz	L(Align16Both)
-
-	cmp	$8, %rax
-	jae	L(ShlHigh8)
-	cmp	$1, %rax
-	je	L(Shl1)
-	cmp	$2, %rax
-	je	L(Shl2)
-	cmp	$3, %rax
-	je	L(Shl3)
-	cmp	$4, %rax
-	je	L(Shl4)
-	cmp	$5, %rax
-	je	L(Shl5)
-	cmp	$6, %rax
-	je	L(Shl6)
-	jmp	L(Shl7)
-
-L(ShlHigh8):
-	je	L(Shl8)
-	cmp	$9, %rax
-	je	L(Shl9)
-	cmp	$10, %rax
-	je	L(Shl10)
-	cmp	$11, %rax
-	je	L(Shl11)
-	cmp	$12, %rax
-	je	L(Shl12)
-	cmp	$13, %rax
-	je	L(Shl13)
-	cmp	$14, %rax
-	je	L(Shl14)
-	jmp	L(Shl15)
-
-L(Align16Both):
-	movaps	(%rcx), %xmm1
-	movaps	16(%rcx), %xmm2
-	movaps	%xmm1, (%rdx)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm4
-	movaps	%xmm3, (%rdx, %rsi)
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm1
-	movaps	%xmm4, (%rdx, %rsi)
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm2
-	movaps	%xmm1, (%rdx, %rsi)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm3, (%rdx, %rsi)
-	mov	%rcx, %rax
-	lea	16(%rcx, %rsi), %rcx
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	lea	112(%r8, %rax), %r8
-# endif
-	mov	$-0x40, %rsi
-
-	.p2align 4
-L(Aligned64Loop):
-	movaps	(%rcx), %xmm2
-	movaps	%xmm2, %xmm4
-	movaps	16(%rcx), %xmm5
-	movaps	32(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	48(%rcx), %xmm7
-	pminub	%xmm5, %xmm2
-	pminub	%xmm7, %xmm3
-	pminub	%xmm2, %xmm3
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %rax
-	lea	64(%rdx), %rdx
-	lea	64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeaveCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Aligned64Leave)
-	movaps	%xmm4, -64(%rdx)
-	movaps	%xmm5, -48(%rdx)
-	movaps	%xmm6, -32(%rdx)
-	movaps	%xmm7, -16(%rdx)
-	jmp	L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
-	lea	48(%r8), %r8
-# endif
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm6, -32(%rdx)
-	pcmpeqb	%xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl1):
-	movaps	-1(%rcx), %xmm1
-	movaps	15(%rcx), %xmm2
-L(Shl1Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	31(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-15(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-1(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl1LoopStart):
-	movaps	15(%rcx), %xmm2
-	movaps	31(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	47(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	63(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$1, %xmm3, %xmm4
-	jnz	L(Shl1Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave1)
-# endif
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl1LoopStart)
-
-L(Shl1LoopExit):
-	movdqu	-1(%rcx), %xmm1
-	mov	$15, %rsi
-	movdqu	%xmm1, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl2):
-	movaps	-2(%rcx), %xmm1
-	movaps	14(%rcx), %xmm2
-L(Shl2Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	30(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-14(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-2(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl2LoopStart):
-	movaps	14(%rcx), %xmm2
-	movaps	30(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	46(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	62(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$2, %xmm3, %xmm4
-	jnz	L(Shl2Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave2)
-# endif
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl2LoopStart)
-
-L(Shl2LoopExit):
-	movdqu	-2(%rcx), %xmm1
-	mov	$14, %rsi
-	movdqu	%xmm1, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl3):
-	movaps	-3(%rcx), %xmm1
-	movaps	13(%rcx), %xmm2
-L(Shl3Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	29(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-13(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-3(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl3LoopStart):
-	movaps	13(%rcx), %xmm2
-	movaps	29(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	45(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	61(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$3, %xmm3, %xmm4
-	jnz	L(Shl3Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave3)
-# endif
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl3LoopStart)
-
-L(Shl3LoopExit):
-	movdqu	-3(%rcx), %xmm1
-	mov	$13, %rsi
-	movdqu	%xmm1, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl4):
-	movaps	-4(%rcx), %xmm1
-	movaps	12(%rcx), %xmm2
-L(Shl4Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	28(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-12(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-4(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl4LoopStart):
-	movaps	12(%rcx), %xmm2
-	movaps	28(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	44(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	60(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$4, %xmm3, %xmm4
-	jnz	L(Shl4Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave4)
-# endif
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl4LoopStart)
-
-L(Shl4LoopExit):
-	movdqu	-4(%rcx), %xmm1
-	mov	$12, %rsi
-	movdqu	%xmm1, -4(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl5):
-	movaps	-5(%rcx), %xmm1
-	movaps	11(%rcx), %xmm2
-L(Shl5Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	27(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-11(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-5(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl5LoopStart):
-	movaps	11(%rcx), %xmm2
-	movaps	27(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	43(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	59(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$5, %xmm3, %xmm4
-	jnz	L(Shl5Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave5)
-# endif
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl5LoopStart)
-
-L(Shl5LoopExit):
-	movdqu	-5(%rcx), %xmm1
-	mov	$11, %rsi
-	movdqu	%xmm1, -5(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl6):
-	movaps	-6(%rcx), %xmm1
-	movaps	10(%rcx), %xmm2
-L(Shl6Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	26(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-10(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-6(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl6LoopStart):
-	movaps	10(%rcx), %xmm2
-	movaps	26(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	42(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	58(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$6, %xmm3, %xmm4
-	jnz	L(Shl6Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave6)
-# endif
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl6LoopStart)
-
-L(Shl6LoopExit):
-	mov	(%rcx), %r9
-	mov	6(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 6(%rdx)
-	mov	$10, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl7):
-	movaps	-7(%rcx), %xmm1
-	movaps	9(%rcx), %xmm2
-L(Shl7Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	25(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-9(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-7(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl7LoopStart):
-	movaps	9(%rcx), %xmm2
-	movaps	25(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	41(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	57(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$7, %xmm3, %xmm4
-	jnz	L(Shl7Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave7)
-# endif
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl7LoopStart)
-
-L(Shl7LoopExit):
-	mov	(%rcx), %r9
-	mov	5(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 5(%rdx)
-	mov	$9, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl8):
-	movaps	-8(%rcx), %xmm1
-	movaps	8(%rcx), %xmm2
-L(Shl8Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	24(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-8(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-8(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl8LoopStart):
-	movaps	8(%rcx), %xmm2
-	movaps	24(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	40(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	56(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$8, %xmm3, %xmm4
-	jnz	L(Shl8Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave8)
-# endif
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl8LoopStart)
-
-L(Shl8LoopExit):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl9):
-	movaps	-9(%rcx), %xmm1
-	movaps	7(%rcx), %xmm2
-L(Shl9Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	23(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-7(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-9(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl9LoopStart):
-	movaps	7(%rcx), %xmm2
-	movaps	23(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	39(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	55(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$9, %xmm3, %xmm4
-	jnz	L(Shl9Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave9)
-# endif
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl9LoopStart)
-
-L(Shl9LoopExit):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl10):
-	movaps	-10(%rcx), %xmm1
-	movaps	6(%rcx), %xmm2
-L(Shl10Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	22(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-6(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-10(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl10LoopStart):
-	movaps	6(%rcx), %xmm2
-	movaps	22(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	38(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	54(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$10, %xmm3, %xmm4
-	jnz	L(Shl10Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave10)
-# endif
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl10LoopStart)
-
-L(Shl10LoopExit):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl11):
-	movaps	-11(%rcx), %xmm1
-	movaps	5(%rcx), %xmm2
-L(Shl11Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	21(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-5(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-11(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl11LoopStart):
-	movaps	5(%rcx), %xmm2
-	movaps	21(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	37(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	53(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$11, %xmm3, %xmm4
-	jnz	L(Shl11Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave11)
-# endif
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl11LoopStart)
-
-L(Shl11LoopExit):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl12):
-	movaps	-12(%rcx), %xmm1
-	movaps	4(%rcx), %xmm2
-L(Shl12Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	20(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-4(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-12(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl12LoopStart):
-	movaps	4(%rcx), %xmm2
-	movaps	20(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	36(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	52(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$12, %xmm3, %xmm4
-	jnz	L(Shl12Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave12)
-# endif
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl12LoopStart)
-
-L(Shl12LoopExit):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl13):
-	movaps	-13(%rcx), %xmm1
-	movaps	3(%rcx), %xmm2
-L(Shl13Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	19(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-3(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-13(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl13LoopStart):
-	movaps	3(%rcx), %xmm2
-	movaps	19(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	35(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	51(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$13, %xmm3, %xmm4
-	jnz	L(Shl13Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave13)
-# endif
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl13LoopStart)
-
-L(Shl13LoopExit):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl14):
-	movaps	-14(%rcx), %xmm1
-	movaps	2(%rcx), %xmm2
-L(Shl14Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	18(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-2(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-14(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl14LoopStart):
-	movaps	2(%rcx), %xmm2
-	movaps	18(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	34(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	50(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$14, %xmm3, %xmm4
-	jnz	L(Shl14Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave14)
-# endif
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl14LoopStart)
-
-L(Shl14LoopExit):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl15):
-	movaps	-15(%rcx), %xmm1
-	movaps	1(%rcx), %xmm2
-L(Shl15Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	17(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-1(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-15(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl15LoopStart):
-	movaps	1(%rcx), %xmm2
-	movaps	17(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	33(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	49(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$15, %xmm3, %xmm4
-	jnz	L(Shl15Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave15)
-# endif
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl15LoopStart)
-
-L(Shl15LoopExit):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
-	jmp	L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-#  ifdef USE_AS_STRNCPY
-	add	$16, %r8
-#  endif
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-
-	.p2align 4
-L(Exit8):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$8, %r8
-	lea	8(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-
-	.p2align 4
-L(Exit16):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %rax
-	mov	%rax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	15(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	lea	16(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	cmp	$1, %r8
-	je	L(Exit1)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	jmp	L(Exit8)
-
-	.p2align 4
-L(ExitHighCase2):
-	cmp	$9, %r8
-	je	L(Exit9)
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$15, %r8
-	je	L(Exit15)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	jmp	L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$16, %r8
-	je	L(Exit16)
-	cmp	$8, %r8
-	je	L(Exit8)
-	jg	L(More8Case3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	jg	L(More4Case3)
-	cmp	$2, %r8
-	jl	L(Exit1)
-	je	L(Exit2)
-	jg	L(Exit3)
-L(More8Case3): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Exit12)
-	jl	L(Less12Case3)
-	cmp	$14, %r8
-	jl	L(Exit13)
-	je	L(Exit14)
-	jg	L(Exit15)
-L(More4Case3): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Exit5)
-	je	L(Exit6)
-	jg	L(Exit7)
-L(Less12Case3): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Exit9)
-	je	L(Exit10)
-	jg	L(Exit11)
-#  endif
-
-	.p2align 4
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$1, %r8
-	lea	1(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$2, %r8
-	lea	2(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$3, %r8
-	lea	3(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit4):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	3(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$4, %r8
-	lea	4(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit5):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$5, %r8
-	lea	5(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit6):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$6, %r8
-	lea	6(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit7):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movl	3(%rcx), %eax
-	movl	%eax, 3(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$7, %r8
-	lea	7(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit9):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %eax
-	mov	%eax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$9, %r8
-	lea	9(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %eax
-	mov	%eax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$10, %r8
-	lea	10(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$11, %r8
-	lea	11(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$12, %r8
-	lea	12(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %rax
-	mov	%rax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$13, %r8
-	lea	13(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %rax
-	mov	%rax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$14, %r8
-	lea	14(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$15, %r8
-	lea	15(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-	.p2align 4
-L(Fill0):
-	ret
-
-	.p2align 4
-L(Fill1):
-	movb	%dl, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill2):
-	movw	%dx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill3):
-	movw	%dx, (%rcx)
-	movb	%dl, 2(%rcx)
-	ret
-
-	.p2align 4
-L(Fill4):
-	movl	%edx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill5):
-	movl	%edx, (%rcx)
-	movb	%dl, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill6):
-	movl	%edx, (%rcx)
-	movw	%dx, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill7):
-	movl	%edx, (%rcx)
-	movl	%edx, 3(%rcx)
-	ret
-
-	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill9):
-	mov	%rdx, (%rcx)
-	movb	%dl, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rcx)
-	movw	%dx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rcx)
-	movl	%edx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rcx)
-	movl	%edx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 5(%rcx)
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 6(%rcx)
-	ret
-
-	.p2align 4
-L(Fill15):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill16):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(StrncpyFillExit1):
-	lea	16(%r8), %r8
-L(FillFrom1To16Bytes):
-	test	%r8, %r8
-	jz	L(Fill0)
-	cmp	$16, %r8
-	je	L(Fill16)
-	cmp	$8, %r8
-	je	L(Fill8)
-	jg	L(FillMore8)
-	cmp	$4, %r8
-	je	L(Fill4)
-	jg	L(FillMore4)
-	cmp	$2, %r8
-	jl	L(Fill1)
-	je	L(Fill2)
-	jg	L(Fill3)
-L(FillMore8): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Fill12)
-	jl	L(FillLess12)
-	cmp	$14, %r8
-	jl	L(Fill13)
-	je	L(Fill14)
-	jg	L(Fill15)
-L(FillMore4): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Fill5)
-	je	L(Fill6)
-	jg	L(Fill7)
-L(FillLess12): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Fill9)
-	je	L(Fill10)
-	jmp	L(Fill11)
-
-	.p2align 4
-L(StrncpyFillTailWithZero1):
-	xor	%rdx, %rdx
-	sub	$16, %r8
-	jbe	L(StrncpyFillExit1)
-
-	pxor	%xmm0, %xmm0
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-
-	lea	16(%rcx), %rcx
-
-	mov	%rcx, %rdx
-	and	$0xf, %rdx
-	sub	%rdx, %rcx
-	add	%rdx, %r8
-	xor	%rdx, %rdx
-	sub	$64, %r8
-	jb	L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	lea	64(%rcx), %rcx
-	sub	$64, %r8
-	jae	L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
-	add	$32, %r8
-	jl	L(StrncpyFillLess32)
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	lea	32(%rcx), %rcx
-	sub	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
-	add	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-	.p2align 4
-L(Exit0):
-	mov	%rdx, %rax
-	ret
-
-	.p2align 4
-L(StrncpyExit15Bytes):
-	cmp	$9, %r8
-	je	L(Exit9)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-	.p2align 4
-L(StrncpyExit8Bytes):
-	cmp	$1, %r8
-	je	L(Exit1)
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-#  endif
-# endif
-
-# ifdef USE_AS_STRNCPY
-	.p2align 4
-L(StrncpyLeaveCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
-	lea	64(%r8), %r8
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	add	$48, %r8
-	jle	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
-	.p2align 4
-L(StrncpyExit1Case2OrCase3):
-	movdqu	-1(%rcx), %xmm0
-	movdqu	%xmm0, -1(%rdx)
-	mov	$15, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit2Case2OrCase3):
-	movdqu	-2(%rcx), %xmm0
-	movdqu	%xmm0, -2(%rdx)
-	mov	$14, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit3Case2OrCase3):
-	movdqu	-3(%rcx), %xmm0
-	movdqu	%xmm0, -3(%rdx)
-	mov	$13, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit4Case2OrCase3):
-	movdqu	-4(%rcx), %xmm0
-	movdqu	%xmm0, -4(%rdx)
-	mov	$12, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit5Case2OrCase3):
-	movdqu	-5(%rcx), %xmm0
-	movdqu	%xmm0, -5(%rdx)
-	mov	$11, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit6Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	6(%rcx), %r9d
-	mov	%r9d, 6(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$10, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit7Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	5(%rcx), %r9d
-	mov	%r9d, 5(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$9, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit8Case2OrCase3):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit9Case2OrCase3):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit10Case2OrCase3):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit11Case2OrCase3):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit12Case2OrCase3):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit13Case2OrCase3):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit14Case2OrCase3):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit15Case2OrCase3):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave1):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit1):
-	lea	15(%rdx, %rsi), %rdx
-	lea	15(%rcx, %rsi), %rcx
-	mov	-15(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -15(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave2):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit2):
-	lea	14(%rdx, %rsi), %rdx
-	lea	14(%rcx, %rsi), %rcx
-	mov	-14(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -14(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave3):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit3):
-	lea	13(%rdx, %rsi), %rdx
-	lea	13(%rcx, %rsi), %rcx
-	mov	-13(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -13(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave4):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit4):
-	lea	12(%rdx, %rsi), %rdx
-	lea	12(%rcx, %rsi), %rcx
-	mov	-12(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -12(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave5):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit5):
-	lea	11(%rdx, %rsi), %rdx
-	lea	11(%rcx, %rsi), %rcx
-	mov	-11(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -11(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave6):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit6):
-	lea	10(%rdx, %rsi), %rdx
-	lea	10(%rcx, %rsi), %rcx
-	mov	-10(%rcx), %rsi
-	movw	-2(%rcx), %ax
-	mov	%rsi, -10(%rdx)
-	movw	%ax, -2(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave7):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit7):
-	lea	9(%rdx, %rsi), %rdx
-	lea	9(%rcx, %rsi), %rcx
-	mov	-9(%rcx), %rsi
-	movb	-1(%rcx), %ah
-	mov	%rsi, -9(%rdx)
-	movb	%ah, -1(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave8):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit8):
-	lea	8(%rdx, %rsi), %rdx
-	lea	8(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave9):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit9):
-	lea	7(%rdx, %rsi), %rdx
-	lea	7(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave10):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit10):
-	lea	6(%rdx, %rsi), %rdx
-	lea	6(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave11):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit11):
-	lea	5(%rdx, %rsi), %rdx
-	lea	5(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave12):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit12):
-	lea	4(%rdx, %rsi), %rdx
-	lea	4(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave13):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit13):
-	lea	3(%rdx, %rsi), %rdx
-	lea	3(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave14):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit14):
-	lea	2(%rdx, %rsi), %rdx
-	lea	2(%rcx, %rsi), %rcx
-	movw	-2(%rcx), %ax
-	xor	%rsi, %rsi
-	movw	%ax, -2(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave15):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit15):
-	lea	1(%rdx, %rsi), %rdx
-	lea	1(%rcx, %rsi), %rcx
-	movb	-1(%rcx), %ah
-	xor	%rsi, %rsi
-	movb	%ah, -1(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@ 
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"