diff mbox series

[v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S

Message ID 20220712192910.351121-2-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S | expand

Commit Message

Noah Goldstein July 12, 2022, 7:29 p.m. UTC
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/rtld-stpcpy.S |  18 ++++
 sysdeps/x86_64/multiarch/stpcpy-sse2.S |  15 +--
 sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++--
 sysdeps/x86_64/stpcpy.S                |   3 +-
 sysdeps/x86_64/strcpy.S                | 138 +------------------------
 5 files changed, 156 insertions(+), 155 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S

Comments

H.J. Lu July 12, 2022, 11:23 p.m. UTC | #1
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/rtld-stpcpy.S |  18 ++++
>  sysdeps/x86_64/multiarch/stpcpy-sse2.S |  15 +--
>  sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++--
>  sysdeps/x86_64/stpcpy.S                |   3 +-
>  sysdeps/x86_64/strcpy.S                | 138 +------------------------
>  5 files changed, 156 insertions(+), 155 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> new file mode 100644
> index 0000000000..914141f07f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../stpcpy.S"
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> index 078504a44e..ea9f973af3 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> @@ -17,17 +17,10 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -
> -# include <sysdep.h>
> -# define __stpcpy __stpcpy_sse2
> -
> -# undef weak_alias
> -# define weak_alias(ignored1, ignored2)
> -# undef libc_hidden_def
> -# define libc_hidden_def(__stpcpy)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(stpcpy)
> +# ifndef STRCPY
> +#  define STRCPY       __stpcpy_sse2
> +# endif
>  #endif
>
>  #define USE_AS_STPCPY
> -#include <sysdeps/x86_64/stpcpy.S>
> +#include "strcpy-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> index f37967c441..8b5db8b13d 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> @@ -17,12 +17,137 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> +# ifndef STRCPY
> +#  define STRCPY __strcpy_sse2
> +# endif
> +#endif
>
> -# include <sysdep.h>
> -# define strcpy __strcpy_sse2
> +#include <sysdep.h>
>
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcpy)
> -#endif
> +       .text
> +ENTRY (STRCPY)
> +       movq %rsi, %rcx         /* Source register. */
> +       andl $7, %ecx           /* mask alignment bits */
> +       movq %rdi, %rdx         /* Duplicate destination pointer.  */
> +
> +       jz 5f                   /* aligned => start loop */
> +
> +       neg %ecx                /* We need to align to 8 bytes.  */
> +       addl $8,%ecx
> +       /* Search the first bytes directly.  */
> +0:
> +       movb    (%rsi), %al     /* Fetch a byte */
> +       testb   %al, %al        /* Is it NUL? */
> +       movb    %al, (%rdx)     /* Store it */
> +       jz      4f              /* If it was NUL, done! */
> +       incq    %rsi
> +       incq    %rdx
> +       decl    %ecx
> +       jnz     0b
> +
> +5:
> +       movq $0xfefefefefefefeff,%r8
> +
> +       /* Now the sources is aligned.  Unfortunatly we cannot force
> +          to have both source and destination aligned, so ignore the
> +          alignment of the destination.  */
> +       .p2align 4
> +1:
> +       /* 1st unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 2nd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
>
> -#include <sysdeps/x86_64/strcpy.S>
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 3rd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 4th unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +       jmp     1b              /* Next iteration.  */
> +
> +       /* Do the last few bytes. %rax contains the value to write.
> +          The loop is unrolled twice.  */
> +       .p2align 4
> +3:
> +       /* Note that stpcpy needs to return with the value of the NUL
> +          byte.  */
> +       movb    %al, (%rdx)     /* 1st byte.  */
> +       testb   %al, %al        /* Is it NUL.  */
> +       jz      4f              /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       movb    %ah, (%rdx)     /* 2nd byte.  */
> +       testb   %ah, %ah        /* Is it NUL?.  */
> +       jz      4f              /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       shrq    $16, %rax       /* Shift...  */
> +       jmp     3b              /* and look at next two bytes in %rax.  */
> +
> +4:
> +#ifdef USE_AS_STPCPY
> +       movq    %rdx, %rax      /* Destination is return value.  */
> +#else
> +       movq    %rdi, %rax      /* Source is return value.  */
> +#endif
> +       retq
> +END (STRCPY)
> diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S
> index ec23de1416..b097c203dd 100644
> --- a/sysdeps/x86_64/stpcpy.S
> +++ b/sysdeps/x86_64/stpcpy.S
> @@ -1,7 +1,6 @@
> -#define USE_AS_STPCPY
>  #define STRCPY __stpcpy
>
> -#include <sysdeps/x86_64/strcpy.S>
> +#include "multiarch/stpcpy-sse2.S"
>
>  weak_alias (__stpcpy, stpcpy)
>  libc_hidden_def (__stpcpy)
> diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
> index 17e8073550..05f19e6e94 100644
> --- a/sysdeps/x86_64/strcpy.S
> +++ b/sysdeps/x86_64/strcpy.S
> @@ -16,140 +16,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> -#ifndef USE_AS_STPCPY
> -# define STRCPY strcpy
> -#endif
> -
> -       .text
> -ENTRY (STRCPY)
> -       movq %rsi, %rcx         /* Source register. */
> -       andl $7, %ecx           /* mask alignment bits */
> -       movq %rdi, %rdx         /* Duplicate destination pointer.  */
> -
> -       jz 5f                   /* aligned => start loop */
> -
> -       neg %ecx                /* We need to align to 8 bytes.  */
> -       addl $8,%ecx
> -       /* Search the first bytes directly.  */
> -0:
> -       movb    (%rsi), %al     /* Fetch a byte */
> -       testb   %al, %al        /* Is it NUL? */
> -       movb    %al, (%rdx)     /* Store it */
> -       jz      4f              /* If it was NUL, done! */
> -       incq    %rsi
> -       incq    %rdx
> -       decl    %ecx
> -       jnz     0b
> -
> -5:
> -       movq $0xfefefefefefefeff,%r8
> -
> -       /* Now the sources is aligned.  Unfortunatly we cannot force
> -          to have both source and destination aligned, so ignore the
> -          alignment of the destination.  */
> -       .p2align 4
> -1:
> -       /* 1st unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 2nd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 3rd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 4th unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -       jmp     1b              /* Next iteration.  */
> -
> -       /* Do the last few bytes. %rax contains the value to write.
> -          The loop is unrolled twice.  */
> -       .p2align 4
> -3:
> -       /* Note that stpcpy needs to return with the value of the NUL
> -          byte.  */
> -       movb    %al, (%rdx)     /* 1st byte.  */
> -       testb   %al, %al        /* Is it NUL.  */
> -       jz      4f              /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       movb    %ah, (%rdx)     /* 2nd byte.  */
> -       testb   %ah, %ah        /* Is it NUL?.  */
> -       jz      4f              /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       shrq    $16, %rax       /* Shift...  */
> -       jmp     3b              /* and look at next two bytes in %rax.  */
> -
> -4:
> -#ifdef USE_AS_STPCPY
> -       movq    %rdx, %rax      /* Destination is return value.  */
> -#else
> -       movq    %rdi, %rax      /* Source is return value.  */
> -#endif
> -       retq
> -END (STRCPY)
> -#ifndef USE_AS_STPCPY
> +#define STRCPY strcpy
> +#include "multiarch/strcpy-sse2.S"
>  libc_hidden_builtin_def (strcpy)
> -#endif
> --
> 2.34.1
>

LGTM.

Thanks.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
new file mode 100644
index 0000000000..914141f07f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
@@ -0,0 +1,18 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../stpcpy.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
index 078504a44e..ea9f973af3 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
@@ -17,17 +17,10 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-
-# include <sysdep.h>
-# define __stpcpy __stpcpy_sse2
-
-# undef weak_alias
-# define weak_alias(ignored1, ignored2)
-# undef libc_hidden_def
-# define libc_hidden_def(__stpcpy)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(stpcpy)
+# ifndef STRCPY
+#  define STRCPY	__stpcpy_sse2
+# endif
 #endif
 
 #define USE_AS_STPCPY
-#include <sysdeps/x86_64/stpcpy.S>
+#include "strcpy-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S
index f37967c441..8b5db8b13d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S
@@ -17,12 +17,137 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
+# ifndef STRCPY
+#  define STRCPY __strcpy_sse2
+# endif
+#endif
 
-# include <sysdep.h>
-# define strcpy __strcpy_sse2
+#include <sysdep.h>
 
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcpy)
-#endif
+	.text
+ENTRY (STRCPY)
+	movq %rsi, %rcx		/* Source register. */
+	andl $7, %ecx		/* mask alignment bits */
+	movq %rdi, %rdx		/* Duplicate destination pointer.  */
+
+	jz 5f			/* aligned => start loop */
+
+	neg %ecx		/* We need to align to 8 bytes.  */
+	addl $8,%ecx
+	/* Search the first bytes directly.  */
+0:
+	movb	(%rsi), %al	/* Fetch a byte */
+	testb	%al, %al	/* Is it NUL? */
+	movb	%al, (%rdx)	/* Store it */
+	jz	4f		/* If it was NUL, done! */
+	incq	%rsi
+	incq	%rdx
+	decl	%ecx
+	jnz	0b
+
+5:
+	movq $0xfefefefefefefeff,%r8
+
+	/* Now the sources is aligned.  Unfortunatly we cannot force
+	   to have both source and destination aligned, so ignore the
+	   alignment of the destination.  */
+	.p2align 4
+1:
+	/* 1st unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 2nd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
 
-#include <sysdeps/x86_64/strcpy.S>
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 3rd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 4th unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+	jmp	1b		/* Next iteration.  */
+
+	/* Do the last few bytes. %rax contains the value to write.
+	   The loop is unrolled twice.  */
+	.p2align 4
+3:
+	/* Note that stpcpy needs to return with the value of the NUL
+	   byte.  */
+	movb	%al, (%rdx)	/* 1st byte.  */
+	testb	%al, %al	/* Is it NUL.  */
+	jz	4f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	movb	%ah, (%rdx)	/* 2nd byte.  */
+	testb	%ah, %ah	/* Is it NUL?.  */
+	jz	4f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	shrq	$16, %rax	/* Shift...  */
+	jmp	3b		/* and look at next two bytes in %rax.  */
+
+4:
+#ifdef USE_AS_STPCPY
+	movq	%rdx, %rax	/* Destination is return value.  */
+#else
+	movq	%rdi, %rax	/* Source is return value.  */
+#endif
+	retq
+END (STRCPY)
diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S
index ec23de1416..b097c203dd 100644
--- a/sysdeps/x86_64/stpcpy.S
+++ b/sysdeps/x86_64/stpcpy.S
@@ -1,7 +1,6 @@ 
-#define USE_AS_STPCPY
 #define STRCPY __stpcpy
 
-#include <sysdeps/x86_64/strcpy.S>
+#include "multiarch/stpcpy-sse2.S"
 
 weak_alias (__stpcpy, stpcpy)
 libc_hidden_def (__stpcpy)
diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
index 17e8073550..05f19e6e94 100644
--- a/sysdeps/x86_64/strcpy.S
+++ b/sysdeps/x86_64/strcpy.S
@@ -16,140 +16,6 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-#ifndef USE_AS_STPCPY
-# define STRCPY strcpy
-#endif
-
-	.text
-ENTRY (STRCPY)
-	movq %rsi, %rcx		/* Source register. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rdx		/* Duplicate destination pointer.  */
-
-	jz 5f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:
-	movb	(%rsi), %al	/* Fetch a byte */
-	testb	%al, %al	/* Is it NUL? */
-	movb	%al, (%rdx)	/* Store it */
-	jz	4f		/* If it was NUL, done! */
-	incq	%rsi
-	incq	%rdx
-	decl	%ecx
-	jnz	0b
-
-5:
-	movq $0xfefefefefefefeff,%r8
-
-	/* Now the sources is aligned.  Unfortunatly we cannot force
-	   to have both source and destination aligned, so ignore the
-	   alignment of the destination.  */
-	.p2align 4
-1:
-	/* 1st unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 2nd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 3rd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 4th unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-	jmp	1b		/* Next iteration.  */
-
-	/* Do the last few bytes. %rax contains the value to write.
-	   The loop is unrolled twice.  */
-	.p2align 4
-3:
-	/* Note that stpcpy needs to return with the value of the NUL
-	   byte.  */
-	movb	%al, (%rdx)	/* 1st byte.  */
-	testb	%al, %al	/* Is it NUL.  */
-	jz	4f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	movb	%ah, (%rdx)	/* 2nd byte.  */
-	testb	%ah, %ah	/* Is it NUL?.  */
-	jz	4f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	shrq	$16, %rax	/* Shift...  */
-	jmp	3b		/* and look at next two bytes in %rax.  */
-
-4:
-#ifdef USE_AS_STPCPY
-	movq	%rdx, %rax	/* Destination is return value.  */
-#else
-	movq	%rdi, %rax	/* Source is return value.  */
-#endif
-	retq
-END (STRCPY)
-#ifndef USE_AS_STPCPY
+#define STRCPY	strcpy
+#include "multiarch/strcpy-sse2.S"
 libc_hidden_builtin_def (strcpy)
-#endif