diff mbox series

[v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S

Message ID 20220712192910.351121-5-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S | expand

Commit Message

Noah Goldstein July 12, 2022, 7:29 p.m. UTC
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/rtld-strchr.S    |  18 +++
 sysdeps/x86_64/multiarch/rtld-strchrnul.S |  18 +++
 sysdeps/x86_64/multiarch/strchr-sse2.S    | 175 +++++++++++++++++++++-
 sysdeps/x86_64/multiarch/strchrnul-sse2.S |  11 +-
 sysdeps/x86_64/strchr.S                   | 167 +--------------------
 sysdeps/x86_64/strchrnul.S                |   7 +-
 6 files changed, 213 insertions(+), 183 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/rtld-strchr.S
 create mode 100644 sysdeps/x86_64/multiarch/rtld-strchrnul.S

Comments

H.J. Lu July 12, 2022, 9:27 p.m. UTC | #1
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/rtld-strchr.S    |  18 +++
>  sysdeps/x86_64/multiarch/rtld-strchrnul.S |  18 +++
>  sysdeps/x86_64/multiarch/strchr-sse2.S    | 175 +++++++++++++++++++++-
>  sysdeps/x86_64/multiarch/strchrnul-sse2.S |  11 +-
>  sysdeps/x86_64/strchr.S                   | 167 +--------------------
>  sysdeps/x86_64/strchrnul.S                |   7 +-
>  6 files changed, 213 insertions(+), 183 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-strchr.S
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-strchrnul.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S
> new file mode 100644
> index 0000000000..2b7b879e37
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../strchr.S"
> diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
> new file mode 100644
> index 0000000000..0cc5becc88
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../strchrnul.S"
> diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
> index 992f700077..f7767ca543 100644
> --- a/sysdeps/x86_64/multiarch/strchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
> @@ -16,13 +16,172 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#if IS_IN (libc)
> -# define strchr __strchr_sse2
> +#if IS_IN (libc) || defined STRCHR
> +# ifndef STRCHR
> +#  define STRCHR __strchr_sse2
> +# endif
>
> -# undef weak_alias
> -# define weak_alias(strchr, index)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strchr)
> -#endif
> +# include <sysdep.h>
> +
> +       .text
> +ENTRY (STRCHR)
> +       movd    %esi, %xmm1
> +       movl    %edi, %eax
> +       andl    $4095, %eax
> +       punpcklbw %xmm1, %xmm1
> +       cmpl    $4032, %eax
> +       punpcklwd %xmm1, %xmm1
> +       pshufd  $0, %xmm1, %xmm1
> +       jg      L(cross_page)
> +       movdqu  (%rdi), %xmm0
> +       pxor    %xmm3, %xmm3
> +       movdqa  %xmm0, %xmm4
> +       pcmpeqb %xmm1, %xmm0
> +       pcmpeqb %xmm3, %xmm4
> +       por     %xmm4, %xmm0
> +       pmovmskb %xmm0, %eax
> +       test    %eax, %eax
> +       je      L(next_48_bytes)
> +       bsf     %eax, %eax
> +# ifdef AS_STRCHRNUL
> +       leaq    (%rdi,%rax), %rax
> +# else
> +       movl    $0, %edx
> +       leaq    (%rdi,%rax), %rax
> +       cmpb    %sil, (%rax)
> +       cmovne  %rdx, %rax
> +# endif
> +       ret
> +
> +       .p2align 3
> +L(next_48_bytes):
> +       movdqu  16(%rdi), %xmm0
> +       movdqa  %xmm0, %xmm4
> +       pcmpeqb %xmm1, %xmm0
> +       pcmpeqb %xmm3, %xmm4
> +       por     %xmm4, %xmm0
> +       pmovmskb %xmm0, %ecx
> +       movdqu  32(%rdi), %xmm0
> +       movdqa  %xmm0, %xmm4
> +       pcmpeqb %xmm1, %xmm0
> +       salq    $16, %rcx
> +       pcmpeqb %xmm3, %xmm4
> +       por     %xmm4, %xmm0
> +       pmovmskb %xmm0, %eax
> +       movdqu  48(%rdi), %xmm0
> +       pcmpeqb %xmm0, %xmm3
> +       salq    $32, %rax
> +       pcmpeqb %xmm1, %xmm0
> +       orq     %rcx, %rax
> +       por     %xmm3, %xmm0
> +       pmovmskb %xmm0, %ecx
> +       salq    $48, %rcx
> +       orq     %rcx, %rax
> +       testq   %rax, %rax
> +       jne     L(return)
> +L(loop_start):
> +       /* We use this alignment to force loop be aligned to 8 but not
> +          16 bytes.  This gives better sheduling on AMD processors.  */
> +       .p2align 4
> +       pxor    %xmm6, %xmm6
> +       andq    $-64, %rdi
> +       .p2align 3
> +L(loop64):
> +       addq    $64, %rdi
> +       movdqa  (%rdi), %xmm5
> +       movdqa  16(%rdi), %xmm2
> +       movdqa  32(%rdi), %xmm3
> +       pxor    %xmm1, %xmm5
> +       movdqa  48(%rdi), %xmm4
> +       pxor    %xmm1, %xmm2
> +       pxor    %xmm1, %xmm3
> +       pminub  (%rdi), %xmm5
> +       pxor    %xmm1, %xmm4
> +       pminub  16(%rdi), %xmm2
> +       pminub  32(%rdi), %xmm3
> +       pminub  %xmm2, %xmm5
> +       pminub  48(%rdi), %xmm4
> +       pminub  %xmm3, %xmm5
> +       pminub  %xmm4, %xmm5
> +       pcmpeqb %xmm6, %xmm5
> +       pmovmskb %xmm5, %eax
> +
> +       testl   %eax, %eax
> +       je      L(loop64)
>
> -#include "../strchr.S"
> +       movdqa  (%rdi), %xmm5
> +       movdqa  %xmm5, %xmm0
> +       pcmpeqb %xmm1, %xmm5
> +       pcmpeqb %xmm6, %xmm0
> +       por     %xmm0, %xmm5
> +       pcmpeqb %xmm6, %xmm2
> +       pcmpeqb %xmm6, %xmm3
> +       pcmpeqb %xmm6, %xmm4
> +
> +       pmovmskb %xmm5, %ecx
> +       pmovmskb %xmm2, %eax
> +       salq    $16, %rax
> +       pmovmskb %xmm3, %r8d
> +       pmovmskb %xmm4, %edx
> +       salq    $32, %r8
> +       orq     %r8, %rax
> +       orq     %rcx, %rax
> +       salq    $48, %rdx
> +       orq     %rdx, %rax
> +       .p2align 3
> +L(return):
> +       bsfq    %rax, %rax
> +# ifdef AS_STRCHRNUL
> +       leaq    (%rdi,%rax), %rax
> +# else
> +       movl    $0, %edx
> +       leaq    (%rdi,%rax), %rax
> +       cmpb    %sil, (%rax)
> +       cmovne  %rdx, %rax
> +# endif
> +       ret
> +       .p2align 4
> +
> +L(cross_page):
> +       movq    %rdi, %rdx
> +       pxor    %xmm2, %xmm2
> +       andq    $-64, %rdx
> +       movdqa  %xmm1, %xmm0
> +       movdqa  (%rdx), %xmm3
> +       movdqa  %xmm3, %xmm4
> +       pcmpeqb %xmm1, %xmm3
> +       pcmpeqb %xmm2, %xmm4
> +       por     %xmm4, %xmm3
> +       pmovmskb %xmm3, %r8d
> +       movdqa  16(%rdx), %xmm3
> +       movdqa  %xmm3, %xmm4
> +       pcmpeqb %xmm1, %xmm3
> +       pcmpeqb %xmm2, %xmm4
> +       por     %xmm4, %xmm3
> +       pmovmskb %xmm3, %eax
> +       movdqa  32(%rdx), %xmm3
> +       movdqa  %xmm3, %xmm4
> +       pcmpeqb %xmm1, %xmm3
> +       salq    $16, %rax
> +       pcmpeqb %xmm2, %xmm4
> +       por     %xmm4, %xmm3
> +       pmovmskb %xmm3, %r9d
> +       movdqa  48(%rdx), %xmm3
> +       pcmpeqb %xmm3, %xmm2
> +       salq    $32, %r9
> +       pcmpeqb %xmm3, %xmm0
> +       orq     %r9, %rax
> +       orq     %r8, %rax
> +       por     %xmm2, %xmm0
> +       pmovmskb %xmm0, %ecx
> +       salq    $48, %rcx
> +       orq     %rcx, %rax
> +       movl    %edi, %ecx
> +       subb    %dl, %cl
> +       shrq    %cl, %rax
> +       testq   %rax, %rax
> +       jne     L(return)
> +       jmp     L(loop_start)
> +
> +END (STRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> index f91c670369..7238977a21 100644
> --- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> @@ -17,10 +17,11 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define __strchrnul __strchrnul_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__strchrnul, strchrnul)
> +# ifndef STRCHR
> +#  define STRCHR       __strchrnul_sse2
> +# endif
>  #endif
>
> -#include "../strchrnul.S"
> +#define AS_STRCHRNUL
> +
> +#include "strchr-sse2.S"
> diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> index dda7c0431d..77c956c92c 100644
> --- a/sysdeps/x86_64/strchr.S
> +++ b/sysdeps/x86_64/strchr.S
> @@ -17,171 +17,8 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
>
> -       .text
> -ENTRY (strchr)
> -       movd    %esi, %xmm1
> -       movl    %edi, %eax
> -       andl    $4095, %eax
> -       punpcklbw %xmm1, %xmm1
> -       cmpl    $4032, %eax
> -       punpcklwd %xmm1, %xmm1
> -       pshufd  $0, %xmm1, %xmm1
> -       jg      L(cross_page)
> -       movdqu  (%rdi), %xmm0
> -       pxor    %xmm3, %xmm3
> -       movdqa  %xmm0, %xmm4
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm3, %xmm4
> -       por     %xmm4, %xmm0
> -       pmovmskb %xmm0, %eax
> -       test    %eax, %eax
> -       je      L(next_48_bytes)
> -       bsf     %eax, %eax
> -#ifdef AS_STRCHRNUL
> -       leaq    (%rdi,%rax), %rax
> -#else
> -       movl    $0, %edx
> -       leaq    (%rdi,%rax), %rax
> -       cmpb    %sil, (%rax)
> -       cmovne  %rdx, %rax
> -#endif
> -       ret
> -
> -       .p2align 3
> -       L(next_48_bytes):
> -       movdqu  16(%rdi), %xmm0
> -       movdqa  %xmm0, %xmm4
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm3, %xmm4
> -       por     %xmm4, %xmm0
> -       pmovmskb %xmm0, %ecx
> -       movdqu  32(%rdi), %xmm0
> -       movdqa  %xmm0, %xmm4
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $16, %rcx
> -       pcmpeqb %xmm3, %xmm4
> -       por     %xmm4, %xmm0
> -       pmovmskb %xmm0, %eax
> -       movdqu  48(%rdi), %xmm0
> -       pcmpeqb %xmm0, %xmm3
> -       salq    $32, %rax
> -       pcmpeqb %xmm1, %xmm0
> -       orq     %rcx, %rax
> -       por     %xmm3, %xmm0
> -       pmovmskb %xmm0, %ecx
> -       salq    $48, %rcx
> -       orq     %rcx, %rax
> -       testq   %rax, %rax
> -       jne     L(return)
> -L(loop_start):
> -       /* We use this alignment to force loop be aligned to 8 but not
> -          16 bytes.  This gives better sheduling on AMD processors.  */
> -       .p2align 4
> -       pxor    %xmm6, %xmm6
> -       andq    $-64, %rdi
> -       .p2align 3
> -L(loop64):
> -       addq    $64, %rdi
> -       movdqa  (%rdi), %xmm5
> -       movdqa  16(%rdi), %xmm2
> -       movdqa  32(%rdi), %xmm3
> -       pxor    %xmm1, %xmm5
> -       movdqa  48(%rdi), %xmm4
> -       pxor    %xmm1, %xmm2
> -       pxor    %xmm1, %xmm3
> -       pminub  (%rdi), %xmm5
> -       pxor    %xmm1, %xmm4
> -       pminub  16(%rdi), %xmm2
> -       pminub  32(%rdi), %xmm3
> -       pminub  %xmm2, %xmm5
> -       pminub  48(%rdi), %xmm4
> -       pminub  %xmm3, %xmm5
> -       pminub  %xmm4, %xmm5
> -       pcmpeqb %xmm6, %xmm5
> -       pmovmskb %xmm5, %eax
> -
> -       testl   %eax, %eax
> -       je      L(loop64)
> -
> -       movdqa  (%rdi), %xmm5
> -       movdqa  %xmm5, %xmm0
> -       pcmpeqb %xmm1, %xmm5
> -       pcmpeqb %xmm6, %xmm0
> -       por     %xmm0, %xmm5
> -       pcmpeqb %xmm6, %xmm2
> -       pcmpeqb %xmm6, %xmm3
> -       pcmpeqb %xmm6, %xmm4
> -
> -       pmovmskb %xmm5, %ecx
> -       pmovmskb %xmm2, %eax
> -       salq    $16, %rax
> -       pmovmskb %xmm3, %r8d
> -       pmovmskb %xmm4, %edx
> -       salq    $32, %r8
> -       orq     %r8, %rax
> -       orq     %rcx, %rax
> -       salq    $48, %rdx
> -       orq     %rdx, %rax
> -       .p2align 3
> -L(return):
> -       bsfq    %rax, %rax
> -#ifdef AS_STRCHRNUL
> -       leaq    (%rdi,%rax), %rax
> -#else
> -       movl    $0, %edx
> -       leaq    (%rdi,%rax), %rax
> -       cmpb    %sil, (%rax)
> -       cmovne  %rdx, %rax
> -#endif
> -       ret
> -       .p2align 4
> -
> -L(cross_page):
> -       movq    %rdi, %rdx
> -       pxor    %xmm2, %xmm2
> -       andq    $-64, %rdx
> -       movdqa  %xmm1, %xmm0
> -       movdqa  (%rdx), %xmm3
> -       movdqa  %xmm3, %xmm4
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm2, %xmm4
> -       por     %xmm4, %xmm3
> -       pmovmskb %xmm3, %r8d
> -       movdqa  16(%rdx), %xmm3
> -       movdqa  %xmm3, %xmm4
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm2, %xmm4
> -       por     %xmm4, %xmm3
> -       pmovmskb %xmm3, %eax
> -       movdqa  32(%rdx), %xmm3
> -       movdqa  %xmm3, %xmm4
> -       pcmpeqb %xmm1, %xmm3
> -       salq    $16, %rax
> -       pcmpeqb %xmm2, %xmm4
> -       por     %xmm4, %xmm3
> -       pmovmskb %xmm3, %r9d
> -       movdqa  48(%rdx), %xmm3
> -       pcmpeqb %xmm3, %xmm2
> -       salq    $32, %r9
> -       pcmpeqb %xmm3, %xmm0
> -       orq     %r9, %rax
> -       orq     %r8, %rax
> -       por     %xmm2, %xmm0
> -       pmovmskb %xmm0, %ecx
> -       salq    $48, %rcx
> -       orq     %rcx, %rax
> -       movl    %edi, %ecx
> -       subb    %dl, %cl
> -       shrq    %cl, %rax
> -       testq   %rax, %rax
> -       jne     L(return)
> -       jmp     L(loop_start)
> -
> -END (strchr)
> -
> -#ifndef AS_STRCHRNUL
> +#define STRCHR strchr
> +#include "multiarch/strchr-sse2.S"
>  weak_alias (strchr, index)
>  libc_hidden_builtin_def (strchr)
> -#endif
> diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
> index ec2e652e25..508e42db26 100644
> --- a/sysdeps/x86_64/strchrnul.S
> +++ b/sysdeps/x86_64/strchrnul.S
> @@ -18,10 +18,7 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -
> -#define strchr __strchrnul
> -#define AS_STRCHRNUL
> -#include "strchr.S"
> +#define STRCHR __strchrnul
> +#include "multiarch/strchrnul-sse2.S"
>
>  weak_alias (__strchrnul, strchrnul)
> --
> 2.34.1
>

LGTM.

Thanks.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S
new file mode 100644
index 0000000000..2b7b879e37
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
@@ -0,0 +1,18 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
new file mode 100644
index 0000000000..0cc5becc88
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
@@ -0,0 +1,18 @@ 
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
index 992f700077..f7767ca543 100644
--- a/sysdeps/x86_64/multiarch/strchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
@@ -16,13 +16,172 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# define strchr __strchr_sse2
+#if IS_IN (libc) || defined STRCHR
+# ifndef STRCHR
+#  define STRCHR __strchr_sse2
+# endif
 
-# undef weak_alias
-# define weak_alias(strchr, index)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strchr)
-#endif
+# include <sysdep.h>
+
+	.text
+ENTRY (STRCHR)
+	movd	%esi, %xmm1
+	movl	%edi, %eax
+	andl	$4095, %eax
+	punpcklbw %xmm1, %xmm1
+	cmpl	$4032, %eax
+	punpcklwd %xmm1, %xmm1
+	pshufd	$0, %xmm1, %xmm1
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm3, %xmm3
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	je	L(next_48_bytes)
+	bsf	%eax, %eax
+# ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+# else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 3
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %ecx
+	movdqu	32(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rcx
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	salq	$32, %rax
+	pcmpeqb	%xmm1, %xmm0
+	orq	%rcx, %rax
+	por	%xmm3, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	testq	%rax, %rax
+	jne	L(return)
+L(loop_start):
+	/* We use this alignment to force loop be aligned to 8 but not
+	   16 bytes.  This gives better sheduling on AMD processors.  */
+	.p2align 4
+	pxor	%xmm6, %xmm6
+	andq	$-64, %rdi
+	.p2align 3
+L(loop64):
+	addq	$64, %rdi
+	movdqa	(%rdi), %xmm5
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm1, %xmm5
+	movdqa	48(%rdi), %xmm4
+	pxor	%xmm1, %xmm2
+	pxor	%xmm1, %xmm3
+	pminub	(%rdi), %xmm5
+	pxor	%xmm1, %xmm4
+	pminub	16(%rdi), %xmm2
+	pminub	32(%rdi), %xmm3
+	pminub	%xmm2, %xmm5
+	pminub	48(%rdi), %xmm4
+	pminub	%xmm3, %xmm5
+	pminub	%xmm4, %xmm5
+	pcmpeqb %xmm6, %xmm5
+	pmovmskb %xmm5, %eax
+
+	testl	%eax, %eax
+	je	L(loop64)
 
-#include "../strchr.S"
+	movdqa	(%rdi), %xmm5
+	movdqa	%xmm5, %xmm0
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm6, %xmm0
+	por	%xmm0, %xmm5
+	pcmpeqb %xmm6, %xmm2
+	pcmpeqb %xmm6, %xmm3
+	pcmpeqb %xmm6, %xmm4
+
+	pmovmskb %xmm5, %ecx
+	pmovmskb %xmm2, %eax
+	salq	$16, %rax
+	pmovmskb %xmm3, %r8d
+	pmovmskb %xmm4, %edx
+	salq	$32, %r8
+	orq	%r8, %rax
+	orq	%rcx, %rax
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	.p2align 3
+L(return):
+	bsfq	%rax, %rax
+# ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+# else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+# endif
+	ret
+	.p2align 4
+
+L(cross_page):
+	movq	%rdi, %rdx
+	pxor	%xmm2, %xmm2
+	andq	$-64, %rdx
+	movdqa	%xmm1, %xmm0
+	movdqa	(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r8d
+	movdqa	16(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %eax
+	movdqa	32(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	salq	$16, %rax
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r9d
+	movdqa	48(%rdx), %xmm3
+	pcmpeqb	%xmm3, %xmm2
+	salq	$32, %r9
+	pcmpeqb	%xmm3, %xmm0
+	orq	%r9, %rax
+	orq	%r8, %rax
+	por	%xmm2, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	movl	%edi, %ecx
+	subb	%dl, %cl
+	shrq	%cl, %rax
+	testq	%rax, %rax
+	jne	L(return)
+	jmp	L(loop_start)
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
index f91c670369..7238977a21 100644
--- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
@@ -17,10 +17,11 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __strchrnul __strchrnul_sse2
-
-# undef weak_alias
-# define weak_alias(__strchrnul, strchrnul)
+# ifndef STRCHR
+#  define STRCHR	__strchrnul_sse2
+# endif
 #endif
 
-#include "../strchrnul.S"
+#define AS_STRCHRNUL
+
+#include "strchr-sse2.S"
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index dda7c0431d..77c956c92c 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -17,171 +17,8 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (strchr)
-	movd	%esi, %xmm1
-	movl	%edi, %eax
-	andl	$4095, %eax
-	punpcklbw %xmm1, %xmm1
-	cmpl	$4032, %eax
-	punpcklwd %xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
-	jg	L(cross_page)
-	movdqu	(%rdi), %xmm0
-	pxor	%xmm3, %xmm3
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	je	L(next_48_bytes)
-	bsf	%eax, %eax
-#ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
-#else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
-	cmpb	%sil, (%rax)
-	cmovne	%rdx, %rax
-#endif
-	ret
-
-	.p2align 3
-	L(next_48_bytes):
-	movdqu	16(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %ecx
-	movdqu	32(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rcx
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %eax
-	movdqu	48(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rcx, %rax
-	por	%xmm3, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rax
-	testq	%rax, %rax
-	jne	L(return)
-L(loop_start):
-	/* We use this alignment to force loop be aligned to 8 but not
-	   16 bytes.  This gives better sheduling on AMD processors.  */
-	.p2align 4
-	pxor	%xmm6, %xmm6
-	andq	$-64, %rdi
-	.p2align 3
-L(loop64):
-	addq	$64, %rdi
-	movdqa	(%rdi), %xmm5
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm1, %xmm5
-	movdqa	48(%rdi), %xmm4
-	pxor	%xmm1, %xmm2
-	pxor	%xmm1, %xmm3
-	pminub	(%rdi), %xmm5
-	pxor	%xmm1, %xmm4
-	pminub	16(%rdi), %xmm2
-	pminub	32(%rdi), %xmm3
-	pminub	%xmm2, %xmm5
-	pminub	48(%rdi), %xmm4
-	pminub	%xmm3, %xmm5
-	pminub	%xmm4, %xmm5
-	pcmpeqb %xmm6, %xmm5
-	pmovmskb %xmm5, %eax
-
-	testl	%eax, %eax
-	je	L(loop64)
-
-	movdqa	(%rdi), %xmm5
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm6, %xmm0
-	por	%xmm0, %xmm5
-	pcmpeqb %xmm6, %xmm2
-	pcmpeqb %xmm6, %xmm3
-	pcmpeqb %xmm6, %xmm4
-
-	pmovmskb %xmm5, %ecx
-	pmovmskb %xmm2, %eax
-	salq	$16, %rax
-	pmovmskb %xmm3, %r8d
-	pmovmskb %xmm4, %edx
-	salq	$32, %r8
-	orq	%r8, %rax
-	orq	%rcx, %rax
-	salq	$48, %rdx
-	orq	%rdx, %rax
-	.p2align 3
-L(return):
-	bsfq	%rax, %rax
-#ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
-#else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
-	cmpb	%sil, (%rax)
-	cmovne	%rdx, %rax
-#endif
-	ret
-	.p2align 4
-
-L(cross_page):
-	movq	%rdi, %rdx
-	pxor	%xmm2, %xmm2
-	andq	$-64, %rdx
-	movdqa	%xmm1, %xmm0
-	movdqa	(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %r8d
-	movdqa	16(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %eax
-	movdqa	32(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	salq	$16, %rax
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %r9d
-	movdqa	48(%rdx), %xmm3
-	pcmpeqb	%xmm3, %xmm2
-	salq	$32, %r9
-	pcmpeqb	%xmm3, %xmm0
-	orq	%r9, %rax
-	orq	%r8, %rax
-	por	%xmm2, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rax
-	movl	%edi, %ecx
-	subb	%dl, %cl
-	shrq	%cl, %rax
-	testq	%rax, %rax
-	jne	L(return)
-	jmp	L(loop_start)
-
-END (strchr)
-
-#ifndef AS_STRCHRNUL
+#define STRCHR strchr
+#include "multiarch/strchr-sse2.S"
 weak_alias (strchr, index)
 libc_hidden_builtin_def (strchr)
-#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index ec2e652e25..508e42db26 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -18,10 +18,7 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
-#define strchr __strchrnul
-#define AS_STRCHRNUL
-#include "strchr.S"
+#define STRCHR __strchrnul
+#include "multiarch/strchrnul-sse2.S"
 
 weak_alias (__strchrnul, strchrnul)