Message ID | 20180813114411.28186-1-siddhesh@sourceware.org |
---|---|
State | New |
Headers | show |
Series | [aarch64] Add a falkor variant for strlen | expand |
On Mon, Aug 13, 2018 at 4:44 AM Siddhesh Poyarekar <siddhesh@sourceware.org> wrote: > > This variant of strlen uses vector loads and operations to reduce the > size of the code and also eliminate the non-ascii fallback. This > works very well for falkor because of its two vector units and > efficient vector ops. In the best case it reduces latency of cases in > bench-strlen by 48%, with gains throughout the benchmark. > strlen-walk also sees uniform gains in the 5%-15% range. > > Overall the routine appears to work better than the stock one for falkor > regardless of the benchmark, length of string or cache state. > > The same cannot be said of a53 and a72 though. a53 performance was > greatly reduced and for a72 it was a bit of a mixed bag, slightly on the > negative side but I reckon it might be fast in some situations. > > * sysdeps/aarch64/strlen.S (__strlen): Rename to STRLEN. > [!STRLEN](STRLEN): Set to __strlen. > * sysdeps/aarch64/multiarch/strlen.c: New file. > * sysdeps/aarch64/multiarch/strlen_generic.S: Likewise. > * sysdeps/aarch64/multiarch/strlen_falkor.S: Likewise. > * sysdeps/aarch64/multiarch/ifunc-impl-list.c > (__libc_ifunc_impl_list): Add strlen. > * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add > strlen_generic and strlen_falkor. Maybe change the name of strlen_falkor to strlen_simd so it can be used by another processor and not be so confusing to them. Thanks, Andrew > > CC: szabolcs.nagy@arm.com > --- > sysdeps/aarch64/multiarch/Makefile | 3 +- > sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 + > sysdeps/aarch64/multiarch/strlen.c | 39 +++++ > sysdeps/aarch64/multiarch/strlen_falkor.S | 167 ++++++++++++++++++++ > sysdeps/aarch64/multiarch/strlen_generic.S | 42 +++++ > sysdeps/aarch64/strlen.S | 10 +- > 6 files changed, 261 insertions(+), 4 deletions(-) > create mode 100644 sysdeps/aarch64/multiarch/strlen.c > create mode 100644 sysdeps/aarch64/multiarch/strlen_falkor.S > create mode 100644 sysdeps/aarch64/multiarch/strlen_generic.S > > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index 57ffdf7238..746d3aedb1 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -1,4 +1,5 @@ > ifeq ($(subdir),string) > sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ > - memcpy_falkor memmove_falkor memset_generic memset_falkor > + memcpy_falkor memmove_falkor memset_generic memset_falkor \ > + strlen_generic strlen_falkor > endif > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index e55be80103..fbe3a38a76 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -53,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) > IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) > > + IFUNC_IMPL (i, name, strlen, > + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_falkor) > + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic)) > + > return i; > } > diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c > new file mode 100644 > index 0000000000..4de3437662 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/strlen.c > @@ -0,0 +1,39 @@ > +/* Multiple versions of strlen. AARCH64 version. > + Copyright (C) 2018 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > + > +#if IS_IN (libc) > +/* Redefine strlen so that the compiler won't complain about the type > + mismatch with the IFUNC selector in strong_alias, below. */ > +# undef strlen > +# define strlen __redirect_strlen > +# include <string.h> > +# include <init-arch.h> > + > +extern __typeof (__redirect_strlen) __strlen; > + > +extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden; > +extern __typeof (__redirect_strlen) __strlen_falkor attribute_hidden; > + > +libc_ifunc (__strlen, > + (IS_FALKOR (midr) ? __strlen_falkor : __strlen_generic)); > + > +# undef strlen > +strong_alias (__strlen, strlen); > +#endif > diff --git a/sysdeps/aarch64/multiarch/strlen_falkor.S b/sysdeps/aarch64/multiarch/strlen_falkor.S > new file mode 100644 > index 0000000000..fed4dcd46f > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/strlen_falkor.S > @@ -0,0 +1,167 @@ > +/* Copyright (C) 2018 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +/* Assumptions: > + > + ARMv8-a, AArch64, Falkor, unaligned accesses, min page size 4k. */ > + > +/* To test the page crossing code path more thoroughly, compile with > + -DTEST_PAGE_CROSS - this will force all calls through the slower > + entry path. This option is not intended for production use. */ > + > +/* Arguments and results. */ > +#define srcin x0 > +#define len x0 > + > +/* Locals and temporaries. */ > +#define src x1 > +#define data1 x2 > +#define data2 x3 > +#define has_nul1 x4 > +#define has_nul2 x5 > +#define tmp1 x4 > +#define tmp2 x5 > +#define tmp3 x6 > +#define tmp4 x7 > +#define zeroones x8 > +#define dataq q2 > +#define datav v2 > +#define datab2 b3 > +#define dataq2 q3 > +#define datav2 v3 > + > +#ifdef TEST_PAGE_CROSS > +# define MIN_PAGE_SIZE 16 > +#else > +# define MIN_PAGE_SIZE 4096 > +#endif > + > + /* Since strings are short on average, we check the first 16 bytes > + of the string for a NUL character. In order to do an unaligned load > + safely we have to do a page cross check first. If there is a NUL > + byte we calculate the length from the 2 8-byte words using > + conditional select to reduce branch mispredictions (it is unlikely > + strlen_falkor will be repeatedly called on strings with the same > + length). > + > + If the string is longer than 16 bytes, we align src so don't need > + further page cross checks, and process 16 bytes per iteration. > + > + If the page cross check fails, we read 16 bytes from an aligned > + address, remove any characters before the string, and continue > + in the main loop using aligned loads. Since strings crossing a > + page in the first 16 bytes are rare (probability of > + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. > + > + AArch64 systems have a minimum page size of 4k. We don't bother > + checking for larger page sizes - the cost of setting up the correct > + page size is just not worth the extra gain from a small reduction in > + the cases taking the slow path. Note that we only care about > + whether the first fetch, which may be misaligned, crosses a page > + boundary. */ > + > +ENTRY_ALIGN (__strlen_falkor, 6) > + DELOUSE (0) > + DELOUSE (1) > + and tmp1, srcin, MIN_PAGE_SIZE - 1 > + cmp tmp1, MIN_PAGE_SIZE - 16 > + b.gt L(page_cross) > + ldr dataq, [srcin] > +#ifdef __AARCH64EB__ > + rev64 datav.16b, datav.16b > +#endif > + > + /* Get the minimum value and keep going if it is not zero. */ > + uminv datab2, datav.16b > + mov tmp1, datav2.d[0] > + cbnz tmp1, L(main_loop_entry) > + > + cmeq datav.16b, datav.16b, #0 > + mov data1, datav.d[0] > + mov data2, datav.d[1] > + cmp data1, 0 > + csel data1, data1, data2, ne > + mov len, 8 > + rev data1, data1 > + clz tmp1, data1 > + csel len, xzr, len, ne > + add len, len, tmp1, lsr 3 > + ret > + > +L(main_loop_entry): > + bic src, srcin, 15 > + > +L(main_loop): > + ldr dataq, [src, 16]! > +L(page_cross_entry): > + /* Get the minimum value and keep going if it is not zero. */ > + uminv datab2, datav.16b > + mov tmp1, datav2.d[0] > + cbnz tmp1, L(main_loop) > + > +L(tail): > +#ifdef __AARCH64EB__ > + rev64 datav.16b, datav.16b > +#endif > + /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a > + pair of scalars and then compute the length from the earliest NULL > + byte. */ > + cmeq datav.16b, datav.16b, #0 > + mov data1, datav.d[0] > + mov data2, datav.d[1] > + cmp data1, 0 > + csel data1, data1, data2, ne > + sub len, src, srcin > + rev data1, data1 > + add tmp2, len, 8 > + clz tmp1, data1 > + csel len, len, tmp2, ne > + add len, len, tmp1, lsr 3 > + ret > + > + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede > + srcin to 0xff, so we ignore any NUL bytes before the string. > + Then continue in the aligned loop. */ > +L(page_cross): > + mov tmp3, 63 > + bic src, srcin, 15 > + and tmp1, srcin, 7 > + ands tmp2, srcin, 8 > + ldr dataq, [src] > + lsl tmp1, tmp1, 3 > + csel tmp2, tmp2, tmp1, eq > + csel tmp1, tmp1, tmp3, eq > + mov tmp4, -1 > +#ifdef __AARCH64EB__ > + /* Big-endian. Early bytes are at MSB. */ > + lsr tmp1, tmp4, tmp1 > + lsr tmp2, tmp4, tmp2 > +#else > + /* Little-endian. Early bytes are at LSB. */ > + lsl tmp1, tmp4, tmp1 > + lsl tmp2, tmp4, tmp2 > +#endif > + mov datav2.d[0], tmp1 > + mov datav2.d[1], tmp2 > + orn datav.16b, datav.16b, datav2.16b > + b L(page_cross_entry) > +END (__strlen_falkor) > +weak_alias (__strlen_falkor, strlen_falkor) > +libc_hidden_builtin_def (strlen_falkor) > diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_generic.S > new file mode 100644 > index 0000000000..a74b0877dc > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/strlen_generic.S > @@ -0,0 +1,42 @@ > +/* A Generic Optimized strlen implementation for AARCH64. > + Copyright (C) 2018 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* The actual strlen and memmove code is in ../strlen.S. If we are > + building libc this file defines __strlen_generic and __memmove_generic. > + Otherwise the include of ../strlen.S will define the normal __strlen > + and__memmove entry points. */ > + > +#include <sysdep.h> > + > +#if IS_IN (libc) > + > +# define STRLEN __strlen_generic > + > +/* Do not hide the generic versions of strlen and memmove, we use them > + internally. */ > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(name) > + > +# ifdef SHARED > +/* It doesn't make sense to send libc-internal strlen calls through a PLT. */ > + .globl __GI_strlen; __GI_strlen = __strlen_generic > +# endif > + > +#endif > + > +#include "../strlen.S" > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S > index eb773ef532..521ebc3b75 100644 > --- a/sysdeps/aarch64/strlen.S > +++ b/sysdeps/aarch64/strlen.S > @@ -23,6 +23,10 @@ > * ARMv8-a, AArch64, unaligned accesses, min page size 4k. > */ > > +#ifndef STRLEN > +# define STRLEN __strlen > +#endif > + > /* To test the page crossing code path more thoroughly, compile with > -DTEST_PAGE_CROSS - this will force all calls through the slower > entry path. This option is not intended for production use. */ > @@ -84,7 +88,7 @@ > whether the first fetch, which may be misaligned, crosses a page > boundary. */ > > -ENTRY_ALIGN (__strlen, 6) > +ENTRY_ALIGN (STRLEN, 6) > DELOUSE (0) > DELOUSE (1) > and tmp1, srcin, MIN_PAGE_SIZE - 1 > @@ -215,6 +219,6 @@ L(page_cross): > csel data1, data1, tmp4, eq > csel data2, data2, tmp2, eq > b L(page_cross_entry) > -END (__strlen) > -weak_alias (__strlen, strlen) > +END (STRLEN) > +weak_alias (STRLEN, strlen) > libc_hidden_builtin_def (strlen) > -- > 2.17.1 >
On 08/13/2018 08:53 PM, Andrew Pinski wrote: >> * sysdeps/aarch64/strlen.S (__strlen): Rename to STRLEN. >> [!STRLEN](STRLEN): Set to __strlen. >> * sysdeps/aarch64/multiarch/strlen.c: New file. >> * sysdeps/aarch64/multiarch/strlen_generic.S: Likewise. >> * sysdeps/aarch64/multiarch/strlen_falkor.S: Likewise. >> * sysdeps/aarch64/multiarch/ifunc-impl-list.c >> (__libc_ifunc_impl_list): Add strlen. >> * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add >> strlen_generic and strlen_falkor. > > Maybe change the name of strlen_falkor to strlen_simd so it can be > used by another processor and not be so confusing to them. Sure thing. Siddhesh
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 57ffdf7238..746d3aedb1 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,5 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ - memcpy_falkor memmove_falkor memset_generic memset_falkor + memcpy_falkor memmove_falkor memset_generic memset_falkor \ + strlen_generic strlen_falkor endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index e55be80103..fbe3a38a76 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -53,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_falkor) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic)) + return i; } diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c new file mode 100644 index 0000000000..4de3437662 --- /dev/null +++ b/sysdeps/aarch64/multiarch/strlen.c @@ -0,0 +1,39 @@ +/* Multiple versions of strlen. AARCH64 version. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine strlen so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef strlen +# define strlen __redirect_strlen +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_strlen) __strlen; + +extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden; +extern __typeof (__redirect_strlen) __strlen_falkor attribute_hidden; + +libc_ifunc (__strlen, + (IS_FALKOR (midr) ? __strlen_falkor : __strlen_generic)); + +# undef strlen +strong_alias (__strlen, strlen); +#endif diff --git a/sysdeps/aarch64/multiarch/strlen_falkor.S b/sysdeps/aarch64/multiarch/strlen_falkor.S new file mode 100644 index 0000000000..fed4dcd46f --- /dev/null +++ b/sysdeps/aarch64/multiarch/strlen_falkor.S @@ -0,0 +1,167 @@ +/* Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Assumptions: + + ARMv8-a, AArch64, Falkor, unaligned accesses, min page size 4k. */ + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ +#define srcin x0 +#define len x0 + +/* Locals and temporaries. */ +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 +#define dataq q2 +#define datav v2 +#define datab2 b3 +#define dataq2 q3 +#define datav2 v3 + +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 16 +#else +# define MIN_PAGE_SIZE 4096 +#endif + + /* Since strings are short on average, we check the first 16 bytes + of the string for a NUL character. In order to do an unaligned load + safely we have to do a page cross check first. If there is a NUL + byte we calculate the length from the 2 8-byte words using + conditional select to reduce branch mispredictions (it is unlikely + strlen_falkor will be repeatedly called on strings with the same + length). + + If the string is longer than 16 bytes, we align src so don't need + further page cross checks, and process 16 bytes per iteration. + + If the page cross check fails, we read 16 bytes from an aligned + address, remove any characters before the string, and continue + in the main loop using aligned loads. Since strings crossing a + page in the first 16 bytes are rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. + + AArch64 systems have a minimum page size of 4k. We don't bother + checking for larger page sizes - the cost of setting up the correct + page size is just not worth the extra gain from a small reduction in + the cases taking the slow path. Note that we only care about + whether the first fetch, which may be misaligned, crosses a page + boundary. */ + +ENTRY_ALIGN (__strlen_falkor, 6) + DELOUSE (0) + DELOUSE (1) + and tmp1, srcin, MIN_PAGE_SIZE - 1 + cmp tmp1, MIN_PAGE_SIZE - 16 + b.gt L(page_cross) + ldr dataq, [srcin] +#ifdef __AARCH64EB__ + rev64 datav.16b, datav.16b +#endif + + /* Get the minimum value and keep going if it is not zero. */ + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + cbnz tmp1, L(main_loop_entry) + + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 + csel data1, data1, data2, ne + mov len, 8 + rev data1, data1 + clz tmp1, data1 + csel len, xzr, len, ne + add len, len, tmp1, lsr 3 + ret + +L(main_loop_entry): + bic src, srcin, 15 + +L(main_loop): + ldr dataq, [src, 16]! +L(page_cross_entry): + /* Get the minimum value and keep going if it is not zero. */ + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + cbnz tmp1, L(main_loop) + +L(tail): +#ifdef __AARCH64EB__ + rev64 datav.16b, datav.16b +#endif + /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a + pair of scalars and then compute the length from the earliest NULL + byte. */ + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 + csel data1, data1, data2, ne + sub len, src, srcin + rev data1, data1 + add tmp2, len, 8 + clz tmp1, data1 + csel len, len, tmp2, ne + add len, len, tmp1, lsr 3 + ret + + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede + srcin to 0xff, so we ignore any NUL bytes before the string. + Then continue in the aligned loop. */ +L(page_cross): + mov tmp3, 63 + bic src, srcin, 15 + and tmp1, srcin, 7 + ands tmp2, srcin, 8 + ldr dataq, [src] + lsl tmp1, tmp1, 3 + csel tmp2, tmp2, tmp1, eq + csel tmp1, tmp1, tmp3, eq + mov tmp4, -1 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsr tmp1, tmp4, tmp1 + lsr tmp2, tmp4, tmp2 +#else + /* Little-endian. Early bytes are at LSB. */ + lsl tmp1, tmp4, tmp1 + lsl tmp2, tmp4, tmp2 +#endif + mov datav2.d[0], tmp1 + mov datav2.d[1], tmp2 + orn datav.16b, datav.16b, datav2.16b + b L(page_cross_entry) +END (__strlen_falkor) +weak_alias (__strlen_falkor, strlen_falkor) +libc_hidden_builtin_def (strlen_falkor) diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_generic.S new file mode 100644 index 0000000000..a74b0877dc --- /dev/null +++ b/sysdeps/aarch64/multiarch/strlen_generic.S @@ -0,0 +1,42 @@ +/* A Generic Optimized strlen implementation for AARCH64. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* The actual strlen and memmove code is in ../strlen.S. If we are + building libc this file defines __strlen_generic and __memmove_generic. + Otherwise the include of ../strlen.S will define the normal __strlen + and__memmove entry points. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +# define STRLEN __strlen_generic + +/* Do not hide the generic versions of strlen and memmove, we use them + internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +# ifdef SHARED +/* It doesn't make sense to send libc-internal strlen calls through a PLT. */ + .globl __GI_strlen; __GI_strlen = __strlen_generic +# endif + +#endif + +#include "../strlen.S" diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S index eb773ef532..521ebc3b75 100644 --- a/sysdeps/aarch64/strlen.S +++ b/sysdeps/aarch64/strlen.S @@ -23,6 +23,10 @@ * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ +#ifndef STRLEN +# define STRLEN __strlen +#endif + /* To test the page crossing code path more thoroughly, compile with -DTEST_PAGE_CROSS - this will force all calls through the slower entry path. This option is not intended for production use. */ @@ -84,7 +88,7 @@ whether the first fetch, which may be misaligned, crosses a page boundary. */ -ENTRY_ALIGN (__strlen, 6) +ENTRY_ALIGN (STRLEN, 6) DELOUSE (0) DELOUSE (1) and tmp1, srcin, MIN_PAGE_SIZE - 1 @@ -215,6 +219,6 @@ L(page_cross): csel data1, data1, tmp4, eq csel data2, data2, tmp2, eq b L(page_cross_entry) -END (__strlen) -weak_alias (__strlen, strlen) +END (STRLEN) +weak_alias (STRLEN, strlen) libc_hidden_builtin_def (strlen)