Message ID | 20230801070902.1385953-3-dengjianbo@loongson.cn |
---|---|
State | New |
Headers | show |
Series | Add ifunc support and different versions of strlen | expand |
On 01/08/23 04:09, dengjianbo wrote: > 1. strlen-lasx is implemeted by LASX simd instructions(256bit) > 2. strlen-lsx is implemeted by LSX simd instructions(128bit) > 3. strlen-align is implemented by LA basic instructions and never use unaligned memory acess Usually optimization routines are added along benchmarks number to show the expected improvements over different sizes and alignment. > --- > sysdeps/loongarch/lp64/multiarch/Makefile | 3 + > .../lp64/multiarch/ifunc-impl-list.c | 39 +++++++ > .../loongarch/lp64/multiarch/ifunc-strlen.h | 36 +++++++ > .../loongarch/lp64/multiarch/strlen-aligned.S | 101 ++++++++++++++++++ > .../loongarch/lp64/multiarch/strlen-lasx.S | 65 +++++++++++ > sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 73 +++++++++++++ > sysdeps/loongarch/lp64/multiarch/strlen.c | 37 +++++++ > sysdeps/loongarch/sys/regdef.h | 57 ++++++++++ > .../unix/sysv/linux/loongarch/cpu-features.h | 2 + > 9 files changed, 413 insertions(+) > create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile > create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c > create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h > create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S > create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S > create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S > create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c > > diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile > new file mode 100644 > index 0000000000..529a8b6bab > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/Makefile > @@ -0,0 +1,3 @@ > +ifeq ($(subdir),string) > +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx > +endif One entry per line: sysdep_routines += \ strlen-aligned \ strlen-lsx \ strlen-lasx \ # sysdep_routines > diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c > new file mode 100644 > index 0000000000..b35e41127e > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c > @@ -0,0 +1,39 @@ > +/* Enumerate available IFUNC implementations of a function. LoongArch64 version. > + Copyright (C) 2017-2023 Free Software Foundation, Inc. I think it should be only 2023 here and for other new file as well. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <assert.h> > +#include <string.h> > +#include <wchar.h> > +#include <ldsodefs.h> > +#include <ifunc-impl-list.h> > +#include <stdio.h> > + > +size_t > +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > + size_t max) > +{ > + > + size_t i = max; > + > + IFUNC_IMPL (i, name, strlen, > + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx) > + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx) > + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned) > + ) > + return i; > +} > diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h > new file mode 100644 > index 0000000000..e2b3490f39 > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h > @@ -0,0 +1,36 @@ > +/* Common definition for strlen implementation. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2017-2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <ldsodefs.h> > +#include <ifunc-init.h> > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + if (SUPPORT_LASX) > + return OPTIMIZE (lasx); > + else if (SUPPORT_LSX) > + return OPTIMIZE (lsx); > + else > + return OPTIMIZE (aligned); > +} > diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S > new file mode 100644 > index 0000000000..b379e978a7 > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S > @@ -0,0 +1,101 @@ > +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include <sys/regdef.h> > +#include <sys/asm.h> > + > +#if IS_IN (libc) > +# define STRLEN __strlen_aligned > +#else > +# define STRLEN strlen > +#endif Is this really an improvement over the generic implementation? It seems to use a quite similar strategy. > + > +LEAF(STRLEN, 6) > + move a1, a0 > + bstrins.d a0, zero, 2, 0 > + lu12i.w a2, 0x01010 > + li.w t0, -1 > + > + ld.d t2, a0, 0 > + andi t1, a1, 0x7 > + ori a2, a2, 0x101 > + slli.d t1, t1, 3 > + > + bstrins.d a2, a2, 63, 32 > + sll.d t1, t0, t1 > + slli.d t3, a2, 7 > + nor a3, zero, t3 > + > + orn t2, t2, t1 > + sub.d t0, t2, a2 > + nor t1, t2, a3 > + and t0, t0, t1 > + > + > + bnez t0, L(count_pos) > + addi.d a0, a0, 8 > +L(loop_16_7bit): > + ld.d t2, a0, 0 > + sub.d t1, t2, a2 > + > + and t0, t1, t3 > + bnez t0, L(more_check) > + ld.d t2, a0, 8 > + sub.d t1, t2, a2 > + > + and t0, t1, t3 > + addi.d a0, a0, 16 > + beqz t0, L(loop_16_7bit) > + addi.d a0, a0, -8 > + > +L(more_check): > + nor t0, t2, a3 > + and t0, t1, t0 > + bnez t0, L(count_pos) > + addi.d a0, a0, 8 > + > + > +L(loop_16_8bit): > + ld.d t2, a0, 0 > + sub.d t1, t2, a2 > + nor t0, t2, a3 > + and t0, t0, t1 > + > + bnez t0, L(count_pos) > + ld.d t2, a0, 8 > + addi.d a0, a0, 16 > + sub.d t1, t2, a2 > + > + nor t0, t2, a3 > + and t0, t0, t1 > + beqz t0, L(loop_16_8bit) > + addi.d a0, a0, -8 > + > +L(count_pos): > + ctz.d t1, t0 > + sub.d a0, a0, a1 > + srli.d t1, t1, 3 > + add.d a0, a0, t1 > + > + jr ra > +END(STRLEN) > + > +#ifdef _LIBC > +libc_hidden_builtin_def (STRLEN) > +#endif > diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S > new file mode 100644 > index 0000000000..56ac6403d3 > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S > @@ -0,0 +1,65 @@ Missing one line comment. > +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include <sys/regdef.h> > +#include <sys/asm.h> > + > +#if IS_IN (libc) > + > +# define STRLEN __strlen_lasx > + > +LEAF(STRLEN, 6) > + move a1, a0 > + bstrins.d a0, zero, 4, 0 > + li.d t1, -1 > + xvld xr0, a0, 0 > + > + xvmsknz.b xr0, xr0 > + xvpickve.w xr1, xr0, 4 > + vilvl.h vr0, vr1, vr0 > + movfr2gr.s t0, fa0 # sign extend > + > + sra.w t0, t0, a1 > + beq t0, t1, L(loop) > + cto.w a0, t0 > + jr ra > + > +L(loop): > + xvld xr0, a0, 32 > + addi.d a0, a0, 32 > + xvsetanyeqz.b fcc0, xr0 > + bceqz fcc0, L(loop) > + > + > + xvmsknz.b xr0, xr0 > + sub.d a0, a0, a1 > + xvpickve.w xr1, xr0, 4 > + vilvl.h vr0, vr1, vr0 > + > + movfr2gr.s t0, fa0 > + cto.w t0, t0 > + add.d a0, a0, t0 > + jr ra > +END(STRLEN) > + > +#ifdef _LIBC > +libc_hidden_builtin_def (STRLEN) > +#endif > + > +#endif This implementation fails to assembler with binutils 2.40.0.20230525: ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match insn: xvld $xr0,$r4,0 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match insn: xvmsknz.b $xr0,$xr0 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match insn: xvpickve.w $xr1,$xr0,4 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match insn: xvld $xr0,$r4,32 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match insn: xvsetanyeqz.b $fcc0,$xr0 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match insn: xvmsknz.b $xr0,$xr0 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match insn: xvpickve.w $xr1,$xr0,4 ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 You need to either add a configure option to increase the minimum required binutils or add a macro to synthetize the instruction on older binutils (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does). > diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S > new file mode 100644 > index 0000000000..1c19c98b5b > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S > @@ -0,0 +1,73 @@ > +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include <sys/regdef.h> > +#include <sys/asm.h> > + > +#if IS_IN (libc) > + > +# define STRLEN __strlen_lsx > + > +LEAF(STRLEN, 6) > + move a1, a0 > + bstrins.d a0, zero, 4, 0 > + vld vr0, a0, 0 > + vld vr1, a0, 16 > + > + li.d t1, -1 > + vmsknz.b vr0, vr0 > + vmsknz.b vr1, vr1 > + vilvl.h vr0, vr1, vr0 > + > + movfr2gr.s t0, fa0 > + sra.w t0, t0, a1 > + beq t0, t1, L(loop) > + cto.w a0, t0 > + > + jr ra > + nop > + nop > + nop > + > + > +L(loop): > + vld vr0, a0, 32 > + vld vr1, a0, 48 > + addi.d a0, a0, 32 > + vmin.bu vr2, vr0, vr1 > + > + vsetanyeqz.b fcc0, vr2 > + bceqz fcc0, L(loop) > + vmsknz.b vr0, vr0 > + vmsknz.b vr1, vr1 > + > + vilvl.h vr0, vr1, vr0 > + sub.d a0, a0, a1 > + movfr2gr.s t0, fa0 > + cto.w t0, t0 > + > + add.d a0, a0, t0 > + jr ra > +END(STRLEN) > + > +#ifdef _LIBC > +libc_hidden_builtin_def (STRLEN) > +#endif > + > +#endif This implementation fails to assembler with binutils 2.40.0.20230525: ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:34: Error: no match insn: vmsknz.b $vr0,$vr0 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:35: Error: no match insn: vmsknz.b $vr1,$vr1 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:36: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:50: Error: no match insn: vld $vr0,$r4,32 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:51: Error: no match insn: vld $vr1,$r4,48 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:53: Error: no match insn: vmin.bu $vr2,$vr0,$vr1 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:55: Error: no match insn: vsetanyeqz.b $fcc0,$vr2 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:57: Error: no match insn: vmsknz.b $vr0,$vr0 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:58: Error: no match insn: vmsknz.b $vr1,$vr1 ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:60: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 > diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c > new file mode 100644 > index 0000000000..416ed0d9e2 > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/strlen.c > @@ -0,0 +1,37 @@ > +/* Multiple versions of strlen. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2017-2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > + > +#if IS_IN (libc) > +# define strlen __redirect_strlen > +# include <string.h> > +# undef strlen > + > +# define SYMBOL_NAME strlen > +# include "ifunc-strlen.h" > + > +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ()); > + > +# ifdef SHARED > +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen) > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen); > +# endif > + > +#endif > diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h > index 5100f36d24..524d2e3277 100644 > --- a/sysdeps/loongarch/sys/regdef.h > +++ b/sysdeps/loongarch/sys/regdef.h > @@ -89,6 +89,14 @@ > #define fs5 $f29 > #define fs6 $f30 > #define fs7 $f31 > +#define fcc0 $fcc0 > +#define fcc1 $fcc1 > +#define fcc2 $fcc2 > +#define fcc3 $fcc3 > +#define fcc4 $fcc4 > +#define fcc5 $fcc5 > +#define fcc6 $fcc6 > +#define fcc7 $fcc7 > > #define vr0 $vr0 > #define vr1 $vr1 > @@ -98,6 +106,30 @@ > #define vr5 $vr5 > #define vr6 $vr6 > #define vr7 $vr7 > +#define vr8 $vr8 > +#define vr9 $vr9 > +#define vr10 $vr10 > +#define vr11 $vr11 > +#define vr12 $vr12 > +#define vr13 $vr13 > +#define vr14 $vr14 > +#define vr15 $vr15 > +#define vr16 $vr16 > +#define vr17 $vr17 > +#define vr18 $vr18 > +#define vr19 $vr19 > +#define vr20 $vr20 > +#define vr21 $vr21 > +#define vr22 $vr22 > +#define vr23 $vr23 > +#define vr24 $vr24 > +#define vr25 $vr25 > +#define vr26 $vr26 > +#define vr27 $vr27 > +#define vr28 $vr28 > +#define vr29 $vr29 > +#define vr30 $vr30 > +#define vr31 $vr31 > > #define xr0 $xr0 > #define xr1 $xr1 > @@ -107,5 +139,30 @@ > #define xr5 $xr5 > #define xr6 $xr6 > #define xr7 $xr7 > +#define xr7 $xr7 > +#define xr8 $xr8 > +#define xr9 $xr9 > +#define xr10 $xr10 > +#define xr11 $xr11 > +#define xr12 $xr12 > +#define xr13 $xr13 > +#define xr14 $xr14 > +#define xr15 $xr15 > +#define xr16 $xr16 > +#define xr17 $xr17 > +#define xr18 $xr18 > +#define xr19 $xr19 > +#define xr20 $xr20 > +#define xr21 $xr21 > +#define xr22 $xr22 > +#define xr23 $xr23 > +#define xr24 $xr24 > +#define xr25 $xr25 > +#define xr26 $xr26 > +#define xr27 $xr27 > +#define xr28 $xr28 > +#define xr29 $xr29 > +#define xr30 $xr30 > +#define xr31 $xr31 > > #endif /* _SYS_REGDEF_H */ > diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h > index e371e13b15..d1a280a5ee 100644 > --- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h > +++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h > @@ -25,5 +25,7 @@ > #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) > #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) > > +#define INIT_ARCH() > + > #endif /* _CPU_FEATURES_LOONGARCH64_H */ >
On Tue, 2023-08-01 at 15:09 +0800, dengjianbo wrote: /* snip */ > diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile > b/sysdeps/loongarch/lp64/multiarch/Makefile > new file mode 100644 > index 0000000000..529a8b6bab > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/Makefile > @@ -0,0 +1,3 @@ > +ifeq ($(subdir),string) > +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx > +endif Please check if the assembler supports LSX/LASX, if not you should not add strlen-lsx and strlen-lasx here. We don't want to disallow building Glibc for LoongArch with old assembler.
在 2023/8/1 下午10:31, Adhemerval Zanella Netto 写道: > > On 01/08/23 04:09, dengjianbo wrote: >> 1. strlen-lasx is implemeted by LASX simd instructions(256bit) >> 2. strlen-lsx is implemeted by LSX simd instructions(128bit) >> 3. strlen-align is implemented by LA basic instructions and never use unaligned memory acess > Usually optimization routines are added along benchmarks number to show > the expected improvements over different sizes and alignment. The performance test plots for these functions over different sizes and alignment are here: https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_align.png https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lasx.png https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lsx.png > >> --- >> sysdeps/loongarch/lp64/multiarch/Makefile | 3 + >> .../lp64/multiarch/ifunc-impl-list.c | 39 +++++++ >> .../loongarch/lp64/multiarch/ifunc-strlen.h | 36 +++++++ >> .../loongarch/lp64/multiarch/strlen-aligned.S | 101 ++++++++++++++++++ >> .../loongarch/lp64/multiarch/strlen-lasx.S | 65 +++++++++++ >> sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 73 +++++++++++++ >> sysdeps/loongarch/lp64/multiarch/strlen.c | 37 +++++++ >> sysdeps/loongarch/sys/regdef.h | 57 ++++++++++ >> .../unix/sysv/linux/loongarch/cpu-features.h | 2 + >> 9 files changed, 413 insertions(+) >> create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile >> create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c >> create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h >> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S >> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S >> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S >> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c >> >> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile >> new file mode 100644 >> index 0000000000..529a8b6bab >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile >> @@ -0,0 +1,3 @@ >> +ifeq ($(subdir),string) >> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx >> +endif > One entry per line: > > sysdep_routines += \ > strlen-aligned \ > strlen-lsx \ > strlen-lasx \ > # sysdep_routines > >> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c >> new file mode 100644 >> index 0000000000..b35e41127e >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c >> @@ -0,0 +1,39 @@ >> +/* Enumerate available IFUNC implementations of a function. LoongArch64 version. >> + Copyright (C) 2017-2023 Free Software Foundation, Inc. > I think it should be only 2023 here and for other new file as well. > >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#include <assert.h> >> +#include <string.h> >> +#include <wchar.h> >> +#include <ldsodefs.h> >> +#include <ifunc-impl-list.h> >> +#include <stdio.h> >> + >> +size_t >> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> + size_t max) >> +{ >> + >> + size_t i = max; >> + >> + IFUNC_IMPL (i, name, strlen, >> + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx) >> + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx) >> + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned) >> + ) >> + return i; >> +} >> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h >> new file mode 100644 >> index 0000000000..e2b3490f39 >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h >> @@ -0,0 +1,36 @@ >> +/* Common definition for strlen implementation. >> + All versions must be listed in ifunc-impl-list.c. >> + Copyright (C) 2017-2023 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +#include <ldsodefs.h> >> +#include <ifunc-init.h> >> + >> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; >> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; >> +extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; >> + >> +static inline void * >> +IFUNC_SELECTOR (void) >> +{ >> + if (SUPPORT_LASX) >> + return OPTIMIZE (lasx); >> + else if (SUPPORT_LSX) >> + return OPTIMIZE (lsx); >> + else >> + return OPTIMIZE (aligned); >> +} >> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S >> new file mode 100644 >> index 0000000000..b379e978a7 >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S >> @@ -0,0 +1,101 @@ >> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +#include <sysdep.h> >> +#include <sys/regdef.h> >> +#include <sys/asm.h> >> + >> +#if IS_IN (libc) >> +# define STRLEN __strlen_aligned >> +#else >> +# define STRLEN strlen >> +#endif > Is this really an improvement over the generic implementation? It seems to > use a quite similar strategy. > >> + >> +LEAF(STRLEN, 6) >> + move a1, a0 >> + bstrins.d a0, zero, 2, 0 >> + lu12i.w a2, 0x01010 >> + li.w t0, -1 >> + >> + ld.d t2, a0, 0 >> + andi t1, a1, 0x7 >> + ori a2, a2, 0x101 >> + slli.d t1, t1, 3 >> + >> + bstrins.d a2, a2, 63, 32 >> + sll.d t1, t0, t1 >> + slli.d t3, a2, 7 >> + nor a3, zero, t3 >> + >> + orn t2, t2, t1 >> + sub.d t0, t2, a2 >> + nor t1, t2, a3 >> + and t0, t0, t1 >> + >> + >> + bnez t0, L(count_pos) >> + addi.d a0, a0, 8 >> +L(loop_16_7bit): >> + ld.d t2, a0, 0 >> + sub.d t1, t2, a2 >> + >> + and t0, t1, t3 >> + bnez t0, L(more_check) >> + ld.d t2, a0, 8 >> + sub.d t1, t2, a2 >> + >> + and t0, t1, t3 >> + addi.d a0, a0, 16 >> + beqz t0, L(loop_16_7bit) >> + addi.d a0, a0, -8 >> + >> +L(more_check): >> + nor t0, t2, a3 >> + and t0, t1, t0 >> + bnez t0, L(count_pos) >> + addi.d a0, a0, 8 >> + >> + >> +L(loop_16_8bit): >> + ld.d t2, a0, 0 >> + sub.d t1, t2, a2 >> + nor t0, t2, a3 >> + and t0, t0, t1 >> + >> + bnez t0, L(count_pos) >> + ld.d t2, a0, 8 >> + addi.d a0, a0, 16 >> + sub.d t1, t2, a2 >> + >> + nor t0, t2, a3 >> + and t0, t0, t1 >> + beqz t0, L(loop_16_8bit) >> + addi.d a0, a0, -8 >> + >> +L(count_pos): >> + ctz.d t1, t0 >> + sub.d a0, a0, a1 >> + srli.d t1, t1, 3 >> + add.d a0, a0, t1 >> + >> + jr ra >> +END(STRLEN) >> + >> +#ifdef _LIBC >> +libc_hidden_builtin_def (STRLEN) >> +#endif >> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S >> new file mode 100644 >> index 0000000000..56ac6403d3 >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S >> @@ -0,0 +1,65 @@ > Missing one line comment. > >> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +#include <sysdep.h> >> +#include <sys/regdef.h> >> +#include <sys/asm.h> >> + >> +#if IS_IN (libc) >> + >> +# define STRLEN __strlen_lasx >> + >> +LEAF(STRLEN, 6) >> + move a1, a0 >> + bstrins.d a0, zero, 4, 0 >> + li.d t1, -1 >> + xvld xr0, a0, 0 >> + >> + xvmsknz.b xr0, xr0 >> + xvpickve.w xr1, xr0, 4 >> + vilvl.h vr0, vr1, vr0 >> + movfr2gr.s t0, fa0 # sign extend >> + >> + sra.w t0, t0, a1 >> + beq t0, t1, L(loop) >> + cto.w a0, t0 >> + jr ra >> + >> +L(loop): >> + xvld xr0, a0, 32 >> + addi.d a0, a0, 32 >> + xvsetanyeqz.b fcc0, xr0 >> + bceqz fcc0, L(loop) >> + >> + >> + xvmsknz.b xr0, xr0 >> + sub.d a0, a0, a1 >> + xvpickve.w xr1, xr0, 4 >> + vilvl.h vr0, vr1, vr0 >> + >> + movfr2gr.s t0, fa0 >> + cto.w t0, t0 >> + add.d a0, a0, t0 >> + jr ra >> +END(STRLEN) >> + >> +#ifdef _LIBC >> +libc_hidden_builtin_def (STRLEN) >> +#endif >> + >> +#endif > This implementation fails to assembler with binutils 2.40.0.20230525: > > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match insn: xvld $xr0,$r4,0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match insn: xvmsknz.b $xr0,$xr0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match insn: xvpickve.w $xr1,$xr0,4 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match insn: xvld $xr0,$r4,32 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match insn: xvsetanyeqz.b $fcc0,$xr0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match insn: xvmsknz.b $xr0,$xr0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match insn: xvpickve.w $xr1,$xr0,4 > ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 > > You need to either add a configure option to increase the minimum required > binutils or add a macro to synthetize the instruction on older binutils > (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does). > > >> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S >> new file mode 100644 >> index 0000000000..1c19c98b5b >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S >> @@ -0,0 +1,73 @@ >> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +#include <sysdep.h> >> +#include <sys/regdef.h> >> +#include <sys/asm.h> >> + >> +#if IS_IN (libc) >> + >> +# define STRLEN __strlen_lsx >> + >> +LEAF(STRLEN, 6) >> + move a1, a0 >> + bstrins.d a0, zero, 4, 0 >> + vld vr0, a0, 0 >> + vld vr1, a0, 16 >> + >> + li.d t1, -1 >> + vmsknz.b vr0, vr0 >> + vmsknz.b vr1, vr1 >> + vilvl.h vr0, vr1, vr0 >> + >> + movfr2gr.s t0, fa0 >> + sra.w t0, t0, a1 >> + beq t0, t1, L(loop) >> + cto.w a0, t0 >> + >> + jr ra >> + nop >> + nop >> + nop >> + >> + >> +L(loop): >> + vld vr0, a0, 32 >> + vld vr1, a0, 48 >> + addi.d a0, a0, 32 >> + vmin.bu vr2, vr0, vr1 >> + >> + vsetanyeqz.b fcc0, vr2 >> + bceqz fcc0, L(loop) >> + vmsknz.b vr0, vr0 >> + vmsknz.b vr1, vr1 >> + >> + vilvl.h vr0, vr1, vr0 >> + sub.d a0, a0, a1 >> + movfr2gr.s t0, fa0 >> + cto.w t0, t0 >> + >> + add.d a0, a0, t0 >> + jr ra >> +END(STRLEN) >> + >> +#ifdef _LIBC >> +libc_hidden_builtin_def (STRLEN) >> +#endif >> + >> +#endif > This implementation fails to assembler with binutils 2.40.0.20230525: > > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:34: Error: no match insn: vmsknz.b $vr0,$vr0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:35: Error: no match insn: vmsknz.b $vr1,$vr1 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:36: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:50: Error: no match insn: vld $vr0,$r4,32 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:51: Error: no match insn: vld $vr1,$r4,48 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:53: Error: no match insn: vmin.bu $vr2,$vr0,$vr1 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:55: Error: no match insn: vsetanyeqz.b $fcc0,$vr2 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:57: Error: no match insn: vmsknz.b $vr0,$vr0 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:58: Error: no match insn: vmsknz.b $vr1,$vr1 > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:60: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 > >> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c >> new file mode 100644 >> index 0000000000..416ed0d9e2 >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/strlen.c >> @@ -0,0 +1,37 @@ >> +/* Multiple versions of strlen. >> + All versions must be listed in ifunc-impl-list.c. >> + Copyright (C) 2017-2023 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +/* Define multiple versions only for the definition in libc. */ >> + >> +#if IS_IN (libc) >> +# define strlen __redirect_strlen >> +# include <string.h> >> +# undef strlen >> + >> +# define SYMBOL_NAME strlen >> +# include "ifunc-strlen.h" >> + >> +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ()); >> + >> +# ifdef SHARED >> +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen) >> + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen); >> +# endif >> + >> +#endif >> diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h >> index 5100f36d24..524d2e3277 100644 >> --- a/sysdeps/loongarch/sys/regdef.h >> +++ b/sysdeps/loongarch/sys/regdef.h >> @@ -89,6 +89,14 @@ >> #define fs5 $f29 >> #define fs6 $f30 >> #define fs7 $f31 >> +#define fcc0 $fcc0 >> +#define fcc1 $fcc1 >> +#define fcc2 $fcc2 >> +#define fcc3 $fcc3 >> +#define fcc4 $fcc4 >> +#define fcc5 $fcc5 >> +#define fcc6 $fcc6 >> +#define fcc7 $fcc7 >> >> #define vr0 $vr0 >> #define vr1 $vr1 >> @@ -98,6 +106,30 @@ >> #define vr5 $vr5 >> #define vr6 $vr6 >> #define vr7 $vr7 >> +#define vr8 $vr8 >> +#define vr9 $vr9 >> +#define vr10 $vr10 >> +#define vr11 $vr11 >> +#define vr12 $vr12 >> +#define vr13 $vr13 >> +#define vr14 $vr14 >> +#define vr15 $vr15 >> +#define vr16 $vr16 >> +#define vr17 $vr17 >> +#define vr18 $vr18 >> +#define vr19 $vr19 >> +#define vr20 $vr20 >> +#define vr21 $vr21 >> +#define vr22 $vr22 >> +#define vr23 $vr23 >> +#define vr24 $vr24 >> +#define vr25 $vr25 >> +#define vr26 $vr26 >> +#define vr27 $vr27 >> +#define vr28 $vr28 >> +#define vr29 $vr29 >> +#define vr30 $vr30 >> +#define vr31 $vr31 >> >> #define xr0 $xr0 >> #define xr1 $xr1 >> @@ -107,5 +139,30 @@ >> #define xr5 $xr5 >> #define xr6 $xr6 >> #define xr7 $xr7 >> +#define xr7 $xr7 >> +#define xr8 $xr8 >> +#define xr9 $xr9 >> +#define xr10 $xr10 >> +#define xr11 $xr11 >> +#define xr12 $xr12 >> +#define xr13 $xr13 >> +#define xr14 $xr14 >> +#define xr15 $xr15 >> +#define xr16 $xr16 >> +#define xr17 $xr17 >> +#define xr18 $xr18 >> +#define xr19 $xr19 >> +#define xr20 $xr20 >> +#define xr21 $xr21 >> +#define xr22 $xr22 >> +#define xr23 $xr23 >> +#define xr24 $xr24 >> +#define xr25 $xr25 >> +#define xr26 $xr26 >> +#define xr27 $xr27 >> +#define xr28 $xr28 >> +#define xr29 $xr29 >> +#define xr30 $xr30 >> +#define xr31 $xr31 >> >> #endif /* _SYS_REGDEF_H */ >> diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h >> index e371e13b15..d1a280a5ee 100644 >> --- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h >> +++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h >> @@ -25,5 +25,7 @@ >> #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) >> #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) >> >> +#define INIT_ARCH() >> + >> #endif /* _CPU_FEATURES_LOONGARCH64_H */ >>
>>> >>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote: >>> +#if IS_IN (libc) >>> +# define STRLEN __strlen_aligned >>> +#else >>> +# define STRLEN strlen >>> +#endif >> Is this really an improvement over the generic implementation? It >> seems to >> use a quite similar strategy. Comparing with the code generated by compiler, the assembly code does an 16bytes loop unrolling, and handles ascii data and non-ascii data separately which could take less instructions to calculate the length of ascii data. besides, the assembly code using fewer instructions to start the loop. I think the performance improvement benefits from this. Please kindly check bench result also from: https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out >> This implementation fails to assembler with binutils 2.40.0.20230525: >> >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match >> insn: xvld $xr0,$r4,0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match >> insn: xvmsknz.b $xr0,$xr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match >> insn: xvpickve.w $xr1,$xr0,4 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match >> insn: vilvl.h $vr0,$vr1,$vr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match >> insn: xvld $xr0,$r4,32 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match >> insn: xvsetanyeqz.b $fcc0,$xr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match >> insn: xvmsknz.b $xr0,$xr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match >> insn: xvpickve.w $xr1,$xr0,4 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match >> insn: vilvl.h $vr0,$vr1,$vr0 >> >> You need to either add a configure option to increase the minimum >> required >> binutils or add a macro to synthetize the instruction on older binutils >> (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does). Configuration variable loongarch_vec_asm has been added in patch v2, when doing the configuration, it will check if the assembler supports LSX/LASX and decides whether strlen LSX/LASX code get compiled. diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac index 39efccfd8f..9fadf7bb9d 100644 --- a/sysdeps/loongarch/configure.ac +++ b/sysdeps/loongarch/configure.ac @@ -74,6 +74,8 @@ else libc_cv_loongarch_vec_asm=no fi rm -f conftest*]) +LIBC_CONFIG_VAR([loongarch_vec_asm], [$libc_cv_loongarch_vec_asm]) + if test $libc_cv_loongarch_vec_asm = yes; then AC_DEFINE(HAVE_LOONGARCH_VEC_ASM) fi diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile new file mode 100644 index 0000000000..73b7f61969 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/Makefile @@ -0,0 +1,11 @@ +ifeq ($(subdir),string) +sysdep_routines += strlen-aligned \ + # sysdep_routines + +ifeq ($(loongarch_vec_asm), yes) +sysdep_routines += strlen-lsx \ + strlen-lasx \ + # sysdep_routines +endif + +endif Detailed info can be find from: https://sourceware.org/pipermail/libc-alpha/2023-August/150566.html >> This implementation fails to assembler with binutils 2.40.0.20230525: >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match >> insn: vld $vr0,$r4,0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match >> insn: vld $vr1,$r4,16 >> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release version 2.41? Following issues is also fixed in the patch v2: 1. Missing one line comment. 2. I think it should be only 2023 here and for other new file as well. (Copyright) On 2023-08-02 09:25, caiyinyu wrote: > > 在 2023/8/1 下午10:31, Adhemerval Zanella Netto 写道: >> >> On 01/08/23 04:09, dengjianbo wrote: >>> 1. strlen-lasx is implemeted by LASX simd instructions(256bit) >>> 2. strlen-lsx is implemeted by LSX simd instructions(128bit) >>> 3. strlen-align is implemented by LA basic instructions and never >>> use unaligned memory acess >> Usually optimization routines are added along benchmarks number to show >> the expected improvements over different sizes and alignment. > > The performance test plots for these functions over different sizes > and alignment are here: > https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_align.png > https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lasx.png > https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_lsx.png > > >> >>> --- >>> sysdeps/loongarch/lp64/multiarch/Makefile | 3 + >>> .../lp64/multiarch/ifunc-impl-list.c | 39 +++++++ >>> .../loongarch/lp64/multiarch/ifunc-strlen.h | 36 +++++++ >>> .../loongarch/lp64/multiarch/strlen-aligned.S | 101 >>> ++++++++++++++++++ >>> .../loongarch/lp64/multiarch/strlen-lasx.S | 65 +++++++++++ >>> sysdeps/loongarch/lp64/multiarch/strlen-lsx.S | 73 +++++++++++++ >>> sysdeps/loongarch/lp64/multiarch/strlen.c | 37 +++++++ >>> sysdeps/loongarch/sys/regdef.h | 57 ++++++++++ >>> .../unix/sysv/linux/loongarch/cpu-features.h | 2 + >>> 9 files changed, 413 insertions(+) >>> create mode 100644 sysdeps/loongarch/lp64/multiarch/Makefile >>> create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c >>> create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h >>> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-aligned.S >>> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lasx.S >>> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen-lsx.S >>> create mode 100644 sysdeps/loongarch/lp64/multiarch/strlen.c >>> >>> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile >>> b/sysdeps/loongarch/lp64/multiarch/Makefile >>> new file mode 100644 >>> index 0000000000..529a8b6bab >>> --- /dev/null >>> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile >>> @@ -0,0 +1,3 @@ >>> +ifeq ($(subdir),string) >>> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx >>> +endif >> One entry per line: >> >> sysdep_routines += \ >> strlen-aligned \ >> strlen-lsx \ >> strlen-lasx \ >> # sysdep_routines >> >>> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c >>> b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c >>> new file mode 100644 >>> index 0000000000..b35e41127e >>> --- /dev/null >>> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c >>> @@ -0,0 +1,39 @@ >>> +/* Enumerate available IFUNC implementations of a function. >>> LoongArch64 version. >>> + Copyright (C) 2017-2023 Free Software Foundation, Inc. >> I think it should be only 2023 here and for other new file as well. >> >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <http://www.gnu.org/licenses/>. */ >>> + >>> +#include <assert.h> >>> +#include <string.h> >>> +#include <wchar.h> >>> +#include <ldsodefs.h> >>> +#include <ifunc-impl-list.h> >>> +#include <stdio.h> >>> + >>> +size_t >>> +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl >>> *array, >>> + size_t max) >>> +{ >>> + >>> + size_t i = max; >>> + >>> + IFUNC_IMPL (i, name, strlen, >>> + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, >>> __strlen_lasx) >>> + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx) >>> + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned) >>> + ) >>> + return i; >>> +} >>> diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h >>> b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h >>> new file mode 100644 >>> index 0000000000..e2b3490f39 >>> --- /dev/null >>> +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h >>> @@ -0,0 +1,36 @@ >>> +/* Common definition for strlen implementation. >>> + All versions must be listed in ifunc-impl-list.c. >>> + Copyright (C) 2017-2023 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <https://www.gnu.org/licenses/>. */ >>> + >>> +#include <ldsodefs.h> >>> +#include <ifunc-init.h> >>> + >>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; >>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; >>> +extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; >>> + >>> +static inline void * >>> +IFUNC_SELECTOR (void) >>> +{ >>> + if (SUPPORT_LASX) >>> + return OPTIMIZE (lasx); >>> + else if (SUPPORT_LSX) >>> + return OPTIMIZE (lsx); >>> + else >>> + return OPTIMIZE (aligned); >>> +} >>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S >>> b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S >>> new file mode 100644 >>> index 0000000000..b379e978a7 >>> --- /dev/null >>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S >>> @@ -0,0 +1,101 @@ >>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. >>> + >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library. If not, see >>> + <https://www.gnu.org/licenses/>. */ >>> + >>> +#include <sysdep.h> >>> +#include <sys/regdef.h> >>> +#include <sys/asm.h> >>> + >>> +#if IS_IN (libc) >>> +# define STRLEN __strlen_aligned >>> +#else >>> +# define STRLEN strlen >>> +#endif >> Is this really an improvement over the generic implementation? It >> seems to >> use a quite similar strategy. >> >>> + >>> +LEAF(STRLEN, 6) >>> + move a1, a0 >>> + bstrins.d a0, zero, 2, 0 >>> + lu12i.w a2, 0x01010 >>> + li.w t0, -1 >>> + >>> + ld.d t2, a0, 0 >>> + andi t1, a1, 0x7 >>> + ori a2, a2, 0x101 >>> + slli.d t1, t1, 3 >>> + >>> + bstrins.d a2, a2, 63, 32 >>> + sll.d t1, t0, t1 >>> + slli.d t3, a2, 7 >>> + nor a3, zero, t3 >>> + >>> + orn t2, t2, t1 >>> + sub.d t0, t2, a2 >>> + nor t1, t2, a3 >>> + and t0, t0, t1 >>> + >>> + >>> + bnez t0, L(count_pos) >>> + addi.d a0, a0, 8 >>> +L(loop_16_7bit): >>> + ld.d t2, a0, 0 >>> + sub.d t1, t2, a2 >>> + >>> + and t0, t1, t3 >>> + bnez t0, L(more_check) >>> + ld.d t2, a0, 8 >>> + sub.d t1, t2, a2 >>> + >>> + and t0, t1, t3 >>> + addi.d a0, a0, 16 >>> + beqz t0, L(loop_16_7bit) >>> + addi.d a0, a0, -8 >>> + >>> +L(more_check): >>> + nor t0, t2, a3 >>> + and t0, t1, t0 >>> + bnez t0, L(count_pos) >>> + addi.d a0, a0, 8 >>> + >>> + >>> +L(loop_16_8bit): >>> + ld.d t2, a0, 0 >>> + sub.d t1, t2, a2 >>> + nor t0, t2, a3 >>> + and t0, t0, t1 >>> + >>> + bnez t0, L(count_pos) >>> + ld.d t2, a0, 8 >>> + addi.d a0, a0, 16 >>> + sub.d t1, t2, a2 >>> + >>> + nor t0, t2, a3 >>> + and t0, t0, t1 >>> + beqz t0, L(loop_16_8bit) >>> + addi.d a0, a0, -8 >>> + >>> +L(count_pos): >>> + ctz.d t1, t0 >>> + sub.d a0, a0, a1 >>> + srli.d t1, t1, 3 >>> + add.d a0, a0, t1 >>> + >>> + jr ra >>> +END(STRLEN) >>> + >>> +#ifdef _LIBC >>> +libc_hidden_builtin_def (STRLEN) >>> +#endif >>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S >>> b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S >>> new file mode 100644 >>> index 0000000000..56ac6403d3 >>> --- /dev/null >>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S >>> @@ -0,0 +1,65 @@ >> Missing one line comment. >> >>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. >>> + >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library. If not, see >>> + <https://www.gnu.org/licenses/>. */ >>> + >>> +#include <sysdep.h> >>> +#include <sys/regdef.h> >>> +#include <sys/asm.h> >>> + >>> +#if IS_IN (libc) >>> + >>> +# define STRLEN __strlen_lasx >>> + >>> +LEAF(STRLEN, 6) >>> + move a1, a0 >>> + bstrins.d a0, zero, 4, 0 >>> + li.d t1, -1 >>> + xvld xr0, a0, 0 >>> + >>> + xvmsknz.b xr0, xr0 >>> + xvpickve.w xr1, xr0, 4 >>> + vilvl.h vr0, vr1, vr0 >>> + movfr2gr.s t0, fa0 # sign extend >>> + >>> + sra.w t0, t0, a1 >>> + beq t0, t1, L(loop) >>> + cto.w a0, t0 >>> + jr ra >>> + >>> +L(loop): >>> + xvld xr0, a0, 32 >>> + addi.d a0, a0, 32 >>> + xvsetanyeqz.b fcc0, xr0 >>> + bceqz fcc0, L(loop) >>> + >>> + >>> + xvmsknz.b xr0, xr0 >>> + sub.d a0, a0, a1 >>> + xvpickve.w xr1, xr0, 4 >>> + vilvl.h vr0, vr1, vr0 >>> + >>> + movfr2gr.s t0, fa0 >>> + cto.w t0, t0 >>> + add.d a0, a0, t0 >>> + jr ra >>> +END(STRLEN) >>> + >>> +#ifdef _LIBC >>> +libc_hidden_builtin_def (STRLEN) >>> +#endif >>> + >>> +#endif >> This implementation fails to assembler with binutils 2.40.0.20230525: >> >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match >> insn: xvld $xr0,$r4,0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match >> insn: xvmsknz.b $xr0,$xr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match >> insn: xvpickve.w $xr1,$xr0,4 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match >> insn: vilvl.h $vr0,$vr1,$vr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match >> insn: xvld $xr0,$r4,32 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match >> insn: xvsetanyeqz.b $fcc0,$xr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match >> insn: xvmsknz.b $xr0,$xr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match >> insn: xvpickve.w $xr1,$xr0,4 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match >> insn: vilvl.h $vr0,$vr1,$vr0 >> >> You need to either add a configure option to increase the minimum >> required >> binutils or add a macro to synthetize the instruction on older binutils >> (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does). >> >> >>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S >>> b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S >>> new file mode 100644 >>> index 0000000000..1c19c98b5b >>> --- /dev/null >>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S >>> @@ -0,0 +1,73 @@ >>> +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. >>> + >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library. If not, see >>> + <https://www.gnu.org/licenses/>. */ >>> + >>> +#include <sysdep.h> >>> +#include <sys/regdef.h> >>> +#include <sys/asm.h> >>> + >>> +#if IS_IN (libc) >>> + >>> +# define STRLEN __strlen_lsx >>> + >>> +LEAF(STRLEN, 6) >>> + move a1, a0 >>> + bstrins.d a0, zero, 4, 0 >>> + vld vr0, a0, 0 >>> + vld vr1, a0, 16 >>> + >>> + li.d t1, -1 >>> + vmsknz.b vr0, vr0 >>> + vmsknz.b vr1, vr1 >>> + vilvl.h vr0, vr1, vr0 >>> + >>> + movfr2gr.s t0, fa0 >>> + sra.w t0, t0, a1 >>> + beq t0, t1, L(loop) >>> + cto.w a0, t0 >>> + >>> + jr ra >>> + nop >>> + nop >>> + nop >>> + >>> + >>> +L(loop): >>> + vld vr0, a0, 32 >>> + vld vr1, a0, 48 >>> + addi.d a0, a0, 32 >>> + vmin.bu vr2, vr0, vr1 >>> + >>> + vsetanyeqz.b fcc0, vr2 >>> + bceqz fcc0, L(loop) >>> + vmsknz.b vr0, vr0 >>> + vmsknz.b vr1, vr1 >>> + >>> + vilvl.h vr0, vr1, vr0 >>> + sub.d a0, a0, a1 >>> + movfr2gr.s t0, fa0 >>> + cto.w t0, t0 >>> + >>> + add.d a0, a0, t0 >>> + jr ra >>> +END(STRLEN) >>> + >>> +#ifdef _LIBC >>> +libc_hidden_builtin_def (STRLEN) >>> +#endif >>> + >>> +#endif >> This implementation fails to assembler with binutils 2.40.0.20230525: >> >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match >> insn: vld $vr0,$r4,0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match >> insn: vld $vr1,$r4,16 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:34: Error: no match >> insn: vmsknz.b $vr0,$vr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:35: Error: no match >> insn: vmsknz.b $vr1,$vr1 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:36: Error: no match >> insn: vilvl.h $vr0,$vr1,$vr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:50: Error: no match >> insn: vld $vr0,$r4,32 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:51: Error: no match >> insn: vld $vr1,$r4,48 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:53: Error: no match >> insn: vmin.bu $vr2,$vr0,$vr1 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:55: Error: no match >> insn: vsetanyeqz.b $fcc0,$vr2 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:57: Error: no match >> insn: vmsknz.b $vr0,$vr0 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:58: Error: no match >> insn: vmsknz.b $vr1,$vr1 >> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:60: Error: no match >> insn: vilvl.h $vr0,$vr1,$vr0 >> >>> diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c >>> b/sysdeps/loongarch/lp64/multiarch/strlen.c >>> new file mode 100644 >>> index 0000000000..416ed0d9e2 >>> --- /dev/null >>> +++ b/sysdeps/loongarch/lp64/multiarch/strlen.c >>> @@ -0,0 +1,37 @@ >>> +/* Multiple versions of strlen. >>> + All versions must be listed in ifunc-impl-list.c. >>> + Copyright (C) 2017-2023 Free Software Foundation, Inc. >>> + This file is part of the GNU C Library. >>> + >>> + The GNU C Library is free software; you can redistribute it and/or >>> + modify it under the terms of the GNU Lesser General Public >>> + License as published by the Free Software Foundation; either >>> + version 2.1 of the License, or (at your option) any later version. >>> + >>> + The GNU C Library is distributed in the hope that it will be >>> useful, >>> + but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + Lesser General Public License for more details. >>> + >>> + You should have received a copy of the GNU Lesser General Public >>> + License along with the GNU C Library; if not, see >>> + <https://www.gnu.org/licenses/>. */ >>> + >>> +/* Define multiple versions only for the definition in libc. */ >>> + >>> +#if IS_IN (libc) >>> +# define strlen __redirect_strlen >>> +# include <string.h> >>> +# undef strlen >>> + >>> +# define SYMBOL_NAME strlen >>> +# include "ifunc-strlen.h" >>> + >>> +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ()); >>> + >>> +# ifdef SHARED >>> +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen) >>> + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen); >>> +# endif >>> + >>> +#endif >>> diff --git a/sysdeps/loongarch/sys/regdef.h >>> b/sysdeps/loongarch/sys/regdef.h >>> index 5100f36d24..524d2e3277 100644 >>> --- a/sysdeps/loongarch/sys/regdef.h >>> +++ b/sysdeps/loongarch/sys/regdef.h >>> @@ -89,6 +89,14 @@ >>> #define fs5 $f29 >>> #define fs6 $f30 >>> #define fs7 $f31 >>> +#define fcc0 $fcc0 >>> +#define fcc1 $fcc1 >>> +#define fcc2 $fcc2 >>> +#define fcc3 $fcc3 >>> +#define fcc4 $fcc4 >>> +#define fcc5 $fcc5 >>> +#define fcc6 $fcc6 >>> +#define fcc7 $fcc7 >>> #define vr0 $vr0 >>> #define vr1 $vr1 >>> @@ -98,6 +106,30 @@ >>> #define vr5 $vr5 >>> #define vr6 $vr6 >>> #define vr7 $vr7 >>> +#define vr8 $vr8 >>> +#define vr9 $vr9 >>> +#define vr10 $vr10 >>> +#define vr11 $vr11 >>> +#define vr12 $vr12 >>> +#define vr13 $vr13 >>> +#define vr14 $vr14 >>> +#define vr15 $vr15 >>> +#define vr16 $vr16 >>> +#define vr17 $vr17 >>> +#define vr18 $vr18 >>> +#define vr19 $vr19 >>> +#define vr20 $vr20 >>> +#define vr21 $vr21 >>> +#define vr22 $vr22 >>> +#define vr23 $vr23 >>> +#define vr24 $vr24 >>> +#define vr25 $vr25 >>> +#define vr26 $vr26 >>> +#define vr27 $vr27 >>> +#define vr28 $vr28 >>> +#define vr29 $vr29 >>> +#define vr30 $vr30 >>> +#define vr31 $vr31 >>> #define xr0 $xr0 >>> #define xr1 $xr1 >>> @@ -107,5 +139,30 @@ >>> #define xr5 $xr5 >>> #define xr6 $xr6 >>> #define xr7 $xr7 >>> +#define xr7 $xr7 >>> +#define xr8 $xr8 >>> +#define xr9 $xr9 >>> +#define xr10 $xr10 >>> +#define xr11 $xr11 >>> +#define xr12 $xr12 >>> +#define xr13 $xr13 >>> +#define xr14 $xr14 >>> +#define xr15 $xr15 >>> +#define xr16 $xr16 >>> +#define xr17 $xr17 >>> +#define xr18 $xr18 >>> +#define xr19 $xr19 >>> +#define xr20 $xr20 >>> +#define xr21 $xr21 >>> +#define xr22 $xr22 >>> +#define xr23 $xr23 >>> +#define xr24 $xr24 >>> +#define xr25 $xr25 >>> +#define xr26 $xr26 >>> +#define xr27 $xr27 >>> +#define xr28 $xr28 >>> +#define xr29 $xr29 >>> +#define xr30 $xr30 >>> +#define xr31 $xr31 >>> #endif /* _SYS_REGDEF_H */ >>> diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h >>> b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h >>> index e371e13b15..d1a280a5ee 100644 >>> --- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h >>> +++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h >>> @@ -25,5 +25,7 @@ >>> #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) >>> #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) >>> +#define INIT_ARCH() >>> + >>> #endif /* _CPU_FEATURES_LOONGARCH64_H */
On 2023-08-01 22:44, Xi Ruoyao wrote: > On Tue, 2023-08-01 at 15:09 +0800, dengjianbo wrote: > > /* snip */ > >> diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile >> b/sysdeps/loongarch/lp64/multiarch/Makefile >> new file mode 100644 >> index 0000000000..529a8b6bab >> --- /dev/null >> +++ b/sysdeps/loongarch/lp64/multiarch/Makefile >> @@ -0,0 +1,3 @@ >> +ifeq ($(subdir),string) >> +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx >> +endif > Please check if the assembler supports LSX/LASX, if not you should not > add strlen-lsx and strlen-lasx here. We don't want to disallow building > Glibc for LoongArch with old assembler. > A new configuration variable loongarch_vec_asm has been added in patch v2. when doing the configuration, it will check if the assembler supports LSX/LASX and set the corresponding value. Then we can check it in the makefile, decides if the strlen LASX/LSX code can be compiled. +LIBC_CONFIG_VAR([loongarch_vec_asm], [$libc_cv_loongarch_vec_asm]) +++ b/sysdeps/loongarch/lp64/multiarch/Makefile @@ -0,0 +1,11 @@ +ifeq ($(subdir),string) +sysdep_routines += strlen-aligned \ + # sysdep_routines + +ifeq ($(loongarch_vec_asm), yes) +sysdep_routines += strlen-lsx \ + strlen-lasx \ + # sysdep_routines +endif + +endif For detailed info, please kindly find from: https://sourceware.org/pipermail/libc-alpha/2023-August/150566.html
On 02/08/23 09:25, dengjianbo wrote: >>>> >>>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote: >>>> +#if IS_IN (libc) >>>> +# define STRLEN __strlen_aligned >>>> +#else >>>> +# define STRLEN strlen >>>> +#endif >>> Is this really an improvement over the generic implementation? It seems to >>> use a quite similar strategy. > Comparing with the code generated by compiler, the assembly code does an 16bytes loop > unrolling, and handles ascii data and non-ascii data separately which could take less > instructions to calculate the length of ascii data. besides, the assembly code using > fewer instructions to start the loop. I think the performance improvement benefits from > this. Please kindly check bench result also from: > https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out From the summarized results [1], it seems that the initial start to mask off unaligned inputs are slight better. The __strlen_aligned onl seems better to sizes larger than 32 (the 16 lenght results seems strange). Maybe you coult improve shift_find/find_zero_all/index_first on loongarch. Does it improve by explicit instructing compiler to unroll the loop? diff --git a/sysdeps/loongarch/Makefile b/sysdeps/loongarch/Makefile index 43d2f583cd..d807a5e0d2 100644 --- a/sysdeps/loongarch/Makefile +++ b/sysdeps/loongarch/Makefile @@ -15,3 +15,7 @@ ASFLAGS-.os += $(pic-ccflag) ifeq (yes,$(have-cmodel-medium)) CFLAGS-.oS += -mcmodel=medium endif + +ifeq ($(subdir),string) +CFLAGS-strlen.c += -funroll-all-loops --param max-variable-expansions-in-unroller=2 +endif [1] https://github.com/jiadengx/glibc_test/blob/main/strlen/strlen_align.png >>> This implementation fails to assembler with binutils 2.40.0.20230525: >>> >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:31: Error: no match insn: xvld $xr0,$r4,0 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:33: Error: no match insn: xvmsknz.b $xr0,$xr0 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:34: Error: no match insn: xvpickve.w $xr1,$xr0,4 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:35: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:44: Error: no match insn: xvld $xr0,$r4,32 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:46: Error: no match insn: xvsetanyeqz.b $fcc0,$xr0 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:50: Error: no match insn: xvmsknz.b $xr0,$xr0 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:52: Error: no match insn: xvpickve.w $xr1,$xr0,4 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lasx.S:53: Error: no match insn: vilvl.h $vr0,$vr1,$vr0 >>> >>> You need to either add a configure option to increase the minimum required >>> binutils or add a macro to synthetize the instruction on older binutils >>> (similar to what sysdeps/powerpc/powerpc64/le/power9/strncmp.S does). > Configuration variable loongarch_vec_asm has been added in patch v2, when doing the configuration, > it will check if the assembler supports LSX/LASX and decides whether strlen LSX/LASX code get compiled. > > diff --git a/sysdeps/loongarch/configure.ac b/sysdeps/loongarch/configure.ac > index 39efccfd8f..9fadf7bb9d 100644 > --- a/sysdeps/loongarch/configure.ac > +++ b/sysdeps/loongarch/configure.ac > @@ -74,6 +74,8 @@ else > libc_cv_loongarch_vec_asm=no > fi > rm -f conftest*]) > +LIBC_CONFIG_VAR([loongarch_vec_asm], [$libc_cv_loongarch_vec_asm]) > + > if test $libc_cv_loongarch_vec_asm = yes; then > AC_DEFINE(HAVE_LOONGARCH_VEC_ASM) > fi > diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile > new file mode 100644 > index 0000000000..73b7f61969 > --- /dev/null > +++ b/sysdeps/loongarch/lp64/multiarch/Makefile > @@ -0,0 +1,11 @@ > +ifeq ($(subdir),string) > +sysdep_routines += strlen-aligned \ > + # sysdep_routines > + > +ifeq ($(loongarch_vec_asm), yes) > +sysdep_routines += strlen-lsx \ > + strlen-lasx \ > + # sysdep_routines > +endif > + > +endif > > Detailed info can be find from: > https://sourceware.org/pipermail/libc-alpha/2023-August/150566.html >>> This implementation fails to assembler with binutils 2.40.0.20230525: >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 >>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 >>> > Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release > version 2.41? Although it should work, it is unexpected that depending of the assembler used some optimized routines are not enabled. > > Following issues is also fixed in the patch v2: > 1. Missing one line comment. > 2. I think it should be only 2023 here and for other new file as well. (Copyright)
On 2023-08-02 20:59, Adhemerval Zanella Netto wrote: >>>>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote: >>>>> +#if IS_IN (libc) >>>>> +# define STRLEN __strlen_aligned >>>>> +#else >>>>> +# define STRLEN strlen >>>>> +#endif >>>> Is this really an improvement over the generic implementation? It seems to >>>> use a quite similar strategy. >> Comparing with the code generated by compiler, the assembly code does an 16bytes loop >> unrolling, and handles ascii data and non-ascii data separately which could take less >> instructions to calculate the length of ascii data. besides, the assembly code using >> fewer instructions to start the loop. I think the performance improvement benefits from >> this. Please kindly check bench result also from: >> https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out > From the summarized results [1], it seems that the initial start to mask > off unaligned inputs are slight better. The __strlen_aligned onl seems > better to sizes larger than 32 (the 16 lenght results seems strange). > Maybe you coult improve shift_find/find_zero_all/index_first on loongarch. > > Does it improve by explicit instructing compiler to unroll the loop? As you know, the assembly versions of strlen uses the same strategy to calculate string length, if assembly code only calculate 8 bytes in the loop and don't separate ascii and non-ascii data, the code of loop and loop end part should be the same as the compiler generated code base on generic strlen. Loongarch doesn't provide instructions like alpha cmpbge, so there is no much optimizations could be done on find_zero_all/index_first/has_zero except we can remove some BIG_ENDIAN codes. Refer to the latest test results in the chart: The assembly implementation vs. generic strlen implementation(compiled by using CFLAGS-strlen.c += -funroll-all-loops --param max-variable-expandsions-in-unroller=2) the performance improvement of the assembly implementation is evident(30% ~ 40%), especially in cases when the length is greater than 64 bytes. Please kindly see the results via: https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png >>>> This implementation fails to assembler with binutils 2.40.0.20230525: >>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: >>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 >>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 >>>> >> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release >> version 2.41? > Although it should work, it is unexpected that depending of the assembler used > some optimized routines are not enabled. In patch v2, an new configuration variable has been added to control whether the LASX/LSX will be compiled according to assembler support LASX/LSX or not, so it can be compiled with old versions of binutils.
On 03/08/23 10:27, dengjianbo wrote: > > On 2023-08-02 20:59, Adhemerval Zanella Netto wrote: >>>>>> On 2023-08-02 10:31, Adhemerval Zanella Netto wrote: >>>>>> +#if IS_IN (libc) >>>>>> +# define STRLEN __strlen_aligned >>>>>> +#else >>>>>> +# define STRLEN strlen >>>>>> +#endif >>>>> Is this really an improvement over the generic implementation? It seems to >>>>> use a quite similar strategy. >>> Comparing with the code generated by compiler, the assembly code does an 16bytes loop >>> unrolling, and handles ascii data and non-ascii data separately which could take less >>> instructions to calculate the length of ascii data. besides, the assembly code using >>> fewer instructions to start the loop. I think the performance improvement benefits from >>> this. Please kindly check bench result also from: >>> https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out >> From the summarized results [1], it seems that the initial start to mask >> off unaligned inputs are slight better. The __strlen_aligned onl seems >> better to sizes larger than 32 (the 16 lenght results seems strange). >> Maybe you coult improve shift_find/find_zero_all/index_first on loongarch. >> >> Does it improve by explicit instructing compiler to unroll the loop? > As you know, the assembly versions of strlen uses the same strategy to > calculate string length, if assembly code only calculate 8 bytes in the > loop and don't separate ascii and non-ascii data, the code of loop and > loop end part should be the same as the compiler generated code base on > generic strlen. Loongarch doesn't provide instructions like alpha > cmpbge, so there is no much optimizations could be done on > find_zero_all/index_first/has_zero except we can remove some BIG_ENDIAN > codes. > > Refer to the latest test results in the chart: The assembly > implementation vs. generic strlen implementation(compiled by using > CFLAGS-strlen.c += -funroll-all-loops --param > max-variable-expandsions-in-unroller=2) the performance > improvement of the assembly implementation is evident(30% ~ 40%), > especially in cases when the length is greater than 64 bytes. > Please kindly see the results via: > https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png So maybe use the generic implementation plus the compiler flags to loop unrolling instead of asm optimization? >>>>> This implementation fails to assembler with binutils 2.40.0.20230525: >>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: >>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 >>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 >>>>> >>> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release >>> version 2.41? >> Although it should work, it is unexpected that depending of the assembler used >> some optimized routines are not enabled. > > In patch v2, an new configuration variable has been added to control > whether the LASX/LSX will be compiled according to assembler support > LASX/LSX or not, so it can be compiled with old versions of binutils. Yes I am aware and this seems odd, albeit not really wrong. It means that you will get less code coverage and optimizations depending of the used binutils. I would advise to follow what other architecture did to provide arch-specific optimization, which is either setup a minimum gcc/binutils version (for instance aarch64 libmvec), or encode the instructions in a binutils neutral mode (as the powerpc implementation I pointed out).
On Thu, 2023-08-03 at 10:48 -0300, Adhemerval Zanella Netto wrote: > On 03/08/23 10:27, dengjianbo wrote: > > On 2023-08-02 20:59, Adhemerval Zanella Netto wrote: > > > > > > > On 2023-08-02 10:31, Adhemerval Zanella Netto wrote: > > > > > > > +#if IS_IN (libc) > > > > > > > +# define STRLEN __strlen_aligned > > > > > > > +#else > > > > > > > +# define STRLEN strlen > > > > > > > +#endif > > > > > > Is this really an improvement over the generic implementation? It seems to > > > > > > use a quite similar strategy. > > > > Comparing with the code generated by compiler, the assembly code does an 16bytes loop > > > > unrolling, and handles ascii data and non-ascii data separately which could take less > > > > instructions to calculate the length of ascii data. besides, the assembly code using > > > > fewer instructions to start the loop. I think the performance improvement benefits from > > > > this. Please kindly check bench result also from: > > > > https://github.com/jiadengx/glibc_test/blob/main/strlen/bench-strlen.out > > > From the summarized results [1], it seems that the initial start to mask > > > off unaligned inputs are slight better. The __strlen_aligned onl seems > > > better to sizes larger than 32 (the 16 lenght results seems strange). > > > Maybe you coult improve shift_find/find_zero_all/index_first on loongarch. > > > > > > Does it improve by explicit instructing compiler to unroll the loop? > > As you know, the assembly versions of strlen uses the same strategy to > > calculate string length, if assembly code only calculate 8 bytes in the > > loop and don't separate ascii and non-ascii data, the code of loop and > > loop end part should be the same as the compiler generated code base on > > generic strlen. Loongarch doesn't provide instructions like alpha > > cmpbge, so there is no much optimizations could be done on > > find_zero_all/index_first/has_zero except we can remove some BIG_ENDIAN > > codes. Removing them will not make any difference because the compiler will optimized the BIG_ENDIAN paths away. > > Refer to the latest test results in the chart: The assembly > > implementation vs. generic strlen implementation(compiled by using > > CFLAGS-strlen.c += -funroll-all-loops --param > > max-variable-expandsions-in-unroller=2) the performance > > improvement of the assembly implementation is evident(30% ~ 40%), > > especially in cases when the length is greater than 64 bytes. > > Please kindly see the results via: > > https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png > > So maybe use the generic implementation plus the compiler flags to loop > unrolling instead of asm optimization? This is strange... I remember I'd attempted to add #pragma GCC unroll for the main loop of strlen and I observed no performance gain on my Loongson-3A5000-HV, at all. Maybe a different test environment (hardware, compiler version, or something)? > > > > > > This implementation fails to assembler with binutils 2.40.0.20230525: > > > > > > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: > > > > > > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 > > > > > > ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 > > > > > > > > > > Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release > > > > version 2.41? > > > Although it should work, it is unexpected that depending of the assembler used > > > some optimized routines are not enabled. > > > > In patch v2, an new configuration variable has been added to control > > whether the LASX/LSX will be compiled according to assembler support > > LASX/LSX or not, so it can be compiled with old versions of binutils. > > Yes I am aware and this seems odd, albeit not really wrong. It means that > you will get less code coverage and optimizations depending of the used > binutils. > > I would advise to follow what other architecture did to provide arch-specific > optimization, which is either setup a minimum gcc/binutils version (for > instance aarch64 libmvec), or encode the instructions in a binutils neutral > mode (as the powerpc implementation I pointed out). Hmm, this policy seems different from $OTHER_PROJECTS.
On Thu, 2023-08-03 at 22:53 +0800, Xi Ruoyao wrote: > > > > I would advise to follow what other architecture did to provide > > arch-specific > > optimization, which is either setup a minimum gcc/binutils version > > (for > > instance aarch64 libmvec), or encode the instructions in a binutils > > neutral > > mode (as the powerpc implementation I pointed out). > > Hmm, this policy seems different from $OTHER_PROJECTS. BTW I guess we should start to document some general rules about machine-specific optimizations in https://sourceware.org/glibc/wiki/Consensus or somewhere.
On 03/08/23 11:59, Xi Ruoyao wrote: > On Thu, 2023-08-03 at 22:53 +0800, Xi Ruoyao wrote: >>> >>> I would advise to follow what other architecture did to provide >>> arch-specific >>> optimization, which is either setup a minimum gcc/binutils version >>> (for >>> instance aarch64 libmvec), or encode the instructions in a binutils >>> neutral >>> mode (as the powerpc implementation I pointed out). >> >> Hmm, this policy seems different from $OTHER_PROJECTS. We don't have a strict policy regarding it, but I think having less configuration permutations to test helps in maintainability. For instance, with the --enable-fortify-source I had to test 2/3 *times* more build permutation to see if every architecture did build for all supported gcc version. > > BTW I guess we should start to document some general rules about > machine-specific optimizations in > https://sourceware.org/glibc/wiki/Consensus or somewhere. > In fact my understanding is arch arch-maintainer may define how to proceed in this way.
..... >>> Refer to the latest test results in the chart: The assembly >>> implementation vs. generic strlen implementation(compiled by using >>> CFLAGS-strlen.c += -funroll-all-loops --param >>> max-variable-expandsions-in-unroller=2) the performance >>> improvement of the assembly implementation is evident(30% ~ 40%), >>> especially in cases when the length is greater than 64 bytes. >>> Please kindly see the results via: >>> https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png >> So maybe use the generic implementation plus the compiler flags to loop >> unrolling instead of asm optimization? > This is strange... I remember I'd attempted to add #pragma GCC unroll > for the main loop of strlen and I observed no performance gain on my > Loongson-3A5000-HV, at all. Maybe a different test environment > (hardware, compiler version, or something)? The name of his graph is ambiguous. What he means is that our assembly implementation performs better than the generic code implementation (plus the compiler flags for loop unrolling), and our assembly implementation improves performance by 30% to 40%, especially in cases where the length is greater than 64 bytes. https://github.com/jiadengx/glibc_test/blob/main/strlen2/bench1/generic_strlen_with_loop_unrolling.png > >>>>>>> This implementation fails to assembler with binutils 2.40.0.20230525: >>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: >>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 >>>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 >>>>>>> >>>>> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release >>>>> version 2.41? >>>> Although it should work, it is unexpected that depending of the assembler used >>>> some optimized routines are not enabled. >>> In patch v2, an new configuration variable has been added to control >>> whether the LASX/LSX will be compiled according to assembler support >>> LASX/LSX or not, so it can be compiled with old versions of binutils. >> Yes I am aware and this seems odd, albeit not really wrong. It means that >> you will get less code coverage and optimizations depending of the used >> binutils. >> >> I would advise to follow what other architecture did to provide arch-specific >> optimization, which is either setup a minimum gcc/binutils version (for >> instance aarch64 libmvec), or encode the instructions in a binutils neutral >> mode (as the powerpc implementation I pointed out). > Hmm, this policy seems different from $OTHER_PROJECTS. I prefer the first plan: setting a minimum version limit for gcc/binutils. >
On 2023-08-03 21:48, Adhemerval Zanella Netto wrote: >>>>>> This implementation fails to assembler with binutils 2.40.0.20230525: >>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S: Assembler messages: >>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:30: Error: no match insn: vld $vr0,$r4,0 >>>>>> ../sysdeps/loongarch/lp64/multiarch/strlen-lsx.S:31: Error: no match insn: vld $vr1,$r4,16 >>>>>> >>>> Sorry, it's my mistake for the wrong version of binutils. Could you please try the latest release >>>> version 2.41? >>> Although it should work, it is unexpected that depending of the assembler used >>> some optimized routines are not enabled. >> In patch v2, an new configuration variable has been added to control >> whether the LASX/LSX will be compiled according to assembler support >> LASX/LSX or not, so it can be compiled with old versions of binutils. > Yes I am aware and this seems odd, albeit not really wrong. It means that > you will get less code coverage and optimizations depending of the used > binutils. > > I would advise to follow what other architecture did to provide arch-specific > optimization, which is either setup a minimum gcc/binutils version (for > instance aarch64 libmvec), or encode the instructions in a binutils neutral > mode (as the powerpc implementation I pointed out). we have setup a minimun binutils version in patch v3, please kindly find via: https://sourceware.org/pipermail/libc-alpha/2023-August/150670.html
diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile new file mode 100644 index 0000000000..529a8b6bab --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/Makefile @@ -0,0 +1,3 @@ +ifeq ($(subdir),string) +sysdep_routines += strlen-aligned strlen-lsx strlen-lasx +endif diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..b35e41127e --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c @@ -0,0 +1,39 @@ +/* Enumerate available IFUNC implementations of a function. LoongArch64 version. + Copyright (C) 2017-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ldsodefs.h> +#include <ifunc-impl-list.h> +#include <stdio.h> + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + + size_t i = max; + + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LASX, __strlen_lasx) + IFUNC_IMPL_ADD (array, i, strlen, SUPPORT_LSX, __strlen_lsx) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_aligned) + ) + return i; +} diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h new file mode 100644 index 0000000000..e2b3490f39 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/ifunc-strlen.h @@ -0,0 +1,36 @@ +/* Common definition for strlen implementation. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <ldsodefs.h> +#include <ifunc-init.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + if (SUPPORT_LASX) + return OPTIMIZE (lasx); + else if (SUPPORT_LSX) + return OPTIMIZE (lsx); + else + return OPTIMIZE (aligned); +} diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S new file mode 100644 index 0000000000..b379e978a7 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/strlen-aligned.S @@ -0,0 +1,101 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) +# define STRLEN __strlen_aligned +#else +# define STRLEN strlen +#endif + +LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 2, 0 + lu12i.w a2, 0x01010 + li.w t0, -1 + + ld.d t2, a0, 0 + andi t1, a1, 0x7 + ori a2, a2, 0x101 + slli.d t1, t1, 3 + + bstrins.d a2, a2, 63, 32 + sll.d t1, t0, t1 + slli.d t3, a2, 7 + nor a3, zero, t3 + + orn t2, t2, t1 + sub.d t0, t2, a2 + nor t1, t2, a3 + and t0, t0, t1 + + + bnez t0, L(count_pos) + addi.d a0, a0, 8 +L(loop_16_7bit): + ld.d t2, a0, 0 + sub.d t1, t2, a2 + + and t0, t1, t3 + bnez t0, L(more_check) + ld.d t2, a0, 8 + sub.d t1, t2, a2 + + and t0, t1, t3 + addi.d a0, a0, 16 + beqz t0, L(loop_16_7bit) + addi.d a0, a0, -8 + +L(more_check): + nor t0, t2, a3 + and t0, t1, t0 + bnez t0, L(count_pos) + addi.d a0, a0, 8 + + +L(loop_16_8bit): + ld.d t2, a0, 0 + sub.d t1, t2, a2 + nor t0, t2, a3 + and t0, t0, t1 + + bnez t0, L(count_pos) + ld.d t2, a0, 8 + addi.d a0, a0, 16 + sub.d t1, t2, a2 + + nor t0, t2, a3 + and t0, t0, t1 + beqz t0, L(loop_16_8bit) + addi.d a0, a0, -8 + +L(count_pos): + ctz.d t1, t0 + sub.d a0, a0, a1 + srli.d t1, t1, 3 + add.d a0, a0, t1 + + jr ra +END(STRLEN) + +#ifdef _LIBC +libc_hidden_builtin_def (STRLEN) +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S new file mode 100644 index 0000000000..56ac6403d3 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lasx.S @@ -0,0 +1,65 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) + +# define STRLEN __strlen_lasx + +LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 4, 0 + li.d t1, -1 + xvld xr0, a0, 0 + + xvmsknz.b xr0, xr0 + xvpickve.w xr1, xr0, 4 + vilvl.h vr0, vr1, vr0 + movfr2gr.s t0, fa0 # sign extend + + sra.w t0, t0, a1 + beq t0, t1, L(loop) + cto.w a0, t0 + jr ra + +L(loop): + xvld xr0, a0, 32 + addi.d a0, a0, 32 + xvsetanyeqz.b fcc0, xr0 + bceqz fcc0, L(loop) + + + xvmsknz.b xr0, xr0 + sub.d a0, a0, a1 + xvpickve.w xr1, xr0, 4 + vilvl.h vr0, vr1, vr0 + + movfr2gr.s t0, fa0 + cto.w t0, t0 + add.d a0, a0, t0 + jr ra +END(STRLEN) + +#ifdef _LIBC +libc_hidden_builtin_def (STRLEN) +#endif + +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S new file mode 100644 index 0000000000..1c19c98b5b --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/strlen-lsx.S @@ -0,0 +1,73 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <sys/regdef.h> +#include <sys/asm.h> + +#if IS_IN (libc) + +# define STRLEN __strlen_lsx + +LEAF(STRLEN, 6) + move a1, a0 + bstrins.d a0, zero, 4, 0 + vld vr0, a0, 0 + vld vr1, a0, 16 + + li.d t1, -1 + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + vilvl.h vr0, vr1, vr0 + + movfr2gr.s t0, fa0 + sra.w t0, t0, a1 + beq t0, t1, L(loop) + cto.w a0, t0 + + jr ra + nop + nop + nop + + +L(loop): + vld vr0, a0, 32 + vld vr1, a0, 48 + addi.d a0, a0, 32 + vmin.bu vr2, vr0, vr1 + + vsetanyeqz.b fcc0, vr2 + bceqz fcc0, L(loop) + vmsknz.b vr0, vr0 + vmsknz.b vr1, vr1 + + vilvl.h vr0, vr1, vr0 + sub.d a0, a0, a1 + movfr2gr.s t0, fa0 + cto.w t0, t0 + + add.d a0, a0, t0 + jr ra +END(STRLEN) + +#ifdef _LIBC +libc_hidden_builtin_def (STRLEN) +#endif + +#endif diff --git a/sysdeps/loongarch/lp64/multiarch/strlen.c b/sysdeps/loongarch/lp64/multiarch/strlen.c new file mode 100644 index 0000000000..416ed0d9e2 --- /dev/null +++ b/sysdeps/loongarch/lp64/multiarch/strlen.c @@ -0,0 +1,37 @@ +/* Multiple versions of strlen. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +# define strlen __redirect_strlen +# include <string.h> +# undef strlen + +# define SYMBOL_NAME strlen +# include "ifunc-strlen.h" + +libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ()); + +# ifdef SHARED +__hidden_ver1 (strlen, __GI_strlen, __redirect_strlen) + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlen); +# endif + +#endif diff --git a/sysdeps/loongarch/sys/regdef.h b/sysdeps/loongarch/sys/regdef.h index 5100f36d24..524d2e3277 100644 --- a/sysdeps/loongarch/sys/regdef.h +++ b/sysdeps/loongarch/sys/regdef.h @@ -89,6 +89,14 @@ #define fs5 $f29 #define fs6 $f30 #define fs7 $f31 +#define fcc0 $fcc0 +#define fcc1 $fcc1 +#define fcc2 $fcc2 +#define fcc3 $fcc3 +#define fcc4 $fcc4 +#define fcc5 $fcc5 +#define fcc6 $fcc6 +#define fcc7 $fcc7 #define vr0 $vr0 #define vr1 $vr1 @@ -98,6 +106,30 @@ #define vr5 $vr5 #define vr6 $vr6 #define vr7 $vr7 +#define vr8 $vr8 +#define vr9 $vr9 +#define vr10 $vr10 +#define vr11 $vr11 +#define vr12 $vr12 +#define vr13 $vr13 +#define vr14 $vr14 +#define vr15 $vr15 +#define vr16 $vr16 +#define vr17 $vr17 +#define vr18 $vr18 +#define vr19 $vr19 +#define vr20 $vr20 +#define vr21 $vr21 +#define vr22 $vr22 +#define vr23 $vr23 +#define vr24 $vr24 +#define vr25 $vr25 +#define vr26 $vr26 +#define vr27 $vr27 +#define vr28 $vr28 +#define vr29 $vr29 +#define vr30 $vr30 +#define vr31 $vr31 #define xr0 $xr0 #define xr1 $xr1 @@ -107,5 +139,30 @@ #define xr5 $xr5 #define xr6 $xr6 #define xr7 $xr7 +#define xr7 $xr7 +#define xr8 $xr8 +#define xr9 $xr9 +#define xr10 $xr10 +#define xr11 $xr11 +#define xr12 $xr12 +#define xr13 $xr13 +#define xr14 $xr14 +#define xr15 $xr15 +#define xr16 $xr16 +#define xr17 $xr17 +#define xr18 $xr18 +#define xr19 $xr19 +#define xr20 $xr20 +#define xr21 $xr21 +#define xr22 $xr22 +#define xr23 $xr23 +#define xr24 $xr24 +#define xr25 $xr25 +#define xr26 $xr26 +#define xr27 $xr27 +#define xr28 $xr28 +#define xr29 $xr29 +#define xr30 $xr30 +#define xr31 $xr31 #endif /* _SYS_REGDEF_H */ diff --git a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h index e371e13b15..d1a280a5ee 100644 --- a/sysdeps/unix/sysv/linux/loongarch/cpu-features.h +++ b/sysdeps/unix/sysv/linux/loongarch/cpu-features.h @@ -25,5 +25,7 @@ #define SUPPORT_LSX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LSX) #define SUPPORT_LASX (GLRO (dl_hwcap) & HWCAP_LOONGARCH_LASX) +#define INIT_ARCH() + #endif /* _CPU_FEATURES_LOONGARCH64_H */