Message ID | PAWPR08MB89822C10A1EEF917E99C85FC83AEA@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | [1/3] AArch64: Cleanup emag memset | expand |
On 10/11/23 14:35, Wilco Dijkstra wrote: > > Add a specialized memset for the common ZVA size of 64. Since the code is > identical to __memset_falkor, remove the latter. > > OK for commit? LGTM, thanks. Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> > > --- > > diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S > index bf3cf85c8a95fd8c03ae13c4173fe507040ee8cd..bbfb7184c3e4277f59178ccf4f9b92814dd7a48d 100644 > --- a/sysdeps/aarch64/memset.S > +++ b/sysdeps/aarch64/memset.S > @@ -101,19 +101,19 @@ L(tail64): > ret > > L(try_zva): > -#ifdef ZVA_MACRO > - zva_macro > -#else > +#ifndef ZVA64_ONLY > .p2align 3 > mrs tmp1, dczid_el0 > tbnz tmp1w, 4, L(no_zva) > and tmp1w, tmp1w, 15 > cmp tmp1w, 4 /* ZVA size is 64 bytes. */ > b.ne L(zva_128) > - > + nop > +#endif > /* Write the first and last 64 byte aligned block using stp rather > than using DC ZVA. This is faster on some cores. > */ > + .p2align 4 > L(zva_64): > str q0, [dst, 16] > stp q0, q0, [dst, 32] > @@ -123,7 +123,6 @@ L(zva_64): > sub count, dstend, dst /* Count is now 128 too large. */ > sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > add dst, dst, 128 > - nop > 1: dc zva, dst > add dst, dst, 64 > subs count, count, 64 > @@ -134,6 +133,7 @@ L(zva_64): > stp q0, q0, [dstend, -32] > ret > > +#ifndef ZVA64_ONLY > .p2align 3 > L(zva_128): > cmp tmp1w, 5 /* ZVA size is 128 bytes. */ Ok. > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index a1a4de3cd93c48db6e47eebc9c111186efca53fb..171ca5e4cf9a87fc7df5896f21c2e5b94ea218ba 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -12,10 +12,10 @@ sysdep_routines += \ > memmove_mops \ > memset_a64fx \ > memset_emag \ > - memset_falkor \ > memset_generic \ > memset_kunpeng \ > memset_mops \ > + memset_zva64 \ > strlen_asimd \ > strlen_generic \ > # sysdep_routines Ok. > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 3596d3c8d3403b4ea07d80d9a8877e2908a9883e..fdd9ea92463123df213dec27f6f0598f8ce54d6e 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -54,9 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > IFUNC_IMPL (i, name, memset, > - /* Enable this on non-falkor processors too so that other cores > - can do a comparative analysis with __memset_generic. */ > - IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64) > IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag) > IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) Ok. > #if HAVE_AARCH64_SVE_ASM > diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c > index 9193b197ddc3a647768184a6a639d6635cfea96e..6deb6865e5154f129922dca673cf069f72f46d79 100644 > --- a/sysdeps/aarch64/multiarch/memset.c > +++ b/sysdeps/aarch64/multiarch/memset.c > @@ -28,7 +28,7 @@ > > extern __typeof (__redirect_memset) __libc_memset; > > -extern __typeof (__redirect_memset) __memset_falkor attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden; > extern __typeof (__redirect_memset) __memset_emag attribute_hidden; > extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden; > extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; > @@ -47,18 +47,17 @@ select_memset_ifunc (void) > { > if (IS_A64FX (midr) && zva_size == 256) > return __memset_a64fx; > - return __memset_generic; > } > > if (IS_KUNPENG920 (midr)) > return __memset_kunpeng; > > - if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64) > - return __memset_falkor; > - > if (IS_EMAG (midr)) > return __memset_emag; > > + if (zva_size == 64) > + return __memset_zva64; > + > return __memset_generic; > } > Ok. > diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S > deleted file mode 100644 > index c6946a8072ce60099f9c3da0cf4ca54785e6a520..0000000000000000000000000000000000000000 > --- a/sysdeps/aarch64/multiarch/memset_falkor.S > +++ /dev/null > @@ -1,54 +0,0 @@ > -/* Memset for falkor. > - Copyright (C) 2017-2023 Free Software Foundation, Inc. > - > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library. If not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > -#include <memset-reg.h> > - > -/* Reading dczid_el0 is expensive on falkor so move it into the ifunc > - resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to > - use this function only when ZVA is enabled. */ > - > -#if IS_IN (libc) > -.macro zva_macro > - .p2align 4 > - /* Write the first and last 64 byte aligned block using stp rather > - than using DC ZVA. This is faster on some cores. */ > - str q0, [dst, 16] > - stp q0, q0, [dst, 32] > - bic dst, dst, 63 > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > -1: dc zva, dst > - add dst, dst, 64 > - subs count, count, 64 > - b.hi 1b > - stp q0, q0, [dst, 0] > - stp q0, q0, [dst, 32] > - stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > -.endm > - > -# define ZVA_MACRO zva_macro > -# define MEMSET __memset_falkor > -# include <sysdeps/aarch64/memset.S> > -#endif > diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S > new file mode 100644 > index 0000000000000000000000000000000000000000..13f45fd3d882c756f18a1679d758e2eb688f9c3d > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_zva64.S > @@ -0,0 +1,27 @@ > +/* Optimized memset for zva size = 64. > + Copyright (C) 2023 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#define ZVA64_ONLY 1 > +#define MEMSET __memset_zva64 > +#undef libc_hidden_builtin_def > +#define libc_hidden_builtin_def(X) > + > +#include "../memset.S" > Ok.
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index bf3cf85c8a95fd8c03ae13c4173fe507040ee8cd..bbfb7184c3e4277f59178ccf4f9b92814dd7a48d 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -101,19 +101,19 @@ L(tail64): ret L(try_zva): -#ifdef ZVA_MACRO - zva_macro -#else +#ifndef ZVA64_ONLY .p2align 3 mrs tmp1, dczid_el0 tbnz tmp1w, 4, L(no_zva) and tmp1w, tmp1w, 15 cmp tmp1w, 4 /* ZVA size is 64 bytes. */ b.ne L(zva_128) - + nop +#endif /* Write the first and last 64 byte aligned block using stp rather than using DC ZVA. This is faster on some cores. */ + .p2align 4 L(zva_64): str q0, [dst, 16] stp q0, q0, [dst, 32] @@ -123,7 +123,6 @@ L(zva_64): sub count, dstend, dst /* Count is now 128 too large. */ sub count, count, 128+64+64 /* Adjust count and bias for loop. */ add dst, dst, 128 - nop 1: dc zva, dst add dst, dst, 64 subs count, count, 64 @@ -134,6 +133,7 @@ L(zva_64): stp q0, q0, [dstend, -32] ret +#ifndef ZVA64_ONLY .p2align 3 L(zva_128): cmp tmp1w, 5 /* ZVA size is 128 bytes. */ diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index a1a4de3cd93c48db6e47eebc9c111186efca53fb..171ca5e4cf9a87fc7df5896f21c2e5b94ea218ba 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -12,10 +12,10 @@ sysdep_routines += \ memmove_mops \ memset_a64fx \ memset_emag \ - memset_falkor \ memset_generic \ memset_kunpeng \ memset_mops \ + memset_zva64 \ strlen_asimd \ strlen_generic \ # sysdep_routines diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 3596d3c8d3403b4ea07d80d9a8877e2908a9883e..fdd9ea92463123df213dec27f6f0598f8ce54d6e 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -54,9 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) IFUNC_IMPL (i, name, memset, - /* Enable this on non-falkor processors too so that other cores - can do a comparative analysis with __memset_generic. */ - IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng) #if HAVE_AARCH64_SVE_ASM diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c index 9193b197ddc3a647768184a6a639d6635cfea96e..6deb6865e5154f129922dca673cf069f72f46d79 100644 --- a/sysdeps/aarch64/multiarch/memset.c +++ b/sysdeps/aarch64/multiarch/memset.c @@ -28,7 +28,7 @@ extern __typeof (__redirect_memset) __libc_memset; -extern __typeof (__redirect_memset) __memset_falkor attribute_hidden; +extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden; extern __typeof (__redirect_memset) __memset_emag attribute_hidden; extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden; extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden; @@ -47,18 +47,17 @@ select_memset_ifunc (void) { if (IS_A64FX (midr) && zva_size == 256) return __memset_a64fx; - return __memset_generic; } if (IS_KUNPENG920 (midr)) return __memset_kunpeng; - if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64) - return __memset_falkor; - if (IS_EMAG (midr)) return __memset_emag; + if (zva_size == 64) + return __memset_zva64; + return __memset_generic; } diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S deleted file mode 100644 index c6946a8072ce60099f9c3da0cf4ca54785e6a520..0000000000000000000000000000000000000000 --- a/sysdeps/aarch64/multiarch/memset_falkor.S +++ /dev/null @@ -1,54 +0,0 @@ -/* Memset for falkor. - Copyright (C) 2017-2023 Free Software Foundation, Inc. - - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> -#include <memset-reg.h> - -/* Reading dczid_el0 is expensive on falkor so move it into the ifunc - resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to - use this function only when ZVA is enabled. */ - -#if IS_IN (libc) -.macro zva_macro - .p2align 4 - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. */ - str q0, [dst, 16] - stp q0, q0, [dst, 32] - bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 64 - subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret -.endm - -# define ZVA_MACRO zva_macro -# define MEMSET __memset_falkor -# include <sysdeps/aarch64/memset.S> -#endif diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S new file mode 100644 index 0000000000000000000000000000000000000000..13f45fd3d882c756f18a1679d758e2eb688f9c3d --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_zva64.S @@ -0,0 +1,27 @@ +/* Optimized memset for zva size = 64. + Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define ZVA64_ONLY 1 +#define MEMSET __memset_zva64 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(X) + +#include "../memset.S"