Message ID | 1507223795-4893-1-git-send-email-siddhesh@sourceware.org |
---|---|
State | New |
Headers | show |
Series | [PATCHv3,1/2] aarch64: Hoist ZVA check out of the memset function | expand |
Ping! On Thursday 05 October 2017 10:46 PM, Siddhesh Poyarekar wrote: > The DZP bit in the dczid_el0 register does not change dynamically, so > it is safe to read once during program startup. Hoist the zva check > into an ifunc resolver and store the result into a static variable, > which can be read in case of non-standard zva sizes. This effectively > adds 3 ifunc variants for memset - one for cases where zva is > disabled, one for 64 byte zva and another for 128 byte zva. I have > retained the older memset as __memset_generic for internal libc.so use > so that the change impact is minimal. We should eventually have a > discussion on what is more expensive, reading dczid_el0 on every > memset invocation or the indirection due to PLT. > > The gains due to this are significant for falkor, with run time > reductions as high as 42% in some cases. Likewise for mustang, > although the numbers are slightly lower. Here's a sample from the > falkor tests: > > Function: memset > Variant: walk > simple_memset __memset_nozva __memset_zva_64 __memset_generic > ======================================================================================================================== > length=256, char=0: 35936.10 (-706.66%) 2429.88 ( 45.46%) 2571.85 ( 42.27%) 4454.92 > length=257, char=0: 36209.50 (-710.17%) 2436.12 ( 45.49%) 2564.25 ( 42.63%) 4469.36 > length=258, char=0: 36507.90 (-710.21%) 2522.06 ( 44.03%) 2578.89 ( 42.77%) 4505.99 > length=259, char=0: 36764.30 (-711.99%) 2611.61 ( 42.32%) 2593.52 ( 42.72%) 4527.69 > length=260, char=0: 36943.30 (-712.62%) 2639.06 ( 41.95%) 2608.24 ( 42.63%) 4546.19 > length=261, char=0: 37287.50 (-717.27%) 2623.07 ( 42.51%) 2623.17 ( 42.51%) 4562.47 > length=262, char=0: 37573.70 (-722.44%) 2665.51 ( 41.66%) 2637.28 ( 42.27%) 4568.56 > length=263, char=0: 37833.70 (-724.30%) 2692.70 ( 41.33%) 2668.38 ( 41.86%) 4589.79 > length=264, char=0: 38136.00 (-727.49%) 2737.30 ( 40.61%) 2685.48 ( 41.73%) 4608.66 > length=265, char=0: 38403.10 (-730.30%) 2778.70 ( 39.92%) 2695.10 ( 41.73%) 4625.23 > length=266, char=0: 38684.50 (-729.88%) 2822.16 ( 39.46%) 2692.91 ( 42.23%) 4661.47 > length=267, char=0: 38954.10 (-732.30%) 2867.41 ( 38.73%) 2706.28 ( 42.18%) 4680.31 > length=268, char=0: 39155.00 (-733.08%) 2968.76 ( 36.84%) 2721.89 ( 42.09%) 4700.03 > length=269, char=0: 39559.30 (-737.49%) 3057.49 ( 35.27%) 2737.61 ( 42.04%) 4723.54 > length=270, char=0: 39813.80 (-742.51%) 3073.64 ( 34.96%) 2751.70 ( 41.77%) 4725.60 > length=271, char=0: 40070.60 (-744.40%) 3103.55 ( 34.60%) 2784.25 ( 41.33%) 4745.43 > length=512, char=0: 137515.00 (-1275.48%) 8971.95 ( 10.26%) 7168.66 ( 28.30%) 9997.61 > length=513, char=0: 138015.00 (-1284.40%) 8987.07 ( 9.85%) 7242.59 ( 27.35%) 9969.29 > length=514, char=0: 138556.00 (-1286.76%) 9200.17 ( 7.92%) 7211.49 ( 27.82%) 9991.38 > length=515, char=0: 139182.00 (-1277.21%) 9223.64 ( 8.73%) 7232.78 ( 28.43%) 10106.10 > length=516, char=0: 139512.00 (-1288.41%) 9306.80 ( 7.38%) 7312.15 ( 27.23%) 10048.30 > length=517, char=0: 140117.00 (-1292.65%) 9429.22 ( 6.28%) 7273.52 ( 27.71%) 10061.20 > length=518, char=0: 140706.00 (-1294.63%) 9463.83 ( 6.20%) 7292.57 ( 27.72%) 10089.10 > length=519, char=0: 141221.00 (-1289.12%) 9548.99 ( 6.07%) 7312.75 ( 28.07%) 10166.20 > length=520, char=0: 141696.00 (-1297.00%) 9713.49 ( 4.27%) 7386.44 ( 27.21%) 10147.00 > length=521, char=0: 142309.00 (-1298.82%) 9888.41 ( 2.80%) 7361.91 ( 27.64%) 10173.50 > length=522, char=0: 142878.00 (-1292.34%) 9909.30 ( 3.43%) 7381.22 ( 28.07%) 10261.70 > length=523, char=0: 143327.00 (-1300.69%) 9918.78 ( 3.07%) 7462.93 ( 27.07%) 10232.60 > length=524, char=0: 143776.00 (-1301.67%) 10055.40 ( 1.97%) 7428.56 ( 27.58%) 10257.50 > length=525, char=0: 144429.00 (-1296.79%) 10090.80 ( 2.41%) 7449.84 ( 27.95%) 10340.10 > length=526, char=0: 144976.00 (-1305.05%) 10178.80 ( 1.35%) 7530.66 ( 27.02%) 10318.20 > length=527, char=0: 145551.00 (-1306.63%) 10314.40 ( 0.32%) 7498.48 ( 27.53%) 10347.50 > length=1024, char=0: 537600.00 (-2116.32%) 34541.10 (-42.40%) 22541.00 ( 7.07%) 24256.40 > length=1025, char=0: 538490.00 (-2117.66%) 34560.10 (-42.33%) 22574.10 ( 7.03%) 24281.90 > length=1026, char=0: 539596.00 (-2118.30%) 34869.20 (-43.35%) 22615.10 ( 7.03%) 24324.70 > length=1027, char=0: 540544.00 (-2118.30%) 35020.60 (-43.72%) 22654.60 ( 7.03%) 24367.50 > length=1028, char=0: 541355.00 (-2119.44%) 35407.20 (-45.16%) 22702.00 ( 6.93%) 24391.50 > length=1029, char=0: 542678.00 (-2121.52%) 35806.10 (-46.58%) 22751.10 ( 6.87%) 24428.20 > length=1030, char=0: 543843.00 (-2122.73%) 35761.20 (-46.16%) 22771.20 ( 6.93%) 24467.30 > length=1031, char=0: 544725.00 (-2123.15%) 35927.70 (-46.63%) 22814.10 ( 6.89%) 24502.40 > length=1032, char=0: 545744.00 (-2124.10%) 35882.10 (-46.23%) 22844.50 ( 6.90%) 24537.70 > length=1033, char=0: 546968.00 (-2125.25%) 36080.00 (-46.79%) 22885.20 ( 6.90%) 24580.10 > length=1034, char=0: 548042.00 (-2126.35%) 36208.30 (-47.09%) 22922.90 ( 6.88%) 24616.20 > length=1035, char=0: 549066.00 (-2127.30%) 36398.80 (-47.65%) 22961.30 ( 6.86%) 24651.60 > length=1036, char=0: 550138.00 (-2127.95%) 36558.40 (-48.05%) 23008.70 ( 6.82%) 24692.60 > length=1037, char=0: 551170.00 (-2129.86%) 36732.90 (-48.61%) 23043.40 ( 6.77%) 24717.70 > length=1038, char=0: 552268.00 (-2130.95%) 36722.80 (-48.35%) 23078.80 ( 6.77%) 24754.80 > length=1039, char=0: 553270.00 (-2131.58%) 36891.60 (-48.80%) 23116.80 ( 6.76%) 24792.80 > > * sysdeps/aarch64/memset.S (do_no_zva): New macro. > (do_zva_64): Likewise. > (do_zva_128): Likewise. > (__memset): Rename to MEMSET macro. > (MEMSET): Use the new macros. > * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): > Add memset_generic, memset_nozva, memset_zva_64, > memset_zva_128 and memset_generic. > * sysdeps/aarch64/multiarch/ifunc-impl-list.c > (__libc_ifunc_impl_list): Add memset ifuncs. > * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New > local variable zva_size. > * sysdeps/aarch64/multiarch/memset.c: New file. > * sysdeps/aarch64/multiarch/memset_generic.S: New file. > * sysdeps/aarch64/multiarch/memset_nozva.S: New file. > * sysdeps/aarch64/multiarch/memset_zva_64.S: New file. > * sysdeps/aarch64/multiarch/memset_zva_128.S: New file. > * sysdeps/aarch64/multiarch/rtld-memset.S: New file. > * sysdeps/unix/sysv/linux/aarch64/cpu-features.c > (DCZID_DZP_MASK): New macro. > (DCZID_BS_MASK): Likewise. > (init_cpu_features): Read and set zva_size. > * sysdeps/unix/sysv/linux/aarch64/cpu-features.h > (struct cpu_features): New member zva_size. > --- > sysdeps/aarch64/memset.S | 200 +++++++++++++++---------- > sysdeps/aarch64/multiarch/Makefile | 3 +- > sysdeps/aarch64/multiarch/ifunc-impl-list.c | 5 + > sysdeps/aarch64/multiarch/init-arch.h | 8 +- > sysdeps/aarch64/multiarch/memset.c | 43 ++++++ > sysdeps/aarch64/multiarch/memset_generic.S | 28 ++++ > sysdeps/aarch64/multiarch/memset_nozva.S | 23 +++ > sysdeps/aarch64/multiarch/memset_zva_128.S | 24 +++ > sysdeps/aarch64/multiarch/memset_zva_64.S | 24 +++ > sysdeps/aarch64/multiarch/rtld-memset.S | 24 +++ > sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 ++ > sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 1 + > 12 files changed, 307 insertions(+), 86 deletions(-) > create mode 100644 sysdeps/aarch64/multiarch/memset.c > create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S > create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S > create mode 100644 sysdeps/aarch64/multiarch/memset_zva_128.S > create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S > create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S > > diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S > index 110fd22..9fea4c2 100644 > --- a/sysdeps/aarch64/memset.S > +++ b/sysdeps/aarch64/memset.S > @@ -37,7 +37,105 @@ > #define zva_len x7 > #define zva_lenw w7 > > -ENTRY_ALIGN (__memset, 6) > +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128 > + bytes and higher sizes. */ > + > +/* No ZVA. */ > +.macro do_no_zva > + sub count, dstend, dst /* Count is 16 too large. */ > + add dst, dst, 16 > + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > +1: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 1b > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* Write the first and last 64 byte aligned block using stp rather > + than using DC ZVA. This is faster on some cores. */ > +.macro do_zva_64 > + str q0, [dst, 16] > + stp q0, q0, [dst, 32] > + bic dst, dst, 63 > + stp q0, q0, [dst, 64] > + stp q0, q0, [dst, 96] > + sub count, dstend, dst /* Count is now 128 too large. */ > + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > + add dst, dst, 128 > + nop > +1: dc zva, dst > + add dst, dst, 64 > + subs count, count, 64 > + b.hi 1b > + stp q0, q0, [dst, 0] > + stp q0, q0, [dst, 32] > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* ZVA size of 128 bytes. */ > +.macro do_zva_128 > + str q0, [dst, 16] > + stp q0, q0, [dst, 32] > + stp q0, q0, [dst, 64] > + stp q0, q0, [dst, 96] > + bic dst, dst, 127 > + sub count, dstend, dst /* Count is now 128 too large. */ > + sub count, count, 128+128 /* Adjust count and bias for loop. */ > + add dst, dst, 128 > +1: dc zva, dst > + add dst, dst, 128 > + subs count, count, 128 > + b.hi 1b > + stp q0, q0, [dstend, -128] > + stp q0, q0, [dstend, -96] > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* ZVA size of more than 128 bytes. */ > +.macro do_zva_default > + add tmp1, zva_len, 64 /* Max alignment bytes written. */ > + cmp count, tmp1 > + blo L(no_zva) > + > + sub tmp2, zva_len, 1 > + add tmp1, dst, zva_len > + add dst, dst, 16 > + subs count, tmp1, dst /* Actual alignment bytes to write. */ > + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > + beq 2f > +1: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 1b > +2: mov dst, tmp1 > + sub count, dstend, tmp1 /* Remaining bytes to write. */ > + subs count, count, zva_len > + b.lo 4f > +3: dc zva, dst > + add dst, dst, zva_len > + subs count, count, zva_len > + b.hs 3b > +4: add count, count, zva_len > + subs count, count, 64 > + b.ls 6f > +5: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 5b > +6: stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* Memset entry point. */ > +ENTRY_ALIGN (MEMSET, 6) > > DELOUSE (0) > DELOUSE (2) > @@ -89,107 +187,45 @@ L(set96): > .p2align 3 > nop > L(set_long): > +#ifdef MEMSET_ZVA > and valw, valw, 255 > +#endif > bic dst, dstin, 15 > str q0, [dstin] > +#ifdef MEMSET_ZVA > cmp count, 256 > ccmp valw, 0, 0, cs > b.eq L(try_zva) > +#endif > L(no_zva): > - sub count, dstend, dst /* Count is 16 too large. */ > - add dst, dst, 16 > - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > -L(tail64): > - subs count, count, 64 > - b.hi 1b > -2: stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > + do_no_zva > > - .p2align 3 > +#ifdef MEMSET_ZVA > + .p2align 4 > L(try_zva): > +# if MEMSET_ZVA == 64 > + do_zva_64 > +# elif MEMSET_ZVA == 128 > + do_zva_128 > +# else > mrs tmp1, dczid_el0 > tbnz tmp1w, 4, L(no_zva) > and tmp1w, tmp1w, 15 > cmp tmp1w, 4 /* ZVA size is 64 bytes. */ > b.ne L(zva_128) > + do_zva_64 > > - /* Write the first and last 64 byte aligned block using stp rather > - than using DC ZVA. This is faster on some cores. > - */ > -L(zva_64): > - str q0, [dst, 16] > - stp q0, q0, [dst, 32] > - bic dst, dst, 63 > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > - nop > -1: dc zva, dst > - add dst, dst, 64 > - subs count, count, 64 > - b.hi 1b > - stp q0, q0, [dst, 0] > - stp q0, q0, [dst, 32] > - stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > - > - .p2align 3 > L(zva_128): > cmp tmp1w, 5 /* ZVA size is 128 bytes. */ > b.ne L(zva_other) > - > - str q0, [dst, 16] > - stp q0, q0, [dst, 32] > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - bic dst, dst, 127 > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+128 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > -1: dc zva, dst > - add dst, dst, 128 > - subs count, count, 128 > - b.hi 1b > - stp q0, q0, [dstend, -128] > - stp q0, q0, [dstend, -96] > - stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > + do_zva_128 > > L(zva_other): > mov tmp2w, 4 > lsl zva_lenw, tmp2w, tmp1w > - add tmp1, zva_len, 64 /* Max alignment bytes written. */ > - cmp count, tmp1 > - blo L(no_zva) > + do_zva_default > +# endif > +#endif > > - sub tmp2, zva_len, 1 > - add tmp1, dst, zva_len > - add dst, dst, 16 > - subs count, tmp1, dst /* Actual alignment bytes to write. */ > - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > - beq 2f > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > - subs count, count, 64 > - b.hi 1b > -2: mov dst, tmp1 > - sub count, dstend, tmp1 /* Remaining bytes to write. */ > - subs count, count, zva_len > - b.lo 4f > -3: dc zva, dst > - add dst, dst, zva_len > - subs count, count, zva_len > - b.hs 3b > -4: add count, count, zva_len > - b L(tail64) > - > -END (__memset) > -weak_alias (__memset, memset) > -libc_hidden_builtin_def (memset) > +END (MEMSET) > +libc_hidden_builtin_def (MEMSET) > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index 9aa1e79..c1e17e8 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -1,4 +1,5 @@ > ifeq ($(subdir),string) > sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ > - memmove_falkor > + memmove_falkor memset_generic memset_nozva memset_zva_64 \ > + memset_zva_128 > endif > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 2cb74d5..fb695ce 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > + IFUNC_IMPL (i, name, memset, > + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128) > + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) > > return i; > } > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > index 3af442c..a756dad 100644 > --- a/sysdeps/aarch64/multiarch/init-arch.h > +++ b/sysdeps/aarch64/multiarch/init-arch.h > @@ -18,6 +18,8 @@ > > #include <ldsodefs.h> > > -#define INIT_ARCH() \ > - uint64_t __attribute__((unused)) midr = \ > - GLRO(dl_aarch64_cpu_features).midr_el1; > +#define INIT_ARCH() \ > + uint64_t __attribute__((unused)) midr = \ > + GLRO(dl_aarch64_cpu_features).midr_el1; \ > + unsigned __attribute__((unused)) zva_size = \ > + GLRO(dl_aarch64_cpu_features).zva_size; > diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c > new file mode 100644 > index 0000000..a7e34c0 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset.c > @@ -0,0 +1,43 @@ > +/* Multiple versions of memset. AARCH64 version. > + Copyright (C) 2017 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > + > +#if IS_IN (libc) > +/* Redefine memset so that the compiler won't complain about the type > + mismatch with the IFUNC selector in strong_alias, below. */ > +# undef memset > +# define memset __redirect_memset > +# include <string.h> > +# include <init-arch.h> > + > +extern __typeof (__redirect_memset) __libc_memset; > + > +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden; > +extern __typeof (__redirect_memset) __memset_generic attribute_hidden; > + > +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva > + : (zva_size == 64 ? __memset_zva_64 > + : (zva_size == 128 ? __memset_zva_128 > + : __memset_generic)))); > + > +# undef memset > +strong_alias (__libc_memset, memset); > +#endif > diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S > new file mode 100644 > index 0000000..8871600 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_generic.S > @@ -0,0 +1,28 @@ > +/* Memset for aarch64, default version for internal use. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) > +# define MEMSET __memset_generic > +# define MEMSET_ZVA 1 > +/* Add a hidden definition for use within libc.so. */ > +# ifdef SHARED > + .globl __GI_memset; __GI_memset = __memset_generic > +# endif > +# include <sysdeps/aarch64/memset.S> > +#endif > diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S > new file mode 100644 > index 0000000..2d4fc42 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_nozva.S > @@ -0,0 +1,23 @@ > +/* Memset for aarch64, ZVA disabled. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) > +# define MEMSET __memset_nozva > +# include <sysdeps/aarch64/memset.S> > +#endif > diff --git a/sysdeps/aarch64/multiarch/memset_zva_128.S b/sysdeps/aarch64/multiarch/memset_zva_128.S > new file mode 100644 > index 0000000..2c68127 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_zva_128.S > @@ -0,0 +1,24 @@ > +/* Memset for aarch64, ZVA enabled and == 128 bytes. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) > +# define MEMSET __memset_zva_128 > +# define MEMSET_ZVA 128 > +# include <sysdeps/aarch64/memset.S> > +#endif > diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S > new file mode 100644 > index 0000000..ff895f9 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_zva_64.S > @@ -0,0 +1,24 @@ > +/* Memset for aarch64, ZVA enabled and == 64 bytes. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) > +# define MEMSET __memset_zva_64 > +# define MEMSET_ZVA 64 > +# include <sysdeps/aarch64/memset.S> > +#endif > diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S > new file mode 100644 > index 0000000..172df42 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/rtld-memset.S > @@ -0,0 +1,24 @@ > +/* Memset for aarch64, for the dynamic linker. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (rtld) > +# define MEMSET memset > +# define MEMSET_ZVA 1 > +# include <sysdeps/aarch64/memset.S> > +#endif > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > index e769eeb..092ee81 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > @@ -20,6 +20,9 @@ > #include <sys/auxv.h> > #include <elf/dl-hwcaps.h> > > +#define DCZID_DZP_MASK (1 << 4) > +#define DCZID_BS_MASK (0xf) > + > #if HAVE_TUNABLES > struct cpu_list > { > @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features) > } > > cpu_features->midr_el1 = midr; > + > + /* Check if ZVA is enabled. */ > + unsigned dczid; > + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid)); > + > + if ((dczid & DCZID_DZP_MASK) == 0) > + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK); > } > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > index 73cb53d..f2b6afd 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > @@ -47,6 +47,7 @@ > struct cpu_features > { > uint64_t midr_el1; > + unsigned zva_size; > }; > > #endif /* _CPU_FEATURES_AARCH64_H */ >
LGTM, for the pointer brought by Szabolcs: - I still prefer to have the macros on one file instead of multiple ones. It makes checks for all the possible code patch easier and one can theoretical build the desirable memset by just define the required macros instead of pulling from source code from different files. On 10/10/2017 07:05, Siddhesh Poyarekar wrote: > Ping! > > On Thursday 05 October 2017 10:46 PM, Siddhesh Poyarekar wrote: >> The DZP bit in the dczid_el0 register does not change dynamically, so >> it is safe to read once during program startup. Hoist the zva check >> into an ifunc resolver and store the result into a static variable, >> which can be read in case of non-standard zva sizes. This effectively >> adds 3 ifunc variants for memset - one for cases where zva is >> disabled, one for 64 byte zva and another for 128 byte zva. I have >> retained the older memset as __memset_generic for internal libc.so use >> so that the change impact is minimal. We should eventually have a >> discussion on what is more expensive, reading dczid_el0 on every >> memset invocation or the indirection due to PLT. >> >> The gains due to this are significant for falkor, with run time >> reductions as high as 42% in some cases. Likewise for mustang, >> although the numbers are slightly lower. Here's a sample from the >> falkor tests: >> >> Function: memset >> Variant: walk >> simple_memset __memset_nozva __memset_zva_64 __memset_generic >> ======================================================================================================================== >> length=256, char=0: 35936.10 (-706.66%) 2429.88 ( 45.46%) 2571.85 ( 42.27%) 4454.92 >> length=257, char=0: 36209.50 (-710.17%) 2436.12 ( 45.49%) 2564.25 ( 42.63%) 4469.36 >> length=258, char=0: 36507.90 (-710.21%) 2522.06 ( 44.03%) 2578.89 ( 42.77%) 4505.99 >> length=259, char=0: 36764.30 (-711.99%) 2611.61 ( 42.32%) 2593.52 ( 42.72%) 4527.69 >> length=260, char=0: 36943.30 (-712.62%) 2639.06 ( 41.95%) 2608.24 ( 42.63%) 4546.19 >> length=261, char=0: 37287.50 (-717.27%) 2623.07 ( 42.51%) 2623.17 ( 42.51%) 4562.47 >> length=262, char=0: 37573.70 (-722.44%) 2665.51 ( 41.66%) 2637.28 ( 42.27%) 4568.56 >> length=263, char=0: 37833.70 (-724.30%) 2692.70 ( 41.33%) 2668.38 ( 41.86%) 4589.79 >> length=264, char=0: 38136.00 (-727.49%) 2737.30 ( 40.61%) 2685.48 ( 41.73%) 4608.66 >> length=265, char=0: 38403.10 (-730.30%) 2778.70 ( 39.92%) 2695.10 ( 41.73%) 4625.23 >> length=266, char=0: 38684.50 (-729.88%) 2822.16 ( 39.46%) 2692.91 ( 42.23%) 4661.47 >> length=267, char=0: 38954.10 (-732.30%) 2867.41 ( 38.73%) 2706.28 ( 42.18%) 4680.31 >> length=268, char=0: 39155.00 (-733.08%) 2968.76 ( 36.84%) 2721.89 ( 42.09%) 4700.03 >> length=269, char=0: 39559.30 (-737.49%) 3057.49 ( 35.27%) 2737.61 ( 42.04%) 4723.54 >> length=270, char=0: 39813.80 (-742.51%) 3073.64 ( 34.96%) 2751.70 ( 41.77%) 4725.60 >> length=271, char=0: 40070.60 (-744.40%) 3103.55 ( 34.60%) 2784.25 ( 41.33%) 4745.43 >> length=512, char=0: 137515.00 (-1275.48%) 8971.95 ( 10.26%) 7168.66 ( 28.30%) 9997.61 >> length=513, char=0: 138015.00 (-1284.40%) 8987.07 ( 9.85%) 7242.59 ( 27.35%) 9969.29 >> length=514, char=0: 138556.00 (-1286.76%) 9200.17 ( 7.92%) 7211.49 ( 27.82%) 9991.38 >> length=515, char=0: 139182.00 (-1277.21%) 9223.64 ( 8.73%) 7232.78 ( 28.43%) 10106.10 >> length=516, char=0: 139512.00 (-1288.41%) 9306.80 ( 7.38%) 7312.15 ( 27.23%) 10048.30 >> length=517, char=0: 140117.00 (-1292.65%) 9429.22 ( 6.28%) 7273.52 ( 27.71%) 10061.20 >> length=518, char=0: 140706.00 (-1294.63%) 9463.83 ( 6.20%) 7292.57 ( 27.72%) 10089.10 >> length=519, char=0: 141221.00 (-1289.12%) 9548.99 ( 6.07%) 7312.75 ( 28.07%) 10166.20 >> length=520, char=0: 141696.00 (-1297.00%) 9713.49 ( 4.27%) 7386.44 ( 27.21%) 10147.00 >> length=521, char=0: 142309.00 (-1298.82%) 9888.41 ( 2.80%) 7361.91 ( 27.64%) 10173.50 >> length=522, char=0: 142878.00 (-1292.34%) 9909.30 ( 3.43%) 7381.22 ( 28.07%) 10261.70 >> length=523, char=0: 143327.00 (-1300.69%) 9918.78 ( 3.07%) 7462.93 ( 27.07%) 10232.60 >> length=524, char=0: 143776.00 (-1301.67%) 10055.40 ( 1.97%) 7428.56 ( 27.58%) 10257.50 >> length=525, char=0: 144429.00 (-1296.79%) 10090.80 ( 2.41%) 7449.84 ( 27.95%) 10340.10 >> length=526, char=0: 144976.00 (-1305.05%) 10178.80 ( 1.35%) 7530.66 ( 27.02%) 10318.20 >> length=527, char=0: 145551.00 (-1306.63%) 10314.40 ( 0.32%) 7498.48 ( 27.53%) 10347.50 >> length=1024, char=0: 537600.00 (-2116.32%) 34541.10 (-42.40%) 22541.00 ( 7.07%) 24256.40 >> length=1025, char=0: 538490.00 (-2117.66%) 34560.10 (-42.33%) 22574.10 ( 7.03%) 24281.90 >> length=1026, char=0: 539596.00 (-2118.30%) 34869.20 (-43.35%) 22615.10 ( 7.03%) 24324.70 >> length=1027, char=0: 540544.00 (-2118.30%) 35020.60 (-43.72%) 22654.60 ( 7.03%) 24367.50 >> length=1028, char=0: 541355.00 (-2119.44%) 35407.20 (-45.16%) 22702.00 ( 6.93%) 24391.50 >> length=1029, char=0: 542678.00 (-2121.52%) 35806.10 (-46.58%) 22751.10 ( 6.87%) 24428.20 >> length=1030, char=0: 543843.00 (-2122.73%) 35761.20 (-46.16%) 22771.20 ( 6.93%) 24467.30 >> length=1031, char=0: 544725.00 (-2123.15%) 35927.70 (-46.63%) 22814.10 ( 6.89%) 24502.40 >> length=1032, char=0: 545744.00 (-2124.10%) 35882.10 (-46.23%) 22844.50 ( 6.90%) 24537.70 >> length=1033, char=0: 546968.00 (-2125.25%) 36080.00 (-46.79%) 22885.20 ( 6.90%) 24580.10 >> length=1034, char=0: 548042.00 (-2126.35%) 36208.30 (-47.09%) 22922.90 ( 6.88%) 24616.20 >> length=1035, char=0: 549066.00 (-2127.30%) 36398.80 (-47.65%) 22961.30 ( 6.86%) 24651.60 >> length=1036, char=0: 550138.00 (-2127.95%) 36558.40 (-48.05%) 23008.70 ( 6.82%) 24692.60 >> length=1037, char=0: 551170.00 (-2129.86%) 36732.90 (-48.61%) 23043.40 ( 6.77%) 24717.70 >> length=1038, char=0: 552268.00 (-2130.95%) 36722.80 (-48.35%) 23078.80 ( 6.77%) 24754.80 >> length=1039, char=0: 553270.00 (-2131.58%) 36891.60 (-48.80%) 23116.80 ( 6.76%) 24792.80 >> >> * sysdeps/aarch64/memset.S (do_no_zva): New macro. >> (do_zva_64): Likewise. >> (do_zva_128): Likewise. >> (__memset): Rename to MEMSET macro. >> (MEMSET): Use the new macros. >> * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): >> Add memset_generic, memset_nozva, memset_zva_64, >> memset_zva_128 and memset_generic. >> * sysdeps/aarch64/multiarch/ifunc-impl-list.c >> (__libc_ifunc_impl_list): Add memset ifuncs. >> * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New >> local variable zva_size. >> * sysdeps/aarch64/multiarch/memset.c: New file. >> * sysdeps/aarch64/multiarch/memset_generic.S: New file. >> * sysdeps/aarch64/multiarch/memset_nozva.S: New file. >> * sysdeps/aarch64/multiarch/memset_zva_64.S: New file. >> * sysdeps/aarch64/multiarch/memset_zva_128.S: New file. >> * sysdeps/aarch64/multiarch/rtld-memset.S: New file. >> * sysdeps/unix/sysv/linux/aarch64/cpu-features.c >> (DCZID_DZP_MASK): New macro. >> (DCZID_BS_MASK): Likewise. >> (init_cpu_features): Read and set zva_size. >> * sysdeps/unix/sysv/linux/aarch64/cpu-features.h >> (struct cpu_features): New member zva_size. >> --- >> sysdeps/aarch64/memset.S | 200 +++++++++++++++---------- >> sysdeps/aarch64/multiarch/Makefile | 3 +- >> sysdeps/aarch64/multiarch/ifunc-impl-list.c | 5 + >> sysdeps/aarch64/multiarch/init-arch.h | 8 +- >> sysdeps/aarch64/multiarch/memset.c | 43 ++++++ >> sysdeps/aarch64/multiarch/memset_generic.S | 28 ++++ >> sysdeps/aarch64/multiarch/memset_nozva.S | 23 +++ >> sysdeps/aarch64/multiarch/memset_zva_128.S | 24 +++ >> sysdeps/aarch64/multiarch/memset_zva_64.S | 24 +++ >> sysdeps/aarch64/multiarch/rtld-memset.S | 24 +++ >> sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 ++ >> sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 1 + >> 12 files changed, 307 insertions(+), 86 deletions(-) >> create mode 100644 sysdeps/aarch64/multiarch/memset.c >> create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S >> create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S >> create mode 100644 sysdeps/aarch64/multiarch/memset_zva_128.S >> create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S >> create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S >> >> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S >> index 110fd22..9fea4c2 100644 >> --- a/sysdeps/aarch64/memset.S >> +++ b/sysdeps/aarch64/memset.S >> @@ -37,7 +37,105 @@ >> #define zva_len x7 >> #define zva_lenw w7 >> >> -ENTRY_ALIGN (__memset, 6) >> +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128 >> + bytes and higher sizes. */ >> + >> +/* No ZVA. */ >> +.macro do_no_zva >> + sub count, dstend, dst /* Count is 16 too large. */ >> + add dst, dst, 16 >> + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ >> +1: stp q0, q0, [dst], 64 >> + stp q0, q0, [dst, -32] >> + subs count, count, 64 >> + b.hi 1b >> + stp q0, q0, [dstend, -64] >> + stp q0, q0, [dstend, -32] >> + ret >> +.endm >> + >> +/* Write the first and last 64 byte aligned block using stp rather >> + than using DC ZVA. This is faster on some cores. */ >> +.macro do_zva_64 >> + str q0, [dst, 16] >> + stp q0, q0, [dst, 32] >> + bic dst, dst, 63 >> + stp q0, q0, [dst, 64] >> + stp q0, q0, [dst, 96] >> + sub count, dstend, dst /* Count is now 128 too large. */ >> + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ >> + add dst, dst, 128 >> + nop >> +1: dc zva, dst >> + add dst, dst, 64 >> + subs count, count, 64 >> + b.hi 1b >> + stp q0, q0, [dst, 0] >> + stp q0, q0, [dst, 32] >> + stp q0, q0, [dstend, -64] >> + stp q0, q0, [dstend, -32] >> + ret >> +.endm >> + >> +/* ZVA size of 128 bytes. */ >> +.macro do_zva_128 >> + str q0, [dst, 16] >> + stp q0, q0, [dst, 32] >> + stp q0, q0, [dst, 64] >> + stp q0, q0, [dst, 96] >> + bic dst, dst, 127 >> + sub count, dstend, dst /* Count is now 128 too large. */ >> + sub count, count, 128+128 /* Adjust count and bias for loop. */ >> + add dst, dst, 128 >> +1: dc zva, dst >> + add dst, dst, 128 >> + subs count, count, 128 >> + b.hi 1b >> + stp q0, q0, [dstend, -128] >> + stp q0, q0, [dstend, -96] >> + stp q0, q0, [dstend, -64] >> + stp q0, q0, [dstend, -32] >> + ret >> +.endm >> + >> +/* ZVA size of more than 128 bytes. */ >> +.macro do_zva_default >> + add tmp1, zva_len, 64 /* Max alignment bytes written. */ >> + cmp count, tmp1 >> + blo L(no_zva) >> + >> + sub tmp2, zva_len, 1 >> + add tmp1, dst, zva_len >> + add dst, dst, 16 >> + subs count, tmp1, dst /* Actual alignment bytes to write. */ >> + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ >> + beq 2f >> +1: stp q0, q0, [dst], 64 >> + stp q0, q0, [dst, -32] >> + subs count, count, 64 >> + b.hi 1b >> +2: mov dst, tmp1 >> + sub count, dstend, tmp1 /* Remaining bytes to write. */ >> + subs count, count, zva_len >> + b.lo 4f >> +3: dc zva, dst >> + add dst, dst, zva_len >> + subs count, count, zva_len >> + b.hs 3b >> +4: add count, count, zva_len >> + subs count, count, 64 >> + b.ls 6f >> +5: stp q0, q0, [dst], 64 >> + stp q0, q0, [dst, -32] >> + subs count, count, 64 >> + b.hi 5b >> +6: stp q0, q0, [dstend, -64] >> + stp q0, q0, [dstend, -32] >> + ret >> +.endm >> + >> +/* Memset entry point. */ >> +ENTRY_ALIGN (MEMSET, 6) >> >> DELOUSE (0) >> DELOUSE (2) >> @@ -89,107 +187,45 @@ L(set96): >> .p2align 3 >> nop >> L(set_long): >> +#ifdef MEMSET_ZVA >> and valw, valw, 255 >> +#endif >> bic dst, dstin, 15 >> str q0, [dstin] >> +#ifdef MEMSET_ZVA >> cmp count, 256 >> ccmp valw, 0, 0, cs >> b.eq L(try_zva) >> +#endif >> L(no_zva): >> - sub count, dstend, dst /* Count is 16 too large. */ >> - add dst, dst, 16 >> - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ >> -1: stp q0, q0, [dst], 64 >> - stp q0, q0, [dst, -32] >> -L(tail64): >> - subs count, count, 64 >> - b.hi 1b >> -2: stp q0, q0, [dstend, -64] >> - stp q0, q0, [dstend, -32] >> - ret >> + do_no_zva >> >> - .p2align 3 >> +#ifdef MEMSET_ZVA >> + .p2align 4 >> L(try_zva): >> +# if MEMSET_ZVA == 64 >> + do_zva_64 >> +# elif MEMSET_ZVA == 128 >> + do_zva_128 >> +# else >> mrs tmp1, dczid_el0 >> tbnz tmp1w, 4, L(no_zva) >> and tmp1w, tmp1w, 15 >> cmp tmp1w, 4 /* ZVA size is 64 bytes. */ >> b.ne L(zva_128) >> + do_zva_64 >> >> - /* Write the first and last 64 byte aligned block using stp rather >> - than using DC ZVA. This is faster on some cores. >> - */ >> -L(zva_64): >> - str q0, [dst, 16] >> - stp q0, q0, [dst, 32] >> - bic dst, dst, 63 >> - stp q0, q0, [dst, 64] >> - stp q0, q0, [dst, 96] >> - sub count, dstend, dst /* Count is now 128 too large. */ >> - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ >> - add dst, dst, 128 >> - nop >> -1: dc zva, dst >> - add dst, dst, 64 >> - subs count, count, 64 >> - b.hi 1b >> - stp q0, q0, [dst, 0] >> - stp q0, q0, [dst, 32] >> - stp q0, q0, [dstend, -64] >> - stp q0, q0, [dstend, -32] >> - ret >> - >> - .p2align 3 >> L(zva_128): >> cmp tmp1w, 5 /* ZVA size is 128 bytes. */ >> b.ne L(zva_other) >> - >> - str q0, [dst, 16] >> - stp q0, q0, [dst, 32] >> - stp q0, q0, [dst, 64] >> - stp q0, q0, [dst, 96] >> - bic dst, dst, 127 >> - sub count, dstend, dst /* Count is now 128 too large. */ >> - sub count, count, 128+128 /* Adjust count and bias for loop. */ >> - add dst, dst, 128 >> -1: dc zva, dst >> - add dst, dst, 128 >> - subs count, count, 128 >> - b.hi 1b >> - stp q0, q0, [dstend, -128] >> - stp q0, q0, [dstend, -96] >> - stp q0, q0, [dstend, -64] >> - stp q0, q0, [dstend, -32] >> - ret >> + do_zva_128 >> >> L(zva_other): >> mov tmp2w, 4 >> lsl zva_lenw, tmp2w, tmp1w >> - add tmp1, zva_len, 64 /* Max alignment bytes written. */ >> - cmp count, tmp1 >> - blo L(no_zva) >> + do_zva_default >> +# endif >> +#endif >> >> - sub tmp2, zva_len, 1 >> - add tmp1, dst, zva_len >> - add dst, dst, 16 >> - subs count, tmp1, dst /* Actual alignment bytes to write. */ >> - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ >> - beq 2f >> -1: stp q0, q0, [dst], 64 >> - stp q0, q0, [dst, -32] >> - subs count, count, 64 >> - b.hi 1b >> -2: mov dst, tmp1 >> - sub count, dstend, tmp1 /* Remaining bytes to write. */ >> - subs count, count, zva_len >> - b.lo 4f >> -3: dc zva, dst >> - add dst, dst, zva_len >> - subs count, count, zva_len >> - b.hs 3b >> -4: add count, count, zva_len >> - b L(tail64) >> - >> -END (__memset) >> -weak_alias (__memset, memset) >> -libc_hidden_builtin_def (memset) >> +END (MEMSET) >> +libc_hidden_builtin_def (MEMSET) >> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile >> index 9aa1e79..c1e17e8 100644 >> --- a/sysdeps/aarch64/multiarch/Makefile >> +++ b/sysdeps/aarch64/multiarch/Makefile >> @@ -1,4 +1,5 @@ >> ifeq ($(subdir),string) >> sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ >> - memmove_falkor >> + memmove_falkor memset_generic memset_nozva memset_zva_64 \ >> + memset_zva_128 >> endif >> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c >> index 2cb74d5..fb695ce 100644 >> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c >> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c >> @@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) >> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) >> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) >> + IFUNC_IMPL (i, name, memset, >> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva) >> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64) >> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128) >> + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) >> >> return i; >> } >> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h >> index 3af442c..a756dad 100644 >> --- a/sysdeps/aarch64/multiarch/init-arch.h >> +++ b/sysdeps/aarch64/multiarch/init-arch.h >> @@ -18,6 +18,8 @@ >> >> #include <ldsodefs.h> >> >> -#define INIT_ARCH() \ >> - uint64_t __attribute__((unused)) midr = \ >> - GLRO(dl_aarch64_cpu_features).midr_el1; >> +#define INIT_ARCH() \ >> + uint64_t __attribute__((unused)) midr = \ >> + GLRO(dl_aarch64_cpu_features).midr_el1; \ >> + unsigned __attribute__((unused)) zva_size = \ >> + GLRO(dl_aarch64_cpu_features).zva_size; >> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c >> new file mode 100644 >> index 0000000..a7e34c0 >> --- /dev/null >> +++ b/sysdeps/aarch64/multiarch/memset.c >> @@ -0,0 +1,43 @@ >> +/* Multiple versions of memset. AARCH64 version. >> + Copyright (C) 2017 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +/* Define multiple versions only for the definition in libc. */ >> + >> +#if IS_IN (libc) >> +/* Redefine memset so that the compiler won't complain about the type >> + mismatch with the IFUNC selector in strong_alias, below. */ >> +# undef memset >> +# define memset __redirect_memset >> +# include <string.h> >> +# include <init-arch.h> >> + >> +extern __typeof (__redirect_memset) __libc_memset; >> + >> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden; >> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden; >> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden; >> +extern __typeof (__redirect_memset) __memset_generic attribute_hidden; >> + >> +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva >> + : (zva_size == 64 ? __memset_zva_64 >> + : (zva_size == 128 ? __memset_zva_128 >> + : __memset_generic)))); >> + >> +# undef memset >> +strong_alias (__libc_memset, memset); >> +#endif >> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S >> new file mode 100644 >> index 0000000..8871600 >> --- /dev/null >> +++ b/sysdeps/aarch64/multiarch/memset_generic.S >> @@ -0,0 +1,28 @@ >> +/* Memset for aarch64, default version for internal use. >> + Copyright (C) 2017 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#if IS_IN (libc) >> +# define MEMSET __memset_generic >> +# define MEMSET_ZVA 1 >> +/* Add a hidden definition for use within libc.so. */ >> +# ifdef SHARED >> + .globl __GI_memset; __GI_memset = __memset_generic >> +# endif >> +# include <sysdeps/aarch64/memset.S> >> +#endif >> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S >> new file mode 100644 >> index 0000000..2d4fc42 >> --- /dev/null >> +++ b/sysdeps/aarch64/multiarch/memset_nozva.S >> @@ -0,0 +1,23 @@ >> +/* Memset for aarch64, ZVA disabled. >> + Copyright (C) 2017 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#if IS_IN (libc) >> +# define MEMSET __memset_nozva >> +# include <sysdeps/aarch64/memset.S> >> +#endif >> diff --git a/sysdeps/aarch64/multiarch/memset_zva_128.S b/sysdeps/aarch64/multiarch/memset_zva_128.S >> new file mode 100644 >> index 0000000..2c68127 >> --- /dev/null >> +++ b/sysdeps/aarch64/multiarch/memset_zva_128.S >> @@ -0,0 +1,24 @@ >> +/* Memset for aarch64, ZVA enabled and == 128 bytes. >> + Copyright (C) 2017 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#if IS_IN (libc) >> +# define MEMSET __memset_zva_128 >> +# define MEMSET_ZVA 128 >> +# include <sysdeps/aarch64/memset.S> >> +#endif >> diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S >> new file mode 100644 >> index 0000000..ff895f9 >> --- /dev/null >> +++ b/sysdeps/aarch64/multiarch/memset_zva_64.S >> @@ -0,0 +1,24 @@ >> +/* Memset for aarch64, ZVA enabled and == 64 bytes. >> + Copyright (C) 2017 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#if IS_IN (libc) >> +# define MEMSET __memset_zva_64 >> +# define MEMSET_ZVA 64 >> +# include <sysdeps/aarch64/memset.S> >> +#endif >> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S >> new file mode 100644 >> index 0000000..172df42 >> --- /dev/null >> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S >> @@ -0,0 +1,24 @@ >> +/* Memset for aarch64, for the dynamic linker. >> + Copyright (C) 2017 Free Software Foundation, Inc. >> + >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library. If not, see >> + <http://www.gnu.org/licenses/>. */ >> + >> +#if IS_IN (rtld) >> +# define MEMSET memset >> +# define MEMSET_ZVA 1 >> +# include <sysdeps/aarch64/memset.S> >> +#endif >> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c >> index e769eeb..092ee81 100644 >> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c >> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c >> @@ -20,6 +20,9 @@ >> #include <sys/auxv.h> >> #include <elf/dl-hwcaps.h> >> >> +#define DCZID_DZP_MASK (1 << 4) >> +#define DCZID_BS_MASK (0xf) >> + >> #if HAVE_TUNABLES >> struct cpu_list >> { >> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features) >> } >> >> cpu_features->midr_el1 = midr; >> + >> + /* Check if ZVA is enabled. */ >> + unsigned dczid; >> + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid)); >> + >> + if ((dczid & DCZID_DZP_MASK) == 0) >> + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK); >> } >> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h >> index 73cb53d..f2b6afd 100644 >> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h >> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h >> @@ -47,6 +47,7 @@ >> struct cpu_features >> { >> uint64_t midr_el1; >> + unsigned zva_size; >> }; >> >> #endif /* _CPU_FEATURES_AARCH64_H */ >> >
======================================================================================================================== length=256, char=0: 35936.10 (-706.66%) 2429.88 ( 45.46%) 2571.85 ( 42.27%) 4454.92 length=257, char=0: 36209.50 (-710.17%) 2436.12 ( 45.49%) 2564.25 ( 42.63%) 4469.36 length=258, char=0: 36507.90 (-710.21%) 2522.06 ( 44.03%) 2578.89 ( 42.77%) 4505.99 length=259, char=0: 36764.30 (-711.99%) 2611.61 ( 42.32%) 2593.52 ( 42.72%) 4527.69 length=260, char=0: 36943.30 (-712.62%) 2639.06 ( 41.95%) 2608.24 ( 42.63%) 4546.19 length=261, char=0: 37287.50 (-717.27%) 2623.07 ( 42.51%) 2623.17 ( 42.51%) 4562.47 length=262, char=0: 37573.70 (-722.44%) 2665.51 ( 41.66%) 2637.28 ( 42.27%) 4568.56 length=263, char=0: 37833.70 (-724.30%) 2692.70 ( 41.33%) 2668.38 ( 41.86%) 4589.79 length=264, char=0: 38136.00 (-727.49%) 2737.30 ( 40.61%) 2685.48 ( 41.73%) 4608.66 length=265, char=0: 38403.10 (-730.30%) 2778.70 ( 39.92%) 2695.10 ( 41.73%) 4625.23 length=266, char=0: 38684.50 (-729.88%) 2822.16 ( 39.46%) 2692.91 ( 42.23%) 4661.47 length=267, char=0: 38954.10 (-732.30%) 2867.41 ( 38.73%) 2706.28 ( 42.18%) 4680.31 length=268, char=0: 39155.00 (-733.08%) 2968.76 ( 36.84%) 2721.89 ( 42.09%) 4700.03 length=269, char=0: 39559.30 (-737.49%) 3057.49 ( 35.27%) 2737.61 ( 42.04%) 4723.54 length=270, char=0: 39813.80 (-742.51%) 3073.64 ( 34.96%) 2751.70 ( 41.77%) 4725.60 length=271, char=0: 40070.60 (-744.40%) 3103.55 ( 34.60%) 2784.25 ( 41.33%) 4745.43 length=512, char=0: 137515.00 (-1275.48%) 8971.95 ( 10.26%) 7168.66 ( 28.30%) 9997.61 length=513, char=0: 138015.00 (-1284.40%) 8987.07 ( 9.85%) 7242.59 ( 27.35%) 9969.29 length=514, char=0: 138556.00 (-1286.76%) 9200.17 ( 7.92%) 7211.49 ( 27.82%) 9991.38 length=515, char=0: 139182.00 (-1277.21%) 9223.64 ( 8.73%) 7232.78 ( 28.43%) 10106.10 length=516, char=0: 139512.00 (-1288.41%) 9306.80 ( 7.38%) 7312.15 ( 27.23%) 10048.30 length=517, char=0: 140117.00 (-1292.65%) 9429.22 ( 6.28%) 7273.52 ( 27.71%) 10061.20 length=518, char=0: 140706.00 (-1294.63%) 9463.83 ( 6.20%) 7292.57 ( 27.72%) 10089.10 length=519, char=0: 141221.00 (-1289.12%) 9548.99 ( 6.07%) 7312.75 ( 28.07%) 10166.20 length=520, char=0: 141696.00 (-1297.00%) 9713.49 ( 4.27%) 7386.44 ( 27.21%) 10147.00 length=521, char=0: 142309.00 (-1298.82%) 9888.41 ( 2.80%) 7361.91 ( 27.64%) 10173.50 length=522, char=0: 142878.00 (-1292.34%) 9909.30 ( 3.43%) 7381.22 ( 28.07%) 10261.70 length=523, char=0: 143327.00 (-1300.69%) 9918.78 ( 3.07%) 7462.93 ( 27.07%) 10232.60 length=524, char=0: 143776.00 (-1301.67%) 10055.40 ( 1.97%) 7428.56 ( 27.58%) 10257.50 length=525, char=0: 144429.00 (-1296.79%) 10090.80 ( 2.41%) 7449.84 ( 27.95%) 10340.10 length=526, char=0: 144976.00 (-1305.05%) 10178.80 ( 1.35%) 7530.66 ( 27.02%) 10318.20 length=527, char=0: 145551.00 (-1306.63%) 10314.40 ( 0.32%) 7498.48 ( 27.53%) 10347.50 length=1024, char=0: 537600.00 (-2116.32%) 34541.10 (-42.40%) 22541.00 ( 7.07%) 24256.40 length=1025, char=0: 538490.00 (-2117.66%) 34560.10 (-42.33%) 22574.10 ( 7.03%) 24281.90 length=1026, char=0: 539596.00 (-2118.30%) 34869.20 (-43.35%) 22615.10 ( 7.03%) 24324.70 length=1027, char=0: 540544.00 (-2118.30%) 35020.60 (-43.72%) 22654.60 ( 7.03%) 24367.50 length=1028, char=0: 541355.00 (-2119.44%) 35407.20 (-45.16%) 22702.00 ( 6.93%) 24391.50 length=1029, char=0: 542678.00 (-2121.52%) 35806.10 (-46.58%) 22751.10 ( 6.87%) 24428.20 length=1030, char=0: 543843.00 (-2122.73%) 35761.20 (-46.16%) 22771.20 ( 6.93%) 24467.30 length=1031, char=0: 544725.00 (-2123.15%) 35927.70 (-46.63%) 22814.10 ( 6.89%) 24502.40 length=1032, char=0: 545744.00 (-2124.10%) 35882.10 (-46.23%) 22844.50 ( 6.90%) 24537.70 length=1033, char=0: 546968.00 (-2125.25%) 36080.00 (-46.79%) 22885.20 ( 6.90%) 24580.10 length=1034, char=0: 548042.00 (-2126.35%) 36208.30 (-47.09%) 22922.90 ( 6.88%) 24616.20 length=1035, char=0: 549066.00 (-2127.30%) 36398.80 (-47.65%) 22961.30 ( 6.86%) 24651.60 length=1036, char=0: 550138.00 (-2127.95%) 36558.40 (-48.05%) 23008.70 ( 6.82%) 24692.60 length=1037, char=0: 551170.00 (-2129.86%) 36732.90 (-48.61%) 23043.40 ( 6.77%) 24717.70 length=1038, char=0: 552268.00 (-2130.95%) 36722.80 (-48.35%) 23078.80 ( 6.77%) 24754.80 length=1039, char=0: 553270.00 (-2131.58%) 36891.60 (-48.80%) 23116.80 ( 6.76%) 24792.80 * sysdeps/aarch64/memset.S (do_no_zva): New macro. (do_zva_64): Likewise. (do_zva_128): Likewise. (__memset): Rename to MEMSET macro. (MEMSET): Use the new macros. * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memset_generic, memset_nozva, memset_zva_64, memset_zva_128 and memset_generic. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add memset ifuncs. * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New local variable zva_size. * sysdeps/aarch64/multiarch/memset.c: New file. * sysdeps/aarch64/multiarch/memset_generic.S: New file. * sysdeps/aarch64/multiarch/memset_nozva.S: New file. * sysdeps/aarch64/multiarch/memset_zva_64.S: New file. * sysdeps/aarch64/multiarch/memset_zva_128.S: New file. * sysdeps/aarch64/multiarch/rtld-memset.S: New file. * sysdeps/unix/sysv/linux/aarch64/cpu-features.c (DCZID_DZP_MASK): New macro. (DCZID_BS_MASK): Likewise. (init_cpu_features): Read and set zva_size. * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (struct cpu_features): New member zva_size. --- sysdeps/aarch64/memset.S | 200 +++++++++++++++---------- sysdeps/aarch64/multiarch/Makefile | 3 +- sysdeps/aarch64/multiarch/ifunc-impl-list.c | 5 + sysdeps/aarch64/multiarch/init-arch.h | 8 +- sysdeps/aarch64/multiarch/memset.c | 43 ++++++ sysdeps/aarch64/multiarch/memset_generic.S | 28 ++++ sysdeps/aarch64/multiarch/memset_nozva.S | 23 +++ sysdeps/aarch64/multiarch/memset_zva_128.S | 24 +++ sysdeps/aarch64/multiarch/memset_zva_64.S | 24 +++ sysdeps/aarch64/multiarch/rtld-memset.S | 24 +++ sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 ++ sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 1 + 12 files changed, 307 insertions(+), 86 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/memset.c create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S create mode 100644 sysdeps/aarch64/multiarch/memset_zva_128.S create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 110fd22..9fea4c2 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -37,7 +37,105 @@ #define zva_len x7 #define zva_lenw w7 -ENTRY_ALIGN (__memset, 6) +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128 + bytes and higher sizes. */ + +/* No ZVA. */ +.macro do_no_zva + sub count, dstend, dst /* Count is 16 too large. */ + add dst, dst, 16 + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +/* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. */ +.macro do_zva_64 + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 + nop +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +/* ZVA size of 128 bytes. */ +.macro do_zva_128 + str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+128 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 128 + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] + stp q0, q0, [dstend, -96] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +/* ZVA size of more than 128 bytes. */ +.macro do_zva_default + add tmp1, zva_len, 64 /* Max alignment bytes written. */ + cmp count, tmp1 + blo L(no_zva) + + sub tmp2, zva_len, 1 + add tmp1, dst, zva_len + add dst, dst, 16 + subs count, tmp1, dst /* Actual alignment bytes to write. */ + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ + beq 2f +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b +2: mov dst, tmp1 + sub count, dstend, tmp1 /* Remaining bytes to write. */ + subs count, count, zva_len + b.lo 4f +3: dc zva, dst + add dst, dst, zva_len + subs count, count, zva_len + b.hs 3b +4: add count, count, zva_len + subs count, count, 64 + b.ls 6f +5: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 5b +6: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +/* Memset entry point. */ +ENTRY_ALIGN (MEMSET, 6) DELOUSE (0) DELOUSE (2) @@ -89,107 +187,45 @@ L(set96): .p2align 3 nop L(set_long): +#ifdef MEMSET_ZVA and valw, valw, 255 +#endif bic dst, dstin, 15 str q0, [dstin] +#ifdef MEMSET_ZVA cmp count, 256 ccmp valw, 0, 0, cs b.eq L(try_zva) +#endif L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - add dst, dst, 16 - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret + do_no_zva - .p2align 3 +#ifdef MEMSET_ZVA + .p2align 4 L(try_zva): +# if MEMSET_ZVA == 64 + do_zva_64 +# elif MEMSET_ZVA == 128 + do_zva_128 +# else mrs tmp1, dczid_el0 tbnz tmp1w, 4, L(no_zva) and tmp1w, tmp1w, 15 cmp tmp1w, 4 /* ZVA size is 64 bytes. */ b.ne L(zva_128) + do_zva_64 - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ -L(zva_64): - str q0, [dst, 16] - stp q0, q0, [dst, 32] - bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 - nop -1: dc zva, dst - add dst, dst, 64 - subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 L(zva_128): cmp tmp1w, 5 /* ZVA size is 128 bytes. */ b.ne L(zva_other) - - str q0, [dst, 16] - stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - bic dst, dst, 127 - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 - subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret + do_zva_128 L(zva_other): mov tmp2w, 4 lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) + do_zva_default +# endif +#endif - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - b L(tail64) - -END (__memset) -weak_alias (__memset, memset) -libc_hidden_builtin_def (memset) +END (MEMSET) +libc_hidden_builtin_def (MEMSET) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 9aa1e79..c1e17e8 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,5 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ - memmove_falkor + memmove_falkor memset_generic memset_nozva memset_zva_64 \ + memset_zva_128 endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 2cb74d5..fb695ce 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -46,6 +46,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) + IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva) + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64) + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) return i; } diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h index 3af442c..a756dad 100644 --- a/sysdeps/aarch64/multiarch/init-arch.h +++ b/sysdeps/aarch64/multiarch/init-arch.h @@ -18,6 +18,8 @@ #include <ldsodefs.h> -#define INIT_ARCH() \ - uint64_t __attribute__((unused)) midr = \ - GLRO(dl_aarch64_cpu_features).midr_el1; +#define INIT_ARCH() \ + uint64_t __attribute__((unused)) midr = \ + GLRO(dl_aarch64_cpu_features).midr_el1; \ + unsigned __attribute__((unused)) zva_size = \ + GLRO(dl_aarch64_cpu_features).zva_size; diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c new file mode 100644 index 0000000..a7e34c0 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset.c @@ -0,0 +1,43 @@ +/* Multiple versions of memset. AARCH64 version. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memset so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memset +# define memset __redirect_memset +# include <string.h> +# include <init-arch.h> + +extern __typeof (__redirect_memset) __libc_memset; + +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden; +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden; +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden; +extern __typeof (__redirect_memset) __memset_generic attribute_hidden; + +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva + : (zva_size == 64 ? __memset_zva_64 + : (zva_size == 128 ? __memset_zva_128 + : __memset_generic)))); + +# undef memset +strong_alias (__libc_memset, memset); +#endif diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S new file mode 100644 index 0000000..8871600 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_generic.S @@ -0,0 +1,28 @@ +/* Memset for aarch64, default version for internal use. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMSET __memset_generic +# define MEMSET_ZVA 1 +/* Add a hidden definition for use within libc.so. */ +# ifdef SHARED + .globl __GI_memset; __GI_memset = __memset_generic +# endif +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S new file mode 100644 index 0000000..2d4fc42 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_nozva.S @@ -0,0 +1,23 @@ +/* Memset for aarch64, ZVA disabled. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMSET __memset_nozva +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/memset_zva_128.S b/sysdeps/aarch64/multiarch/memset_zva_128.S new file mode 100644 index 0000000..2c68127 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_zva_128.S @@ -0,0 +1,24 @@ +/* Memset for aarch64, ZVA enabled and == 128 bytes. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMSET __memset_zva_128 +# define MEMSET_ZVA 128 +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S new file mode 100644 index 0000000..ff895f9 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_zva_64.S @@ -0,0 +1,24 @@ +/* Memset for aarch64, ZVA enabled and == 64 bytes. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMSET __memset_zva_64 +# define MEMSET_ZVA 64 +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S new file mode 100644 index 0000000..172df42 --- /dev/null +++ b/sysdeps/aarch64/multiarch/rtld-memset.S @@ -0,0 +1,24 @@ +/* Memset for aarch64, for the dynamic linker. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (rtld) +# define MEMSET memset +# define MEMSET_ZVA 1 +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c index e769eeb..092ee81 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c @@ -20,6 +20,9 @@ #include <sys/auxv.h> #include <elf/dl-hwcaps.h> +#define DCZID_DZP_MASK (1 << 4) +#define DCZID_BS_MASK (0xf) + #if HAVE_TUNABLES struct cpu_list { @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features) } cpu_features->midr_el1 = midr; + + /* Check if ZVA is enabled. */ + unsigned dczid; + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid)); + + if ((dczid & DCZID_DZP_MASK) == 0) + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK); } diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index 73cb53d..f2b6afd 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -47,6 +47,7 @@ struct cpu_features { uint64_t midr_el1; + unsigned zva_size; }; #endif /* _CPU_FEATURES_AARCH64_H */