Message ID | 1505834596-21641-1-git-send-email-siddhesh@sourceware.org |
---|---|
State | New |
Headers | show |
Series | [1/2] Hoist ZVA check out of the function | expand |
Ping. On Tuesday 19 September 2017 08:53 PM, Siddhesh Poyarekar wrote: > The DZP bit in the dczid_el0 register does not change dynamically, so > it is safe to read once during program startup. Hoist the zva check > into an ifunc resolver and store the result into a static variable, > which can be read in case of non-standard zva sizes. This effectively > adds 3 ifunc variants for memset - one for cases where zva is > disabled, one for 64 byte zva and another for 128 byte zva. I have > retained the older memset as __memset_generic for internal libc.so use > so that the change impact is minimal. We should eventually have a > discussion on what is more expensive, reading dczid_el0 on every > memset invocation or the indirection due to PLT. > > The gains due to this are significant for falkor, with gains as high > as 80% in some cases. Likewise for mustang, although the numbers are > slightly lower. Here's a sample from the falkor tests: > > Function: memset > Variant: walk > simple_memset __memset_nozva __memset_zva_64 __memset_zva_default __memset_generic > ======================================================================================================================== > length=256, char=0: 1.82 (-87.26%) 26.99 ( 89.30%) 25.49 ( 78.76%) 23.48 ( 64.65%) 14.26 > length=257, char=0: 1.82 (-87.29%) 26.97 ( 88.44%) 25.77 ( 80.12%) 24.41 ( 70.57%) 14.31 > length=258, char=0: 1.82 (-87.38%) 26.27 ( 82.29%) 25.84 ( 79.28%) 24.33 ( 68.80%) 14.41 > length=259, char=0: 1.82 (-87.36%) 26.06 ( 81.15%) 25.72 ( 78.84%) 24.57 ( 70.80%) 14.38 > length=260, char=0: 1.82 (-87.44%) 25.35 ( 75.23%) 25.93 ( 79.23%) 24.34 ( 68.24%) 14.47 > length=261, char=0: 1.82 (-87.49%) 26.15 ( 79.70%) 26.01 ( 78.72%) 24.44 ( 67.97%) 14.55 > length=262, char=0: 1.82 (-87.54%) 25.91 ( 77.31%) 26.06 ( 78.35%) 24.33 ( 66.49%) 14.61 > length=263, char=0: 1.82 (-87.54%) 25.69 ( 75.80%) 25.96 ( 77.63%) 24.54 ( 67.90%) 14.61 > length=264, char=0: 1.82 (-87.57%) 25.31 ( 72.69%) 26.16 ( 78.43%) 24.63 ( 68.00%) 14.66 > length=265, char=0: 1.82 (-87.65%) 25.29 ( 71.35%) 26.25 ( 77.84%) 24.58 ( 66.53%) 14.76 > length=266, char=0: 1.82 (-87.69%) 25.10 ( 69.40%) 26.15 ( 76.48%) 24.77 ( 67.22%) 14.82 > length=267, char=0: 1.82 (-87.69%) 24.89 ( 68.02%) 26.20 ( 76.90%) 24.87 ( 67.91%) 14.81 > length=268, char=0: 1.82 (-87.74%) 24.07 ( 62.04%) 26.40 ( 77.74%) 24.95 ( 67.93%) 14.85 > length=269, char=0: 1.82 (-87.80%) 23.82 ( 59.29%) 26.47 ( 77.00%) 24.89 ( 66.43%) 14.96 > length=270, char=0: 1.82 (-87.84%) 23.65 ( 57.61%) 26.35 ( 75.58%) 25.07 ( 67.07%) 15.01 > length=271, char=0: 1.83 (-87.82%) 23.48 ( 56.53%) 26.39 ( 75.93%) 25.15 ( 67.66%) 15.00 > length=512, char=0: 1.90 (-92.59%) 29.25 ( 13.81%) 36.30 ( 41.27%) 40.95 ( 59.36%) 25.70 > length=513, char=0: 1.90 (-92.57%) 29.29 ( 14.35%) 36.63 ( 43.01%) 40.80 ( 59.28%) 25.61 > length=514, char=0: 1.90 (-92.62%) 28.61 ( 10.91%) 36.64 ( 42.05%) 40.89 ( 58.52%) 25.80 > length=515, char=0: 1.90 (-92.63%) 28.74 ( 11.29%) 36.68 ( 42.06%) 40.56 ( 57.08%) 25.82 > length=516, char=0: 1.90 (-92.65%) 28.33 ( 9.54%) 36.72 ( 41.96%) 40.09 ( 55.01%) 25.87 > length=517, char=0: 1.90 (-92.66%) 28.41 ( 9.60%) 36.80 ( 41.97%) 39.43 ( 52.13%) 25.92 > length=518, char=0: 1.90 (-92.66%) 28.16 ( 8.45%) 36.84 ( 41.89%) 39.40 ( 51.77%) 25.96 > length=519, char=0: 1.90 (-92.67%) 28.21 ( 8.58%) 36.86 ( 41.86%) 40.39 ( 55.46%) 25.98 > length=520, char=0: 1.90 (-92.65%) 27.53 ( 6.32%) 36.90 ( 42.49%) 40.80 ( 57.58%) 25.89 > length=521, char=0: 1.90 (-92.69%) 27.53 ( 5.65%) 36.61 ( 40.50%) 40.86 ( 56.81%) 26.05 > length=522, char=0: 1.90 (-92.66%) 27.40 ( 5.59%) 36.95 ( 42.35%) 40.92 ( 57.64%) 25.95 > length=523, char=0: 1.91 (-92.71%) 27.50 ( 5.29%) 36.69 ( 40.45%) 40.97 ( 56.84%) 26.12 > length=524, char=0: 1.90 (-92.69%) 27.33 ( 5.02%) 37.00 ( 42.18%) 40.98 ( 57.50%) 26.02 > length=525, char=0: 1.91 (-92.72%) 27.24 ( 4.04%) 36.70 ( 40.13%) 41.04 ( 56.72%) 26.19 > length=526, char=0: 1.90 (-92.70%) 27.06 ( 3.73%) 37.06 ( 42.05%) 41.08 ( 57.44%) 26.09 > length=527, char=0: 1.91 (-92.74%) 26.82 ( 2.17%) 37.06 ( 41.17%) 41.11 ( 56.62%) 26.25 > length=1024, char=0: 1.95 (-95.35%) 30.55 (-27.12%) 46.52 ( 10.99%) 49.89 ( 19.02%) 41.91 > length=1025, char=0: 1.95 (-95.31%) 30.58 (-26.47%) 46.57 ( 11.98%) 49.92 ( 20.05%) 41.59 > length=1026, char=0: 1.95 (-95.36%) 30.35 (-27.70%) 46.56 ( 10.92%) 49.45 ( 17.79%) 41.98 > length=1027, char=0: 1.95 (-95.36%) 30.24 (-28.02%) 46.20 ( 9.98%) 49.93 ( 18.88%) 42.00 > length=1028, char=0: 1.95 (-95.37%) 29.75 (-29.25%) 46.58 ( 10.76%) 49.92 ( 18.71%) 42.05 > length=1029, char=0: 1.95 (-95.37%) 29.78 (-29.24%) 46.57 ( 10.65%) 49.96 ( 18.72%) 42.08 > length=1030, char=0: 1.95 (-95.33%) 29.77 (-28.73%) 46.63 ( 11.63%) 49.97 ( 19.64%) 41.77 > length=1031, char=0: 1.95 (-95.37%) 29.64 (-29.68%) 46.62 ( 10.59%) 49.51 ( 17.46%) 42.15 > length=1032, char=0: 1.95 (-95.38%) 29.60 (-29.80%) 46.22 ( 9.63%) 49.99 ( 18.58%) 42.16 > length=1033, char=0: 1.95 (-95.38%) 29.32 (-30.55%) 46.65 ( 10.49%) 49.95 ( 18.32%) 42.22 > length=1034, char=0: 1.95 (-95.39%) 29.45 (-30.31%) 46.67 ( 10.44%) 50.01 ( 18.36%) 42.25 > length=1035, char=0: 1.95 (-95.35%) 29.31 (-30.09%) 46.68 ( 11.34%) 50.02 ( 19.31%) 41.92 > length=1036, char=0: 1.95 (-95.40%) 29.30 (-30.75%) 46.66 ( 10.27%) 49.56 ( 17.12%) 42.32 > length=1037, char=0: 1.95 (-95.39%) 29.17 (-31.08%) 46.30 ( 9.38%) 50.04 ( 18.22%) 42.33 > length=1038, char=0: 1.95 (-95.40%) 29.12 (-31.30%) 46.71 ( 10.19%) 50.02 ( 18.01%) 42.39 > length=1039, char=0: 1.95 (-95.40%) 29.19 (-31.20%) 46.73 ( 10.14%) 50.06 ( 18.00%) 42.43 > > * sysdeps/aarch64/memset.S (do_no_zva): New macro. > (do_zva_64): Likewise. > (do_zva_128): Likewise. > (do_zva_default): Likewise. > (__memset): Rename to MEMSET macro. > (MEMSET): Use the new macros. > (MEMSET)[INTERNAL_MEMSET]: Retain old memset. > (MEMSET)[!INTERNAL_MEMSET]: Remove zva check. > * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): > Add memset_generic, memset_nozva and memset_zva. > * sysdeps/aarch64/multiarch/ifunc-impl-list.c > (__libc_ifunc_impl_list): Add memset ifuncs. > * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New > static variable __aarch64_zva_size and local variable > zva_size. > * sysdeps/aarch64/multiarch/memset.c: New file. > * sysdeps/aarch64/multiarch/memset_generic.S: New file. > * sysdeps/aarch64/multiarch/memset_nozva.S: New file. > * sysdeps/aarch64/multiarch/memset_zva.S: New file. > * sysdeps/unix/sysv/linux/aarch64/cpu-features.c > (DCZID_DZP_MASK): New macro. > (DCZID_BS_MASK): Likewise. > (init_cpu_features): Read and set zva_size. > * sysdeps/unix/sysv/linux/aarch64/cpu-features.h > (struct cpu_features): New member zva_size. > --- > sysdeps/aarch64/memset.S | 248 +++++++++++++++---------- > sysdeps/aarch64/multiarch/Makefile | 2 +- > sysdeps/aarch64/multiarch/ifunc-impl-list.c | 6 + > sysdeps/aarch64/multiarch/init-arch.h | 9 +- > sysdeps/aarch64/multiarch/memset.c | 47 +++++ > sysdeps/aarch64/multiarch/memset_generic.S | 27 +++ > sysdeps/aarch64/multiarch/memset_nozva.S | 22 +++ > sysdeps/aarch64/multiarch/memset_zva.S | 41 ++++ > sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 + > sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 1 + > 10 files changed, 313 insertions(+), 100 deletions(-) > create mode 100644 sysdeps/aarch64/multiarch/memset.c > create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S > create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S > create mode 100644 sysdeps/aarch64/multiarch/memset_zva.S > > diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S > index 110fd22..8cff3a4 100644 > --- a/sysdeps/aarch64/memset.S > +++ b/sysdeps/aarch64/memset.S > @@ -37,7 +37,108 @@ > #define zva_len x7 > #define zva_lenw w7 > > -ENTRY_ALIGN (__memset, 6) > +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128 > + bytes and higher sizes. */ > + > +#ifndef ZVA_MACROS > +# define ZVA_MACROS > +/* No ZVA. */ > +.macro do_no_zva > + sub count, dstend, dst /* Count is 16 too large. */ > + add dst, dst, 16 > + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > +1: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 1b > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* Write the first and last 64 byte aligned block using stp rather > + than using DC ZVA. This is faster on some cores. */ > +.macro do_zva_64 > + str q0, [dst, 16] > + stp q0, q0, [dst, 32] > + bic dst, dst, 63 > + stp q0, q0, [dst, 64] > + stp q0, q0, [dst, 96] > + sub count, dstend, dst /* Count is now 128 too large. */ > + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > + add dst, dst, 128 > + nop > +1: dc zva, dst > + add dst, dst, 64 > + subs count, count, 64 > + b.hi 1b > + stp q0, q0, [dst, 0] > + stp q0, q0, [dst, 32] > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* ZVA size of 128 bytes. */ > +.macro do_zva_128 > + str q0, [dst, 16] > + stp q0, q0, [dst, 32] > + stp q0, q0, [dst, 64] > + stp q0, q0, [dst, 96] > + bic dst, dst, 127 > + sub count, dstend, dst /* Count is now 128 too large. */ > + sub count, count, 128+128 /* Adjust count and bias for loop. */ > + add dst, dst, 128 > +1: dc zva, dst > + add dst, dst, 128 > + subs count, count, 128 > + b.hi 1b > + stp q0, q0, [dstend, -128] > + stp q0, q0, [dstend, -96] > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* ZVA size of more than 128 bytes. */ > +.macro do_zva_default > + add tmp1, zva_len, 64 /* Max alignment bytes written. */ > + cmp count, tmp1 > + blo MEMSET_L(no_zva) > + > + sub tmp2, zva_len, 1 > + add tmp1, dst, zva_len > + add dst, dst, 16 > + subs count, tmp1, dst /* Actual alignment bytes to write. */ > + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > + beq 2f > +1: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 1b > +2: mov dst, tmp1 > + sub count, dstend, tmp1 /* Remaining bytes to write. */ > + subs count, count, zva_len > + b.lo 4f > +3: dc zva, dst > + add dst, dst, zva_len > + subs count, count, zva_len > + b.hs 3b > +4: add count, count, zva_len > + subs count, count, 64 > + b.ls 6f > +5: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 5b > +6: stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > +#endif > + > +/* Memset entry point. */ > +ENTRY_ALIGN (MEMSET, 6) > > DELOUSE (0) > DELOUSE (2) > @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6) > add dstend, dstin, count > > cmp count, 96 > - b.hi L(set_long) > + b.hi MEMSET_L(set_long) > cmp count, 16 > - b.hs L(set_medium) > + b.hs MEMSET_L(set_medium) > mov val, v0.D[0] > > /* Set 0..15 bytes. */ > @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6) > 3: ret > > /* Set 17..96 bytes. */ > -L(set_medium): > +MEMSET_L(set_medium): > str q0, [dstin] > - tbnz count, 6, L(set96) > + tbnz count, 6, MEMSET_L(set96) > str q0, [dstend, -16] > tbz count, 5, 1f > str q0, [dstin, 16] > @@ -80,7 +181,7 @@ L(set_medium): > .p2align 4 > /* Set 64..96 bytes. Write 64 bytes from the start and > 32 bytes from the end. */ > -L(set96): > +MEMSET_L(set96): > str q0, [dstin, 16] > stp q0, q0, [dstin, 32] > stp q0, q0, [dstend, -32] > @@ -88,108 +189,63 @@ L(set96): > > .p2align 3 > nop > -L(set_long): > +MEMSET_L(set_long): > +#ifdef INTERNAL_MEMSET > and valw, valw, 255 > bic dst, dstin, 15 > str q0, [dstin] > cmp count, 256 > ccmp valw, 0, 0, cs > - b.eq L(try_zva) > -L(no_zva): > - sub count, dstend, dst /* Count is 16 too large. */ > - add dst, dst, 16 > - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > -L(tail64): > - subs count, count, 64 > - b.hi 1b > -2: stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > + b.eq MEMSET_L(try_zva) > > - .p2align 3 > -L(try_zva): > +MEMSET_L(no_zva): > + do_no_zva > + > + .p2align 4 > +MEMSET_L(try_zva): > mrs tmp1, dczid_el0 > - tbnz tmp1w, 4, L(no_zva) > and tmp1w, tmp1w, 15 > cmp tmp1w, 4 /* ZVA size is 64 bytes. */ > - b.ne L(zva_128) > + b.ne MEMSET_L(zva_128) > + do_zva_64 > > - /* Write the first and last 64 byte aligned block using stp rather > - than using DC ZVA. This is faster on some cores. > - */ > -L(zva_64): > - str q0, [dst, 16] > - stp q0, q0, [dst, 32] > - bic dst, dst, 63 > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > - nop > -1: dc zva, dst > - add dst, dst, 64 > - subs count, count, 64 > - b.hi 1b > - stp q0, q0, [dst, 0] > - stp q0, q0, [dst, 32] > - stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > - > - .p2align 3 > -L(zva_128): > +MEMSET_L(zva_128): > cmp tmp1w, 5 /* ZVA size is 128 bytes. */ > - b.ne L(zva_other) > + b.ne MEMSET_L(zva_other) > + do_zva_128 > > - str q0, [dst, 16] > - stp q0, q0, [dst, 32] > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - bic dst, dst, 127 > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+128 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > -1: dc zva, dst > - add dst, dst, 128 > - subs count, count, 128 > - b.hi 1b > - stp q0, q0, [dstend, -128] > - stp q0, q0, [dstend, -96] > - stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > - > -L(zva_other): > +MEMSET_L(zva_other): > mov tmp2w, 4 > lsl zva_lenw, tmp2w, tmp1w > - add tmp1, zva_len, 64 /* Max alignment bytes written. */ > - cmp count, tmp1 > - blo L(no_zva) > + do_zva_default > +#else > + /* Memset called through PLT, so we need only one of the ZVA > + variants. */ > +# ifdef MEMSET_ZVA > + and valw, valw, 255 > +# endif > + bic dst, dstin, 15 > + str q0, [dstin] > +# ifdef MEMSET_ZVA > + cmp count, 256 > + ccmp valw, 0, 0, cs > + b.eq MEMSET_L(try_zva) > +# endif > +MEMSET_L(no_zva): > + do_no_zva > +# if defined MEMSET_ZVA > +MEMSET_L(try_zva): > +# if MEMSET_ZVA == 64 > + do_zva_64 > +# elif MEMSET_ZVA == 128 > + do_zva_128 > +# else > + adrp zva_len, __aarch64_zva_size > + ldr zva_len, [zva_len, #:lo12:__aarch64_zva_size] > + do_zva_default > +# endif > +# endif > +#endif > > - sub tmp2, zva_len, 1 > - add tmp1, dst, zva_len > - add dst, dst, 16 > - subs count, tmp1, dst /* Actual alignment bytes to write. */ > - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > - beq 2f > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > - subs count, count, 64 > - b.hi 1b > -2: mov dst, tmp1 > - sub count, dstend, tmp1 /* Remaining bytes to write. */ > - subs count, count, zva_len > - b.lo 4f > -3: dc zva, dst > - add dst, dst, zva_len > - subs count, count, zva_len > - b.hs 3b > -4: add count, count, zva_len > - b L(tail64) > - > -END (__memset) > -weak_alias (__memset, memset) > -libc_hidden_builtin_def (memset) > +END (MEMSET) > +libc_hidden_builtin_def (MEMSET) > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index 9aa1e79..f611182 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -1,4 +1,4 @@ > ifeq ($(subdir),string) > sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ > - memmove_falkor > + memmove_falkor memset_generic memset_nozva memset_zva > endif > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 2cb74d5..29148ac 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > + IFUNC_IMPL (i, name, memset, > + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default) > + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) > > return i; > } > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > index 3af442c..541c27e 100644 > --- a/sysdeps/aarch64/multiarch/init-arch.h > +++ b/sysdeps/aarch64/multiarch/init-arch.h > @@ -18,6 +18,9 @@ > > #include <ldsodefs.h> > > -#define INIT_ARCH() \ > - uint64_t __attribute__((unused)) midr = \ > - GLRO(dl_aarch64_cpu_features).midr_el1; > +#define INIT_ARCH() \ > + uint64_t __attribute__((unused)) midr = \ > + GLRO(dl_aarch64_cpu_features).midr_el1; \ > + extern unsigned __aarch64_zva_size; \ > + unsigned __attribute__((unused)) zva_size = __aarch64_zva_size = \ > + GLRO(dl_aarch64_cpu_features).zva_size; > diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c > new file mode 100644 > index 0000000..58e669a > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset.c > @@ -0,0 +1,47 @@ > +/* Multiple versions of memset. AARCH64 version. > + Copyright (C) 2017 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > + > +#if IS_IN (libc) > +/* Redefine memset so that the compiler won't complain about the type > + mismatch with the IFUNC selector in strong_alias, below. */ > +# undef memset > +# define memset __redirect_memset > +# include <string.h> > +# include <init-arch.h> > + > +unsigned __aarch64_zva_size; > + > +extern __typeof (__redirect_memset) __libc_memset; > + > +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden; > + > +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva > + : (zva_size == 64 ? __memset_zva_64 > + : (zva_size == 128 ? __memset_zva_128 > + : __memset_zva_default)))); > + > +# undef memset > +strong_alias (__libc_memset, memset); > +#else > +#include <string/memset.c> > +#endif > diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S > new file mode 100644 > index 0000000..56f1e02 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_generic.S > @@ -0,0 +1,27 @@ > +/* Memset for aarch64, default version for internal use. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#define MEMSET __memset_generic > +#define INTERNAL_MEMSET > +#define MEMSET_L(label) L(label) > +#ifdef SHARED > + .globl __GI_memset; __GI_memset = __memset_generic > +#endif > + > +#include <sysdeps/aarch64/memset.S> > diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S > new file mode 100644 > index 0000000..98045ac > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_nozva.S > @@ -0,0 +1,22 @@ > +/* Memset for aarch64, ZVA disabled. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#define MEMSET __memset_nozva > +#define MEMSET_L(label) L(label) > +#include <sysdeps/aarch64/memset.S> > diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S > new file mode 100644 > index 0000000..5d02b89 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_zva.S > @@ -0,0 +1,41 @@ > +/* Memset for aarch64, ZVA enabled. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) > +# define MEMSET __memset_zva_64 > +# define MEMSET_ZVA 64 > +# define MEMSET_L(label) L(label ## _zva64) > +# include <sysdeps/aarch64/memset.S> > + > +# undef MEMSET > +# undef MEMSET_ZVA > +# undef MEMSET_L > +# define MEMSET __memset_zva_128 > +# define MEMSET_ZVA 128 > +# define MEMSET_L(label) L(label ## _zva128) > +# include <sysdeps/aarch64/memset.S> > + > +# undef MEMSET > +# undef MEMSET_ZVA > +# undef MEMSET_L > +# define MEMSET __memset_zva_default > +# define MEMSET_ZVA 1 > +# define MEMSET_L(label) L(label ## _zvadef) > +# include <sysdeps/aarch64/memset.S> > +#endif > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > index e769eeb..092ee81 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > @@ -20,6 +20,9 @@ > #include <sys/auxv.h> > #include <elf/dl-hwcaps.h> > > +#define DCZID_DZP_MASK (1 << 4) > +#define DCZID_BS_MASK (0xf) > + > #if HAVE_TUNABLES > struct cpu_list > { > @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features) > } > > cpu_features->midr_el1 = midr; > + > + /* Check if ZVA is enabled. */ > + unsigned dczid; > + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid)); > + > + if ((dczid & DCZID_DZP_MASK) == 0) > + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK); > } > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > index 73cb53d..f2b6afd 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > @@ -47,6 +47,7 @@ > struct cpu_features > { > uint64_t midr_el1; > + unsigned zva_size; > }; > > #endif /* _CPU_FEATURES_AARCH64_H */ >
On 19/09/2017 08:23, Siddhesh Poyarekar wrote: > The DZP bit in the dczid_el0 register does not change dynamically, so > it is safe to read once during program startup. Hoist the zva check > into an ifunc resolver and store the result into a static variable, > which can be read in case of non-standard zva sizes. This effectively > adds 3 ifunc variants for memset - one for cases where zva is > disabled, one for 64 byte zva and another for 128 byte zva. I have > retained the older memset as __memset_generic for internal libc.so use > so that the change impact is minimal. We should eventually have a > discussion on what is more expensive, reading dczid_el0 on every > memset invocation or the indirection due to PLT. > > The gains due to this are significant for falkor, with gains as high > as 80% in some cases. Likewise for mustang, although the numbers are > slightly lower. Here's a sample from the falkor tests: > I would use a more compact ChangeLog entry as: * sysdeps/aarch64/memset.S (do_no_zva, do_zva_64, do_zva_128, do_zva_default): New macro. Same for the other entries where it applies. > (MEMSET): Use the new macros. > (MEMSET)[INTERNAL_MEMSET]: Retain old memset. > (MEMSET)[!INTERNAL_MEMSET]: Remove zva check. > * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): > Add memset_generic, memset_nozva and memset_zva. > * sysdeps/aarch64/multiarch/ifunc-impl-list.c > (__libc_ifunc_impl_list): Add memset ifuncs. > * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New > static variable __aarch64_zva_size and local variable > zva_size. > * sysdeps/aarch64/multiarch/memset.c: New file. > * sysdeps/aarch64/multiarch/memset_generic.S: New file. > * sysdeps/aarch64/multiarch/memset_nozva.S: New file. > * sysdeps/aarch64/multiarch/memset_zva.S: New file. > * sysdeps/unix/sysv/linux/aarch64/cpu-features.c > (DCZID_DZP_MASK): New macro. > (DCZID_BS_MASK): Likewise. > (init_cpu_features): Read and set zva_size. > * sysdeps/unix/sysv/linux/aarch64/cpu-features.h > (struct cpu_features): New member zva_size. > > > diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S > index 110fd22..8cff3a4 100644 > --- a/sysdeps/aarch64/memset.S > +++ b/sysdeps/aarch64/memset.S > @@ -37,7 +37,108 @@ > #define zva_len x7 > #define zva_lenw w7 > > -ENTRY_ALIGN (__memset, 6) > +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128 > + bytes and higher sizes. */ > + > +#ifndef ZVA_MACROS > +# define ZVA_MACROS > +/* No ZVA. */ > +.macro do_no_zva > + sub count, dstend, dst /* Count is 16 too large. */ > + add dst, dst, 16 > + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > +1: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 1b > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* Write the first and last 64 byte aligned block using stp rather > + than using DC ZVA. This is faster on some cores. */ > +.macro do_zva_64 > + str q0, [dst, 16] > + stp q0, q0, [dst, 32] > + bic dst, dst, 63 > + stp q0, q0, [dst, 64] > + stp q0, q0, [dst, 96] > + sub count, dstend, dst /* Count is now 128 too large. */ > + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > + add dst, dst, 128 > + nop > +1: dc zva, dst > + add dst, dst, 64 > + subs count, count, 64 > + b.hi 1b > + stp q0, q0, [dst, 0] > + stp q0, q0, [dst, 32] > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* ZVA size of 128 bytes. */ > +.macro do_zva_128 > + str q0, [dst, 16] > + stp q0, q0, [dst, 32] > + stp q0, q0, [dst, 64] > + stp q0, q0, [dst, 96] > + bic dst, dst, 127 > + sub count, dstend, dst /* Count is now 128 too large. */ > + sub count, count, 128+128 /* Adjust count and bias for loop. */ > + add dst, dst, 128 > +1: dc zva, dst > + add dst, dst, 128 > + subs count, count, 128 > + b.hi 1b > + stp q0, q0, [dstend, -128] > + stp q0, q0, [dstend, -96] > + stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > + > +/* ZVA size of more than 128 bytes. */ > +.macro do_zva_default > + add tmp1, zva_len, 64 /* Max alignment bytes written. */ > + cmp count, tmp1 > + blo MEMSET_L(no_zva) > + > + sub tmp2, zva_len, 1 > + add tmp1, dst, zva_len > + add dst, dst, 16 > + subs count, tmp1, dst /* Actual alignment bytes to write. */ > + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > + beq 2f > +1: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 1b > +2: mov dst, tmp1 > + sub count, dstend, tmp1 /* Remaining bytes to write. */ > + subs count, count, zva_len > + b.lo 4f > +3: dc zva, dst > + add dst, dst, zva_len > + subs count, count, zva_len > + b.hs 3b > +4: add count, count, zva_len > + subs count, count, 64 > + b.ls 6f > +5: stp q0, q0, [dst], 64 > + stp q0, q0, [dst, -32] > + subs count, count, 64 > + b.hi 5b > +6: stp q0, q0, [dstend, -64] > + stp q0, q0, [dstend, -32] > + ret > +.endm > +#endif > + > +/* Memset entry point. */ > +ENTRY_ALIGN (MEMSET, 6) > > DELOUSE (0) > DELOUSE (2) > @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6) > add dstend, dstin, count > > cmp count, 96 > - b.hi L(set_long) > + b.hi MEMSET_L(set_long) > cmp count, 16 > - b.hs L(set_medium) > + b.hs MEMSET_L(set_medium) > mov val, v0.D[0] > > /* Set 0..15 bytes. */ > @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6) > 3: ret > > /* Set 17..96 bytes. */ > -L(set_medium): > +MEMSET_L(set_medium): > str q0, [dstin] > - tbnz count, 6, L(set96) > + tbnz count, 6, MEMSET_L(set96) > str q0, [dstend, -16] > tbz count, 5, 1f > str q0, [dstin, 16] > @@ -80,7 +181,7 @@ L(set_medium): > .p2align 4 > /* Set 64..96 bytes. Write 64 bytes from the start and > 32 bytes from the end. */ > -L(set96): > +MEMSET_L(set96): > str q0, [dstin, 16] > stp q0, q0, [dstin, 32] > stp q0, q0, [dstend, -32] > @@ -88,108 +189,63 @@ L(set96): > > .p2align 3 > nop > -L(set_long): > +MEMSET_L(set_long): > +#ifdef INTERNAL_MEMSET > and valw, valw, 255 > bic dst, dstin, 15 > str q0, [dstin] > cmp count, 256 > ccmp valw, 0, 0, cs > - b.eq L(try_zva) > -L(no_zva): > - sub count, dstend, dst /* Count is 16 too large. */ > - add dst, dst, 16 > - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > -L(tail64): > - subs count, count, 64 > - b.hi 1b > -2: stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > + b.eq MEMSET_L(try_zva) > > - .p2align 3 > -L(try_zva): > +MEMSET_L(no_zva): > + do_no_zva > + > + .p2align 4 > +MEMSET_L(try_zva): > mrs tmp1, dczid_el0 > - tbnz tmp1w, 4, L(no_zva) > and tmp1w, tmp1w, 15 > cmp tmp1w, 4 /* ZVA size is 64 bytes. */ > - b.ne L(zva_128) > + b.ne MEMSET_L(zva_128) > + do_zva_64 > > - /* Write the first and last 64 byte aligned block using stp rather > - than using DC ZVA. This is faster on some cores. > - */ > -L(zva_64): > - str q0, [dst, 16] > - stp q0, q0, [dst, 32] > - bic dst, dst, 63 > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > - nop > -1: dc zva, dst > - add dst, dst, 64 > - subs count, count, 64 > - b.hi 1b > - stp q0, q0, [dst, 0] > - stp q0, q0, [dst, 32] > - stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > - > - .p2align 3 > -L(zva_128): > +MEMSET_L(zva_128): > cmp tmp1w, 5 /* ZVA size is 128 bytes. */ > - b.ne L(zva_other) > + b.ne MEMSET_L(zva_other) > + do_zva_128 > > - str q0, [dst, 16] > - stp q0, q0, [dst, 32] > - stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - bic dst, dst, 127 > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+128 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > -1: dc zva, dst > - add dst, dst, 128 > - subs count, count, 128 > - b.hi 1b > - stp q0, q0, [dstend, -128] > - stp q0, q0, [dstend, -96] > - stp q0, q0, [dstend, -64] > - stp q0, q0, [dstend, -32] > - ret > - > -L(zva_other): > +MEMSET_L(zva_other): > mov tmp2w, 4 > lsl zva_lenw, tmp2w, tmp1w > - add tmp1, zva_len, 64 /* Max alignment bytes written. */ > - cmp count, tmp1 > - blo L(no_zva) > + do_zva_default > +#else > + /* Memset called through PLT, so we need only one of the ZVA > + variants. */ > +# ifdef MEMSET_ZVA > + and valw, valw, 255 > +# endif > + bic dst, dstin, 15 > + str q0, [dstin] > +# ifdef MEMSET_ZVA > + cmp count, 256 > + ccmp valw, 0, 0, cs > + b.eq MEMSET_L(try_zva) > +# endif > +MEMSET_L(no_zva): > + do_no_zva > +# if defined MEMSET_ZVA > +MEMSET_L(try_zva): > +# if MEMSET_ZVA == 64 > + do_zva_64 > +# elif MEMSET_ZVA == 128 > + do_zva_128 > +# else > + adrp zva_len, __aarch64_zva_size > + ldr zva_len, [zva_len, #:lo12:__aarch64_zva_size] > + do_zva_default > +# endif > +# endif > +#endif > > - sub tmp2, zva_len, 1 > - add tmp1, dst, zva_len > - add dst, dst, 16 > - subs count, tmp1, dst /* Actual alignment bytes to write. */ > - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > - beq 2f > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > - subs count, count, 64 > - b.hi 1b > -2: mov dst, tmp1 > - sub count, dstend, tmp1 /* Remaining bytes to write. */ > - subs count, count, zva_len > - b.lo 4f > -3: dc zva, dst > - add dst, dst, zva_len > - subs count, count, zva_len > - b.hs 3b > -4: add count, count, zva_len > - b L(tail64) > - > -END (__memset) > -weak_alias (__memset, memset) > -libc_hidden_builtin_def (memset) > +END (MEMSET) > +libc_hidden_builtin_def (MEMSET) > diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile > index 9aa1e79..f611182 100644 > --- a/sysdeps/aarch64/multiarch/Makefile > +++ b/sysdeps/aarch64/multiarch/Makefile > @@ -1,4 +1,4 @@ > ifeq ($(subdir),string) > sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ > - memmove_falkor > + memmove_falkor memset_generic memset_nozva memset_zva > endif > diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > index 2cb74d5..29148ac 100644 > --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c > @@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) > + IFUNC_IMPL (i, name, memset, > + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128) > + IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default) > + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) > > return i; > } > diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h > index 3af442c..541c27e 100644 > --- a/sysdeps/aarch64/multiarch/init-arch.h > +++ b/sysdeps/aarch64/multiarch/init-arch.h > @@ -18,6 +18,9 @@ > > #include <ldsodefs.h> > > -#define INIT_ARCH() \ > - uint64_t __attribute__((unused)) midr = \ > - GLRO(dl_aarch64_cpu_features).midr_el1; > +#define INIT_ARCH() \ > + uint64_t __attribute__((unused)) midr = \ > + GLRO(dl_aarch64_cpu_features).midr_el1; \ > + extern unsigned __aarch64_zva_size; \ > + unsigned __attribute__((unused)) zva_size = __aarch64_zva_size = \ > + GLRO(dl_aarch64_cpu_features).zva_size; > diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c > new file mode 100644 > index 0000000..58e669a > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset.c > @@ -0,0 +1,47 @@ > +/* Multiple versions of memset. AARCH64 version. > + Copyright (C) 2017 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > + > +#if IS_IN (libc) > +/* Redefine memset so that the compiler won't complain about the type > + mismatch with the IFUNC selector in strong_alias, below. */ > +# undef memset > +# define memset __redirect_memset > +# include <string.h> > +# include <init-arch.h> > + > +unsigned __aarch64_zva_size; > + > +extern __typeof (__redirect_memset) __libc_memset; > + > +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden; > +extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden; > + > +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva > + : (zva_size == 64 ? __memset_zva_64 > + : (zva_size == 128 ? __memset_zva_128 > + : __memset_zva_default)))); > + > +# undef memset > +strong_alias (__libc_memset, memset); > +#else > +#include <string/memset.c> > +#endif You don't need use the default version for the loader, you can use the generic sysdeps/aarch64/memset.S by creating a rtld-memset.S on multiarch and defining the required macros. > diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S > new file mode 100644 > index 0000000..56f1e02 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_generic.S > @@ -0,0 +1,27 @@ > +/* Memset for aarch64, default version for internal use. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#define MEMSET __memset_generic > +#define INTERNAL_MEMSET > +#define MEMSET_L(label) L(label) > +#ifdef SHARED > + .globl __GI_memset; __GI_memset = __memset_generic > +#endif I would add a comment stating it is essentially doing libc_hidden_def(memset) and redirecting the internal implementation to __memset_generic. > + > +#include <sysdeps/aarch64/memset.S> > diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S > new file mode 100644 > index 0000000..98045ac > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_nozva.S > @@ -0,0 +1,22 @@ > +/* Memset for aarch64, ZVA disabled. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#define MEMSET __memset_nozva > +#define MEMSET_L(label) L(label) > +#include <sysdeps/aarch64/memset.S> Although not strictly required, I think it should avoid build these for !IS_IN(libc) as for memset_zva.S. Same applied for memset_generic.S. > diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S > new file mode 100644 > index 0000000..5d02b89 > --- /dev/null > +++ b/sysdeps/aarch64/multiarch/memset_zva.S > @@ -0,0 +1,41 @@ > +/* Memset for aarch64, ZVA enabled. > + Copyright (C) 2017 Free Software Foundation, Inc. > + > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library. If not, see > + <http://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) > +# define MEMSET __memset_zva_64 > +# define MEMSET_ZVA 64 > +# define MEMSET_L(label) L(label ## _zva64) > +# include <sysdeps/aarch64/memset.S> > + > +# undef MEMSET > +# undef MEMSET_ZVA > +# undef MEMSET_L > +# define MEMSET __memset_zva_128 > +# define MEMSET_ZVA 128 > +# define MEMSET_L(label) L(label ## _zva128) > +# include <sysdeps/aarch64/memset.S> > + > +# undef MEMSET > +# undef MEMSET_ZVA > +# undef MEMSET_L > +# define MEMSET __memset_zva_default > +# define MEMSET_ZVA 1 > +# define MEMSET_L(label) L(label ## _zvadef) > +# include <sysdeps/aarch64/memset.S> > +#endif > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > index e769eeb..092ee81 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c > @@ -20,6 +20,9 @@ > #include <sys/auxv.h> > #include <elf/dl-hwcaps.h> > > +#define DCZID_DZP_MASK (1 << 4) > +#define DCZID_BS_MASK (0xf) > + > #if HAVE_TUNABLES > struct cpu_list > { > @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features) > } > > cpu_features->midr_el1 = midr; > + > + /* Check if ZVA is enabled. */ > + unsigned dczid; > + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid)); > + > + if ((dczid & DCZID_DZP_MASK) == 0) > + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK); > } > diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > index 73cb53d..f2b6afd 100644 > --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h > @@ -47,6 +47,7 @@ > struct cpu_features > { > uint64_t midr_el1; > + unsigned zva_size; > }; > > #endif /* _CPU_FEATURES_AARCH64_H */ Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
On Friday 29 September 2017 12:42 AM, Adhemerval Zanella wrote: > I would use a more compact ChangeLog entry as: > > * sysdeps/aarch64/memset.S (do_no_zva, do_zva_64, > do_zva_128, do_zva_default): New macro. > > Same for the other entries where it applies. Done. > You don't need use the default version for the loader, you can use the > generic sysdeps/aarch64/memset.S by creating a rtld-memset.S on > multiarch and defining the required macros. Done. > I would add a comment stating it is essentially doing libc_hidden_def(memset) > and redirecting the internal implementation to __memset_generic. Done. > Although not strictly required, I think it should avoid build these > for !IS_IN(libc) as for memset_zva.S. Same applied for memset_generic.S. Done. Posting updated patch shortly. Thanks, Siddhesh
======================================================================================================================== length=256, char=0: 1.82 (-87.26%) 26.99 ( 89.30%) 25.49 ( 78.76%) 23.48 ( 64.65%) 14.26 length=257, char=0: 1.82 (-87.29%) 26.97 ( 88.44%) 25.77 ( 80.12%) 24.41 ( 70.57%) 14.31 length=258, char=0: 1.82 (-87.38%) 26.27 ( 82.29%) 25.84 ( 79.28%) 24.33 ( 68.80%) 14.41 length=259, char=0: 1.82 (-87.36%) 26.06 ( 81.15%) 25.72 ( 78.84%) 24.57 ( 70.80%) 14.38 length=260, char=0: 1.82 (-87.44%) 25.35 ( 75.23%) 25.93 ( 79.23%) 24.34 ( 68.24%) 14.47 length=261, char=0: 1.82 (-87.49%) 26.15 ( 79.70%) 26.01 ( 78.72%) 24.44 ( 67.97%) 14.55 length=262, char=0: 1.82 (-87.54%) 25.91 ( 77.31%) 26.06 ( 78.35%) 24.33 ( 66.49%) 14.61 length=263, char=0: 1.82 (-87.54%) 25.69 ( 75.80%) 25.96 ( 77.63%) 24.54 ( 67.90%) 14.61 length=264, char=0: 1.82 (-87.57%) 25.31 ( 72.69%) 26.16 ( 78.43%) 24.63 ( 68.00%) 14.66 length=265, char=0: 1.82 (-87.65%) 25.29 ( 71.35%) 26.25 ( 77.84%) 24.58 ( 66.53%) 14.76 length=266, char=0: 1.82 (-87.69%) 25.10 ( 69.40%) 26.15 ( 76.48%) 24.77 ( 67.22%) 14.82 length=267, char=0: 1.82 (-87.69%) 24.89 ( 68.02%) 26.20 ( 76.90%) 24.87 ( 67.91%) 14.81 length=268, char=0: 1.82 (-87.74%) 24.07 ( 62.04%) 26.40 ( 77.74%) 24.95 ( 67.93%) 14.85 length=269, char=0: 1.82 (-87.80%) 23.82 ( 59.29%) 26.47 ( 77.00%) 24.89 ( 66.43%) 14.96 length=270, char=0: 1.82 (-87.84%) 23.65 ( 57.61%) 26.35 ( 75.58%) 25.07 ( 67.07%) 15.01 length=271, char=0: 1.83 (-87.82%) 23.48 ( 56.53%) 26.39 ( 75.93%) 25.15 ( 67.66%) 15.00 length=512, char=0: 1.90 (-92.59%) 29.25 ( 13.81%) 36.30 ( 41.27%) 40.95 ( 59.36%) 25.70 length=513, char=0: 1.90 (-92.57%) 29.29 ( 14.35%) 36.63 ( 43.01%) 40.80 ( 59.28%) 25.61 length=514, char=0: 1.90 (-92.62%) 28.61 ( 10.91%) 36.64 ( 42.05%) 40.89 ( 58.52%) 25.80 length=515, char=0: 1.90 (-92.63%) 28.74 ( 11.29%) 36.68 ( 42.06%) 40.56 ( 57.08%) 25.82 length=516, char=0: 1.90 (-92.65%) 28.33 ( 9.54%) 36.72 ( 41.96%) 40.09 ( 55.01%) 25.87 length=517, char=0: 1.90 (-92.66%) 28.41 ( 9.60%) 36.80 ( 41.97%) 39.43 ( 52.13%) 25.92 length=518, char=0: 1.90 (-92.66%) 28.16 ( 8.45%) 36.84 ( 41.89%) 39.40 ( 51.77%) 25.96 length=519, char=0: 1.90 (-92.67%) 28.21 ( 8.58%) 36.86 ( 41.86%) 40.39 ( 55.46%) 25.98 length=520, char=0: 1.90 (-92.65%) 27.53 ( 6.32%) 36.90 ( 42.49%) 40.80 ( 57.58%) 25.89 length=521, char=0: 1.90 (-92.69%) 27.53 ( 5.65%) 36.61 ( 40.50%) 40.86 ( 56.81%) 26.05 length=522, char=0: 1.90 (-92.66%) 27.40 ( 5.59%) 36.95 ( 42.35%) 40.92 ( 57.64%) 25.95 length=523, char=0: 1.91 (-92.71%) 27.50 ( 5.29%) 36.69 ( 40.45%) 40.97 ( 56.84%) 26.12 length=524, char=0: 1.90 (-92.69%) 27.33 ( 5.02%) 37.00 ( 42.18%) 40.98 ( 57.50%) 26.02 length=525, char=0: 1.91 (-92.72%) 27.24 ( 4.04%) 36.70 ( 40.13%) 41.04 ( 56.72%) 26.19 length=526, char=0: 1.90 (-92.70%) 27.06 ( 3.73%) 37.06 ( 42.05%) 41.08 ( 57.44%) 26.09 length=527, char=0: 1.91 (-92.74%) 26.82 ( 2.17%) 37.06 ( 41.17%) 41.11 ( 56.62%) 26.25 length=1024, char=0: 1.95 (-95.35%) 30.55 (-27.12%) 46.52 ( 10.99%) 49.89 ( 19.02%) 41.91 length=1025, char=0: 1.95 (-95.31%) 30.58 (-26.47%) 46.57 ( 11.98%) 49.92 ( 20.05%) 41.59 length=1026, char=0: 1.95 (-95.36%) 30.35 (-27.70%) 46.56 ( 10.92%) 49.45 ( 17.79%) 41.98 length=1027, char=0: 1.95 (-95.36%) 30.24 (-28.02%) 46.20 ( 9.98%) 49.93 ( 18.88%) 42.00 length=1028, char=0: 1.95 (-95.37%) 29.75 (-29.25%) 46.58 ( 10.76%) 49.92 ( 18.71%) 42.05 length=1029, char=0: 1.95 (-95.37%) 29.78 (-29.24%) 46.57 ( 10.65%) 49.96 ( 18.72%) 42.08 length=1030, char=0: 1.95 (-95.33%) 29.77 (-28.73%) 46.63 ( 11.63%) 49.97 ( 19.64%) 41.77 length=1031, char=0: 1.95 (-95.37%) 29.64 (-29.68%) 46.62 ( 10.59%) 49.51 ( 17.46%) 42.15 length=1032, char=0: 1.95 (-95.38%) 29.60 (-29.80%) 46.22 ( 9.63%) 49.99 ( 18.58%) 42.16 length=1033, char=0: 1.95 (-95.38%) 29.32 (-30.55%) 46.65 ( 10.49%) 49.95 ( 18.32%) 42.22 length=1034, char=0: 1.95 (-95.39%) 29.45 (-30.31%) 46.67 ( 10.44%) 50.01 ( 18.36%) 42.25 length=1035, char=0: 1.95 (-95.35%) 29.31 (-30.09%) 46.68 ( 11.34%) 50.02 ( 19.31%) 41.92 length=1036, char=0: 1.95 (-95.40%) 29.30 (-30.75%) 46.66 ( 10.27%) 49.56 ( 17.12%) 42.32 length=1037, char=0: 1.95 (-95.39%) 29.17 (-31.08%) 46.30 ( 9.38%) 50.04 ( 18.22%) 42.33 length=1038, char=0: 1.95 (-95.40%) 29.12 (-31.30%) 46.71 ( 10.19%) 50.02 ( 18.01%) 42.39 length=1039, char=0: 1.95 (-95.40%) 29.19 (-31.20%) 46.73 ( 10.14%) 50.06 ( 18.00%) 42.43 * sysdeps/aarch64/memset.S (do_no_zva): New macro. (do_zva_64): Likewise. (do_zva_128): Likewise. (do_zva_default): Likewise. (__memset): Rename to MEMSET macro. (MEMSET): Use the new macros. (MEMSET)[INTERNAL_MEMSET]: Retain old memset. (MEMSET)[!INTERNAL_MEMSET]: Remove zva check. * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memset_generic, memset_nozva and memset_zva. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add memset ifuncs. * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New static variable __aarch64_zva_size and local variable zva_size. * sysdeps/aarch64/multiarch/memset.c: New file. * sysdeps/aarch64/multiarch/memset_generic.S: New file. * sysdeps/aarch64/multiarch/memset_nozva.S: New file. * sysdeps/aarch64/multiarch/memset_zva.S: New file. * sysdeps/unix/sysv/linux/aarch64/cpu-features.c (DCZID_DZP_MASK): New macro. (DCZID_BS_MASK): Likewise. (init_cpu_features): Read and set zva_size. * sysdeps/unix/sysv/linux/aarch64/cpu-features.h (struct cpu_features): New member zva_size. --- sysdeps/aarch64/memset.S | 248 +++++++++++++++---------- sysdeps/aarch64/multiarch/Makefile | 2 +- sysdeps/aarch64/multiarch/ifunc-impl-list.c | 6 + sysdeps/aarch64/multiarch/init-arch.h | 9 +- sysdeps/aarch64/multiarch/memset.c | 47 +++++ sysdeps/aarch64/multiarch/memset_generic.S | 27 +++ sysdeps/aarch64/multiarch/memset_nozva.S | 22 +++ sysdeps/aarch64/multiarch/memset_zva.S | 41 ++++ sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 + sysdeps/unix/sysv/linux/aarch64/cpu-features.h | 1 + 10 files changed, 313 insertions(+), 100 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/memset.c create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S create mode 100644 sysdeps/aarch64/multiarch/memset_nozva.S create mode 100644 sysdeps/aarch64/multiarch/memset_zva.S diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 110fd22..8cff3a4 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -37,7 +37,108 @@ #define zva_len x7 #define zva_lenw w7 -ENTRY_ALIGN (__memset, 6) +/* Macros that do the critical loops for either no zva or zva of 64 bytes, 128 + bytes and higher sizes. */ + +#ifndef ZVA_MACROS +# define ZVA_MACROS +/* No ZVA. */ +.macro do_no_zva + sub count, dstend, dst /* Count is 16 too large. */ + add dst, dst, 16 + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +/* Write the first and last 64 byte aligned block using stp rather + than using DC ZVA. This is faster on some cores. */ +.macro do_zva_64 + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+64+64 /* Adjust count and bias for loop. */ + add dst, dst, 128 + nop +1: dc zva, dst + add dst, dst, 64 + subs count, count, 64 + b.hi 1b + stp q0, q0, [dst, 0] + stp q0, q0, [dst, 32] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +/* ZVA size of 128 bytes. */ +.macro do_zva_128 + str q0, [dst, 16] + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64] + stp q0, q0, [dst, 96] + bic dst, dst, 127 + sub count, dstend, dst /* Count is now 128 too large. */ + sub count, count, 128+128 /* Adjust count and bias for loop. */ + add dst, dst, 128 +1: dc zva, dst + add dst, dst, 128 + subs count, count, 128 + b.hi 1b + stp q0, q0, [dstend, -128] + stp q0, q0, [dstend, -96] + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm + +/* ZVA size of more than 128 bytes. */ +.macro do_zva_default + add tmp1, zva_len, 64 /* Max alignment bytes written. */ + cmp count, tmp1 + blo MEMSET_L(no_zva) + + sub tmp2, zva_len, 1 + add tmp1, dst, zva_len + add dst, dst, 16 + subs count, tmp1, dst /* Actual alignment bytes to write. */ + bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ + beq 2f +1: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 1b +2: mov dst, tmp1 + sub count, dstend, tmp1 /* Remaining bytes to write. */ + subs count, count, zva_len + b.lo 4f +3: dc zva, dst + add dst, dst, zva_len + subs count, count, zva_len + b.hs 3b +4: add count, count, zva_len + subs count, count, 64 + b.ls 6f +5: stp q0, q0, [dst], 64 + stp q0, q0, [dst, -32] + subs count, count, 64 + b.hi 5b +6: stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret +.endm +#endif + +/* Memset entry point. */ +ENTRY_ALIGN (MEMSET, 6) DELOUSE (0) DELOUSE (2) @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6) add dstend, dstin, count cmp count, 96 - b.hi L(set_long) + b.hi MEMSET_L(set_long) cmp count, 16 - b.hs L(set_medium) + b.hs MEMSET_L(set_medium) mov val, v0.D[0] /* Set 0..15 bytes. */ @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6) 3: ret /* Set 17..96 bytes. */ -L(set_medium): +MEMSET_L(set_medium): str q0, [dstin] - tbnz count, 6, L(set96) + tbnz count, 6, MEMSET_L(set96) str q0, [dstend, -16] tbz count, 5, 1f str q0, [dstin, 16] @@ -80,7 +181,7 @@ L(set_medium): .p2align 4 /* Set 64..96 bytes. Write 64 bytes from the start and 32 bytes from the end. */ -L(set96): +MEMSET_L(set96): str q0, [dstin, 16] stp q0, q0, [dstin, 32] stp q0, q0, [dstend, -32] @@ -88,108 +189,63 @@ L(set96): .p2align 3 nop -L(set_long): +MEMSET_L(set_long): +#ifdef INTERNAL_MEMSET and valw, valw, 255 bic dst, dstin, 15 str q0, [dstin] cmp count, 256 ccmp valw, 0, 0, cs - b.eq L(try_zva) -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - add dst, dst, 16 - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret + b.eq MEMSET_L(try_zva) - .p2align 3 -L(try_zva): +MEMSET_L(no_zva): + do_no_zva + + .p2align 4 +MEMSET_L(try_zva): mrs tmp1, dczid_el0 - tbnz tmp1w, 4, L(no_zva) and tmp1w, tmp1w, 15 cmp tmp1w, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) + b.ne MEMSET_L(zva_128) + do_zva_64 - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ -L(zva_64): - str q0, [dst, 16] - stp q0, q0, [dst, 32] - bic dst, dst, 63 - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 - nop -1: dc zva, dst - add dst, dst, 64 - subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - - .p2align 3 -L(zva_128): +MEMSET_L(zva_128): cmp tmp1w, 5 /* ZVA size is 128 bytes. */ - b.ne L(zva_other) + b.ne MEMSET_L(zva_other) + do_zva_128 - str q0, [dst, 16] - stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - bic dst, dst, 127 - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 - subs count, count, 128 - b.hi 1b - stp q0, q0, [dstend, -128] - stp q0, q0, [dstend, -96] - stp q0, q0, [dstend, -64] - stp q0, q0, [dstend, -32] - ret - -L(zva_other): +MEMSET_L(zva_other): mov tmp2w, 4 lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) + do_zva_default +#else + /* Memset called through PLT, so we need only one of the ZVA + variants. */ +# ifdef MEMSET_ZVA + and valw, valw, 255 +# endif + bic dst, dstin, 15 + str q0, [dstin] +# ifdef MEMSET_ZVA + cmp count, 256 + ccmp valw, 0, 0, cs + b.eq MEMSET_L(try_zva) +# endif +MEMSET_L(no_zva): + do_no_zva +# if defined MEMSET_ZVA +MEMSET_L(try_zva): +# if MEMSET_ZVA == 64 + do_zva_64 +# elif MEMSET_ZVA == 128 + do_zva_128 +# else + adrp zva_len, __aarch64_zva_size + ldr zva_len, [zva_len, #:lo12:__aarch64_zva_size] + do_zva_default +# endif +# endif +#endif - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - b L(tail64) - -END (__memset) -weak_alias (__memset, memset) -libc_hidden_builtin_def (memset) +END (MEMSET) +libc_hidden_builtin_def (MEMSET) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 9aa1e79..f611182 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -1,4 +1,4 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \ - memmove_falkor + memmove_falkor memset_generic memset_nozva memset_zva endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 2cb74d5..29148ac 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -46,6 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic)) + IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_nozva) + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64) + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 128), __memset_zva_128) + IFUNC_IMPL_ADD (array, i, memset, (zva_size > 0), __memset_zva_default) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) return i; } diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h index 3af442c..541c27e 100644 --- a/sysdeps/aarch64/multiarch/init-arch.h +++ b/sysdeps/aarch64/multiarch/init-arch.h @@ -18,6 +18,9 @@ #include <ldsodefs.h> -#define INIT_ARCH() \ - uint64_t __attribute__((unused)) midr = \ - GLRO(dl_aarch64_cpu_features).midr_el1; +#define INIT_ARCH() \ + uint64_t __attribute__((unused)) midr = \ + GLRO(dl_aarch64_cpu_features).midr_el1; \ + extern unsigned __aarch64_zva_size; \ + unsigned __attribute__((unused)) zva_size = __aarch64_zva_size = \ + GLRO(dl_aarch64_cpu_features).zva_size; diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c new file mode 100644 index 0000000..58e669a --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset.c @@ -0,0 +1,47 @@ +/* Multiple versions of memset. AARCH64 version. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memset so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memset +# define memset __redirect_memset +# include <string.h> +# include <init-arch.h> + +unsigned __aarch64_zva_size; + +extern __typeof (__redirect_memset) __libc_memset; + +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden; +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden; +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden; +extern __typeof (__redirect_memset) __memset_zva_default attribute_hidden; + +libc_ifunc (__libc_memset, (zva_size == 0 ? __memset_nozva + : (zva_size == 64 ? __memset_zva_64 + : (zva_size == 128 ? __memset_zva_128 + : __memset_zva_default)))); + +# undef memset +strong_alias (__libc_memset, memset); +#else +#include <string/memset.c> +#endif diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S new file mode 100644 index 0000000..56f1e02 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_generic.S @@ -0,0 +1,27 @@ +/* Memset for aarch64, default version for internal use. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#define MEMSET __memset_generic +#define INTERNAL_MEMSET +#define MEMSET_L(label) L(label) +#ifdef SHARED + .globl __GI_memset; __GI_memset = __memset_generic +#endif + +#include <sysdeps/aarch64/memset.S> diff --git a/sysdeps/aarch64/multiarch/memset_nozva.S b/sysdeps/aarch64/multiarch/memset_nozva.S new file mode 100644 index 0000000..98045ac --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_nozva.S @@ -0,0 +1,22 @@ +/* Memset for aarch64, ZVA disabled. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#define MEMSET __memset_nozva +#define MEMSET_L(label) L(label) +#include <sysdeps/aarch64/memset.S> diff --git a/sysdeps/aarch64/multiarch/memset_zva.S b/sysdeps/aarch64/multiarch/memset_zva.S new file mode 100644 index 0000000..5d02b89 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memset_zva.S @@ -0,0 +1,41 @@ +/* Memset for aarch64, ZVA enabled. + Copyright (C) 2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMSET __memset_zva_64 +# define MEMSET_ZVA 64 +# define MEMSET_L(label) L(label ## _zva64) +# include <sysdeps/aarch64/memset.S> + +# undef MEMSET +# undef MEMSET_ZVA +# undef MEMSET_L +# define MEMSET __memset_zva_128 +# define MEMSET_ZVA 128 +# define MEMSET_L(label) L(label ## _zva128) +# include <sysdeps/aarch64/memset.S> + +# undef MEMSET +# undef MEMSET_ZVA +# undef MEMSET_L +# define MEMSET __memset_zva_default +# define MEMSET_ZVA 1 +# define MEMSET_L(label) L(label ## _zvadef) +# include <sysdeps/aarch64/memset.S> +#endif diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c index e769eeb..092ee81 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c @@ -20,6 +20,9 @@ #include <sys/auxv.h> #include <elf/dl-hwcaps.h> +#define DCZID_DZP_MASK (1 << 4) +#define DCZID_BS_MASK (0xf) + #if HAVE_TUNABLES struct cpu_list { @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features) } cpu_features->midr_el1 = midr; + + /* Check if ZVA is enabled. */ + unsigned dczid; + asm volatile ("mrs %0, dczid_el0" : "=r"(dczid)); + + if ((dczid & DCZID_DZP_MASK) == 0) + cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK); } diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h index 73cb53d..f2b6afd 100644 --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h @@ -47,6 +47,7 @@ struct cpu_features { uint64_t midr_el1; + unsigned zva_size; }; #endif /* _CPU_FEATURES_AARCH64_H */