@@ -20,23 +20,14 @@
*
* ARMv8-a, AArch64
* Unaligned accesses
- *
*/
#include <sysdep.h>
-/* By default we assume that the DC instruction can be used to zero
- data blocks more efficiently. In some circumstances this might be
- unsafe, for example in an asymmetric multiprocessor environment with
- different DC clear lengths (neither the upper nor lower lengths are
- safe to use). The feature can be disabled by defining DONT_USE_DC.
-
- If code may be run in a virtualized environment, then define
- MAYBE_VIRT. This will cause the code to cache the system register
- values rather than re-reading them each call. */
-
#define dstin x0
-#define val w1
+#define dstin_w w0
+#define val x1
+#define valw w1
#define count x2
#define tmp1 x3
#define tmp1w w3
@@ -44,186 +35,186 @@
#define tmp2w w4
#define zva_len_x x5
#define zva_len w5
-#define zva_bits_x x6
-
-#define A_l x7
-#define A_lw w7
+#define zva_mask_x x6
+#define zva_mask w6
#define dst x8
-#define tmp3w w9
-
-ENTRY_ALIGN (__memset, 6)
-
- mov dst, dstin /* Preserve return value. */
- ands A_lw, val, #255
-#ifndef DONT_USE_DC
- b.eq L(zero_mem)
-#endif
- orr A_lw, A_lw, A_lw, lsl #8
- orr A_lw, A_lw, A_lw, lsl #16
- orr A_l, A_l, A_l, lsl #32
-L(tail_maybe_long):
- cmp count, #64
- b.ge L(not_short)
-L(tail_maybe_tiny):
- cmp count, #15
- b.le L(tail15tiny)
-L(tail63):
- ands tmp1, count, #0x30
- b.eq L(tail15)
- add dst, dst, tmp1
- cmp tmp1w, #0x20
- b.eq 1f
- b.lt 2f
- stp A_l, A_l, [dst, #-48]
-1:
- stp A_l, A_l, [dst, #-32]
-2:
- stp A_l, A_l, [dst, #-16]
-
-L(tail15):
- and count, count, #15
- add dst, dst, count
- stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
+#define dst_w w8
+#define dstend x9
+
+ .globl memset
+ cfi_startproc
+
+#if HAVE_IFUNC && !defined (IS_IN_rtld)
+/* Rather than decode dczid_el0 every time, checking for zva disabled and
+ unpacking the line size, do this once in the indirect function and choose
+ an appropriate entry point which encodes these values as constants. */
+
+ .type memset, %gnu_indirect_function
+memset:
+ mrs x1, dczid_el0
+ and x1, x1, #31 /* isolate line size + disable bit */
+
+ cmp x1, #4 /* 64 byte line size, enabled */
+ b.ne 1f
+ adr x0, memset_zva_64
RET
-L(tail15tiny):
- /* Set up to 15 bytes. Does not assume earlier memory
- being set. */
- tbz count, #3, 1f
- str A_l, [dst], #8
-1:
- tbz count, #2, 1f
- str A_lw, [dst], #4
-1:
- tbz count, #1, 1f
- strh A_lw, [dst], #2
-1:
- tbz count, #0, 1f
- strb A_lw, [dst]
-1:
+1: cmp x1, #5 /* 128 byte line size, enabled */
+ b.ne 1f
+ adr x0, memset_zva_128
RET
- /* Critical loop. Start at a new cache line boundary. Assuming
- * 64 bytes per line, this ensures the entire loop is in one line. */
- .p2align 6
-L(not_short):
- neg tmp2, dst
- ands tmp2, tmp2, #15
- b.eq 2f
- /* Bring DST to 128-bit (16-byte) alignment. We know that there's
- * more than that to set, so we simply store 16 bytes and advance by
- * the amount required to reach alignment. */
- sub count, count, tmp2
- stp A_l, A_l, [dst]
- add dst, dst, tmp2
- /* There may be less than 63 bytes to go now. */
- cmp count, #63
- b.le L(tail63)
-2:
- sub dst, dst, #16 /* Pre-bias. */
- sub count, count, #64
-1:
- stp A_l, A_l, [dst, #16]
- stp A_l, A_l, [dst, #32]
- stp A_l, A_l, [dst, #48]
- stp A_l, A_l, [dst, #64]!
- subs count, count, #64
- b.ge 1b
- tst count, #0x3f
- add dst, dst, #16
- b.ne L(tail63)
+1: adr x0, memset_nozva /* Don't use zva at all */
+ RET
+ .size memset, .-memset
+
+.macro do_zva size
+ .balign 64
+ .type memset_zva_\size, %function
+memset_zva_\size:
+ CALL_MCOUNT
+ and valw, valw, #255
+ cmp count, #4*\size
+ ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */
+ b.ne L(nz_or_small)
+
+ stp xzr, xzr, [dstin] /* first 16 aligned 1. */
+ and tmp2, dstin, #-16
+ and dst, dstin, #-\size
+
+ stp xzr, xzr, [tmp2, #16] /* first 64 aligned 16. */
+ add dstend, dstin, count
+ add dst, dst, #\size
+
+ stp xzr, xzr, [tmp2, #32]
+ sub count, dstend, dst /* recompute for misalign */
+ add tmp1, dst, #\size
+
+ stp xzr, xzr, [tmp2, #48]
+ sub count, count, #2*\size /* pre-bias */
+
+ stp xzr, xzr, [tmp2, #64]
+
+ /* Store up to first SIZE, aligned 16. */
+.ifgt \size - 64
+ stp xzr, xzr, [tmp2, #80]
+ stp xzr, xzr, [tmp2, #96]
+ stp xzr, xzr, [tmp2, #112]
+ stp xzr, xzr, [tmp2, #128]
+.ifgt \size - 128
+.err
+.endif
+.endif
+
+ .balign 64,,24
+0: dc zva, dst
+ subs count, count, #2*\size
+ dc zva, tmp1
+ add dst, dst, #2*\size
+ add tmp1, tmp1, #2*\size
+ b.hs 0b
+
+ adds count, count, #2*\size /* undo pre-bias */
+ b.ne L(zva_tail)
RET
-#ifndef DONT_USE_DC
- /* For zeroing memory, check to see if we can use the ZVA feature to
- * zero entire 'cache' lines. */
-L(zero_mem):
- mov A_l, #0
- cmp count, #63
- b.le L(tail_maybe_tiny)
- neg tmp2, dst
- ands tmp2, tmp2, #15
- b.eq 1f
- sub count, count, tmp2
- stp A_l, A_l, [dst]
- add dst, dst, tmp2
- cmp count, #63
- b.le L(tail63)
-1:
- /* For zeroing small amounts of memory, it's not worth setting up
- * the line-clear code. */
- cmp count, #128
- b.lt L(not_short)
-#ifdef MAYBE_VIRT
- /* For efficiency when virtualized, we cache the ZVA capability. */
- adrp tmp2, L(cache_clear)
- ldr zva_len, [tmp2, #:lo12:L(cache_clear)]
- tbnz zva_len, #31, L(not_short)
- cbnz zva_len, L(zero_by_line)
- mrs tmp1, dczid_el0
- tbz tmp1, #4, 1f
- /* ZVA not available. Remember this for next time. */
- mov zva_len, #~0
- str zva_len, [tmp2, #:lo12:L(cache_clear)]
- b L(not_short)
-1:
- mov tmp3w, #4
- and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
- lsl zva_len, tmp3w, zva_len
- str zva_len, [tmp2, #:lo12:L(cache_clear)]
+ .size memset_zva_\size, . - memset_zva_\size
+.endm
+
+do_zva 64
+do_zva 128
#else
- mrs tmp1, dczid_el0
- tbnz tmp1, #4, L(not_short)
- mov tmp3w, #4
- and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
- lsl zva_len, tmp3w, zva_len
-#endif
-
-L(zero_by_line):
- /* Compute how far we need to go to become suitably aligned. We're
- * already at quad-word alignment. */
- cmp count, zva_len_x
- b.lt L(not_short) /* Not enough to reach alignment. */
- sub zva_bits_x, zva_len_x, #1
- neg tmp2, dst
- ands tmp2, tmp2, zva_bits_x
- b.eq 1f /* Already aligned. */
- /* Not aligned, check that there's enough to copy after alignment. */
- sub tmp1, count, tmp2
- cmp tmp1, #64
- ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
- b.lt L(not_short)
- /* We know that there's at least 64 bytes to zero and that it's safe
- * to overrun by 64 bytes. */
- mov count, tmp1
-2:
- stp A_l, A_l, [dst]
- stp A_l, A_l, [dst, #16]
- stp A_l, A_l, [dst, #32]
- subs tmp2, tmp2, #64
- stp A_l, A_l, [dst, #48]
- add dst, dst, #64
- b.ge 2b
- /* We've overrun a bit, so adjust dst downwards. */
- add dst, dst, tmp2
-1:
- sub count, count, zva_len_x
-3:
- dc zva, dst
- add dst, dst, zva_len_x
- subs count, count, zva_len_x
- b.ge 3b
- ands count, count, zva_bits_x
- b.ne L(tail_maybe_long)
+/* If we don't have ifunc (e.g. ld.so) don't bother with the zva. */
+# define memset_nozva memset
+#endif /* IFUNC */
+
+/* The non-zva path. */
+
+ .balign 64
+ .type memset_nozva, %function
+memset_nozva:
+ CALL_MCOUNT
+ and valw, valw, #255
+L(nz_or_small):
+ orr valw, valw, valw, lsl #8 /* replicate the byte */
+ cmp count, #64
+ orr valw, valw, valw, lsl #16
+ add dstend, dstin, count /* remember end of buffer */
+ orr val, val, val, lsl #32
+ b.hs L(ge_64)
+
+ /* Small data -- original count is less than 64 bytes. */
+L(le_63):
+ cmp count, #16
+ b.lo L(le_15)
+ stp val, val, [dstin]
+ tbz count, #5, L(le_31)
+ stp val, val, [dstin, #16]
+ stp val, val, [dstend, #-32]
+L(le_31):
+ stp val, val, [dstend, #-16]
+ RET
+ .balign 64,,16
+L(le_15):
+ tbz count, #3, L(le_7)
+ str val, [dstin]
+ str val, [dstend, #-8]
+ RET
+ .balign 64,,16
+L(le_7):
+ tbz count, #2, L(le_3)
+ str valw, [dstin]
+ str valw, [dstend, #-4]
+ RET
+ .balign 64,,20
+L(le_3):
+ tbz count, #1, L(le_1)
+ strh valw, [dstend, #-2]
+L(le_1):
+ tbz count, #0, L(le_0)
+ strb valw, [dstin]
+L(le_0):
+ RET
+
+ .balign 64
+L(ge_64):
+ and dst, dstin, #-16 /* align the pointer / pre-bias. */
+ stp val, val, [dstin] /* first 16 align 1 */
+ sub count, dstend, dst /* begin misalign recompute */
+ subs count, count, #16+64 /* finish recompute + pre-bias */
+ b.ls L(loop_tail)
+
+ .balign 64,,24
+L(loop):
+ stp val, val, [dst, #16]
+ stp val, val, [dst, #32]
+ subs count, count, #64
+ stp val, val, [dst, #48]
+ stp val, val, [dst, #64]!
+ b.hs L(loop)
+
+ adds count, count, #64 /* undo pre-bias */
+ b.ne L(loop_tail)
+ RET
+
+ /* Tail of the zva loop. Less than ZVA bytes, but possibly lots
+ more than 64. Note that dst is aligned but unbiased. */
+L(zva_tail):
+ subs count, count, #64 /* pre-bias */
+ sub dst, dst, #16 /* pre-bias */
+ b.hi L(loop)
+
+ /* Tail of the stp loop; less than 64 bytes left (from loop)
+ or less-than-or-equal to 64 bytes left (from ge_64/zva_tail). */
+L(loop_tail):
+ stp val, val, [dstend, #-64]
+ stp val, val, [dstend, #-48]
+ stp val, val, [dstend, #-32]
+ stp val, val, [dstend, #-16]
RET
-#ifdef MAYBE_VIRT
- .bss
- .p2align 2
-L(cache_clear):
- .space 4
-#endif
-#endif /* DONT_USE_DC */
-
-END (__memset)
-weak_alias (__memset, memset)
+
+ .size memset_nozva, . - memset_nozva
+ cfi_endproc
+
+strong_alias (memset, __memset)
libc_hidden_builtin_def (memset)