[RFC] aarch64: improve memset

Message ID	545F237A.8070808@twiddle.net
State	New
Headers	show Return-Path: <libc-alpha-return-54189-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:message-id:date:from:mime-version:to:cc :subject:references:in-reply-to:content-type; q=dns; s=default; b= rMrKOFyBak51iAumino8vP1BdG7BuFjaE8K0CRcJsH/O6oFmpQbQvAcLsOw94ibl a6WnnF3Z9y7frFcm8bBty3n3GqqtC9O6Km5AwZRVfsJvhT4jkE9ijYa8k8CFgOgh v1xKsnD1exbzI3hsvWXQJYr+Rqd69WsLJvcInJ3Tl94= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Message-ID: <545F237A.8070808@twiddle.net> Date: Sun, 09 Nov 2014 09:19:06 +0100 From: Richard Henderson <rth@twiddle.net> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.2.0 MIME-Version: 1.0 To: Wilco Dijkstra <wdijkstr@arm.com> CC: will.newton@linaro.org, marcus.shawcroft@gmail.com, libc-alpha@sourceware.org Subject: Re: [RFC PATCH] aarch64: improve memset References: <002701cffaa0$77623570$6626a050$@com> <002801cffaa5$eb2852f0$c178f8d0$@com> In-Reply-To: <002801cffaa5$eb2852f0$c178f8d0$@com> Content-Type: multipart/mixed; boundary="------------060805000508000806080506"

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 06f04be..9a3d932 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -20,23 +20,14 @@ * * ARMv8-a, AArch64 * Unaligned accesses - * */ #include <sysdep.h> -/* By default we assume that the DC instruction can be used to zero - data blocks more efficiently. In some circumstances this might be - unsafe, for example in an asymmetric multiprocessor environment with - different DC clear lengths (neither the upper nor lower lengths are - safe to use). The feature can be disabled by defining DONT_USE_DC. - - If code may be run in a virtualized environment, then define - MAYBE_VIRT. This will cause the code to cache the system register - values rather than re-reading them each call. */ - #define dstin x0 -#define val w1 +#define dstin_w w0 +#define val x1 +#define valw w1 #define count x2 #define tmp1 x3 #define tmp1w w3 @@ -44,186 +35,186 @@ #define tmp2w w4 #define zva_len_x x5 #define zva_len w5 -#define zva_bits_x x6 - -#define A_l x7 -#define A_lw w7 +#define zva_mask_x x6 +#define zva_mask w6 #define dst x8 -#define tmp3w w9 - -ENTRY_ALIGN (__memset, 6) - - mov dst, dstin /* Preserve return value. */ - ands A_lw, val, #255 -#ifndef DONT_USE_DC - b.eq L(zero_mem) -#endif - orr A_lw, A_lw, A_lw, lsl #8 - orr A_lw, A_lw, A_lw, lsl #16 - orr A_l, A_l, A_l, lsl #32 -L(tail_maybe_long): - cmp count, #64 - b.ge L(not_short) -L(tail_maybe_tiny): - cmp count, #15 - b.le L(tail15tiny) -L(tail63): - ands tmp1, count, #0x30 - b.eq L(tail15) - add dst, dst, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - stp A_l, A_l, [dst, #-48] -1: - stp A_l, A_l, [dst, #-32] -2: - stp A_l, A_l, [dst, #-16] - -L(tail15): - and count, count, #15 - add dst, dst, count - stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ +#define dst_w w8 +#define dstend x9 + + .globl memset + cfi_startproc + +#if HAVE_IFUNC && !defined (IS_IN_rtld) +/* Rather than decode dczid_el0 every time, checking for zva disabled and + unpacking the line size, do this once in the indirect function and choose + an appropriate entry point which encodes these values as constants. */ + + .type memset, %gnu_indirect_function +memset: + mrs x1, dczid_el0 + and x1, x1, #31 /* isolate line size + disable bit */ + + cmp x1, #4 /* 64 byte line size, enabled */ + b.ne 1f + adr x0, memset_zva_64 RET -L(tail15tiny): - /* Set up to 15 bytes. Does not assume earlier memory - being set. */ - tbz count, #3, 1f - str A_l, [dst], #8 -1: - tbz count, #2, 1f - str A_lw, [dst], #4 -1: - tbz count, #1, 1f - strh A_lw, [dst], #2 -1: - tbz count, #0, 1f - strb A_lw, [dst] -1: +1: cmp x1, #5 /* 128 byte line size, enabled */ + b.ne 1f + adr x0, memset_zva_128 RET - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line, this ensures the entire loop is in one line. */ - .p2align 6 -L(not_short): - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 2f - /* Bring DST to 128-bit (16-byte) alignment. We know that there's - * more than that to set, so we simply store 16 bytes and advance by - * the amount required to reach alignment. */ - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63) -2: - sub dst, dst, #16 /* Pre-bias. */ - sub count, count, #64 -1: - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - stp A_l, A_l, [dst, #48] - stp A_l, A_l, [dst, #64]! - subs count, count, #64 - b.ge 1b - tst count, #0x3f - add dst, dst, #16 - b.ne L(tail63) +1: adr x0, memset_nozva /* Don't use zva at all */ + RET + .size memset, .-memset + +.macro do_zva size + .balign 64 + .type memset_zva_\size, %function +memset_zva_\size: + CALL_MCOUNT + and valw, valw, #255 + cmp count, #4*\size + ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + b.ne L(nz_or_small) + + stp xzr, xzr, [dstin] /* first 16 aligned 1. */ + and tmp2, dstin, #-16 + and dst, dstin, #-\size + + stp xzr, xzr, [tmp2, #16] /* first 64 aligned 16. */ + add dstend, dstin, count + add dst, dst, #\size + + stp xzr, xzr, [tmp2, #32] + sub count, dstend, dst /* recompute for misalign */ + add tmp1, dst, #\size + + stp xzr, xzr, [tmp2, #48] + sub count, count, #2*\size /* pre-bias */ + + stp xzr, xzr, [tmp2, #64] + + /* Store up to first SIZE, aligned 16. */ +.ifgt \size - 64 + stp xzr, xzr, [tmp2, #80] + stp xzr, xzr, [tmp2, #96] + stp xzr, xzr, [tmp2, #112] + stp xzr, xzr, [tmp2, #128] +.ifgt \size - 128 +.err +.endif +.endif + + .balign 64,,24 +0: dc zva, dst + subs count, count, #2*\size + dc zva, tmp1 + add dst, dst, #2*\size + add tmp1, tmp1, #2*\size + b.hs 0b + + adds count, count, #2*\size /* undo pre-bias */ + b.ne L(zva_tail) RET -#ifndef DONT_USE_DC - /* For zeroing memory, check to see if we can use the ZVA feature to - * zero entire 'cache' lines. */ -L(zero_mem): - mov A_l, #0 - cmp count, #63 - b.le L(tail_maybe_tiny) - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 1f - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - cmp count, #63 - b.le L(tail63) -1: - /* For zeroing small amounts of memory, it's not worth setting up - * the line-clear code. */ - cmp count, #128 - b.lt L(not_short) -#ifdef MAYBE_VIRT - /* For efficiency when virtualized, we cache the ZVA capability. */ - adrp tmp2, L(cache_clear) - ldr zva_len, [tmp2, #:lo12:L(cache_clear)] - tbnz zva_len, #31, L(not_short) - cbnz zva_len, L(zero_by_line) - mrs tmp1, dczid_el0 - tbz tmp1, #4, 1f - /* ZVA not available. Remember this for next time. */ - mov zva_len, #~0 - str zva_len, [tmp2, #:lo12:L(cache_clear)] - b L(not_short) -1: - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len - str zva_len, [tmp2, #:lo12:L(cache_clear)] + .size memset_zva_\size, . - memset_zva_\size +.endm + +do_zva 64 +do_zva 128 #else - mrs tmp1, dczid_el0 - tbnz tmp1, #4, L(not_short) - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len -#endif - -L(zero_by_line): - /* Compute how far we need to go to become suitably aligned. We're - * already at quad-word alignment. */ - cmp count, zva_len_x - b.lt L(not_short) /* Not enough to reach alignment. */ - sub zva_bits_x, zva_len_x, #1 - neg tmp2, dst - ands tmp2, tmp2, zva_bits_x - b.eq 1f /* Already aligned. */ - /* Not aligned, check that there's enough to copy after alignment. */ - sub tmp1, count, tmp2 - cmp tmp1, #64 - ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ - b.lt L(not_short) - /* We know that there's at least 64 bytes to zero and that it's safe - * to overrun by 64 bytes. */ - mov count, tmp1 -2: - stp A_l, A_l, [dst] - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - subs tmp2, tmp2, #64 - stp A_l, A_l, [dst, #48] - add dst, dst, #64 - b.ge 2b - /* We've overrun a bit, so adjust dst downwards. */ - add dst, dst, tmp2 -1: - sub count, count, zva_len_x -3: - dc zva, dst - add dst, dst, zva_len_x - subs count, count, zva_len_x - b.ge 3b - ands count, count, zva_bits_x - b.ne L(tail_maybe_long) +/* If we don't have ifunc (e.g. ld.so) don't bother with the zva. */ +# define memset_nozva memset +#endif /* IFUNC */ + +/* The non-zva path. */ + + .balign 64 + .type memset_nozva, %function +memset_nozva: + CALL_MCOUNT + and valw, valw, #255 +L(nz_or_small): + orr valw, valw, valw, lsl #8 /* replicate the byte */ + cmp count, #64 + orr valw, valw, valw, lsl #16 + add dstend, dstin, count /* remember end of buffer */ + orr val, val, val, lsl #32 + b.hs L(ge_64) + + /* Small data -- original count is less than 64 bytes. */ +L(le_63): + cmp count, #16 + b.lo L(le_15) + stp val, val, [dstin] + tbz count, #5, L(le_31) + stp val, val, [dstin, #16] + stp val, val, [dstend, #-32] +L(le_31): + stp val, val, [dstend, #-16] + RET + .balign 64,,16 +L(le_15): + tbz count, #3, L(le_7) + str val, [dstin] + str val, [dstend, #-8] + RET + .balign 64,,16 +L(le_7): + tbz count, #2, L(le_3) + str valw, [dstin] + str valw, [dstend, #-4] + RET + .balign 64,,20 +L(le_3): + tbz count, #1, L(le_1) + strh valw, [dstend, #-2] +L(le_1): + tbz count, #0, L(le_0) + strb valw, [dstin] +L(le_0): + RET + + .balign 64 +L(ge_64): + and dst, dstin, #-16 /* align the pointer / pre-bias. */ + stp val, val, [dstin] /* first 16 align 1 */ + sub count, dstend, dst /* begin misalign recompute */ + subs count, count, #16+64 /* finish recompute + pre-bias */ + b.ls L(loop_tail) + + .balign 64,,24 +L(loop): + stp val, val, [dst, #16] + stp val, val, [dst, #32] + subs count, count, #64 + stp val, val, [dst, #48] + stp val, val, [dst, #64]! + b.hs L(loop) + + adds count, count, #64 /* undo pre-bias */ + b.ne L(loop_tail) + RET + + /* Tail of the zva loop. Less than ZVA bytes, but possibly lots + more than 64. Note that dst is aligned but unbiased. */ +L(zva_tail): + subs count, count, #64 /* pre-bias */ + sub dst, dst, #16 /* pre-bias */ + b.hi L(loop) + + /* Tail of the stp loop; less than 64 bytes left (from loop) + or less-than-or-equal to 64 bytes left (from ge_64/zva_tail). */ +L(loop_tail): + stp val, val, [dstend, #-64] + stp val, val, [dstend, #-48] + stp val, val, [dstend, #-32] + stp val, val, [dstend, #-16] RET -#ifdef MAYBE_VIRT - .bss - .p2align 2 -L(cache_clear): - .space 4 -#endif -#endif /* DONT_USE_DC */ - -END (__memset) -weak_alias (__memset, memset) + + .size memset_nozva, . - memset_nozva + cfi_endproc + +strong_alias (memset, __memset) libc_hidden_builtin_def (memset)

[RFC] aarch64: improve memset

Commit Message

Comments

Patch