Message ID | PAWPR08MB89827E63CC0EAD1774E391A3839E2@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Optimize memset | expand |
On 06/09/24 10:21, Wilco Dijkstra wrote: > > Improve small memsets by avoiding branches and use overlapping stores. > Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA sizes > other than 64 and 128. Performance of random memset benchmark improves by > 24% on Neoverse N1. > > Passes regress, OK for commit? > > --- > > diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S > index 7ef77ee8c926de211a2c6d193a23f49808481a82..a7a5fba66712a82c2b571cc0fda1655fd1178186 100644 > --- a/sysdeps/aarch64/memset.S > +++ b/sysdeps/aarch64/memset.S > @@ -1,4 +1,5 @@ > -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. > +/* Generic optimized memset using SIMD. > + Copyright (C) 2012-2024 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > @@ -17,7 +18,6 @@ > <https://www.gnu.org/licenses/>. */ > > #include <sysdep.h> > -#include "memset-reg.h" I think we should eventually just remove this file, it does not make much sense to add an reg alias definition for assembly implementation. > > #ifndef MEMSET > # define MEMSET memset > @@ -25,130 +25,131 @@ > > /* Assumptions: > * > - * ARMv8-a, AArch64, unaligned accesses > + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. > * > */ > > -ENTRY (MEMSET) > +#define dstin x0 > +#define val x1 > +#define valw w1 > +#define count x2 > +#define dst x3 > +#define dstend x4 > +#define zva_val x5 > +#define off x3 > +#define dstend2 x5 > > +ENTRY (MEMSET) > PTR_ARG (0) > SIZE_ARG (2) > - Spurious line removal. The rest looks ok. > dup v0.16B, valw > + cmp count, 16 > + b.lo L(set_small) > + > add dstend, dstin, count > + cmp count, 64 > + b.hs L(set_128) > > - cmp count, 96 > - b.hi L(set_long) > - cmp count, 16 > - b.hs L(set_medium) > - mov val, v0.D[0] > + /* Set 16..63 bytes. */ > + mov off, 16 > + and off, off, count, lsr 1 > + sub dstend2, dstend, off > + str q0, [dstin] > + str q0, [dstin, off] > + str q0, [dstend2, -16] > + str q0, [dstend, -16] > + ret > > + .p2align 4 > /* Set 0..15 bytes. */ > - tbz count, 3, 1f > - str val, [dstin] > - str val, [dstend, -8] > - ret > - nop > -1: tbz count, 2, 2f > - str valw, [dstin] > - str valw, [dstend, -4] > +L(set_small): > + add dstend, dstin, count > + cmp count, 4 > + b.lo 2f > + lsr off, count, 3 > + sub dstend2, dstend, off, lsl 2 > + str s0, [dstin] > + str s0, [dstin, off, lsl 2] > + str s0, [dstend2, -4] > + str s0, [dstend, -4] > ret > + > + /* Set 0..3 bytes. */ > 2: cbz count, 3f > + lsr off, count, 1 > strb valw, [dstin] > - tbz count, 1, 3f > - strh valw, [dstend, -2] > + strb valw, [dstin, off] > + strb valw, [dstend, -1] > 3: ret > > - /* Set 17..96 bytes. */ > -L(set_medium): > - str q0, [dstin] > - tbnz count, 6, L(set96) > - str q0, [dstend, -16] > - tbz count, 5, 1f > - str q0, [dstin, 16] > - str q0, [dstend, -32] > -1: ret > - > .p2align 4 > - /* Set 64..96 bytes. Write 64 bytes from the start and > - 32 bytes from the end. */ > -L(set96): > - str q0, [dstin, 16] > +L(set_128): > + bic dst, dstin, 15 > + cmp count, 128 > + b.hi L(set_long) > + stp q0, q0, [dstin] > stp q0, q0, [dstin, 32] > + stp q0, q0, [dstend, -64] > stp q0, q0, [dstend, -32] > ret > > - .p2align 3 > - nop > + .p2align 4 > L(set_long): > - and valw, valw, 255 > - bic dst, dstin, 15 > str q0, [dstin] > - cmp count, 256 > - ccmp valw, 0, 0, cs > - b.eq L(try_zva) > -L(no_zva): > - sub count, dstend, dst /* Count is 16 too large. */ > - sub dst, dst, 16 /* Dst is biased by -32. */ > - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ > -1: stp q0, q0, [dst, 32] > - stp q0, q0, [dst, 64]! > -L(tail64): > - subs count, count, 64 > - b.hi 1b > -2: stp q0, q0, [dstend, -64] > + str q0, [dst, 16] > + tst valw, 255 > + b.ne L(no_zva) > +#ifndef ZVA64_ONLY > + mrs zva_val, dczid_el0 > + and zva_val, zva_val, 31 > + cmp zva_val, 4 /* ZVA size is 64 bytes. */ > + b.ne L(zva_128) > +#endif > + stp q0, q0, [dst, 32] > + bic dst, dstin, 63 > + sub count, dstend, dst /* Count is now 64 too large. */ > + sub count, count, 64 + 64 /* Adjust count and bias for loop. */ > + > + /* Write last bytes before ZVA loop. */ > + stp q0, q0, [dstend, -64] > stp q0, q0, [dstend, -32] > + > + .p2align 4 > +L(zva64_loop): > + add dst, dst, 64 > + dc zva, dst > + subs count, count, 64 > + b.hi L(zva64_loop) > ret > > -L(try_zva): > -#ifndef ZVA64_ONLY > .p2align 3 > - mrs tmp1, dczid_el0 > - tbnz tmp1w, 4, L(no_zva) > - and tmp1w, tmp1w, 15 > - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ > - b.ne L(zva_128) > - nop > -#endif > - /* Write the first and last 64 byte aligned block using stp rather > - than using DC ZVA. This is faster on some cores. > - */ > - .p2align 4 > -L(zva_64): > - str q0, [dst, 16] > +L(no_zva): > + sub count, dstend, dst /* Count is 32 too large. */ > + sub count, count, 64 + 32 /* Adjust count and bias for loop. */ > +L(no_zva_loop): > stp q0, q0, [dst, 32] > - bic dst, dst, 63 > stp q0, q0, [dst, 64] > - stp q0, q0, [dst, 96] > - sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > -1: dc zva, dst > add dst, dst, 64 > subs count, count, 64 > - b.hi 1b > - stp q0, q0, [dst, 0] > - stp q0, q0, [dst, 32] > + b.hi L(no_zva_loop) > stp q0, q0, [dstend, -64] > stp q0, q0, [dstend, -32] > ret > > #ifndef ZVA64_ONLY > - .p2align 3 > + .p2align 4 > L(zva_128): > - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ > - b.ne L(zva_other) > + cmp zva_val, 5 /* ZVA size is 128 bytes. */ > + b.ne L(no_zva) > > - str q0, [dst, 16] > stp q0, q0, [dst, 32] > stp q0, q0, [dst, 64] > stp q0, q0, [dst, 96] > bic dst, dst, 127 > sub count, dstend, dst /* Count is now 128 too large. */ > - sub count, count, 128+128 /* Adjust count and bias for loop. */ > - add dst, dst, 128 > -1: dc zva, dst > - add dst, dst, 128 > + sub count, count, 128 + 128 /* Adjust count and bias for loop. */ > +1: add dst, dst, 128 > + dc zva, dst > subs count, count, 128 > b.hi 1b > stp q0, q0, [dstend, -128] > @@ -156,35 +157,6 @@ L(zva_128): > stp q0, q0, [dstend, -64] > stp q0, q0, [dstend, -32] > ret > - > -L(zva_other): > - mov tmp2w, 4 > - lsl zva_lenw, tmp2w, tmp1w > - add tmp1, zva_len, 64 /* Max alignment bytes written. */ > - cmp count, tmp1 > - blo L(no_zva) > - > - sub tmp2, zva_len, 1 > - add tmp1, dst, zva_len > - add dst, dst, 16 > - subs count, tmp1, dst /* Actual alignment bytes to write. */ > - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ > - beq 2f > -1: stp q0, q0, [dst], 64 > - stp q0, q0, [dst, -32] > - subs count, count, 64 > - b.hi 1b > -2: mov dst, tmp1 > - sub count, dstend, tmp1 /* Remaining bytes to write. */ > - subs count, count, zva_len > - b.lo 4f > -3: dc zva, dst > - add dst, dst, zva_len > - subs count, count, zva_len > - b.hs 3b > -4: add count, count, zva_len > - sub dst, dst, 32 /* Bias dst for tail loop. */ > - b L(tail64) > #endif > > END (MEMSET) >
* Wilco Dijkstra: > Improve small memsets by avoiding branches and use overlapping stores. > Use DC ZVA for copies over 128 bytes. Remove unnecessary code for ZVA > sizes other than 64 and 128. Performance of random memset benchmark > improves by 24% on Neoverse N1. > > Passes regress, OK for commit? We have received a report that this breaks certain virtualized AArch64 environments: Bug 2327564 - glibc-2.40.9000-4.fc42 update causes kernel panic during aarch64 Fedora CoreOS boot on OpenStack <https://bugzilla.redhat.com/show_bug.cgi?id=2327564> Output from ld.so --list-diagnostics is below. The crash is in PID 1, so there is not much data to go by. I can probably get a login on such a system running glibc 2.40 and try run the glibc testsuite from the current development branch there, but I'm not sure if I could make sense of the results. One issue I see is that the dczid_el0 masking in sysdeps/aarch64/memset.S is inconsistent with sysdeps/unix/sysv/linux/aarch64/cpu-features.c: The former use 31, while the latter uses 0xf. But that does not make a difference here because dczid_el0 is 5. Thanks, Florian dl_dst_lib="lib64" dl_hwcap=0x8ff dl_hwcap2=0x0 dl_hwcap3=0x0 dl_hwcap4=0x0 dl_hwcaps_subdirs="" dl_hwcaps_subdirs_active=0x0 dl_pagesize=0x1000 dl_platform="aarch64" dl_profile_output="/var/tmp" dso.ld="ld-linux-aarch64.so.1" dso.libc="libc.so.6" env_filtered[0x0]="SHELL" env_filtered[0x1]="HISTCONTROL" env_filtered[0x2]="HISTSIZE" env_filtered[0x3]="HOSTNAME" env_filtered[0x4]="GPG_TTY" env_filtered[0x5]="EDITOR" env_filtered[0x6]="PWD" env_filtered[0x7]="LOGNAME" env_filtered[0x8]="XDG_SESSION_TYPE" env_filtered[0x9]="MOTD_SHOWN" env_filtered[0xa]="HOME" env[0xb]="LANG=C.UTF-8" env_filtered[0xc]="LS_COLORS" env_filtered[0xd]="SSH_CONNECTION" env_filtered[0xe]="XDG_SESSION_CLASS" env_filtered[0xf]="SELINUX_ROLE_REQUESTED" env_filtered[0x10]="TERM" env_filtered[0x11]="LESSOPEN" env_filtered[0x12]="USER" env_filtered[0x13]="SELINUX_USE_CURRENT_RANGE" env_filtered[0x14]="SHLVL" env_filtered[0x15]="XDG_SESSION_ID" env_filtered[0x16]="XDG_RUNTIME_DIR" env_filtered[0x17]="SSH_CLIENT" env[0x18]="PATH=/var/home/core/.local/bin:/var/home/core/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin" env_filtered[0x19]="SELINUX_LEVEL_REQUESTED" env_filtered[0x1a]="DBUS_SESSION_BUS_ADDRESS" env_filtered[0x1b]="MAIL" env_filtered[0x1c]="SSH_TTY" env_filtered[0x1d]="_" path.prefix="/usr" path.rtld="/lib/ld-linux-aarch64.so.1" path.sysconfdir="/etc" path.system_dirs[0x0]="/lib64/" path.system_dirs[0x1]="/usr/lib64/" version.release="development" version.version="2.40.9000" auxv[0x0].a_type=0x21 auxv[0x0].a_val=0xffffa482b000 auxv[0x1].a_type=0x33 auxv[0x1].a_val=0x1270 auxv[0x2].a_type=0x10 auxv[0x2].a_val=0x8ff auxv[0x3].a_type=0x6 auxv[0x3].a_val=0x1000 auxv[0x4].a_type=0x11 auxv[0x4].a_val=0x64 auxv[0x5].a_type=0x3 auxv[0x5].a_val=0xffffa47e0040 auxv[0x6].a_type=0x4 auxv[0x6].a_val=0x38 auxv[0x7].a_type=0x5 auxv[0x7].a_val=0x9 auxv[0x8].a_type=0x7 auxv[0x8].a_val=0x0 auxv[0x9].a_type=0x8 auxv[0x9].a_val=0x0 auxv[0xa].a_type=0x9 auxv[0xa].a_val=0xffffa47faa80 auxv[0xb].a_type=0xb auxv[0xb].a_val=0x3e8 auxv[0xc].a_type=0xc auxv[0xc].a_val=0x3e8 auxv[0xd].a_type=0xd auxv[0xd].a_val=0x3e8 auxv[0xe].a_type=0xe auxv[0xe].a_val=0x3e8 auxv[0xf].a_type=0x17 auxv[0xf].a_val=0x0 auxv[0x10].a_type=0x19 auxv[0x10].a_val=0xffffee238298 auxv[0x11].a_type=0x1a auxv[0x11].a_val=0x0 auxv[0x12].a_type=0x1d auxv[0x12].a_val=0x0 auxv[0x13].a_type=0x1f auxv[0x13].a_val_string="/usr/bin/ld.so" auxv[0x14].a_type=0xf auxv[0x14].a_val_string="aarch64" auxv[0x15].a_type=0x1b auxv[0x15].a_val=0x1c auxv[0x16].a_type=0x1c auxv[0x16].a_val=0x20 uname.sysname="Linux" uname.nodename="host-192-168-40-30" uname.release="6.13.0-0.rc0.20241119git158f238aa69d.2.fc42.aarch64" uname.version="#1 SMP PREEMPT_DYNAMIC Tue Nov 19 16:59:12 UTC 2024" uname.machine="aarch64" uname.domainname="(none)" aarch64.cpu_features.bti=0x0 aarch64.cpu_features.midr_el1=0x431f0a11 aarch64.cpu_features.mops=0x0 aarch64.cpu_features.mte_state=0x0 aarch64.cpu_features.prefer_sve_ifuncs=0x0 aarch64.cpu_features.sve=0x0 aarch64.cpu_features.zva_size=0x80 aarch64.processor[0x0].requested=0x0 aarch64.processor[0x0].observed=0x0 aarch64.processor[0x0].observed_node=0x0 aarch64.processor[0x0].midr_el1=0x431f0a11 aarch64.processor[0x0].dczid_el0=0x5 aarch64.processor[0x1].requested=0x1 aarch64.processor[0x1].observed=0x1 aarch64.processor[0x1].observed_node=0x0 aarch64.processor[0x1].midr_el1=0x431f0a11 aarch64.processor[0x1].dczid_el0=0x5 aarch64.processor[0x2].requested=0x2 aarch64.processor[0x2].observed=0x2 aarch64.processor[0x2].observed_node=0x0 aarch64.processor[0x2].midr_el1=0x431f0a11 aarch64.processor[0x2].dczid_el0=0x5 aarch64.processor[0x3].requested=0x3 aarch64.processor[0x3].observed=0x3 aarch64.processor[0x3].observed_node=0x0 aarch64.processor[0x3].midr_el1=0x431f0a11 aarch64.processor[0x3].dczid_el0=0x5
Hi Florian, > The crash is in PID 1, so there is not much data to go by. I can > probably get a login on such a system running glibc 2.40 and try run the > glibc testsuite from the current development branch there, but I'm not > sure if I could make sense of the results. > One issue I see is that the dczid_el0 masking in > sysdeps/aarch64/memset.S is inconsistent with > sysdeps/unix/sysv/linux/aarch64/cpu-features.c: The former use 31, while > the latter uses 0xf. But that does not make a difference here because > dczid_el0 is 5. Basically I'm checking DZP (bit 4) at the same time as the ZVA size, so that's fine. The value 5 means it is 128 byte ZVA, and that is enough clue to figure it out! I kept the code for zva_128 pretty much as is. It requires a minimum size of 256 since it doesn't use SUBS count, count, 128+128; b.ls ... before the zva_128 loop. The new version changes the code in set_long to no longer check count >= 256. So that means count can underflow in zva_128 and then it will clear an infinite amount of memory... So it needs a extra check for count > 256 (or maybe larger since the cost of 128-byte alignment is quite high). I'll post a patch. Cheers, Wilco
Hi Florian, I've now committed the fix as a08d9a52f967531a77e1824c23b5368c6434a72d . Let me know that fixes the issue you were seeing. Cheers, Wilco
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 7ef77ee8c926de211a2c6d193a23f49808481a82..a7a5fba66712a82c2b571cc0fda1655fd1178186 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -1,4 +1,5 @@ -/* Copyright (C) 2012-2024 Free Software Foundation, Inc. +/* Generic optimized memset using SIMD. + Copyright (C) 2012-2024 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include "memset-reg.h" #ifndef MEMSET # define MEMSET memset @@ -25,130 +25,131 @@ /* Assumptions: * - * ARMv8-a, AArch64, unaligned accesses + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. * */ -ENTRY (MEMSET) +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 +#define off x3 +#define dstend2 x5 +ENTRY (MEMSET) PTR_ARG (0) SIZE_ARG (2) - dup v0.16B, valw + cmp count, 16 + b.lo L(set_small) + add dstend, dstin, count + cmp count, 64 + b.hs L(set_128) - cmp count, 96 - b.hi L(set_long) - cmp count, 16 - b.hs L(set_medium) - mov val, v0.D[0] + /* Set 16..63 bytes. */ + mov off, 16 + and off, off, count, lsr 1 + sub dstend2, dstend, off + str q0, [dstin] + str q0, [dstin, off] + str q0, [dstend2, -16] + str q0, [dstend, -16] + ret + .p2align 4 /* Set 0..15 bytes. */ - tbz count, 3, 1f - str val, [dstin] - str val, [dstend, -8] - ret - nop -1: tbz count, 2, 2f - str valw, [dstin] - str valw, [dstend, -4] +L(set_small): + add dstend, dstin, count + cmp count, 4 + b.lo 2f + lsr off, count, 3 + sub dstend2, dstend, off, lsl 2 + str s0, [dstin] + str s0, [dstin, off, lsl 2] + str s0, [dstend2, -4] + str s0, [dstend, -4] ret + + /* Set 0..3 bytes. */ 2: cbz count, 3f + lsr off, count, 1 strb valw, [dstin] - tbz count, 1, 3f - strh valw, [dstend, -2] + strb valw, [dstin, off] + strb valw, [dstend, -1] 3: ret - /* Set 17..96 bytes. */ -L(set_medium): - str q0, [dstin] - tbnz count, 6, L(set96) - str q0, [dstend, -16] - tbz count, 5, 1f - str q0, [dstin, 16] - str q0, [dstend, -32] -1: ret - .p2align 4 - /* Set 64..96 bytes. Write 64 bytes from the start and - 32 bytes from the end. */ -L(set96): - str q0, [dstin, 16] +L(set_128): + bic dst, dstin, 15 + cmp count, 128 + b.hi L(set_long) + stp q0, q0, [dstin] stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret - .p2align 3 - nop + .p2align 4 L(set_long): - and valw, valw, 255 - bic dst, dstin, 15 str q0, [dstin] - cmp count, 256 - ccmp valw, 0, 0, cs - b.eq L(try_zva) -L(no_zva): - sub count, dstend, dst /* Count is 16 too large. */ - sub dst, dst, 16 /* Dst is biased by -32. */ - sub count, count, 64 + 16 /* Adjust count and bias for loop. */ -1: stp q0, q0, [dst, 32] - stp q0, q0, [dst, 64]! -L(tail64): - subs count, count, 64 - b.hi 1b -2: stp q0, q0, [dstend, -64] + str q0, [dst, 16] + tst valw, 255 + b.ne L(no_zva) +#ifndef ZVA64_ONLY + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(zva_128) +#endif + stp q0, q0, [dst, 32] + bic dst, dstin, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 64 + 64 /* Adjust count and bias for loop. */ + + /* Write last bytes before ZVA loop. */ + stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] + + .p2align 4 +L(zva64_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva64_loop) ret -L(try_zva): -#ifndef ZVA64_ONLY .p2align 3 - mrs tmp1, dczid_el0 - tbnz tmp1w, 4, L(no_zva) - and tmp1w, tmp1w, 15 - cmp tmp1w, 4 /* ZVA size is 64 bytes. */ - b.ne L(zva_128) - nop -#endif - /* Write the first and last 64 byte aligned block using stp rather - than using DC ZVA. This is faster on some cores. - */ - .p2align 4 -L(zva_64): - str q0, [dst, 16] +L(no_zva): + sub count, dstend, dst /* Count is 32 too large. */ + sub count, count, 64 + 32 /* Adjust count and bias for loop. */ +L(no_zva_loop): stp q0, q0, [dst, 32] - bic dst, dst, 63 stp q0, q0, [dst, 64] - stp q0, q0, [dst, 96] - sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+64+64 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst add dst, dst, 64 subs count, count, 64 - b.hi 1b - stp q0, q0, [dst, 0] - stp q0, q0, [dst, 32] + b.hi L(no_zva_loop) stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret #ifndef ZVA64_ONLY - .p2align 3 + .p2align 4 L(zva_128): - cmp tmp1w, 5 /* ZVA size is 128 bytes. */ - b.ne L(zva_other) + cmp zva_val, 5 /* ZVA size is 128 bytes. */ + b.ne L(no_zva) - str q0, [dst, 16] stp q0, q0, [dst, 32] stp q0, q0, [dst, 64] stp q0, q0, [dst, 96] bic dst, dst, 127 sub count, dstend, dst /* Count is now 128 too large. */ - sub count, count, 128+128 /* Adjust count and bias for loop. */ - add dst, dst, 128 -1: dc zva, dst - add dst, dst, 128 + sub count, count, 128 + 128 /* Adjust count and bias for loop. */ +1: add dst, dst, 128 + dc zva, dst subs count, count, 128 b.hi 1b stp q0, q0, [dstend, -128] @@ -156,35 +157,6 @@ L(zva_128): stp q0, q0, [dstend, -64] stp q0, q0, [dstend, -32] ret - -L(zva_other): - mov tmp2w, 4 - lsl zva_lenw, tmp2w, tmp1w - add tmp1, zva_len, 64 /* Max alignment bytes written. */ - cmp count, tmp1 - blo L(no_zva) - - sub tmp2, zva_len, 1 - add tmp1, dst, zva_len - add dst, dst, 16 - subs count, tmp1, dst /* Actual alignment bytes to write. */ - bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */ - beq 2f -1: stp q0, q0, [dst], 64 - stp q0, q0, [dst, -32] - subs count, count, 64 - b.hi 1b -2: mov dst, tmp1 - sub count, dstend, tmp1 /* Remaining bytes to write. */ - subs count, count, zva_len - b.lo 4f -3: dc zva, dst - add dst, dst, zva_len - subs count, count, zva_len - b.hs 3b -4: add count, count, zva_len - sub dst, dst, 32 /* Bias dst for tail loop. */ - b L(tail64) #endif END (MEMSET)