diff mbox series

AArch64: Optimize memset

Message ID PAWPR08MB89827E63CC0EAD1774E391A3839E2@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Optimize memset | expand

Commit Message

Wilco Dijkstra Sept. 6, 2024, 1:21 p.m. UTC
Improve small memsets by avoiding branches and use overlapping stores.
Use DC ZVA for copies over 128 bytes.  Remove unnecessary code for ZVA sizes
other than 64 and 128.  Performance of random memset benchmark improves by
24% on Neoverse N1.

Passes regress, OK for commit?

---

Comments

Adhemerval Zanella Netto Sept. 9, 2024, 1:09 p.m. UTC | #1
On 06/09/24 10:21, Wilco Dijkstra wrote:
> 
> Improve small memsets by avoiding branches and use overlapping stores.
> Use DC ZVA for copies over 128 bytes.  Remove unnecessary code for ZVA sizes
> other than 64 and 128.  Performance of random memset benchmark improves by
> 24% on Neoverse N1.
> 
> Passes regress, OK for commit?
> 
> ---
> 
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 7ef77ee8c926de211a2c6d193a23f49808481a82..a7a5fba66712a82c2b571cc0fda1655fd1178186 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -1,4 +1,5 @@
> -/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
> +/* Generic optimized memset using SIMD.
> +   Copyright (C) 2012-2024 Free Software Foundation, Inc.
>  
>     This file is part of the GNU C Library.
>  
> @@ -17,7 +18,6 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <sysdep.h>
> -#include "memset-reg.h"

I think we should eventually just remove this file, it does not make
much sense to add an reg alias definition for assembly implementation.

>  
>  #ifndef MEMSET
>  # define MEMSET memset
> @@ -25,130 +25,131 @@
>  
>  /* Assumptions:
>   *
> - * ARMv8-a, AArch64, unaligned accesses
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
>   *
>   */
>  
> -ENTRY (MEMSET)
> +#define dstin	x0
> +#define val	x1
> +#define valw	w1
> +#define count	x2
> +#define dst	x3
> +#define dstend	x4
> +#define zva_val	x5
> +#define off	x3
> +#define dstend2	x5
>  
> +ENTRY (MEMSET)
>  	PTR_ARG (0)
>  	SIZE_ARG (2)
> -

Spurious line removal.

The rest looks ok.

>  	dup	v0.16B, valw
> +	cmp	count, 16
> +	b.lo	L(set_small)
> +
>  	add	dstend, dstin, count
> +	cmp	count, 64
> +	b.hs	L(set_128)
>  
> -	cmp	count, 96
> -	b.hi	L(set_long)
> -	cmp	count, 16
> -	b.hs	L(set_medium)
> -	mov	val, v0.D[0]
> +	/* Set 16..63 bytes.  */
> +	mov	off, 16
> +	and	off, off, count, lsr 1
> +	sub	dstend2, dstend, off
> +	str	q0, [dstin]
> +	str	q0, [dstin, off]
> +	str	q0, [dstend2, -16]
> +	str	q0, [dstend, -16]
> +	ret
>  
> +	.p2align 4
>  	/* Set 0..15 bytes.  */
> -	tbz	count, 3, 1f
> -	str	val, [dstin]
> -	str	val, [dstend, -8]
> -	ret
> -	nop
> -1:	tbz	count, 2, 2f
> -	str	valw, [dstin]
> -	str	valw, [dstend, -4]
> +L(set_small):
> +	add	dstend, dstin, count
> +	cmp	count, 4
> +	b.lo	2f
> +	lsr	off, count, 3
> +	sub	dstend2, dstend, off, lsl 2
> +	str	s0, [dstin]
> +	str	s0, [dstin, off, lsl 2]
> +	str	s0, [dstend2, -4]
> +	str	s0, [dstend, -4]
>  	ret
> +
> +	/* Set 0..3 bytes.  */
>  2:	cbz	count, 3f
> +	lsr	off, count, 1
>  	strb	valw, [dstin]
> -	tbz	count, 1, 3f
> -	strh	valw, [dstend, -2]
> +	strb	valw, [dstin, off]
> +	strb	valw, [dstend, -1]
>  3:	ret
>  
> -	/* Set 17..96 bytes.  */
> -L(set_medium):
> -	str	q0, [dstin]
> -	tbnz	count, 6, L(set96)
> -	str	q0, [dstend, -16]
> -	tbz	count, 5, 1f
> -	str	q0, [dstin, 16]
> -	str	q0, [dstend, -32]
> -1:	ret
> -
>  	.p2align 4
> -	/* Set 64..96 bytes.  Write 64 bytes from the start and
> -	   32 bytes from the end.  */
> -L(set96):
> -	str	q0, [dstin, 16]
> +L(set_128):
> +	bic	dst, dstin, 15
> +	cmp	count, 128
> +	b.hi	L(set_long)
> +	stp	q0, q0, [dstin]
>  	stp	q0, q0, [dstin, 32]
> +	stp	q0, q0, [dstend, -64]
>  	stp	q0, q0, [dstend, -32]
>  	ret
>  
> -	.p2align 3
> -	nop
> +	.p2align 4
>  L(set_long):
> -	and	valw, valw, 255
> -	bic	dst, dstin, 15
>  	str	q0, [dstin]
> -	cmp	count, 256
> -	ccmp	valw, 0, 0, cs
> -	b.eq	L(try_zva)
> -L(no_zva):
> -	sub	count, dstend, dst	/* Count is 16 too large.  */
> -	sub	dst, dst, 16		/* Dst is biased by -32.  */
> -	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
> -1:	stp	q0, q0, [dst, 32]
> -	stp	q0, q0, [dst, 64]!
> -L(tail64):
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	stp	q0, q0, [dstend, -64]
> +	str	q0, [dst, 16]
> +	tst	valw, 255
> +	b.ne	L(no_zva)
> +#ifndef ZVA64_ONLY
> +	mrs	zva_val, dczid_el0
> +	and	zva_val, zva_val, 31
> +	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
> +	b.ne	L(zva_128)
> +#endif
> +	stp	q0, q0, [dst, 32]
> +	bic	dst, dstin, 63
> +	sub	count, dstend, dst	/* Count is now 64 too large.  */
> +	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
> +
> +	/* Write last bytes before ZVA loop.  */
> +	stp	q0, q0, [dstend, -64]
>  	stp	q0, q0, [dstend, -32]
> +
> +	.p2align 4
> +L(zva64_loop):
> +	add	dst, dst, 64
> +	dc	zva, dst
> +	subs	count, count, 64
> +	b.hi	L(zva64_loop)
>  	ret
>  
> -L(try_zva):
> -#ifndef ZVA64_ONLY
>  	.p2align 3
> -	mrs	tmp1, dczid_el0
> -	tbnz	tmp1w, 4, L(no_zva)
> -	and	tmp1w, tmp1w, 15
> -	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
> -	b.ne	 L(zva_128)
> -	nop
> -#endif
> -	/* Write the first and last 64 byte aligned block using stp rather
> -	   than using DC ZVA.  This is faster on some cores.
> -	 */
> -	.p2align 4
> -L(zva_64):
> -	str	q0, [dst, 16]
> +L(no_zva):
> +	sub	count, dstend, dst	/* Count is 32 too large.  */
> +	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
> +L(no_zva_loop):
>  	stp	q0, q0, [dst, 32]
> -	bic	dst, dst, 63
>  	stp	q0, q0, [dst, 64]
> -	stp	q0, q0, [dst, 96]
> -	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -1:	dc	zva, dst
>  	add	dst, dst, 64
>  	subs	count, count, 64
> -	b.hi	1b
> -	stp	q0, q0, [dst, 0]
> -	stp	q0, q0, [dst, 32]
> +	b.hi	L(no_zva_loop)
>  	stp	q0, q0, [dstend, -64]
>  	stp	q0, q0, [dstend, -32]
>  	ret
>  
>  #ifndef ZVA64_ONLY
> -	.p2align 3
> +	.p2align 4
>  L(zva_128):
> -	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
> -	b.ne	L(zva_other)
> +	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
> +	b.ne	L(no_zva)
>  
> -	str	q0, [dst, 16]
>  	stp	q0, q0, [dst, 32]
>  	stp	q0, q0, [dst, 64]
>  	stp	q0, q0, [dst, 96]
>  	bic	dst, dst, 127
>  	sub	count, dstend, dst	/* Count is now 128 too large.	*/
> -	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
> -	add	dst, dst, 128
> -1:	dc	zva, dst
> -	add	dst, dst, 128
> +	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
> +1:	add	dst, dst, 128
> +	dc	zva, dst
>  	subs	count, count, 128
>  	b.hi	1b
>  	stp	q0, q0, [dstend, -128]
> @@ -156,35 +157,6 @@ L(zva_128):
>  	stp	q0, q0, [dstend, -64]
>  	stp	q0, q0, [dstend, -32]
>  	ret
> -
> -L(zva_other):
> -	mov	tmp2w, 4
> -	lsl	zva_lenw, tmp2w, tmp1w
> -	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
> -	cmp	count, tmp1
> -	blo	L(no_zva)
> -
> -	sub	tmp2, zva_len, 1
> -	add	tmp1, dst, zva_len
> -	add	dst, dst, 16
> -	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
> -	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
> -	beq	2f
> -1:	stp	q0, q0, [dst], 64
> -	stp	q0, q0, [dst, -32]
> -	subs	count, count, 64
> -	b.hi	1b
> -2:	mov	dst, tmp1
> -	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
> -	subs	count, count, zva_len
> -	b.lo	4f
> -3:	dc	zva, dst
> -	add	dst, dst, zva_len
> -	subs	count, count, zva_len
> -	b.hs	3b
> -4:	add	count, count, zva_len
> -	sub	dst, dst, 32		/* Bias dst for tail loop.  */
> -	b	L(tail64)
>  #endif
>  
>  END (MEMSET)
>
Florian Weimer Nov. 21, 2024, 6:51 p.m. UTC | #2
* Wilco Dijkstra:

> Improve small memsets by avoiding branches and use overlapping stores.
> Use DC ZVA for copies over 128 bytes.  Remove unnecessary code for ZVA
> sizes other than 64 and 128.  Performance of random memset benchmark
> improves by 24% on Neoverse N1.
>
> Passes regress, OK for commit?

We have received a report that this breaks certain virtualized AArch64
environments:

  Bug 2327564 - glibc-2.40.9000-4.fc42 update causes kernel panic during
  aarch64 Fedora CoreOS boot on OpenStack
  <https://bugzilla.redhat.com/show_bug.cgi?id=2327564>

Output from ld.so --list-diagnostics is below.

The crash is in PID 1, so there is not much data to go by.  I can
probably get a login on such a system running glibc 2.40 and try run the
glibc testsuite from the current development branch there, but I'm not
sure if I could make sense of the results.

One issue I see is that the dczid_el0 masking in
sysdeps/aarch64/memset.S is inconsistent with
sysdeps/unix/sysv/linux/aarch64/cpu-features.c: The former use 31, while
the latter uses 0xf.  But that does not make a difference here because
dczid_el0 is 5.

Thanks,
Florian

dl_dst_lib="lib64"
dl_hwcap=0x8ff
dl_hwcap2=0x0
dl_hwcap3=0x0
dl_hwcap4=0x0
dl_hwcaps_subdirs=""
dl_hwcaps_subdirs_active=0x0
dl_pagesize=0x1000
dl_platform="aarch64"
dl_profile_output="/var/tmp"
dso.ld="ld-linux-aarch64.so.1"
dso.libc="libc.so.6"
env_filtered[0x0]="SHELL"
env_filtered[0x1]="HISTCONTROL"
env_filtered[0x2]="HISTSIZE"
env_filtered[0x3]="HOSTNAME"
env_filtered[0x4]="GPG_TTY"
env_filtered[0x5]="EDITOR"
env_filtered[0x6]="PWD"
env_filtered[0x7]="LOGNAME"
env_filtered[0x8]="XDG_SESSION_TYPE"
env_filtered[0x9]="MOTD_SHOWN"
env_filtered[0xa]="HOME"
env[0xb]="LANG=C.UTF-8"
env_filtered[0xc]="LS_COLORS"
env_filtered[0xd]="SSH_CONNECTION"
env_filtered[0xe]="XDG_SESSION_CLASS"
env_filtered[0xf]="SELINUX_ROLE_REQUESTED"
env_filtered[0x10]="TERM"
env_filtered[0x11]="LESSOPEN"
env_filtered[0x12]="USER"
env_filtered[0x13]="SELINUX_USE_CURRENT_RANGE"
env_filtered[0x14]="SHLVL"
env_filtered[0x15]="XDG_SESSION_ID"
env_filtered[0x16]="XDG_RUNTIME_DIR"
env_filtered[0x17]="SSH_CLIENT"
env[0x18]="PATH=/var/home/core/.local/bin:/var/home/core/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin"
env_filtered[0x19]="SELINUX_LEVEL_REQUESTED"
env_filtered[0x1a]="DBUS_SESSION_BUS_ADDRESS"
env_filtered[0x1b]="MAIL"
env_filtered[0x1c]="SSH_TTY"
env_filtered[0x1d]="_"
path.prefix="/usr"
path.rtld="/lib/ld-linux-aarch64.so.1"
path.sysconfdir="/etc"
path.system_dirs[0x0]="/lib64/"
path.system_dirs[0x1]="/usr/lib64/"
version.release="development"
version.version="2.40.9000"
auxv[0x0].a_type=0x21
auxv[0x0].a_val=0xffffa482b000
auxv[0x1].a_type=0x33
auxv[0x1].a_val=0x1270
auxv[0x2].a_type=0x10
auxv[0x2].a_val=0x8ff
auxv[0x3].a_type=0x6
auxv[0x3].a_val=0x1000
auxv[0x4].a_type=0x11
auxv[0x4].a_val=0x64
auxv[0x5].a_type=0x3
auxv[0x5].a_val=0xffffa47e0040
auxv[0x6].a_type=0x4
auxv[0x6].a_val=0x38
auxv[0x7].a_type=0x5
auxv[0x7].a_val=0x9
auxv[0x8].a_type=0x7
auxv[0x8].a_val=0x0
auxv[0x9].a_type=0x8
auxv[0x9].a_val=0x0
auxv[0xa].a_type=0x9
auxv[0xa].a_val=0xffffa47faa80
auxv[0xb].a_type=0xb
auxv[0xb].a_val=0x3e8
auxv[0xc].a_type=0xc
auxv[0xc].a_val=0x3e8
auxv[0xd].a_type=0xd
auxv[0xd].a_val=0x3e8
auxv[0xe].a_type=0xe
auxv[0xe].a_val=0x3e8
auxv[0xf].a_type=0x17
auxv[0xf].a_val=0x0
auxv[0x10].a_type=0x19
auxv[0x10].a_val=0xffffee238298
auxv[0x11].a_type=0x1a
auxv[0x11].a_val=0x0
auxv[0x12].a_type=0x1d
auxv[0x12].a_val=0x0
auxv[0x13].a_type=0x1f
auxv[0x13].a_val_string="/usr/bin/ld.so"
auxv[0x14].a_type=0xf
auxv[0x14].a_val_string="aarch64"
auxv[0x15].a_type=0x1b
auxv[0x15].a_val=0x1c
auxv[0x16].a_type=0x1c
auxv[0x16].a_val=0x20
uname.sysname="Linux"
uname.nodename="host-192-168-40-30"
uname.release="6.13.0-0.rc0.20241119git158f238aa69d.2.fc42.aarch64"
uname.version="#1 SMP PREEMPT_DYNAMIC Tue Nov 19 16:59:12 UTC 2024"
uname.machine="aarch64"
uname.domainname="(none)"
aarch64.cpu_features.bti=0x0
aarch64.cpu_features.midr_el1=0x431f0a11
aarch64.cpu_features.mops=0x0
aarch64.cpu_features.mte_state=0x0
aarch64.cpu_features.prefer_sve_ifuncs=0x0
aarch64.cpu_features.sve=0x0
aarch64.cpu_features.zva_size=0x80
aarch64.processor[0x0].requested=0x0
aarch64.processor[0x0].observed=0x0
aarch64.processor[0x0].observed_node=0x0
aarch64.processor[0x0].midr_el1=0x431f0a11
aarch64.processor[0x0].dczid_el0=0x5
aarch64.processor[0x1].requested=0x1
aarch64.processor[0x1].observed=0x1
aarch64.processor[0x1].observed_node=0x0
aarch64.processor[0x1].midr_el1=0x431f0a11
aarch64.processor[0x1].dczid_el0=0x5
aarch64.processor[0x2].requested=0x2
aarch64.processor[0x2].observed=0x2
aarch64.processor[0x2].observed_node=0x0
aarch64.processor[0x2].midr_el1=0x431f0a11
aarch64.processor[0x2].dczid_el0=0x5
aarch64.processor[0x3].requested=0x3
aarch64.processor[0x3].observed=0x3
aarch64.processor[0x3].observed_node=0x0
aarch64.processor[0x3].midr_el1=0x431f0a11
aarch64.processor[0x3].dczid_el0=0x5
Wilco Dijkstra Nov. 21, 2024, 9:29 p.m. UTC | #3
Hi Florian,

> The crash is in PID 1, so there is not much data to go by.  I can
> probably get a login on such a system running glibc 2.40 and try run the
> glibc testsuite from the current development branch there, but I'm not
> sure if I could make sense of the results.

> One issue I see is that the dczid_el0 masking in
> sysdeps/aarch64/memset.S is inconsistent with
> sysdeps/unix/sysv/linux/aarch64/cpu-features.c: The former use 31, while
> the latter uses 0xf.  But that does not make a difference here because
> dczid_el0 is 5.

Basically I'm checking DZP (bit 4) at the same time as the ZVA size, so that's fine.
The value 5 means it is 128 byte ZVA, and that is enough clue to figure it out!
I kept the code for zva_128 pretty much as is. It requires a minimum size of 256
since it doesn't use SUBS count, count, 128+128; b.ls ... before the zva_128 loop.

The new version changes the code in set_long to no longer check count >= 256.
So that means count can underflow in zva_128 and then it will clear an infinite
amount of memory... So it needs a extra check for count > 256 (or maybe larger
since the cost of 128-byte alignment is quite high).

I'll post a patch.

Cheers,
Wilco
Wilco Dijkstra Nov. 29, 2024, 1:42 p.m. UTC | #4
Hi Florian,

I've now committed the fix as a08d9a52f967531a77e1824c23b5368c6434a72d .
Let me know that fixes the issue you were seeing.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index 7ef77ee8c926de211a2c6d193a23f49808481a82..a7a5fba66712a82c2b571cc0fda1655fd1178186 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -1,4 +1,5 @@ 
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized memset using SIMD.
+   Copyright (C) 2012-2024 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -17,7 +18,6 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include "memset-reg.h"
 
 #ifndef MEMSET
 # define MEMSET memset
@@ -25,130 +25,131 @@ 
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
-ENTRY (MEMSET)
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+#define off	x3
+#define dstend2	x5
 
+ENTRY (MEMSET)
 	PTR_ARG (0)
 	SIZE_ARG (2)
-
 	dup	v0.16B, valw
+	cmp	count, 16
+	b.lo	L(set_small)
+
 	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
 
-	cmp	count, 96
-	b.hi	L(set_long)
-	cmp	count, 16
-	b.hs	L(set_medium)
-	mov	val, v0.D[0]
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
+	ret
 
+	.p2align 4
 	/* Set 0..15 bytes.  */
-	tbz	count, 3, 1f
-	str	val, [dstin]
-	str	val, [dstend, -8]
-	ret
-	nop
-1:	tbz	count, 2, 2f
-	str	valw, [dstin]
-	str	valw, [dstend, -4]
+L(set_small):
+	add	dstend, dstin, count
+	cmp	count, 4
+	b.lo	2f
+	lsr	off, count, 3
+	sub	dstend2, dstend, off, lsl 2
+	str	s0, [dstin]
+	str	s0, [dstin, off, lsl 2]
+	str	s0, [dstend2, -4]
+	str	s0, [dstend, -4]
 	ret
+
+	/* Set 0..3 bytes.  */
 2:	cbz	count, 3f
+	lsr	off, count, 1
 	strb	valw, [dstin]
-	tbz	count, 1, 3f
-	strh	valw, [dstend, -2]
+	strb	valw, [dstin, off]
+	strb	valw, [dstend, -1]
 3:	ret
 
-	/* Set 17..96 bytes.  */
-L(set_medium):
-	str	q0, [dstin]
-	tbnz	count, 6, L(set96)
-	str	q0, [dstend, -16]
-	tbz	count, 5, 1f
-	str	q0, [dstin, 16]
-	str	q0, [dstend, -32]
-1:	ret
-
 	.p2align 4
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-L(set96):
-	str	q0, [dstin, 16]
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
 	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-	nop
+	.p2align 4
 L(set_long):
-	and	valw, valw, 255
-	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 256
-	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]!
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
+	str	q0, [dst, 16]
+	tst	valw, 255
+	b.ne	L(no_zva)
+#ifndef ZVA64_ONLY
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(zva_128)
+#endif
+	stp	q0, q0, [dst, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 64 + 64	/* Adjust count and bias for loop.  */
+
+	/* Write last bytes before ZVA loop.  */
+	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
+
+	.p2align 4
+L(zva64_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva64_loop)
 	ret
 
-L(try_zva):
-#ifndef ZVA64_ONLY
 	.p2align 3
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
-	and	tmp1w, tmp1w, 15
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
-	nop
-#endif
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-	.p2align 4
-L(zva_64):
-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 32 too large.  */
+	sub	count, count, 64 + 32	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
 	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
 	add	dst, dst, 64
 	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
+	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
 #ifndef ZVA64_ONLY
-	.p2align 3
+	.p2align 4
 L(zva_128):
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
+	cmp	zva_val, 5		/* ZVA size is 128 bytes.  */
+	b.ne	L(no_zva)
 
-	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	stp	q0, q0, [dst, 64]
 	stp	q0, q0, [dst, 96]
 	bic	dst, dst, 127
 	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
+	sub	count, count, 128 + 128	/* Adjust count and bias for loop.  */
+1:	add	dst, dst, 128
+	dc	zva, dst
 	subs	count, count, 128
 	b.hi	1b
 	stp	q0, q0, [dstend, -128]
@@ -156,35 +157,6 @@  L(zva_128):
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
-
-L(zva_other):
-	mov	tmp2w, 4
-	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
-
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	sub	dst, dst, 32		/* Bias dst for tail loop.  */
-	b	L(tail64)
 #endif
 
 END (MEMSET)