diff mbox series

AArch64: Improve generic strlen

Message ID PAWPR08MB89822A40A121C666B4A6AADE83B12@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series AArch64: Improve generic strlen | expand

Commit Message

Wilco Dijkstra July 31, 2024, 12:21 p.m. UTC
Improve performance by handling another 16 bytes before entering the loop.
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
size computation to avoid increasing latency.  On Neoverse V1 performance
of the random strlen benchmark improves by 4.6%.

OK for commit?

---

Comments

Adhemerval Zanella Netto Aug. 6, 2024, 6:15 p.m. UTC | #1
On 31/07/24 09:21, Wilco Dijkstra wrote:
> Improve performance by handling another 16 bytes before entering the loop.
> Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
> size computation to avoid increasing latency.  On Neoverse V1 performance
> of the random strlen benchmark improves by 4.6%.
> 
> OK for commit?

Looks ok, although from the ARM blog it states that umaxp is faster for
strings larger than 128 bytes [1].  It means that the implementation
now assumes most of inputs strings to be smaller than 128 bytes; which
seems to be the outcome from the random strlen benchmark that is based
on speccpu2017 [2].

I think we should make it clear on the commit message the rationale of
this change.

[1] https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
[2] https://patchwork.sourceware.org/project/glibc/patch/PAWPR08MB8982471B55A3A03B78C1DF8083B02@PAWPR08MB8982.eurprd08.prod.outlook.com/

> 
> ---
> 
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index ab2a576cdb5665e596b791299af3f4abecb73c0e..352fb40d3abbb44bc1cc604cfd1c86fdc8f8e251 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -1,4 +1,5 @@
> -/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
> +/* Generic optimized strlen using SIMD.
> +   Copyright (C) 2012-2024 Free Software Foundation, Inc.
>  
>     This file is part of the GNU C Library.
>  
> @@ -56,36 +57,50 @@ ENTRY (STRLEN)
>  	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
>  	fmov	synd, dend
>  	lsr	synd, synd, shift
> -	cbz	synd, L(loop)
> +	cbz	synd, L(next16)
>  
>  	rbit	synd, synd
>  	clz	result, synd
>  	lsr	result, result, 2
>  	ret
>  
> +L(next16):
> +	ldr	data, [src, 16]
> +	cmeq	vhas_nul.16b, vdata.16b, 0
> +	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
> +	fmov	synd, dend
> +	cbz	synd, L(loop)
> +	add	src, src, 16
> +#ifndef __AARCH64EB__
> +	rbit	synd, synd
> +#endif
> +	sub	result, src, srcin
> +	clz	tmp, synd
> +	add	result, result, tmp, lsr 2
> +	ret
> +
>  	.p2align 5
>  L(loop):
> -	ldr	data, [src, 16]
> +	ldr	data, [src, 32]!
>  	cmeq	vhas_nul.16b, vdata.16b, 0
> -	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
>  	fmov	synd, dend
>  	cbnz	synd, L(loop_end)
> -	ldr	data, [src, 32]!
> +	ldr	data, [src, 16]
>  	cmeq	vhas_nul.16b, vdata.16b, 0
> -	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
> +	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
>  	fmov	synd, dend
>  	cbz	synd, L(loop)
> -	sub	src, src, 16
> +	add	src, src, 16
>  L(loop_end):
> -	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
> -	sub	result, src, srcin
> -	fmov	synd, dend
> +	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
>  #ifndef __AARCH64EB__
>  	rbit	synd, synd
> +	sub	result, result, 3
>  #endif
> -	add	result, result, 16
>  	clz	tmp, synd
> -	add	result, result, tmp, lsr 2
> +	sub	result, tmp, result
> +	lsr	result, result, 2
>  	ret
>  
>  END (STRLEN)
>
Wilco Dijkstra Aug. 6, 2024, 10:49 p.m. UTC | #2
Hi Adhemerval,

> On 31/07/24 09:21, Wilco Dijkstra wrote:
>> Improve performance by handling another 16 bytes before entering the loop.
>> Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
>> size computation to avoid increasing latency.  On Neoverse V1 performance
>> of the random strlen benchmark improves by 4.6%.

> Looks ok, although from the ARM blog it states that umaxp is faster for
> strings larger than 128 bytes [1].  It means that the implementation

I believe that talks about using SHRN vs UMAXP inside the inner loop - using the
former is faster for small loops, but the latter is faster for large loops.
ADDHN compresses the mask in a similar way as SHRN and is as fast as UMAXP,
so you get the best of both worlds.

> now assumes most of inputs strings to be smaller than 128 bytes; which
> seems to be the outcome from the random strlen benchmark that is based
> on speccpu2017 [2].

Of course all string functions should be optimized for the common case - and
that means very short strings. The new next16 case helps with that. However this
doesn't mean larger strings become slower: the new loop is now as fast or faster
both for small and long strings. 

Cheers,
Wilco
Adhemerval Zanella Netto Aug. 6, 2024, 11:08 p.m. UTC | #3
> On 6 Aug 2024, at 19:49, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Hi Adhemerval,
> 
>> On 31/07/24 09:21, Wilco Dijkstra wrote:
>>> Improve performance by handling another 16 bytes before entering the loop.
>>> Use ADDHN in the loop to avoid SHRN+FMOV when it terminates.  Change final
>>> size computation to avoid increasing latency.  On Neoverse V1 performance
>>> of the random strlen benchmark improves by 4.6%.
> 
>> Looks ok, although from the ARM blog it states that umaxp is faster for
>> strings larger than 128 bytes [1].  It means that the implementation
> 
> I believe that talks about using SHRN vs UMAXP inside the inner loop - using the
> former is faster for small loops, but the latter is faster for large loops.
> ADDHN compresses the mask in a similar way as SHRN and is as fast as UMAXP,
> so you get the best of both worlds.
> 

I had the impression that using umaxp will saturate more vector ports, but it seems that it is not the case for strlen. 

>> now assumes most of inputs strings to be smaller than 128 bytes; which
>> seems to be the outcome from the random strlen benchmark that is based
>> on speccpu2017 [2].
> 
> Of course all string functions should be optimized for the common case - and
> that means very short strings. The new next16 case helps with that. However this
> doesn't mean larger strings become slower: the new loop is now as fast or faster
> both for small and long strings

Fair enough.

Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>

> 
> Cheers,
> Wilco
Wilco Dijkstra Aug. 7, 2024, 2:22 p.m. UTC | #4
Hi Adhemerval,

> I had the impression that using umaxp will saturate more vector ports, but it seems that it is not the case for strlen.

UMINP works out well when you need to combine 2 or 4 vectors like in __strlen_asimd.
However for compressing 128-bit masks to 64 bits, ADDHN is better since it preserves
each element like SHRN and so can be used by RBIT+CLZ, while UMINP/UMAXP merges
pairs of elements (and thus requires extra FMOV and SHRN or ADDHN after the loop).

Cheers,
Wilco
diff mbox series

Patch

diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index ab2a576cdb5665e596b791299af3f4abecb73c0e..352fb40d3abbb44bc1cc604cfd1c86fdc8f8e251 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,5 @@ 
-/* Copyright (C) 2012-2024 Free Software Foundation, Inc.
+/* Generic optimized strlen using SIMD.
+   Copyright (C) 2012-2024 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -56,36 +57,50 @@  ENTRY (STRLEN)
 	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
-	cbz	synd, L(loop)
+	cbz	synd, L(next16)
 
 	rbit	synd, synd
 	clz	result, synd
 	lsr	result, result, 2
 	ret
 
+L(next16):
+	ldr	data, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	src, src, 16
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	result, src, srcin
+	clz	tmp, synd
+	add	result, result, tmp, lsr 2
+	ret
+
 	.p2align 5
 L(loop):
-	ldr	data, [src, 16]
+	ldr	data, [src, 32]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbnz	synd, L(loop_end)
-	ldr	data, [src, 32]!
+	ldr	data, [src, 16]
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	addhn	vend.8b, vhas_nul.8h, vhas_nul.8h
 	fmov	synd, dend
 	cbz	synd, L(loop)
-	sub	src, src, 16
+	add	src, src, 16
 L(loop_end):
-	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
-	sub	result, src, srcin
-	fmov	synd, dend
+	sub	result, shift, src, lsl 2	/* (srcin - src) << 2.  */
 #ifndef __AARCH64EB__
 	rbit	synd, synd
+	sub	result, result, 3
 #endif
-	add	result, result, 16
 	clz	tmp, synd
-	add	result, result, tmp, lsr 2
+	sub	result, tmp, result
+	lsr	result, result, 2
 	ret
 
 END (STRLEN)