[2/2] Ignore prefetcher tagging for smaller copies

Message ID	20180503175209.2943-3-siddhesh@sourceware.org
State	New
Headers	show Return-Path: <libc-alpha-return-92021-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:in-reply-to :references; q=dns; s=default; b=oOlrEYpZssxlRfXOK74Iic0GYJSBHAt 7HNd2zNJk0Y/IxW17DbqqTNqG6PgXj7ij/TdrCraTp4JBhPReio7B2xQ7skXNelH 6Cs4H8BHfFISGlm9GeEypkVhSp0iNzm7FtOEXUN9VBFFe0GBvC/lXNBZaMHxjhLf xlGOUo0XG5dw= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org From: Siddhesh Poyarekar <siddhesh@sourceware.org> To: libc-alpha@sourceware.org Subject: [PATCH 2/2] Ignore prefetcher tagging for smaller copies Date: Thu, 3 May 2018 23:22:09 +0530 Message-Id: <20180503175209.2943-3-siddhesh@sourceware.org> In-Reply-To: <20180503175209.2943-1-siddhesh@sourceware.org> References: <20180503175209.2943-1-siddhesh@sourceware.org>
Series	aarch64,falkor: memcpy/memmove performance improvements \| expand [0/2] aarch64,falkor: memcpy/memmove performance improvements [1/2] aarch64,falkor: Ignore prefetcher hints for memmove tail [2/2] Ignore prefetcher tagging for smaller copies

Message ID

20180503175209.2943-3-siddhesh@sourceware.org

State

New

Headers

DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:from:to:subject:date:message-id:in-reply-to
	:references; q=dns; s=default; b=oOlrEYpZssxlRfXOK74Iic0GYJSBHAt
	7HNd2zNJk0Y/IxW17DbqqTNqG6PgXj7ij/TdrCraTp4JBhPReio7B2xQ7skXNelH
	6Cs4H8BHfFISGlm9GeEypkVhSp0iNzm7FtOEXUN9VBFFe0GBvC/lXNBZaMHxjhLf
	xlGOUo0XG5dw=
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
Sender: libc-alpha-owner@sourceware.org
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
To: libc-alpha@sourceware.org
Subject: [PATCH 2/2] Ignore prefetcher tagging for smaller copies
Date: Thu,  3 May 2018 23:22:09 +0530
Message-Id: <20180503175209.2943-3-siddhesh@sourceware.org>
In-Reply-To: <20180503175209.2943-1-siddhesh@sourceware.org>
References: <20180503175209.2943-1-siddhesh@sourceware.org>

Series

aarch64,falkor: memcpy/memmove performance improvements | expand

Commit Message

Siddhesh Poyarekar May 3, 2018, 5:52 p.m. UTC

For smaller and medium sized copies, the effect of hardware
prefetching are not as dominant as instruction level parallelism.
Hence it makes more sense to load data into multiple registers than to
try and route them to the same prefetch unit.  This is also the case
for the loop exit where we are unable to latch on to the same prefetch
unit anyway so it makes more sense to have data loaded in parallel.

The performance results are a bit mixed with memcpy-random, with
numbers jumping between -1% and +3%, i.e. the numbers don't seem
repeatable.  memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
bytes and that improvement reduces down as the impact of the tail copy
decreases in comparison to the loop.

	* sysdeps/aarch64/multiarch/memcpy_falkor.S (B_l, B_lw, C_l,
	D_l, E_l, F_l, G_l, A_h, B_h, C_h, D_h, E_h, F_h, G_h): New
	macros.
	(__memcpy_falkor): Use multiple registers to copy data in loop
	tail.

 sysdeps/aarch64/multiarch/memcpy_falkor.S | 68 +++++++++++++++++++------------
 1 file changed, 41 insertions(+), 27 deletions(-)

Comments

Szabolcs Nagy May 10, 2018, 10:29 a.m. UTC | #1

On 03/05/18 18:52, Siddhesh Poyarekar wrote:
> For smaller and medium sized copies, the effect of hardware
> prefetching are not as dominant as instruction level parallelism.
> Hence it makes more sense to load data into multiple registers than to
> try and route them to the same prefetch unit.  This is also the case
> for the loop exit where we are unable to latch on to the same prefetch
> unit anyway so it makes more sense to have data loaded in parallel.
> 
> The performance results are a bit mixed with memcpy-random, with
> numbers jumping between -1% and +3%, i.e. the numbers don't seem
> repeatable.  memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
> bytes and that improvement reduces down as the impact of the tail copy
> decreases in comparison to the loop.
> 
> 	* sysdeps/aarch64/multiarch/memcpy_falkor.S (B_l, B_lw, C_l,
> 	D_l, E_l, F_l, G_l, A_h, B_h, C_h, D_h, E_h, F_h, G_h): New
> 	macros.
> 	(__memcpy_falkor): Use multiple registers to copy data in loop
> 	tail.

OK to commit.

diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index 8dd8c1e03a..2fe9937f11 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -35,6 +35,20 @@ 
 #define A_hw	w7
 #define tmp1	x14
 
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	dst
+#define E_h	tmp1
+#define F_l	src
+#define F_h	count
+#define G_l	srcend
+#define G_h	x15
+
 /* Copies are split into 3 main cases:
 
    1. Small copies of up to 32 bytes
@@ -74,21 +88,21 @@  ENTRY_ALIGN (__memcpy_falkor, 6)
 	/* Medium copies: 33..128 bytes.  */
 	sub	tmp1, count, 1
 	ldp	A_l, A_h, [src, 16]
-	stp	A_l, A_h, [dstin, 16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -16]
 	tbz	tmp1, 6, 1f
-	ldp	A_l, A_h, [src, 32]
-	stp	A_l, A_h, [dstin, 32]
-	ldp	A_l, A_h, [src, 48]
-	stp	A_l, A_h, [dstin, 48]
-	ldp	A_l, A_h, [srcend, -64]
-	stp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	A_l, A_h, [dstend, -48]
+	ldp	D_l, D_h, [src, 32]
+	ldp	E_l, E_h, [src, 48]
+	stp	D_l, D_h, [dstin, 32]
+	stp	E_l, E_h, [dstin, 48]
+	ldp	F_l, F_h, [srcend, -64]
+	ldp	G_l, G_h, [srcend, -48]
+	stp	F_l, F_h, [dstend, -64]
+	stp	G_l, G_h, [dstend, -48]
 1:
-	ldp	A_l, A_h, [srcend, -32]
-	stp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	stp	A_l, A_h, [dstin, 16]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
 	ret
 
 	.p2align 4
@@ -98,36 +112,36 @@  L(copy32):
 	cmp	count, 16
 	b.lo	1f
 	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [srcend, -16]
 	stp	A_l, A_h, [dstin]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	A_l, A_h, [dstend, -16]
+	stp	B_l, B_h, [dstend, -16]
 	ret
 	.p2align 4
 1:
 	/* 8-15 */
 	tbz	count, 3, 1f
 	ldr	A_l, [src]
+	ldr	B_l, [srcend, -8]
 	str	A_l, [dstin]
-	ldr	A_l, [srcend, -8]
-	str	A_l, [dstend, -8]
+	str	B_l, [dstend, -8]
 	ret
 	.p2align 4
 1:
 	/* 4-7 */
 	tbz	count, 2, 1f
 	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
 	str	A_lw, [dstin]
-	ldr	A_lw, [srcend, -4]
-	str	A_lw, [dstend, -4]
+	str	B_lw, [dstend, -4]
 	ret
 	.p2align 4
 1:
 	/* 2-3 */
 	tbz	count, 1, 1f
 	ldrh	A_lw, [src]
+	ldrh	B_lw, [srcend, -2]
 	strh	A_lw, [dstin]
-	ldrh	A_lw, [srcend, -2]
-	strh	A_lw, [dstend, -2]
+	strh	B_lw, [dstend, -2]
 	ret
 	.p2align 4
 1:
@@ -171,12 +185,12 @@  L(loop64):
 L(last64):
 	ldp	A_l, A_h, [srcend, -64]
 	stnp	A_l, A_h, [dstend, -64]
-	ldp	A_l, A_h, [srcend, -48]
-	stnp	A_l, A_h, [dstend, -48]
-	ldp	A_l, A_h, [srcend, -32]
-	stnp	A_l, A_h, [dstend, -32]
-	ldp	A_l, A_h, [srcend, -16]
-	stnp	A_l, A_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -48]
+	stnp	B_l, B_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -32]
+	stnp	C_l, C_h, [dstend, -32]
+	ldp	D_l, D_h, [srcend, -16]
+	stnp	D_l, D_h, [dstend, -16]
 	ret
 
 END (__memcpy_falkor)

[2/2] Ignore prefetcher tagging for smaller copies

Commit Message

Comments

Patch