From patchwork Fri Feb  2 04:50:56 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Siddhesh Poyarekar <siddhesh@sourceware.org>
X-Patchwork-Id: 868504
Return-Path: 
 <libc-alpha-return-89912-incoming=patchwork.ozlabs.org@sourceware.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Authentication-Results: ozlabs.org;
	spf=pass (mailfrom) smtp.mailfrom=sourceware.org
	(client-ip=209.132.180.131; helo=sourceware.org;
	envelope-from=libc-alpha-return-89912-incoming=patchwork.ozlabs.org@sourceware.org;
	receiver=<UNKNOWN>)
Authentication-Results: ozlabs.org; dkim=pass (1024-bit key;
	secure) header.d=sourceware.org header.i=@sourceware.org
	header.b="Q0mmuTRt"; dkim-atps=neutral
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256
	bits)) (No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 3zXl1y6f84z9sDB
	for <incoming@patchwork.ozlabs.org>;
	Fri,  2 Feb 2018 15:51:34 +1100 (AEDT)
DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:from:to:cc:subject:date:message-id; q=dns; s=
	default; b=eHSA2KJ4T+LDdEwTDM1cxKAJ3seUkeqSVNhX/AmxzpLanRckQVWdN
	fnMbO+QUoBFt6Ce+jP/+Y0cCgM96luE4Vn15mvvB/buByg+OqBUYVWJbBxOgu/bf
	0qiI8hZH5fTolksIH13KWfggFhpP1JtInL998gJ0oKi494dpdfOwjM=
DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:from:to:cc:subject:date:message-id; s=default;
	bh=ywXUpH8NCHYErePlGmjsgU4hxy8=; b=Q0mmuTRtBeNLr0uhSSmVDiB5G7Ei
	KpLxSyNByseddlQZoit2dJABQFFNrGakdCw+pwj5H6Etm5pKzt0V2rse4V2f9ND1
	ztO8y0k8KhK3i4EN/zxb7VjMWma1TgEFpavxaaTgo1U5xvlO37sojrtwog+hb94p
	j9o8Uzh0HVi/pQM=
Received: (qmail 52053 invoked by alias); 2 Feb 2018 04:51:29 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: 
 <mailto:libc-alpha-unsubscribe-incoming=patchwork.ozlabs.org@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 52044 invoked by uid 89); 2 Feb 2018 04:51:29 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-26.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE,
	SPF_NEUTRAL autolearn=ham version=3.3.2 spammy=UD:gt
X-HELO: homiemail-a52.g.dreamhost.com
From: Siddhesh Poyarekar <siddhesh@sourceware.org>
To: libc-alpha@sourceware.org
Cc: szabolcs.nagy@arm.com
Subject: [PATCH] aarch64: Optimized memcmp for medium to large sizes
Date: Fri,  2 Feb 2018 10:20:56 +0530
Message-Id: <20180202045056.3121-1-siddhesh@sourceware.org>

This improved memcmp provides a fast path for compares up to 16 bytes
and then compares 16 bytes at a time, thus optimizing loads from both
sources.  The glibc memcmp microbenchmark retains performance (with an
error of ~1ns) for smaller compare sizes and reduces up to 31% of
execution time for compares up to 4K on the APM Mustang.  On Qualcomm
Falkor this improves to almost 48%, i.e. it is almost 2x improvement
for sizes of 2K and above.

	* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
	time.
---
 sysdeps/aarch64/memcmp.S | 66 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 16 deletions(-)
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index ecd12061b2..0804e75d99 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -34,9 +34,12 @@
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
-#define data2		x4
-#define data2w		w4
-#define tmp1		x5
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 ENTRY_ALIGN (memcmp, 6)
 	DELOUSE (0)
@@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6)
 	subs	limit, limit, 8
 	b.lo	L(less8)
 
-	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
-	and	tmp1, src1, 7
-	add	limit, limit, tmp1
+	cmp	data1, data2
+	b.ne	L(return)
+
+	subs	limit, limit, 8
+	b.gt	L(more16)
+
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
+
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
 	cmp	data1, data2
 	bne	L(return)
 
+	/* Jump directly to copying the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
+
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop8)
+
 	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
 	sub	src1, src1, tmp1
 	sub	src2, src2, tmp1
 
-	subs	limit, limit, 8
-	b.ls	L(last_bytes)
-
 	/* Loop performing 8 bytes per iteration using aligned src1.
 	   Limit is pre-decremented by 8 and must be larger than zero.
 	   Exit if <= 8 bytes left to do or if the data is not equal.  */
 	.p2align 4
 L(loop8):
-	ldr	data1, [src1], 8
-	ldr	data2, [src2], 8
-	subs	limit, limit, 8
-	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
 	b.eq	L(loop8)
 
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
 	cmp	data1, data2
 	bne	L(return)
 
-	/* Compare last 1-8 bytes using unaligned access.  */
+	/* Compare last 1-16 bytes using unaligned access.  */
 L(last_bytes):
-	ldr	data1, [src1, limit]
-	ldr	data2, [src2, limit]
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp     data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
 
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 L(return):