[neleai/string-x64] Microoptimize strcmp-sse2-unaligned.

Message ID	20150620083525.GA31992@domone
State	New
Headers	show Return-Path: <libc-alpha-return-60250-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:date:from:to:subject:message-id:mime-version :content-type; q=dns; s=default; b=gdK2DiWcWwMdDNd4N2HvvcsITGNuT wt8gDrsoe08N7YliS5O1cH7Ew2YSyrISmEsPjipF+ZfSXHHqg1wCNFM64pCe1Pvq rMigWTxs6C5pEBFw2lld+kMMgxW/GJVakTrwDQHiu9JgT7nxbYsqxe0q+0ZmbBrE HnJgZDZzC9GoAE= Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Date: Sat, 20 Jun 2015 10:35:25 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz> To: libc-alpha@sourceware.org Subject: [PATCH neleai/string-x64] Microoptimize strcmp-sse2-unaligned. Message-ID: <20150620083525.GA31992@domone> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.20 (2009-06-14)

Message ID

20150620083525.GA31992@domone

State

New

Headers

DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id
	:list-unsubscribe:list-subscribe:list-archive:list-post
	:list-help:sender:date:from:to:subject:message-id:mime-version
	:content-type; q=dns; s=default; b=gdK2DiWcWwMdDNd4N2HvvcsITGNuT
	wt8gDrsoe08N7YliS5O1cH7Ew2YSyrISmEsPjipF+ZfSXHHqg1wCNFM64pCe1Pvq
	rMigWTxs6C5pEBFw2lld+kMMgxW/GJVakTrwDQHiu9JgT7nxbYsqxe0q+0ZmbBrE
	HnJgZDZzC9GoAE=
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
Sender: libc-alpha-owner@sourceware.org
Date: Sat, 20 Jun 2015 10:35:25 +0200
From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= <neleai@seznam.cz>
To: libc-alpha@sourceware.org
Subject: [PATCH neleai/string-x64] Microoptimize strcmp-sse2-unaligned.
Message-ID: <20150620083525.GA31992@domone>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.5.20 (2009-06-14)

Commit Message

Ondřej Bílka June 20, 2015, 8:35 a.m. UTC

Hi,

When I read strcmp again to improve strncmp and add avx2 strcmp 
I found that I made several mistakes, mainly caused by first optimizing 
c template and then fixing assembly.

First was mainly my idea to simplify handling cross-page check by oring
src and dest. I recall that I first did complex crosspage handling where
false positives were cheap. Then I found that due to size it has big
overhead and simple loop was faster when testing with firefox. 
That turned original decision into bad one.

Second is to reorganize loop instructions so that after loop ends I could 
simply find last byte without recalculating much, using trick that last
16 bit mask could be ored with previous three as its relevant only when
previous three were zero.

Final one is that gcc generates bad loops in regards where to increment
pointers. You should place them after loads that use them, not at start
of loop like gcc does. That change is responsible for 10% improvement
for large sizes.

Final are microoptimizations that save few bytes without measurable
performance impact like using eax instead rax to save byte or moving
unnecessary zeroing instruction when they are not needed.

Profile data are here, shortly with avx2 for haswell that I will submit
next.

http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html

OK to commit this?

	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
	(__strcmp_sse2_unaligned): Add several microoptimizations.

Comments

Ondřej Bílka June 20, 2015, 11:16 a.m. UTC | #1

I updated page above to contain avx2 data, profiler used is here.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile200615.tar.bz2

I also found that there is possible regression on core2, see.
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcmp_profile/results_gcc/result.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcmp_profile/results_rand/result.html

Problem is that for larger sizes a ssse3 is still faster but overall its
worse as due to high startup cost. Also big instruction cache footprint
is problem. When in icache its benefical for strings larger than 128
bytes, when its cold then icache causes jump that point into ~400 bytes

http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strcmp_profile/results_rand_noicache/result.html

So how much we care about core2? I know several functions that could be
improved but I put it in backlog due that it isn't that important for me
to optimize and also its nontrivial to do switch without harming smaller
sizes that are hot path instead this cold one.

So how proceed?

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa..03d1b11 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -19,10 +19,13 @@ 
 #include "sysdep.h"
 
 ENTRY ( __strcmp_sse2_unaligned)
-	movl	%edi, %eax
-	xorl	%edx, %edx
 	pxor	%xmm7, %xmm7
-	orl	%esi, %eax
+	movl	%esi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page)
+
+	movl	%edi, %eax
 	andl	$4095, %eax
 	cmpl	$4032, %eax
 	jg	L(cross_page)
@@ -30,13 +33,11 @@  ENTRY ( __strcmp_sse2_unaligned)
 	movdqu	(%rsi), %xmm0
 	pcmpeqb	%xmm1, %xmm0
 	pminub	%xmm1, %xmm0
-	pxor	%xmm1, %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	testq	%rax, %rax
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
 	je	L(next_48_bytes)
-L(return):
-	bsfq	%rax, %rdx
+	bsf	%eax, %edx
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
@@ -50,29 +51,35 @@  L(next_48_bytes):
 	pcmpeqb	%xmm6, %xmm3
 	movdqu	32(%rsi), %xmm2
 	pminub	%xmm6, %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm7, %xmm3
 	movdqu	48(%rdi), %xmm4
 	pcmpeqb	%xmm5, %xmm2
-	pmovmskb	%xmm3, %edx
+	pmovmskb %xmm3, %edx
 	movdqu	48(%rsi), %xmm0
 	pminub	%xmm5, %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm7, %xmm2
 	pcmpeqb	%xmm4, %xmm0
-	pmovmskb	%xmm2, %eax
-	salq	$16, %rdx
+	pmovmskb %xmm2, %eax
+	sal	$16, %edx
 	pminub	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm7, %xmm0
 	salq	$32, %rax
 	orq	%rdx, %rax
-	pmovmskb	%xmm0, %ecx
-	movq	%rcx, %rdx
-	salq	$48, %rdx
-	orq	%rdx, %rax
-	jne	L(return)
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	je	L(main_loop_header)
+L(return):
+	bsf	%rax, %rdx
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	subl	%edx, %eax
+	ret
+
+
 L(main_loop_header):
 	leaq	64(%rdi), %rdx
 	movl	$4096, %ecx
-	pxor	%xmm9, %xmm9
 	andq	$-64, %rdx
 	subq	%rdi, %rdx
 	leaq	(%rdi, %rdx), %rax
@@ -82,16 +89,11 @@  L(main_loop_header):
 	subq	%rsi, %rcx
 	shrq	$6, %rcx
 	movq	%rcx, %rsi
-	jmp	L(loop_start)
 
 	.p2align 4
 L(loop):
-	addq	$64, %rax
-	addq	$64, %rdx
-L(loop_start):
-	testq	%rsi, %rsi
-	leaq	-1(%rsi), %rsi
-	je	L(loop_cross_page)
+	add	$-1, %rsi
+	ja	L(loop_cross_page)
 L(back_to_loop):
 	movdqu	(%rdx), %xmm0
 	movdqu	16(%rdx), %xmm1
@@ -104,61 +106,57 @@  L(back_to_loop):
 	movdqu	48(%rdx), %xmm6
 	pminub	%xmm3, %xmm1
 	movdqa	32(%rax), %xmm2
-	pminub	%xmm1, %xmm0
 	movdqa	48(%rax), %xmm3
 	pcmpeqb	%xmm2, %xmm5
 	pcmpeqb	%xmm3, %xmm6
+	addq	$64, %rax
 	pminub	%xmm2, %xmm5
 	pminub	%xmm3, %xmm6
-	pminub	%xmm5, %xmm0
-	pminub	%xmm6, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %ecx
+	addq	$64, %rdx
+	pminub	%xmm5, %xmm6
+	pminub	%xmm1, %xmm6
+	pminub	%xmm0, %xmm6
+	pcmpeqb	%xmm7, %xmm6
+	pmovmskb %xmm6, %ecx
 	testl	%ecx, %ecx
 	je	L(loop)
-	pcmpeqb	%xmm7, %xmm5
-	movdqu	(%rdx), %xmm0
-	pcmpeqb	%xmm7, %xmm1
-	movdqa	(%rax), %xmm2
-	pcmpeqb	%xmm2, %xmm0
-	pminub	%xmm2, %xmm0
-	pcmpeqb	%xmm7, %xmm6
 	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm1, %ecx
-	pmovmskb	%xmm5, %r8d
-	pmovmskb	%xmm0, %edi
-	salq	$16, %rcx
+	pcmpeqb	%xmm7, %xmm1
+	pcmpeqb	%xmm7, %xmm5
+	pmovmskb %xmm0, %edi
+	pmovmskb %xmm1, %esi
+	pmovmskb %xmm5, %r8d
+	salq	$48, %rcx
 	salq	$32, %r8
-	pmovmskb	%xmm6, %esi
 	orq	%r8, %rcx
 	orq	%rdi, %rcx
-	salq	$48, %rsi
+	sal	$16, %esi
 	orq	%rsi, %rcx
 	bsfq	%rcx, %rcx
-	movzbl	(%rax, %rcx), %eax
-	movzbl	(%rdx, %rcx), %edx
+	movzbl	-64(%rax, %rcx), %eax
+	movzbl	-64(%rdx, %rcx), %edx
 	subl	%edx, %eax
 	ret
 
 	.p2align 4
 L(loop_cross_page):
-	xor	%r10, %r10
+	xor	%ecx, %ecx
 	movq	%rdx, %r9
 	and	$63, %r9
-	subq	%r9, %r10
+	subq	%r9, %rcx
 
-	movdqa	(%rdx, %r10), %xmm0
-	movdqa	16(%rdx, %r10), %xmm1
-	movdqu	(%rax, %r10), %xmm2
-	movdqu	16(%rax, %r10), %xmm3
+	movdqa	(%rdx, %rcx), %xmm0
+	movdqa	16(%rdx, %rcx), %xmm1
+	movdqu	(%rax, %rcx), %xmm2
+	movdqu	16(%rax, %rcx), %xmm3
 	pcmpeqb	%xmm2, %xmm0
-	movdqa	32(%rdx, %r10), %xmm5
+	movdqa	32(%rdx, %rcx), %xmm5
 	pcmpeqb	%xmm3, %xmm1
 	pminub	%xmm2, %xmm0
-	movdqa	48(%rdx, %r10), %xmm6
+	movdqa	48(%rdx, %rcx), %xmm6
 	pminub	%xmm3, %xmm1
-	movdqu	32(%rax, %r10), %xmm2
-	movdqu	48(%rax, %r10), %xmm3
+	movdqu	32(%rax, %rcx), %xmm2
+	movdqu	48(%rax, %rcx), %xmm3
 	pcmpeqb	%xmm2, %xmm5
 	pcmpeqb	%xmm3, %xmm6
 	pminub	%xmm2, %xmm5
@@ -169,12 +167,12 @@  L(loop_cross_page):
 	pcmpeqb	%xmm7, %xmm5
 	pcmpeqb	%xmm7, %xmm6
 
-	pmovmskb	%xmm1, %ecx
-	pmovmskb	%xmm5, %r8d
-	pmovmskb	%xmm0, %edi
-	salq	$16, %rcx
+	pmovmskb %xmm1, %ecx
+	pmovmskb %xmm5, %r8d
+	pmovmskb %xmm0, %edi
+	sal	$16, %ecx
 	salq	$32, %r8
-	pmovmskb	%xmm6, %esi
+	pmovmskb %xmm6, %esi
 	orq	%r8, %rdi
 	orq	%rcx, %rdi
 	salq	$48, %rsi
@@ -190,20 +188,21 @@  L(loop_cross_page):
 	subl	%edx, %eax
 	ret
 
+L(cross_page):
+	xorl	%edx, %edx
+	jmp	L(cross_page_loop_start)
 	.p2align 4
 L(cross_page_loop):
-	cmpb	%cl, %al
-	jne	L(different)
-	addq	$1, %rdx
-	cmpq	$64, %rdx
+	add	$1, %edx
+	cmp	$64, %edx
 	je	L(main_loop_header)
-L(cross_page):
+L(cross_page_loop_start):
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %ecx
-	testb	%al, %al
+	subl	%ecx, %eax
+	jne	L(different)
+	test	%ecx, %ecx
 	jne	L(cross_page_loop)
-	xorl	%eax, %eax
 L(different):
-	subl	%ecx, %eax
 	ret
 END (__strcmp_sse2_unaligned)

[neleai/string-x64] Microoptimize strcmp-sse2-unaligned.

Commit Message

Comments

Patch