From patchwork Sat Jun 20 11:15:20 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: =?utf-8?b?T25kxZllaiBCw61sa2E=?= X-Patchwork-Id: 486977 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id B59491401AF for ; Sat, 20 Jun 2015 21:15:40 +1000 (AEST) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=sourceware.org header.i=@sourceware.org header.b=O8dCHi0r; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:date:from:to:subject:message-id:references :mime-version:content-type:content-transfer-encoding :in-reply-to; q=dns; s=default; b=GkAMBb5v0YsgyBBi2lnjUBu57aiXcU yf80qVc74gx3lNMJgpTe46w+OZ6xRuUlcpR60L+zMGT3JKniU7wSvdKT1Rnvpfi6 8sQVGwqu1Ik8GjtKdMhKTTL7RNrrfok84dk67wO+QB8ePAoTJ4FIOnnI1xCvWyx+ 6YeUBZGEPGFfc= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:date:from:to:subject:message-id:references :mime-version:content-type:content-transfer-encoding :in-reply-to; s=default; bh=xG56qqe/7+2JagGeCJ7gknjNkQU=; b=O8dC Hi0rwzAWvMD2nB/fWioJY571vxRQDqXPCSkEQXlFJJaoZ0k84vDNAJinhYWuCvlw EoommetmgwXMjoPaScYZXn3+/03fjNkJgPaqVidfhb935hk4xBvD6X8Q1Cd3eX4m lGmUZ9/nhMHrhEF4vMyRysUJgGug70MHnwKNz58= Received: (qmail 85844 invoked by alias); 20 Jun 2015 11:15:34 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 85831 invoked by uid 89); 20 Jun 2015 11:15:33 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Sat, 20 Jun 2015 13:15:20 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: Re: [PATCH 2/2 neleai/string-x64] Add strcmp with avx2 Message-ID: <20150620111520.GA12420@domone> References: <20150620083525.GA31992@domone> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20150620083525.GA31992@domone> User-Agent: Mutt/1.5.20 (2009-06-14) On Sat, Jun 20, 2015 at 10:35:25AM +0200, Ondřej Bílka wrote: > > Hi, > > When I read strcmp again to improve strncmp and add avx2 strcmp > I found that I made several mistakes, mainly caused by first optimizing > c template and then fixing assembly. > > First was mainly my idea to simplify handling cross-page check by oring > src and dest. I recall that I first did complex crosspage handling where > false positives were cheap. Then I found that due to size it has big > overhead and simple loop was faster when testing with firefox. > That turned original decision into bad one. > > Second is to reorganize loop instructions so that after loop ends I could > simply find last byte without recalculating much, using trick that last > 16 bit mask could be ored with previous three as its relevant only when > previous three were zero. > > Final one is that gcc generates bad loops in regards where to increment > pointers. You should place them after loads that use them, not at start > of loop like gcc does. That change is responsible for 10% improvement > for large sizes. > > Final are microoptimizations that save few bytes without measurable > performance impact like using eax instead rax to save byte or moving > unnecessary zeroing instruction when they are not needed. > > Profile data are here, shortly with avx2 for haswell that I will submit > next. > > http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html > > OK to commit this? > Here is a avx2 loop that I promised earlier. Luckily it gives small 2% practical benefit on gcc workload. Also it does improve performance on longer inputs twice. * sysdeps/x86_64/multiarch/Makefile: Add strcmp-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add __strcmp_avx2. * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file. * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add avx2 loop. * sysdeps/x86_64/multiarch/strcmp.S: Add ifunc. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index d01bbbe..bf48283 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -30,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4 endif ifeq (yes,$(config-cflags-avx2)) -sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 +sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2 endif endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index cc6f9f2..57ce237 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -126,7 +126,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcmp.S. */ IFUNC_IMPL (i, name, strcmp, - IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_AVX2, __strcmp_avx2) IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S new file mode 100644 index 0000000..b2f8478 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -0,0 +1,3 @@ +#define USE_AVX2 +#define __strcmp_sse2_unaligned __strcmp_avx2 +#include "strcmp-sse2-unaligned.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S index 03d1b11..10bed9a 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -89,12 +89,35 @@ L(main_loop_header): subq %rsi, %rcx shrq $6, %rcx movq %rcx, %rsi - +#ifdef USE_AVX2 + vpxor %xmm7, %xmm7, %xmm7 +#endif .p2align 4 L(loop): add $-1, %rsi ja L(loop_cross_page) L(back_to_loop): +#ifdef USE_AVX2 + vmovdqu (%rdx), %ymm0 + vmovdqu 32(%rdx), %ymm1 + vpcmpeqb (%rax), %ymm0, %ymm0 + vpminub (%rax), %ymm0, %ymm0 + vpcmpeqb (%rax), %ymm1, %ymm1 + vpminub (%rax), %ymm1, %ymm1 + vpminub %ymm0, %ymm1, %ymm2 + vpcmpeqb %ymm7, %ymm2, %ymm2 + addq $64, %rax + addq $64, %rdx + vpmovmskb %ymm2, %esi + test %esi, %esi + je L(loop) + shl $32, %rsi + vpcmpeqb %ymm7, %ymm0, %ymm0 + vpmovmskb %ymm0, %ecx + or %rsi, %rcx + vzeroupper +#else + movdqu (%rdx), %xmm0 movdqu 16(%rdx), %xmm1 movdqa (%rax), %xmm2 @@ -132,14 +155,17 @@ L(back_to_loop): orq %rdi, %rcx sal $16, %esi orq %rsi, %rcx +#endif bsfq %rcx, %rcx movzbl -64(%rax, %rcx), %eax movzbl -64(%rdx, %rcx), %edx subl %edx, %eax ret - .p2align 4 L(loop_cross_page): +#ifdef USE_AVX2 + vzeroupper +#endif xor %ecx, %ecx movq %rdx, %r9 and $63, %r9 diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index f50f26c..867e9d4 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -90,6 +90,12 @@ ENTRY(STRCMP) call __init_cpu_features 1: #ifdef USE_AS_STRCMP +# ifdef HAVE_AVX2_SUPPORT + + leaq __strcmp_avx2(%rip), %rax + testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip) + jnz 3f +# endif leaq __strcmp_sse2_unaligned(%rip), %rax testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip) jnz 3f