From patchwork Mon Mar 28 19:44:14 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: cseo X-Patchwork-Id: 602611 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3qYkrX1pldz9sB6 for ; Tue, 29 Mar 2016 06:44:43 +1100 (AEDT) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.b=HhQNAq3R; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:to:cc:from:subject:message-id:date :mime-version:content-type; q=dns; s=default; b=aMHaCTZM+y5dO79S 3EJ23hGRhSfe+R6u8bm2bAbGdjCr2UbNtYxg1gWrtHUqjEPZhyG71pYI/l4/p/SE oBhlo25pP7mKe0LCA4ay9gZ3LmDY3tS9EQRT8HOLRxM4ggTIg46bdN7An+OPkWRc hKc/qm7D4wNX0bLPqRw6NvengdM= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:to:cc:from:subject:message-id:date :mime-version:content-type; s=default; bh=94LgBoNVZcKo0nQUEzGHl+ pj2aE=; b=HhQNAq3RETtGsPEEBaUZ2Rg+JTD4vtWLiu5sVQfI+leCjJw6IkZj5e EFoE/Uai8AA0/oTyW5YHP994hVenadFv86mWdkAO5c2x9wEPxdS1t0UPZFPU1bun 0CKdDBsDbaSLsSeP7F3Qx+umaE1Nobs63sdnJvxeuHBDNHZmiS/60= Received: (qmail 75584 invoked by alias); 28 Mar 2016 19:44:35 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 75017 invoked by uid 89); 28 Mar 2016 19:44:34 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: Yes, score=5.2 required=5.0 tests=AWL, BAYES_50, KAM_LAZY_DOMAIN_SECURITY, LIKELY_SPAM_BODY, RP_MATCHES_RCVD autolearn=no version=3.3.2 spammy=seo, Nov, Engineer, p2align X-HELO: e24smtp02.br.ibm.com X-IBM-Helo: d24dlp02.br.ibm.com X-IBM-MailFrom: cseo@linux.vnet.ibm.com X-IBM-RcptTo: libc-alpha@sourceware.org To: GLIBC Cc: Tulio Magno Quites Machado Filho , Steve Munroe From: Carlos Eduardo Seo Subject: [PATCH] powerpc: Optimized POWER8 strlen Message-ID: <56F9898E.8010307@linux.vnet.ibm.com> Date: Mon, 28 Mar 2016 16:44:14 -0300 User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:38.0) Gecko/20100101 Thunderbird/38.7.1 MIME-Version: 1.0 X-TM-AS-MML: disable X-Content-Scanned: Fidelis XPS MAILER x-cbid: 16032819-0021-0000-0000-0000072E2A4E Vectorized implementation of strlen for POWER8. This adds significant improvement for long strings (~3x). There will be a trade-off around the 64-byte length due to the alignment checks required to jump into the vectorized loop. Benchmark results are attached. diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 3b0e3a0..f160120 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -19,7 +19,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strcmp-power8 strcmp-power7 strcmp-ppc64 \ strcat-power8 strcat-power7 strcat-ppc64 \ memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \ - strncpy-power8 strstr-power7 strstr-ppc64 + strncpy-power8 strstr-power7 strstr-ppc64 strlen-power8 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 11a8215..f1d44c7 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -101,6 +101,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strlen_power8) IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX, __strlen_power7) IFUNC_IMPL_ADD (array, i, strlen, 1, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S new file mode 100644 index 0000000..686dc3d --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power8.S @@ -0,0 +1,39 @@ +/* Optimized strlen implementation for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strlen_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strlen_power8): \ + cfi_startproc; \ + LOCALENTRY(__strlen_power8) +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strlen_power8) \ + END_2(__strlen_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c index 94501fd..609a87e 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c @@ -29,11 +29,14 @@ extern __typeof (__redirect_strlen) __libc_strlen; extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden; extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden; +extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden; libc_ifunc (__libc_strlen, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strlen_power7 - : __strlen_ppc); + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strlen_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strlen_power7 + : __strlen_ppc); #undef strlen strong_alias (__libc_strlen, strlen) diff --git a/sysdeps/powerpc/powerpc64/power8/strlen.S b/sysdeps/powerpc/powerpc64/power8/strlen.S new file mode 100644 index 0000000..0142747 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strlen.S @@ -0,0 +1,297 @@ +/* Optimized strlen implementation for PowerPC64/POWER8 using a vectorized + loop. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +/* TODO: change these to the actual instructions when the minimum required + binutils allows it. */ +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) + +/* int [r3] strlen (char *s [r3]) */ + +/* TODO: change this to .machine power8 when the minimum required binutils + allows it. */ + .machine power7 +EALIGN (strlen, 4, 0) + CALL_MCOUNT 1 + dcbt 0,r3 + clrrdi r4,r3,3 /* Align the address to doubleword boundary. */ + rlwinm r6,r3,3,26,28 /* Calculate padding. */ + li r0,0 /* Doubleword with null chars to use + with cmpb. */ + li r5,-1 /* MASK = 0xffffffffffffffff. */ + ld r12,0(r4) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r5,r5,r6 +#else + srd r5,r5,r6 /* MASK = MASK >> padding. */ +#endif + orc r9,r12,r5 /* Mask bits that are not part of the string. */ + cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ + bne cr7,L(done) + + /* For shorter strings (< 64 bytes), we will not use vector registers, + as the overhead isn't worth it. So, let's use GPRs instead. This + will be done the same way as we do in the POWER7 implementation. + Let's see if we are aligned to a quadword boundary. If so, we can + jump to the first (non-vectorized) loop. Otherwise, we have to + handle the next DWORD first. */ + mtcrf 0x01,r4 + mr r9,r4 + addi r9,r9,8 + bt 28,L(align64) + + /* Handle the next 8 bytes so we are aligned to a quadword + boundary. */ + ldu r5,8(r4) + cmpb r10,r5,r0 + cmpdi cr7,r10,0 + addi r9,r9,8 + bne cr7,L(done) + +L(align64): + /* Proceed to the old (POWER7) implementation, checking two doublewords + per iteraction. For the first 56 bytes, we will just check for null + characters. After that, we will also check if we are 64-byte aligned + so we can jump to the vectorized implementation. We will unroll + these loops to avoid excessive branching. */ + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + /* Are we 64-byte aligned? If so, jump to the vectorized loop. + Note: aligning to 64-byte will necessarily slow down performance for + strings around 64 bytes in length due to the extra comparisons + required to check alignment for the vectorized loop. This is a + necessary tradeoff we are willing to take in order to speed up the + calculation for larger strings. */ + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + bne cr7,L(dword_zero) + + andi. r10,r9,63 + beq cr0,L(preloop) + ld r6,8(r4) + ldu r5,16(r4) + cmpb r10,r6,r0 + cmpb r11,r5,r0 + or r5,r10,r11 + cmpdi cr7,r5,0 + addi r9,r9,16 + + /* At this point, we are necessarily 64-byte aligned. If no zeroes were + found, jump to the vectorized loop. */ + beq cr7,L(preloop) + +L(dword_zero): + /* OK, one (or both) of the doublewords contains a null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a null byte. */ + + cmpdi cr6,r10,0 + addi r4,r4,-8 + bne cr6,L(done) + + /* The null byte must be in the second doubleword. Adjust the address + again and move the result of cmpb to r10 so we can calculate the + length. */ + + mr r10,r11 + addi r4,r4,8 + + /* If the null byte was found in the non-vectorized code, compute the + final length. r10 has the output of the cmpb instruction, that is, + it contains 0xff in the same position as the null byte in the + original doubleword from the string. Use that to calculate the + length. */ +L(done): +#ifdef __LITTLE_ENDIAN__ + addi r9, r10,-1 /* Form a mask from trailing zeros. */ + andc r9, r9,r10 + popcntd r0, r9 /* Count the bits in the mask. */ +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + subf r5,r3,r4 + srdi r0,r0,3 /* Convert leading/trailing zeros to bytes. */ + add r3,r5,r0 /* Compute final length. */ + blr + + /* Vectorized implementation starts here. */ + .p2align 4 +L(preloop): + /* Set up for the loop. */ + mr r4,r9 + li r7, 16 /* Load required offsets. */ + li r8, 32 + li r9, 48 + li r12, 8 + vxor v0,v0,v0 /* VR with null chars to use with + vcmpequb. */ + + /* Main loop to look for the end of the string. We will read in + 64-byte chunks. Align it to 32 bytes and unroll it 3 times to + leverage the icache performance. */ + .p2align 5 +L(loop): + lvx v1,r4,r0 /* Load 4 quadwords. */ + lvx v2,r4,r7 + lvx v3,r4,r8 + lvx v4,r4,r9 + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ + vminub v6,v3,v4 + vminub v7,v5,v6 + vcmpequb. v7,v7,v0 /* Check for NULLs. */ + addi r4,r4,64 /* Adjust address for the next iteration. */ + bne cr6,L(vmx_zero) + + lvx v1,r4,r0 /* Load 4 quadwords. */ + lvx v2,r4,r7 + lvx v3,r4,r8 + lvx v4,r4,r9 + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ + vminub v6,v3,v4 + vminub v7,v5,v6 + vcmpequb. v7,v7,v0 /* Check for NULLs. */ + addi r4,r4,64 /* Adjust address for the next iteration. */ + bne cr6,L(vmx_zero) + + lvx v1,r4,r0 /* Load 4 quadwords. */ + lvx v2,r4,r7 + lvx v3,r4,r8 + lvx v4,r4,r9 + vminub v5,v1,v2 /* Compare and merge into one VR for speed. */ + vminub v6,v3,v4 + vminub v7,v5,v6 + vcmpequb. v7,v7,v0 /* Check for NULLs. */ + addi r4,r4,64 /* Adjust address for the next iteration. */ + beq cr6,L(loop) + +L(vmx_zero): + /* OK, we found a null byte. Let's look for it in the current 64-byte + block and mark it in its corresponding VR. */ + vcmpequb v1,v1,v0 + vcmpequb v2,v2,v0 + vcmpequb v3,v3,v0 + vcmpequb v4,v4,v0 + + /* We will now 'compress' the result into a single doubleword, so it + can be moved to a GPR for the final calculation. First, we + generate an appropriate mask for vbpermq, so we can permute bits into + the first halfword. */ + vspltisb v10,3 + lvsl v11,r0,r0 + vslb v10,v11,v10 + + /* Permute the first bit of each byte into bits 48-63. */ + VBPERMQ(v1,v1,v10) + VBPERMQ(v2,v2,v10) + VBPERMQ(v3,v3,v10) + VBPERMQ(v4,v4,v10) + + /* Shift each component into its correct position for merging. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v2,v2,v2,2 + vsldoi v3,v3,v3,4 + vsldoi v4,v4,v4,6 +#else + vsldoi v1,v1,v1,6 + vsldoi v2,v2,v2,4 + vsldoi v3,v3,v3,2 +#endif + + /* Merge the results and move to a GPR. */ + vor v1,v2,v1 + vor v2,v3,v4 + vor v4,v1,v2 + MFVRD(r10,v4) + + /* Adjust address to the begninning of the current 64-byte block. */ + addi r4,r4,-64 + +#ifdef __LITTLE_ENDIAN__ + addi r9, r10,-1 /* Form a mask from trailing zeros. */ + andc r9, r9,r10 + popcntd r0, r9 /* Count the bits in the mask. */ +#else + cntlzd r0,r10 /* Count leading zeros before the match. */ +#endif + subf r5,r3,r4 + add r3,r5,r0 /* Compute final length. */ + blr + +END (strlen) +libc_hidden_builtin_def (strlen)