From patchwork Wed May 27 09:07:17 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: =?utf-8?b?T25kxZllaiBCw61sa2E=?= X-Patchwork-Id: 477001 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id D6C5714029C for ; Wed, 27 May 2015 19:07:59 +1000 (AEST) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=sourceware.org header.i=@sourceware.org header.b=t+SVnp5I; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:date:from:to:subject:message-id:references :mime-version:content-type:in-reply-to; q=dns; s=default; b=Rm7N 7tBa4cGbz7//kIR+78PvqdZJrsF9+Il/2zJG+Pn7ebwb8K35c6wO2bMQf+XoJPBs KVkshPcCZS5kPHA7A05n6sE+5qd58XT8F7E9stjBdcBdJYO9nDtV/E4ht3j6AZm5 kAXvyVe0UI78j1YUwiylo5B+4REOqXxy+rz0jlI= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:date:from:to:subject:message-id:references :mime-version:content-type:in-reply-to; s=default; bh=/jGHJv4iB3 MYcbSUhZUSWME/hjw=; b=t+SVnp5IFxKYJN3DZH4/fYzB2bekSNOuA88feV+L1b gNQjjCjik/3UAG0svN+p3uqGK8t6XmnK/S0taWRCK4vhcb1o+nf3BVjsaIQj2pIE 0JEeRXn4GOf2qYPMR3lXqNNNHH4z9TvpWHHAgf5CkVxLJE1ge+nF0K6fSZLGv/b3 U= Received: (qmail 81371 invoked by alias); 27 May 2015 09:07:52 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 81357 invoked by uid 89); 27 May 2015 09:07:52 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=0.7 required=5.0 tests=AWL, BAYES_50, FREEMAIL_FROM, SPF_NEUTRAL autolearn=no version=3.3.2 X-HELO: popelka.ms.mff.cuni.cz Date: Wed, 27 May 2015 11:07:17 +0200 From: =?utf-8?B?T25kxZllaiBCw61sa2E=?= To: libc-alpha@sourceware.org Subject: [PATCH 1/* v2] Generic string function optimization: Add skeleton Message-ID: <20150527090717.GA27814@domone> References: <20150527060121.GA19105@domone> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20150527060121.GA19105@domone> User-Agent: Mutt/1.5.20 (2009-06-14) After testing I found that I added typo in conversion to skeleton. This fixes it. Here is correct version. * string/common.h: New file. * string/skeleton.h: Likewise. diff --git a/string/common.h b/string/common.h new file mode 100644 index 0000000..3b239dd --- /dev/null +++ b/string/common.h @@ -0,0 +1,35 @@ +#include + +static const unsigned long int ones = (~0UL / 255); /* 0x0101...*/ +static const unsigned long int add = 127 * (~0UL / 255); +static const unsigned long int high_bits = 128 * (~0UL / 255); + +/* Use vector arithmetic tricks. Idea is to take expression works on + unsigned byte and evaluates 0 for nozero byte and nonzero on zero byte. + Our expression is ((s + 127) & (~s)) & 128 + Now we evaluate this expression on each byte in parallel and on first + nonzero byte our expression will have nonzero value. */ + +static __always_inline +unsigned long int +contains_zero (unsigned long int s) +{ + return (((s & add) + add) ^ high_bits) & high_bits & ~s; +} + +#define LSIZE sizeof (unsigned long int) +#define CROSS_PAGE(x, n) (((uintptr_t)x) % 4096 > 4096 - n) + +static __always_inline +size_t +first_nonzero_byte (unsigned long int u) +{ +#ifdef FAST_FFS + return ffsl (u) / 8 - 1; +#else + u = u ^ (u - 1); + u = u & ones; + u = u * ones; + return (u >> (8 * LSIZE - 8)) - 1; +#endif +} diff --git a/string/skeleton.h b/string/skeleton.h new file mode 100644 index 0000000..563e6e4 --- /dev/null +++ b/string/skeleton.h @@ -0,0 +1,145 @@ +/* Skeleton of generic string functions. + Copyright (C) 1991-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include + +#ifndef BOUND +# define BOUND(x) 0 +#endif + + +static __always_inline +int +found_in_long_bytes(char *s, unsigned long int cmask, char **result) +{ + const unsigned long int *lptr = (const unsigned long int *) s; + unsigned long int mask = EXPRESSION(*lptr, cmask); + if (mask) + { + *result = s + first_nonzero_byte (mask); + return 1; + } + else + return 0; +} + +static __always_inline +char * +string_skeleton (const char *s_in, int c_in, char *end) +{ + unsigned long int mask; + const unsigned long int *lptr; + char *s = (char *) s_in; + unsigned char c = (unsigned char) c_in; + char *r; + unsigned long int cmask = c * ones; + +#if _STRING_ARCH_unaligned + /* We fetch 32 bytes while not crossing page boundary. + Most strings in practice are of that size and we avoid a loop. + This looks as best in practice, alternative below uses aligned load + but is slower when string starts just few + bytes before 32 byte boundary. A tradeoff is that we rarely could + fetch extra cache line without needing it but this optimization + does pay for that. */ + if (!CROSS_PAGE(s, 32)) + { + if (found_in_long_bytes (s + 0 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s + 1 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s + 2 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s + 3 * LSIZE, cmask, &r)) + return r; + if (sizeof (unsigned long int) == 4) + { + if (found_in_long_bytes (s + 5 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s + 6 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s + 7 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s + 8 * LSIZE, cmask, &r)) + return r; + } + + if (BOUND (s + 32)) + return NULL; + } + else + { +#endif + /* We need use aligned loads. For first load we read some bytes before + start that we discard by shifting them down. */ + + char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE); + lptr = (const unsigned long int *) s_aligned; + mask = (EXPRESSION (*lptr, cmask)) >> (8 * (s - s_aligned)); + + if (mask) + return s + first_nonzero_byte (mask); + + if (BOUND (s_aligned + 1 * LSIZE)) + return NULL; + if (found_in_long_bytes (s_aligned + 1 * LSIZE, cmask, &r)) + return r; + if (BOUND (s_aligned + 2 * LSIZE)) + return NULL; + if (found_in_long_bytes (s_aligned + 2 * LSIZE, cmask, &r)) + return r; + if (BOUND (s_aligned + 3 * LSIZE)) + return NULL; + if (found_in_long_bytes (s_aligned + 3 * LSIZE, cmask, &r)) + return r; + if (BOUND (s_aligned + 4 * LSIZE)) + return NULL; +#if _STRING_ARCH_unaligned + } +#endif + /* Now we read enough bytes to start a loop. */ + + char *s_loop = PTR_ALIGN_DOWN (s, 4 * LSIZE); + while (!BOUND (s_loop + 4 * LSIZE)) + { + s_loop += 4 * LSIZE; + lptr = (const unsigned long int *) (s_loop + 0 * LSIZE); + mask = EXPRESSION (*lptr, cmask); + lptr = (const unsigned long int *) (s_loop + 1 * LSIZE); + mask |= EXPRESSION (*lptr, cmask); + lptr = (const unsigned long int *) (s_loop + 2 * LSIZE); + mask |= EXPRESSION (*lptr, cmask); + lptr = (const unsigned long int *) (s_loop + 3 * LSIZE); + mask |= EXPRESSION (*lptr, cmask); + + if (mask) + { + if (found_in_long_bytes (s_loop + 0 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s_loop + 1 * LSIZE, cmask, &r)) + return r; + if (found_in_long_bytes (s_loop + 2 * LSIZE, cmask, &r)) + return r; + + return s_loop + 3 * LSIZE + first_nonzero_byte (mask); + } + } + return NULL; +}