Message ID | 001c01d0a912$42357710$c6a06530$@com |
---|---|
State | New |
Headers | show |
I'll leave it to Siddhesh to review this patch, but it seems wrong to add this benchmark to string-bench (if it can be handled identically to the string benchmarks, maybe you need to change the variable name to reflect what's actually common between them).
On Wed, Jun 17, 2015 at 04:28:27PM +0100, Wilco Dijkstra wrote: > Hi, > > Due to popular demand, here is a new benchmark that tests isinf, isnan, > isnormal, isfinite and fpclassify. It uses 2 arrays with 1024 doubles, > one with 99% finite FP numbers (10% zeroes, 10% negative) and 1% inf/NaN, > the other with 50% inf, and 50% Nan. > > Results shows that using the GCC built-ins in math.h will give huge speedups > due to avoiding explict calls, PLT indirection to execute a function with > 3-4 instructions. The GCC builtins have similar performance as the existing > math_private inlines for __isnan, __finite and __isinf_ns. > > OK for commit? > > ChangeLog: > 2015-06-17 Wilco Dijkstra <wdijkstr@arm.com> > > * benchtests/Makefile: Add bench-math-inlines.c. > * benchtests/bench-math-inlines.c: New benchmark. > > --- > benchtests/Makefile | 14 +-- > benchtests/bench-math-inlines.c | 203 ++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 211 insertions(+), 6 deletions(-) > create mode 100644 benchtests/bench-math-inlines.c > > diff --git a/benchtests/Makefile b/benchtests/Makefile > index 8e615e5..3c20180 100644 > --- a/benchtests/Makefile > +++ b/benchtests/Makefile > @@ -30,12 +30,13 @@ bench-pthread := pthread_once > bench := $(bench-math) $(bench-pthread) > > # String function benchmarks. > -string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \ > - mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \ > - strcat strchr strchrnul strcmp strcpy strcspn strlen \ > - strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \ > - strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \ > - strcoll > +string-bench := bcopy bzero math-inlines memccpy memchr memcmp memcpy memmem \ > + memmove mempcpy memset rawmemchr stpcpy stpncpy strcasecmp \ > + strcasestr strcat strchr strchrnul strcmp strcpy strcspn \ > + strlen strncasecmp strncat strncmp strncpy strnlen strpbrk \ > + strrchr strspn strstr strcpy_chk stpcpy_chk memrchr strsep \ > + strtok strcoll > + Putting mathinline test cases in string-bench is wrong. Make a new benchset-math or something similar. > string-bench-all := $(string-bench) > > # We have to generate locales > @@ -58,6 +59,7 @@ CFLAGS-bench-ffsll.c += -fno-builtin > bench-malloc := malloc-thread > > $(addprefix $(objpfx)bench-,$(bench-math)): $(libm) > +$(addprefix $(objpfx)bench-,math-inlines): $(libm) This doesn't need an addprefix, but if you make a benchset-math, then you can use this format for benchset-math. > $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library) > $(objpfx)bench-malloc-thread: $(shared-thread-library) > > diff --git a/benchtests/bench-math-inlines.c b/benchtests/bench-math-inlines.c > new file mode 100644 > index 0000000..c21a3d3 > --- /dev/null > +++ b/benchtests/bench-math-inlines.c > @@ -0,0 +1,203 @@ > +/* Measure math inline functions. We've agreed to print outputs in JSON. Please see other newer benchtests to see how you can do that. The string benchtests need to be ported to the JSON output format. > + Copyright (C) 2015 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#define SIZE 1024 > +#define TEST_MAIN > +#define TEST_NAME "math-inlines" > +#include "bench-string.h" > + > +#include <stdlib.h> > +#include <math.h> > +#include <stdint.h> > + > +#define BOOLTEST(func) \ > +int \ > +func ## _t (volatile double *p, size_t n, size_t iters) \ > +{ \ > + int i, j; \ > + int res = 0; \ > + for (j = 0; j < iters; j++) \ > + for (i = 0; i < n; i++) \ > + if (func (p[i] * 2.0)) res++; \ > + return res; \ > +} > + > +#define VALUETEST(func) \ > +int \ > +func ## _t (volatile double *p, size_t n, size_t iters) \ > +{ \ > + int i, j; \ > + int res = 0; \ > + for (j = 0; j < iters; j++) \ > + for (i = 0; i < n; i++) \ > + res += func (p[i] * 2.0); \ > + return res; \ > +} > + > +typedef union > +{ > + double value; > + uint64_t word; > +} ieee_double_shape_type; > + > +#define EXTRACT_WORDS64(i,d) \ > +do { \ > + ieee_double_shape_type gh_u; \ > + gh_u.value = (d); \ > + (i) = gh_u.word; \ > +} while (0) > + > +/* Explicit inlines similar to math_private.h versions. */ > + > +extern __always_inline int > +__isnan_inl (double d) > +{ > + uint64_t di; > + EXTRACT_WORDS64 (di, d); > + return (di & 0x7fffffffffffffffull) > 0x7ff0000000000000ull; > +} > + > +extern __always_inline int > +__isinf_inl (double x) > +{ > + uint64_t ix; > + EXTRACT_WORDS64 (ix,x); > + if ((ix << 1) != 0xffe0000000000000ull) > + return 0; > + return (int)(ix >> 32); > +} > + > +extern __always_inline int > +__finite_inl (double d) > +{ > + uint64_t di; > + EXTRACT_WORDS64 (di, d); > + return (di & 0x7fffffffffffffffull) < 0x7ff0000000000000ull; > +} > + > +/* Explicit inline similar to existing math.h implementation. */ > + > +#define __isnormal_inl(X) (__fpclassify (X) == FP_NORMAL) > +#define __isnormal_inl2(X) (fpclassify (X) == FP_NORMAL) > + > +/* Test fpclassify with use of only 2 of the 5 results. */ > + > +extern __always_inline int > +__fpclassify_test1 (double d) > +{ > + int cl = fpclassify (d); > + return cl == FP_NAN || cl == FP_INFINITE; > +} > + > +extern __always_inline int > +__fpclassify_test2 (double d) > +{ > + return isnan (d) || isinf (d); > +} > + > +/* Create test functions for each possibility. */ > + > +BOOLTEST (__isnan) > +BOOLTEST (isnan) > +BOOLTEST (__isnan_inl) > + > +BOOLTEST (__isinf) > +BOOLTEST (isinf) > +BOOLTEST (__isinf_inl) > + > +BOOLTEST (__finite) > +BOOLTEST (isfinite) > +BOOLTEST (__finite_inl) > + > +BOOLTEST (isnormal) > +BOOLTEST (__isnormal_inl) > +BOOLTEST (__isnormal_inl2) > + > +VALUETEST (fpclassify) > +VALUETEST (__fpclassify) > +BOOLTEST (__fpclassify_test1) > +BOOLTEST (__fpclassify_test2) > + > +IMPL (isnan_t, 0) > +IMPL (__isnan_t, 1) > +IMPL (__isnan_inl_t, 2) > +IMPL (isinf_t, 3) > +IMPL (__isinf_t, 4) > +IMPL (__isinf_inl_t, 5) > +IMPL (isfinite_t, 6) > +IMPL (__finite_t, 7) > +IMPL (__finite_inl_t, 8) > +IMPL (isnormal_t, 9) > +IMPL (__isnormal_inl_t, 10) > +IMPL (__isnormal_inl2_t, 11) > +IMPL (fpclassify_t, 12) > +IMPL (__fpclassify_t, 13) > +IMPL (__fpclassify_test1_t, 14) > +IMPL (__fpclassify_test2_t, 15) > + > +typedef int (*proto_t) (volatile double *p, size_t n, size_t iters); > + > +static void > +do_one_test (impl_t *impl, volatile double *arr, size_t len) > +{ > + size_t iters = INNER_LOOP_ITERS * 10; > + timing_t start, stop, cur; > + > + TIMING_NOW (start); > + CALL (impl, arr, len, iters); > + TIMING_NOW (stop); > + TIMING_DIFF (cur, start, stop); > + > + TIMING_PRINT_MEAN ((double) cur, (double) iters); > +} > + > +static volatile double arr1[SIZE]; > +static volatile double arr2[SIZE]; > + > +int > +test_main (void) > +{ > + size_t i; > + > + test_init (); > + > + /* Create 2 test arrays, one with 10% zeroes, 10% negative values, > + 79% positive values and 1% infinity/NaN. The other contains > + 50% inf, 50% NaN. */ > + > + for (i = 0; i < SIZE; i++) > + { > + int x = rand () & 255; > + arr1[i] = (x < 25) ? 0.0 : ((x < 50) ? -1 : 100); > + if (x == 255) arr1[i] = __builtin_inf (); > + if (x == 254) arr1[i] = __builtin_nan ("0"); > + arr2[i] = (x < 128) ? __builtin_inf () : __builtin_nan ("0"); > + } > + > + FOR_EACH_IMPL (impl, 0) > + { > + printf ("%20s: ", impl->name); > + do_one_test (impl, arr1, SIZE); > + do_one_test (impl, arr2, SIZE); > + putchar ('\n'); > + } > + > + return ret; > +} > + > +#include "../test-skeleton.c" > -- > 1.9.1 > >
On Wed, Jun 17, 2015 at 04:28:27PM +0100, Wilco Dijkstra wrote: > Hi, > > Due to popular demand, here is a new benchmark that tests isinf, isnan, > isnormal, isfinite and fpclassify. It uses 2 arrays with 1024 doubles, > one with 99% finite FP numbers (10% zeroes, 10% negative) and 1% inf/NaN, > the other with 50% inf, and 50% Nan. > > Results shows that using the GCC built-ins in math.h will give huge speedups > due to avoiding explict calls, PLT indirection to execute a function with > 3-4 instructions. The GCC builtins have similar performance as the existing > math_private inlines for __isnan, __finite and __isinf_ns. > > OK for commit? > Ran these, on x64 using builtins is regression even with your benchmark. Main problem here is what exactly you do measure. I don't know how much of your results were caused by measuring latency of load/multiply/move to int register chain. With OoO that latency shouldn't be problem. Original results are following, when I also inlined isfinite: __fpclassify_test2_t: 3660.24 3733.22 __fpclassify_test1_t: 3696.33 3691.3 __fpclassify_t: 14365.8 11116.5 fpclassify_t: 6045.69 3128.76 __isnormal_inl2_t: 5275.85 14562.6 __isnormal_inl_t: 14753.3 11143.5 isnormal_t: 4418.84 4411.59 __finite_inl_t: 3038.75 3038.4 __finite_t: 7712.42 7697.24 isfinite_t: 3108.91 3107.85 __isinf_inl_t: 2109.05 2817.19 __isinf_t: 8555.51 8559.36 isinf_t: 3472.62 3408.8 __isnan_inl_t: 2682.12 2691.39 __isnan_t: 7698.14 7735.29 isnan_t: 2592.58 2572.82 But with latency hiding by using argument first suddenly even isnan and isnormal become regression. for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0); \ if (func (p[i] * 2.0)) res += 5;} \ __fpclassify_test2_t: 92929.4 37256.8 __fpclassify_test1_t: 94020.1 35512.1 __fpclassify_t: 17321.2 13325.1 fpclassify_t: 8021.29 4376.89 __isnormal_inl2_t: 93896.9 38941.8 __isnormal_inl_t: 98069.2 46140.4 isnormal_t: 94775.6 36941.8 __finite_inl_t: 84059.9 38304 __finite_t: 96052.4 45998.2 isfinite_t: 93371.5 36659.1 __isinf_inl_t: 92532.9 36050.1 __isinf_t: 95929.4 46585.2 isinf_t: 93290.1 36445.6 __isnan_inl_t: 82760.7 37452.2 __isnan_t: 98064.6 45338.8 isnan_t: 93386.7 37786.4
> Ondřej Bílka wrote: > On Wed, Jun 17, 2015 at 04:28:27PM +0100, Wilco Dijkstra wrote: > > Hi, > > > > Due to popular demand, here is a new benchmark that tests isinf, isnan, > > isnormal, isfinite and fpclassify. It uses 2 arrays with 1024 doubles, > > one with 99% finite FP numbers (10% zeroes, 10% negative) and 1% inf/NaN, > > the other with 50% inf, and 50% Nan. > > > > Results shows that using the GCC built-ins in math.h will give huge speedups > > due to avoiding explict calls, PLT indirection to execute a function with > > 3-4 instructions. The GCC builtins have similar performance as the existing > > math_private inlines for __isnan, __finite and __isinf_ns. > > > > OK for commit? > > > Ran these, on x64 using builtins is regression even with your benchmark. > > Main problem here is what exactly you do measure. I don't know how much > of your results were caused by measuring latency of load/multiply/move > to int register chain. With OoO that latency shouldn't be problem. > > Original results are following, when I also inlined isfinite: > > __fpclassify_test2_t: 3660.24 3733.22 > __fpclassify_test1_t: 3696.33 3691.3 > __fpclassify_t: 14365.8 11116.5 > fpclassify_t: 6045.69 3128.76 > __isnormal_inl2_t: 5275.85 14562.6 > __isnormal_inl_t: 14753.3 11143.5 > isnormal_t: 4418.84 4411.59 > __finite_inl_t: 3038.75 3038.4 > __finite_t: 7712.42 7697.24 > isfinite_t: 3108.91 3107.85 > __isinf_inl_t: 2109.05 2817.19 > __isinf_t: 8555.51 8559.36 > isinf_t: 3472.62 3408.8 > __isnan_inl_t: 2682.12 2691.39 > __isnan_t: 7698.14 7735.29 > isnan_t: 2592.58 2572.82 > > > But with latency hiding by using argument first suddenly even isnan and > isnormal become regression. > > for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0); \ > if (func (p[i] * 2.0)) res += 5;} \ > > > __fpclassify_test2_t: 92929.4 37256.8 > __fpclassify_test1_t: 94020.1 35512.1 > __fpclassify_t: 17321.2 13325.1 > fpclassify_t: 8021.29 4376.89 > __isnormal_inl2_t: 93896.9 38941.8 > __isnormal_inl_t: 98069.2 46140.4 > isnormal_t: 94775.6 36941.8 > __finite_inl_t: 84059.9 38304 > __finite_t: 96052.4 45998.2 > isfinite_t: 93371.5 36659.1 > __isinf_inl_t: 92532.9 36050.1 > __isinf_t: 95929.4 46585.2 > isinf_t: 93290.1 36445.6 > __isnan_inl_t: 82760.7 37452.2 > __isnan_t: 98064.6 45338.8 > isnan_t: 93386.7 37786.4 Can you try this with: for (i = 0; i < n; i++) \ { double tmp = p[i] * 2.0; \ if (sin(tmp) < 1.0) res++; if (func (tmp)) res += 5;} \ Basically GCC does the array read and multiply twice just like you told it to (remember this is not using -ffast-math). Also avoid adding unnecessary FP operations and conversions (which may interact badly with timing the code we're trying to test). For me the fixed version still shows the expected answer: the built-ins are either faster or as fast as the inlines. So I don't think there is any regression here (remember also that previously there were no inlines at all except for a few inside GLIBC, so the real speedup is much larger). __fpclassify_test2_t: 1.07 __fpclassify_test1_t: 1.07 __fpclassify_t: 1.24 fpclassify_t: 1 __isnormal_inl2_t: 1.11 __isnormal_inl_t: 1.24 isnormal_t: 1.04 __finite_inl_t: 1.04 __finite_t: 1.19 isfinite_t: 1 __isinf_inl_t: 1.07 __isinf_t: 1.22 isinf_t: 1 __isnan_inl_t: 1.04 __isnan_t: 1.14 isnan_t: 1 Wilco
On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote: > > > > Ondřej Bílka wrote: > > But with latency hiding by using argument first suddenly even isnan and > > isnormal become regression. > > > > for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0); \ > > if (func (p[i] * 2.0)) res += 5;} \ > > > > > > __fpclassify_test2_t: 92929.4 37256.8 > > __fpclassify_test1_t: 94020.1 35512.1 > > __fpclassify_t: 17321.2 13325.1 > > fpclassify_t: 8021.29 4376.89 > > __isnormal_inl2_t: 93896.9 38941.8 > > __isnormal_inl_t: 98069.2 46140.4 > > isnormal_t: 94775.6 36941.8 > > __finite_inl_t: 84059.9 38304 > > __finite_t: 96052.4 45998.2 > > isfinite_t: 93371.5 36659.1 > > __isinf_inl_t: 92532.9 36050.1 > > __isinf_t: 95929.4 46585.2 > > isinf_t: 93290.1 36445.6 > > __isnan_inl_t: 82760.7 37452.2 > > __isnan_t: 98064.6 45338.8 > > isnan_t: 93386.7 37786.4 > > Can you try this with: > > for (i = 0; i < n; i++) \ > { double tmp = p[i] * 2.0; \ > if (sin(tmp) < 1.0) res++; if (func (tmp)) res += 5;} \ > That doesn't change outcome: __fpclassify_test2_t: 99721 51051.6 __fpclassify_test1_t: 85015.2 43607.4 __fpclassify_t: 13997.3 10475.1 fpclassify_t: 13502.5 10253.6 __isnormal_inl2_t: 76479.4 41531.7 __isnormal_inl_t: 76526.9 41560.8 isnormal_t: 76458.6 41547.7 __finite_inl_t: 71108.6 33271.3 __finite_t: 73031 37452.3 isfinite_t: 73024.9 37447 __isinf_inl_t: 68599.2 32792.9 __isinf_t: 74851 40108.8 isinf_t: 74871.9 40109.9 __isnan_inl_t: 71100.8 33659.6 __isnan_t: 72914 37592.4 isnan_t: 72909.4 37635.8 > Basically GCC does the array read and multiply twice just like you told it > to (remember this is not using -ffast-math). Also avoid adding unnecessary > FP operations and conversions (which may interact badly with timing the > code we're trying to test). > And how do you know that most users don't use fp conversions in their code just before isinf? These interactions make benchtests worthless as in practice a different variant would be faster than one that you measure. > For me the fixed version still shows the expected answer: the built-ins are > either faster or as fast as the inlines. So I don't think there is any > regression here (remember also that previously there were no inlines at all > except for a few inside GLIBC, so the real speedup is much larger). Thats arm only. So it looks that we need platform-specific headers and testing. These give speedup but as internal on x64 are better as they are its natural to ask if using these in general would give same speedup. That leads to fixing gcc builtins.
> Ondřej Bílka wrote: > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote: > > > > > > > Ondřej Bílka wrote: > > > But with latency hiding by using argument first suddenly even isnan and > > > isnormal become regression. > > > > > > for (i = 0; i < n; i++){ res += 3*sin(p[i] * 2.0); \ > > > if (func (p[i] * 2.0)) res += 5;} \ > > > > > > > > > __fpclassify_test2_t: 92929.4 37256.8 > > > __fpclassify_test1_t: 94020.1 35512.1 > > > __fpclassify_t: 17321.2 13325.1 > > > fpclassify_t: 8021.29 4376.89 > > > __isnormal_inl2_t: 93896.9 38941.8 > > > __isnormal_inl_t: 98069.2 46140.4 > > > isnormal_t: 94775.6 36941.8 > > > __finite_inl_t: 84059.9 38304 > > > __finite_t: 96052.4 45998.2 > > > isfinite_t: 93371.5 36659.1 > > > __isinf_inl_t: 92532.9 36050.1 > > > __isinf_t: 95929.4 46585.2 > > > isinf_t: 93290.1 36445.6 > > > __isnan_inl_t: 82760.7 37452.2 > > > __isnan_t: 98064.6 45338.8 > > > isnan_t: 93386.7 37786.4 > > > > Can you try this with: > > > > for (i = 0; i < n; i++) \ > > { double tmp = p[i] * 2.0; \ > > if (sin(tmp) < 1.0) res++; if (func (tmp)) res += 5;} \ > > > That doesn't change outcome: > > __fpclassify_test2_t: 99721 51051.6 > __fpclassify_test1_t: 85015.2 43607.4 > __fpclassify_t: 13997.3 10475.1 > fpclassify_t: 13502.5 10253.6 > __isnormal_inl2_t: 76479.4 41531.7 > __isnormal_inl_t: 76526.9 41560.8 > isnormal_t: 76458.6 41547.7 > __finite_inl_t: 71108.6 33271.3 > __finite_t: 73031 37452.3 > isfinite_t: 73024.9 37447 > __isinf_inl_t: 68599.2 32792.9 > __isinf_t: 74851 40108.8 > isinf_t: 74871.9 40109.9 > __isnan_inl_t: 71100.8 33659.6 > __isnan_t: 72914 37592.4 > isnan_t: 72909.4 37635.8 That doesn't look correct - it looks like this didn't use the built-ins at all, did you forget to apply that patch? Anyway I received a new machine so now GLIBC finally builds for x64. Since there appear large variations from run to run I repeat the same tests 4 times by copying the FOR_EACH_IMPL loop. The first 1 or 2 are bad, the last 2 converge to useable results. So I suspect frequency scaling is an issue here. Without the sin(tmp) part I get: remainder_test2_t: 40786.9 192862 remainder_test1_t: 43008.2 196311 __fpclassify_test2_t: 2856.56 3020.12 __fpclassify_test1_t: 3043.53 3135.89 __fpclassify_t: 12500.6 10152.5 fpclassify_t: 2972.54 3047.65 __isnormal_inl2_t: 4619.55 14491.1 __isnormal_inl_t: 12896.3 10306.7 isnormal_t: 4254.42 3667.87 __finite_inl_t: 3979.58 3991.6 __finite_t: 7039.61 7039.37 isfinite_t: 2992.65 2969.25 __isinf_inl_t: 2852.1 3239.23 __isinf_t: 8991.81 8813.44 isinf_t: 3241.75 3241.54 __isnan_inl_t: 4003.51 3977.73 __isnan_t: 7054.54 7054.5 isnan_t: 2819.66 2801.94 And with the sin() addition: remainder_test2_t: 105093 214635 remainder_test1_t: 106635 218012 __fpclassify_test2_t: 64290.9 32116.6 __fpclassify_test1_t: 64365.1 32310.2 __fpclassify_t: 72006.1 41607 fpclassify_t: 64190.3 33450.1 __isnormal_inl2_t: 65959.1 33672 __isnormal_inl_t: 71875.7 41727.3 isnormal_t: 65676.1 32826.1 __finite_inl_t: 69600.6 35293.3 __finite_t: 67653.8 38627.2 isfinite_t: 64435.9 34904.9 __isinf_inl_t: 68556.6 33176 __isinf_t: 69066.4 39562.7 isinf_t: 64755.5 34244.6 __isnan_inl_t: 69577.3 34776.2 __isnan_t: 67538.8 38321.3 isnan_t: 63963 33276.6 The remainder test is basically math/w_remainder.c adapted to use __isinf_inl and __isnan_inl (test1) or the isinf/isnan built-ins (test2). From this it seems that __isinf_inl is slightly better than the builtin, but it does not show up as a regression when combined with sin or in the remainder test. So I don't see any potential regression here on x64 - in fact it looks like inlining using the built-ins gives quite good speedups across the board. And besides inlining applications using GLIBC it also inlines a lot of callsites within GLIBC that weren't previously inlined. > > Basically GCC does the array read and multiply twice just like you told it > > to (remember this is not using -ffast-math). Also avoid adding unnecessary > > FP operations and conversions (which may interact badly with timing the > > code we're trying to test). > > > And how do you know that most users don't use fp conversions in their > code just before isinf? These interactions make benchtests worthless as > in practice a different variant would be faster than one that you > measure. You always get such interactions, it's unavoidable. That's why I added some actual math code that uses isinf/isnan to see how it performs in real life. > > For me the fixed version still shows the expected answer: the built-ins are > > either faster or as fast as the inlines. So I don't think there is any > > regression here (remember also that previously there were no inlines at all > > except for a few inside GLIBC, so the real speedup is much larger). > > Thats arm only. So it looks that we need platform-specific headers and testing. Well I just confirmed the same gains apply to x64. Wilco
On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote: > > Ondřej Bílka wrote: > > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote: > > > > > > > > > > Ondřej Bílka wrote: > > > > But with latency hiding by using argument first suddenly even isnan and > > > > isnormal become regression. > > > > > > That doesn't look correct - it looks like this didn't use the built-ins at all, > did you forget to apply that patch? > No, from what you wrote I expected that patch already tests builtins which doesn't. Applied patch and got different results. When I added patch results are similar. > Anyway I received a new machine so now GLIBC finally builds for x64. Since > there appear large variations from run to run I repeat the same tests 4 times > by copying the FOR_EACH_IMPL loop. The first 1 or 2 are bad, the last 2 > converge to useable results. So I suspect frequency scaling is an issue here. > > Without the sin(tmp) part I get: > > remainder_test2_t: 40786.9 192862 > remainder_test1_t: 43008.2 196311 > __fpclassify_test2_t: 2856.56 3020.12 > __fpclassify_test1_t: 3043.53 3135.89 > __fpclassify_t: 12500.6 10152.5 > fpclassify_t: 2972.54 3047.65 > __isnormal_inl2_t: 4619.55 14491.1 > __isnormal_inl_t: 12896.3 10306.7 > isnormal_t: 4254.42 3667.87 > __finite_inl_t: 3979.58 3991.6 > __finite_t: 7039.61 7039.37 > isfinite_t: 2992.65 2969.25 > __isinf_inl_t: 2852.1 3239.23 > __isinf_t: 8991.81 8813.44 > isinf_t: 3241.75 3241.54 > __isnan_inl_t: 4003.51 3977.73 > __isnan_t: 7054.54 7054.5 > isnan_t: 2819.66 2801.94 > > And with the sin() addition: > > remainder_test2_t: 105093 214635 > remainder_test1_t: 106635 218012 > __fpclassify_test2_t: 64290.9 32116.6 > __fpclassify_test1_t: 64365.1 32310.2 > __fpclassify_t: 72006.1 41607 > fpclassify_t: 64190.3 33450.1 > __isnormal_inl2_t: 65959.1 33672 > __isnormal_inl_t: 71875.7 41727.3 > isnormal_t: 65676.1 32826.1 > __finite_inl_t: 69600.6 35293.3 > __finite_t: 67653.8 38627.2 > isfinite_t: 64435.9 34904.9 > __isinf_inl_t: 68556.6 33176 > __isinf_t: 69066.4 39562.7 > isinf_t: 64755.5 34244.6 > __isnan_inl_t: 69577.3 34776.2 > __isnan_t: 67538.8 38321.3 > isnan_t: 63963 33276.6 > > The remainder test is basically math/w_remainder.c adapted to use __isinf_inl > and __isnan_inl (test1) or the isinf/isnan built-ins (test2). > Which still doesn't have to mean anything, only if you test a application that frequently uses these you will get result without doubt. Here a simple modification produces different results. One of many objections is that by simply adding gcc will try to make branchless code like converting that to res += 5 * (isnan(tmp)). So use more difficult branch and and with following two I get __builtin_isinf lot slower. { double tmp = p[i] * 2.0; \ res += 3 * sin (tmp); if (func (tmp)) res += 3* sin (2 * tmp) ;} \ { double tmp = p[i] * 2.0; \ if (func (tmp)) res += 3 * sin (2 * tmp) ;} \ __fpclassify_test2_t: 4645.32 34549.8 __fpclassify_test1_t: 4858.35 35682 __fpclassify_t: 12959.6 9402.96 fpclassify_t: 6163.18 3573.92 __isnormal_inl2_t: 72463.8 14535.8 __isnormal_inl_t: 78154.4 10665.7 isnormal_t: 72172.7 3978.68 __finite_inl_t: 69525.5 2048.91 __finite_t: 75383.9 10260.5 isfinite_t: 71913.3 3449.38 __isinf_inl_t: 2279.96 19166.1 __isinf_t: 10462.3 32308.5 isinf_t: 3706.35 22804.2 __isnan_inl_t: 2376.56 20769.1 __isnan_t: 9558.7 23683.9 isnan_t: 2025.91 19824.9 and __fpclassify_test2_t: 69720.7 70690 __fpclassify_test1_t: 69575.2 57032 __fpclassify_t: 12839.1 9460.82 fpclassify_t: 6136.18 3355.79 __isnormal_inl2_t: 133754 34673.9 __isnormal_inl_t: 137851 41468.9 isnormal_t: 133421 37423.4 __finite_inl_t: 131153 34229.9 __finite_t: 135990 39609.5 isfinite_t: 132679 34604.8 __isinf_inl_t: 66901.3 44636.5 __isinf_t: 74330.2 53375.7 isinf_t: 68432.6 45350.5 __isnan_inl_t: 67125.3 43663.6 __isnan_t: 71864.2 49792.7 isnan_t: 67107.9 42478.3 > >From this it seems that __isinf_inl is slightly better than the builtin, but > it does not show up as a regression when combined with sin or in the remainder > test. > That doesn't hold generaly as remainder test it could be just caused by isnan being slower than isinf. > So I don't see any potential regression here on x64 - in fact it looks like > inlining using the built-ins gives quite good speedups across the board. And > besides inlining applications using GLIBC it also inlines a lot of callsites > within GLIBC that weren't previously inlined. > That doesn't mean anything as this is still poor benchmark. It only uses tight loop which doesn't measure cost of loading constants for example. > > > Basically GCC does the array read and multiply twice just like you told it > > > to (remember this is not using -ffast-math). Also avoid adding unnecessary > > > FP operations and conversions (which may interact badly with timing the > > > code we're trying to test). > > > > > And how do you know that most users don't use fp conversions in their > > code just before isinf? These interactions make benchtests worthless as > > in practice a different variant would be faster than one that you > > measure. > > You always get such interactions, it's unavoidable. That's why I added some > actual math code that uses isinf/isnan to see how it performs in real life. > > > > For me the fixed version still shows the expected answer: the built-ins are > > > either faster or as fast as the inlines. So I don't think there is any > > > regression here (remember also that previously there were no inlines at all > > > except for a few inside GLIBC, so the real speedup is much larger). > > > > Thats arm only. So it looks that we need platform-specific headers and testing. > > Well I just confirmed the same gains apply to x64. > No, that doesn't confirm anything yet. You need to do more extensive testing to get somewhat reliable answer and still you won't be sure. I asked you to run on arm my benchmark to measure results of inlining. I attached again version. You should run it to see how results will differ. test() { gcc -O3 -DBRANCHED -DI1=$1 -DI2=$2 -DT$3 ft.c 2>/dev/null echo branched time ./a.out gcc -O3 -DI1=$1 -DI2=$2 -DT$3 ft.c 2>/dev/null echo builtin time ./a.out } test2() { echo conditional add test $1 $2 1 echo branch test $1 $2 2 echo sum test $1 $2 3 } echo "don't inline" test2 '__attribute__((noinline))' '__attribute__((noinline))' echo inline outer call test2 '__attribute__((noinline))' '__attribute__((always_inline))' echo inline inner call test2 '__attribute__((always_inline))' '__attribute__((noinline))' echo tigth loop test2 '__attribute__((always_inline))' '__attribute__((always_inline))' #include <stdlib.h> #include <stdio.h> //#include <math.h> #undef isinf2 #ifdef BRANCHED int I2 isinf2 (double dx) { union u { double d; long l; }; volatile union u u; u.d = dx; long x = u.l; return 2 * x == 0xffe0000000000000 ? (x == 0x7ff0000000000000 ? 1 : -1) : 0; } #else int I2 isinf2(double x) { return __builtin_isinf (x); } #endif void I1 t1(double x,double *d) { if (isinf2 (x)) *d+=42; } void I1 t2(double x,double *d) { if (isinf2 (x)) abort(); } void I1 t3(double x,double *d) { *d += isinf2 (x); } int main() { double ret= 0.0; int i, j; double *d = malloc (800000); for (i=0;i<1000;i++) d[i]=4.2;//(rand() % 2) ? 0.0 : (rand() % 2 ? 1.3 : 1.0/0.0); for (j=0; j<300000; j++) #ifdef T1 for (i=0; i<1000; i++) t1(d[i]+1.3, &ret); #endif #ifdef T2 for (i=0; i<1000; i++) t2(d[i]+1.3, &ret); #endif #ifdef T3 for (i=0; i<1000; i++) t3(d[i]+1.3, &ret); #endif return ret; }
> Ondřej Bílka wrote: > On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote: > > > Ondřej Bílka wrote: > > > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote: > > > > > > > > > > > > > Ondřej Bílka wrote: > > > > > But with latency hiding by using argument first suddenly even isnan and > > > > > isnormal become regression. > > > > > > > > > That doesn't look correct - it looks like this didn't use the built-ins at all, > > did you forget to apply that patch? > > > No, from what you wrote I expected that patch already tests builtins > which doesn't. Applied patch and got different results. When I added > patch results are similar. OK, I extended the benchmark to add the built-ins explicitly so that you don't need to apply the math.h inline patch first. > Which still doesn't have to mean anything, only if you test a > application that frequently uses these you will get result without > doubt. We don't have applications that uses these, but we can say without any doubt that they will show huge speedups if they do use these functions frequently or any math functions that use them a lot. Remainder() for example shows ~7% gain with the new inlines. > Here a simple modification produces different results. One of many > objections is that by simply adding gcc will try to make branchless code > like converting that to res += 5 * (isnan(tmp)). So use more difficult > branch and and with following two I get __builtin_isinf lot slower. > > { double tmp = p[i] * 2.0; \ > res += 3 * sin (tmp); if (func (tmp)) res += 3* sin (2 * tmp) ;} \ > > { double tmp = p[i] * 2.0; \ > if (func (tmp)) res += 3 * sin (2 * tmp) ;} \ So here are the results again for the original test and your 2 tests above: remainder_test2_t: 40966.3 192314 remainder_test1_t: 43697.4 196474 __fpclassify_t: 12665.2 9951.16 fpclassify_t: 2979.56 2974.35 __fpclassify_test2_t: 2889.92 2984.95 __fpclassify_test1_t: 3269.67 3199.05 isnormal_t: 4381.54 4041.78 __isnormal_builtin_t: 4586.15 4318.18 __isnormal_inl2_t: 4371.76 10737.4 __isnormal_inl_t: 12635.5 10418.4 isfinite_t: 2992.79 2979.5 __isfinite_builtin_t: 2982.96 2982.92 __finite_inl_t: 4090.2 4064.52 __finite_t: 7058.1 7039.74 isinf_t: 3274.14 3299.75 __isinf_builtin_t: 3195.79 3196.05 __isinf_ns_builtin_t: 3241.91 3241.96 __isinf_ns_t: 3500.85 3493.8 __isinf_inl_t: 2834.83 3433.89 __isinf_t: 8794.62 8812.5 isnan_t: 2801.83 2801.67 __isnan_builtin_t: 2794.7 2891.37 __isnan_inl_t: 4216.83 3980.52 __isnan_t: 7070.36 7088.15 remainder_test2_t: 105654 239008 remainder_test1_t: 107533 239310 __fpclassify_t: 12523.5 10080.2 fpclassify_t: 2974.47 2983.21 __fpclassify_test2_t: 64227.5 55564.1 __fpclassify_test1_t: 64036.1 55424 isnormal_t: 122300 34529.5 __isnormal_builtin_t: 122592 34616 __isnormal_inl2_t: 123425 35056 __isnormal_inl_t: 129589 41615.3 isfinite_t: 123254 34041.5 __isfinite_builtin_t: 123302 34093 __finite_inl_t: 123455 34631.8 __finite_t: 127298 39587.5 isinf_t: 63744 45997.6 __isinf_builtin_t: 63545.2 46100.2 __isinf_ns_builtin_t: 63570.9 46087.5 __isinf_ns_t: 63890.9 45754.5 __isinf_inl_t: 64008.5 46505.2 __isinf_t: 68915.7 51833.8 isnan_t: 62866.8 45023.5 __isnan_builtin_t: 62951.9 44956.8 __isnan_inl_t: 63855.1 45294.6 __isnan_t: 67156.5 49505.3 remainder_test2_t: 41147.4 216349 remainder_test1_t: 43860.8 220614 __fpclassify_t: 12569.1 10124.3 fpclassify_t: 3068.91 2974.31 __fpclassify_test2_t: 4048.88 32446.5 __fpclassify_test1_t: 4005.86 32783.1 isnormal_t: 63707.9 14550.8 __isnormal_builtin_t: 63672 14383 __isnormal_inl2_t: 65730.4 15059.1 __isnormal_inl_t: 73352.2 10570.7 isfinite_t: 64756.7 2719.12 __isfinite_builtin_t: 64748.8 2664.12 __finite_inl_t: 65331.4 2740.1 __finite_t: 70374.5 7944.69 isinf_t: 2927.67 20684.2 __isinf_builtin_t: 2848.58 20050.9 __isinf_ns_builtin_t: 2932.22 21809.9 __isinf_ns_t: 2908.41 18973.9 __isinf_inl_t: 2971.63 18025.5 __isinf_t: 9010.58 28392.3 isnan_t: 2841.28 16457.3 __isnan_builtin_t: 2841.34 15017.8 __isnan_inl_t: 2846.25 19736.8 __isnan_t: 8171.32 23874.3 > > >From this it seems that __isinf_inl is slightly better than the builtin, but > > it does not show up as a regression when combined with sin or in the remainder > > test. > > > That doesn't hold generaly as remainder test it could be just caused by > isnan being slower than isinf. No, the new isinf/isnan are both faster than the previous versions (some isinf calls were inlined as __isinf_ns, but even that one is clearly slower than the builtin in all the results). Remember once again this patch creates new inlines that didn't exist before as well as replacing existing inlines in GLIBC with even faster ones. The combination of these means it is simply an impossibility that anything could become slower. > > Well I just confirmed the same gains apply to x64. > > > No, that doesn't confirm anything yet. You need to do more extensive > testing to get somewhat reliable answer and still you won't be sure. No, this benchmark does give a very clear and reliable answer: everything speeds up by a huge factor. > I asked you to run on arm my benchmark to measure results of inlining. > I attached again version. You should run it to see how results will differ. I did run it but I don't understand what it's supposed to mean, and I can't share the results. So do you have something simpler that shows what point you're trying to make? Or maybe you could add your own benchmark to GLIBC? Wilco
On 07/13/2015 07:02 AM, Wilco Dijkstra wrote: > OK, I extended the benchmark to add the built-ins explicitly so that > you don't need to apply the math.h inline patch first. Please post the new version of your benchmark so we can review for inclusion in 2.23. Cheers, Carlos.
On Mon, Jul 13, 2015 at 12:02:51PM +0100, Wilco Dijkstra wrote: > > Ondřej Bílka wrote: > > On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote: > > > > Ondřej Bílka wrote: > > > > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote: > > > > > > > > > > > > > > > > Ondřej Bílka wrote: > > > > > > But with latency hiding by using argument first suddenly even isnan and > > > > > > isnormal become regression. > > > > > > > > > > > > That doesn't look correct - it looks like this didn't use the built-ins at all, > > > did you forget to apply that patch? > > > > > No, from what you wrote I expected that patch already tests builtins > > which doesn't. Applied patch and got different results. When I added > > patch results are similar. > > OK, I extended the benchmark to add the built-ins explicitly so that > you don't need to apply the math.h inline patch first. > Thats also good general policy to include all strategies so we will see that some is optimal on some architecture. > > Which still doesn't have to mean anything, only if you test a > > application that frequently uses these you will get result without > > doubt. > > We don't have applications that uses these, but we can say without any > doubt that they will show huge speedups if they do use these functions > frequently or any math functions that use them a lot. Remainder() for > example shows ~7% gain with the new inlines. > No, this is just microbenchmark as I and Joseph are telling you ten times. There are lot factors that could change performance like need of constants which you don't measure as you inline remainder. So your speedup could be only illusory. > > Here a simple modification produces different results. One of many > > objections is that by simply adding gcc will try to make branchless code > > like converting that to res += 5 * (isnan(tmp)). So use more difficult > > branch and and with following two I get __builtin_isinf lot slower. > > > > { double tmp = p[i] * 2.0; \ > > res += 3 * sin (tmp); if (func (tmp)) res += 3* sin (2 * tmp) ;} \ > > > > { double tmp = p[i] * 2.0; \ > > if (func (tmp)) res += 3 * sin (2 * tmp) ;} \ > > So here are the results again for the original test and your 2 tests above: Also which gcc are you using? There is problem that recent gcc started optimize better libc inlines but not builtins. > > > >From this it seems that __isinf_inl is slightly better than the builtin, but > > > it does not show up as a regression when combined with sin or in the remainder > > > test. > > > > > That doesn't hold generaly as remainder test it could be just caused by > > isnan being slower than isinf. > > No, the new isinf/isnan are both faster than the previous versions (some isinf > calls were inlined as __isinf_ns, but even that one is clearly slower than the > builtin in all the results). Remember once again this patch creates new inlines > that didn't exist before as well as replacing existing inlines in GLIBC with > even faster ones. The combination of these means it is simply an impossibility > that anything could become slower. > I got that isinf libc inline is faster so how do you explain it? > > > Well I just confirmed the same gains apply to x64. > > > > > No, that doesn't confirm anything yet. You need to do more extensive > > testing to get somewhat reliable answer and still you won't be sure. > > No, this benchmark does give a very clear and reliable answer: everything > speeds up by a huge factor. > While that is true it isn't exactly what we asked. We asked what inline is best one. So you need accurate benchmark to also measure that, not rough one that could only tell that inlining helps. > > I asked you to run on arm my benchmark to measure results of inlining. > > I attached again version. You should run it to see how results will differ. > > I did run it but I don't understand what it's supposed to mean, and I can't share > the results. So do you have something simpler that shows what point you're trying > to make? Or maybe you could add your own benchmark to GLIBC? > It was previously explained in thread. Please read it again, I will reply separately whats wrong with v2 benchmark.
> Ondřej Bílka wrote: > On Mon, Jul 13, 2015 at 12:02:51PM +0100, Wilco Dijkstra wrote: > > > Ondřej Bílka wrote: > > > On Fri, Jul 10, 2015 at 05:09:16PM +0100, Wilco Dijkstra wrote: > > > > > Ondřej Bílka wrote: > > > > > On Mon, Jul 06, 2015 at 03:50:11PM +0100, Wilco Dijkstra wrote: > > > > > > > > We don't have applications that uses these, but we can say without any > > doubt that they will show huge speedups if they do use these functions > > frequently or any math functions that use them a lot. Remainder() for > > example shows ~7% gain with the new inlines. > > > No, this is just microbenchmark as I and Joseph are telling you ten > times. There are lot factors that could change performance like need of > constants which you don't measure as you inline remainder. So your > speedup could be only illusory. Of course it is a microbenchmark, that's how I wrote it. And because the new inlines show huge speedups it proves they will improve performance everywhere. Note the remainder functions are explicitly not inlined. > Also which gcc are you using? There is problem that recent gcc started > optimize better libc inlines but not builtins. I always use latest trunk. > > > > >From this it seems that __isinf_inl is slightly better than the builtin, but > > > > it does not show up as a regression when combined with sin or in the remainder > > > > test. > > > > > > > That doesn't hold generaly as remainder test it could be just caused by > > > isnan being slower than isinf. > > > > No, the new isinf/isnan are both faster than the previous versions (some isinf > > calls were inlined as __isinf_ns, but even that one is clearly slower than the > > builtin in all the results). Remember once again this patch creates new inlines > > that didn't exist before as well as replacing existing inlines in GLIBC with > > even faster ones. The combination of these means it is simply an impossibility > > that anything could become slower. > > > I got that isinf libc inline is faster so how do you explain it? Why does it matter? My patch as it stands does not change that internal inline. Again, my patch adds *new* inlines for functions which were previously not inlined. > > > > Well I just confirmed the same gains apply to x64. > > > > > > > No, that doesn't confirm anything yet. You need to do more extensive > > > testing to get somewhat reliable answer and still you won't be sure. > > > > No, this benchmark does give a very clear and reliable answer: everything > > speeds up by a huge factor. > > > While that is true it isn't exactly what we asked. We asked what inline > is best one. So you need accurate benchmark to also measure that, not > rough one that could only tell that inlining helps. No that's not what anyone asked, not what I agreed with Joseph and most definitely not what I intended. I posted a patch to speedup these functions as they are currently not inlined and thus extremely slow, plus a benchmark to prove that my patch makes a huge difference to performance. There is no doubt it is possible to improve the GCC built-ins further, so I created a bug to do so (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66462). However that is beyond the scope of this patch and is likely microarchitecture specific, so each target has to decide what code they generate for each builtin. Wilco
On 07/17/2015 09:24 AM, Wilco Dijkstra wrote: > There is no doubt it is possible to improve the GCC built-ins further, so I > created a bug to do so (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66462). > However that is beyond the scope of this patch and is likely microarchitecture > specific, so each target has to decide what code they generate for each builtin. Agreed. Thanks for the good work Wilco. c.
diff --git a/benchtests/Makefile b/benchtests/Makefile index 8e615e5..3c20180 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -30,12 +30,13 @@ bench-pthread := pthread_once bench := $(bench-math) $(bench-pthread) # String function benchmarks. -string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \ - mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \ - strcat strchr strchrnul strcmp strcpy strcspn strlen \ - strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \ - strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \ - strcoll +string-bench := bcopy bzero math-inlines memccpy memchr memcmp memcpy memmem \ + memmove mempcpy memset rawmemchr stpcpy stpncpy strcasecmp \ + strcasestr strcat strchr strchrnul strcmp strcpy strcspn \ + strlen strncasecmp strncat strncmp strncpy strnlen strpbrk \ + strrchr strspn strstr strcpy_chk stpcpy_chk memrchr strsep \ + strtok strcoll + string-bench-all := $(string-bench) # We have to generate locales @@ -58,6 +59,7 @@ CFLAGS-bench-ffsll.c += -fno-builtin bench-malloc := malloc-thread $(addprefix $(objpfx)bench-,$(bench-math)): $(libm) +$(addprefix $(objpfx)bench-,math-inlines): $(libm) $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library) $(objpfx)bench-malloc-thread: $(shared-thread-library) diff --git a/benchtests/bench-math-inlines.c b/benchtests/bench-math-inlines.c new file mode 100644 index 0000000..c21a3d3 --- /dev/null +++ b/benchtests/bench-math-inlines.c @@ -0,0 +1,203 @@ +/* Measure math inline functions. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define SIZE 1024 +#define TEST_MAIN +#define TEST_NAME "math-inlines" +#include "bench-string.h" + +#include <stdlib.h> +#include <math.h> +#include <stdint.h> + +#define BOOLTEST(func) \ +int \ +func ## _t (volatile double *p, size_t n, size_t iters) \ +{ \ + int i, j; \ + int res = 0; \ + for (j = 0; j < iters; j++) \ + for (i = 0; i < n; i++) \ + if (func (p[i] * 2.0)) res++; \ + return res; \ +} + +#define VALUETEST(func) \ +int \ +func ## _t (volatile double *p, size_t n, size_t iters) \ +{ \ + int i, j; \ + int res = 0; \ + for (j = 0; j < iters; j++) \ + for (i = 0; i < n; i++) \ + res += func (p[i] * 2.0); \ + return res; \ +} + +typedef union +{ + double value; + uint64_t word; +} ieee_double_shape_type; + +#define EXTRACT_WORDS64(i,d) \ +do { \ + ieee_double_shape_type gh_u; \ + gh_u.value = (d); \ + (i) = gh_u.word; \ +} while (0) + +/* Explicit inlines similar to math_private.h versions. */ + +extern __always_inline int +__isnan_inl (double d) +{ + uint64_t di; + EXTRACT_WORDS64 (di, d); + return (di & 0x7fffffffffffffffull) > 0x7ff0000000000000ull; +} + +extern __always_inline int +__isinf_inl (double x) +{ + uint64_t ix; + EXTRACT_WORDS64 (ix,x); + if ((ix << 1) != 0xffe0000000000000ull) + return 0; + return (int)(ix >> 32); +} + +extern __always_inline int +__finite_inl (double d) +{ + uint64_t di; + EXTRACT_WORDS64 (di, d); + return (di & 0x7fffffffffffffffull) < 0x7ff0000000000000ull; +} + +/* Explicit inline similar to existing math.h implementation. */ + +#define __isnormal_inl(X) (__fpclassify (X) == FP_NORMAL) +#define __isnormal_inl2(X) (fpclassify (X) == FP_NORMAL) + +/* Test fpclassify with use of only 2 of the 5 results. */ + +extern __always_inline int +__fpclassify_test1 (double d) +{ + int cl = fpclassify (d); + return cl == FP_NAN || cl == FP_INFINITE; +} + +extern __always_inline int +__fpclassify_test2 (double d) +{ + return isnan (d) || isinf (d); +} + +/* Create test functions for each possibility. */ + +BOOLTEST (__isnan) +BOOLTEST (isnan) +BOOLTEST (__isnan_inl) + +BOOLTEST (__isinf) +BOOLTEST (isinf) +BOOLTEST (__isinf_inl) + +BOOLTEST (__finite) +BOOLTEST (isfinite) +BOOLTEST (__finite_inl) + +BOOLTEST (isnormal) +BOOLTEST (__isnormal_inl) +BOOLTEST (__isnormal_inl2) + +VALUETEST (fpclassify) +VALUETEST (__fpclassify) +BOOLTEST (__fpclassify_test1) +BOOLTEST (__fpclassify_test2) + +IMPL (isnan_t, 0) +IMPL (__isnan_t, 1) +IMPL (__isnan_inl_t, 2) +IMPL (isinf_t, 3) +IMPL (__isinf_t, 4) +IMPL (__isinf_inl_t, 5) +IMPL (isfinite_t, 6) +IMPL (__finite_t, 7) +IMPL (__finite_inl_t, 8) +IMPL (isnormal_t, 9) +IMPL (__isnormal_inl_t, 10) +IMPL (__isnormal_inl2_t, 11) +IMPL (fpclassify_t, 12) +IMPL (__fpclassify_t, 13) +IMPL (__fpclassify_test1_t, 14) +IMPL (__fpclassify_test2_t, 15) + +typedef int (*proto_t) (volatile double *p, size_t n, size_t iters); + +static void +do_one_test (impl_t *impl, volatile double *arr, size_t len) +{ + size_t iters = INNER_LOOP_ITERS * 10; + timing_t start, stop, cur; + + TIMING_NOW (start); + CALL (impl, arr, len, iters); + TIMING_NOW (stop); + TIMING_DIFF (cur, start, stop); + + TIMING_PRINT_MEAN ((double) cur, (double) iters); +} + +static volatile double arr1[SIZE]; +static volatile double arr2[SIZE]; + +int +test_main (void) +{ + size_t i; + + test_init (); + + /* Create 2 test arrays, one with 10% zeroes, 10% negative values, + 79% positive values and 1% infinity/NaN. The other contains + 50% inf, 50% NaN. */ + + for (i = 0; i < SIZE; i++) + { + int x = rand () & 255; + arr1[i] = (x < 25) ? 0.0 : ((x < 50) ? -1 : 100); + if (x == 255) arr1[i] = __builtin_inf (); + if (x == 254) arr1[i] = __builtin_nan ("0"); + arr2[i] = (x < 128) ? __builtin_inf () : __builtin_nan ("0"); + } + + FOR_EACH_IMPL (impl, 0) + { + printf ("%20s: ", impl->name); + do_one_test (impl, arr1, SIZE); + do_one_test (impl, arr2, SIZE); + putchar ('\n'); + } + + return ret; +} + +#include "../test-skeleton.c"