Message ID | 20230315205910.4120377-4-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Improve fmod and fmodf | expand |
On Wed, Mar 15, 2023 at 1:59 PM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > This uses a new algorithm similar to already proposed earlier [1]. > With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), > the simplest implementation is: > > mx * 2^ex == 2 * mx * 2^(ex - 1) > > while (ex > ey) > { > mx *= 2; > --ex; > mx %= my; > } > > With mx/my being mantissa of double floating pointer, on each step the > argument reduction can be improved 11 (which is sizeo of uint64_t minus > MANTISSA_WIDTH plus the signal bit): > > while (ex > ey) > { > mx << 11; > ex -= 11; > mx %= my; > } */ > > The implementation uses builtin clz and ctz, along with shifts to > convert hx/hy back to doubles. Different than the original patch, > this path assume modulo/divide operation is slow, so use multiplication > with invert values. > > I see the following performance improvements using fmod benchtests > (result only show the 'mean' result): > > Architecture | Input | master | patch > -----------------|-----------------|----------|-------- > x86_64 (Ryzen 9) | subnormals | 19.1584 | 12.5049 > x86_64 (Ryzen 9) | normal | 1016.51 | 296.939 > x86_64 (Ryzen 9) | close-exponents | 18.4428 | 16.0244 I tried it with the test in https://sourceware.org/bugzilla/show_bug.cgi?id=30179 On Intel i7-10710U, I got time ./sse 3.13user 0.00system 0:03.13elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k 0inputs+0outputs (0major+37minor)pagefaults 0swaps time ./x87 0.24user 0.00system 0:00.24elapsed 100%CPU (0avgtext+0avgdata 512maxresident)k 0inputs+0outputs (0major+37minor)pagefaults 0swaps time ./generic 0.55user 0.00system 0:00.55elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k 0inputs+0outputs (0major+37minor)pagefaults 0swaps The new generic is still slower than x87. > aarch64 (N1) | subnormal | 11.153 | 6.81778 > aarch64 (N1) | normal | 528.649 | 155.62 > aarch64 (N1) | close-exponents | 11.4517 | 8.21306 > > I also see similar improvements on arm-linux-gnueabihf when running on > the N1 aarch64 chips, where it a lot of soft-fp implementation (for > modulo, clz, ctz, and multiplication): > > Architecture | Input | master | patch > -----------------|-----------------|----------|-------- > armhf (N1) | subnormal | 15.908 | 15.1083 > armhf (N1) | normal | 837.525 | 244.833 > armhf (N1) | close-exponents | 16.2111 | 21.8182 > > Instead of using the math_private.h definitions, I used the > math_config.h instead which is used on newer math implementations. > > Co-authored-by: kirill <kirill.okhotnikov@gmail.com> > > [1] https://sourceware.org/pipermail/libc-alpha/2020-November/119794.html > --- > math/libm-test-fmod.inc | 6 + > sysdeps/ieee754/dbl-64/e_fmod.c | 232 ++++++++++++++++----------- > sysdeps/ieee754/dbl-64/math_config.h | 61 +++++++ > 3 files changed, 203 insertions(+), 96 deletions(-) > > diff --git a/math/libm-test-fmod.inc b/math/libm-test-fmod.inc > index 39fd02f9ef..76c1e0cd45 100644 > --- a/math/libm-test-fmod.inc > +++ b/math/libm-test-fmod.inc > @@ -217,6 +217,12 @@ static const struct test_ff_f_data fmod_test_data[] = > TEST_ff_f (fmod, -0x1p1023L, 0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > TEST_ff_f (fmod, -0x1p1023L, -0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > #endif > +#if TEST_COND_binary64 > + TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > + TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > + TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > + TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > +#endif > #if MIN_EXP <= -16381 > TEST_ff_f (fmod, 0x1p16383L, 0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > TEST_ff_f (fmod, 0x1p16383L, -0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), > diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c > index 60b8bbb9d2..d21143e0cf 100644 > --- a/sysdeps/ieee754/dbl-64/e_fmod.c > +++ b/sysdeps/ieee754/dbl-64/e_fmod.c > @@ -1,105 +1,145 @@ > -/* > - * ==================================================== > - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. > - * > - * Developed at SunPro, a Sun Microsystems, Inc. business. > - * Permission to use, copy, modify, and distribute this > - * software is freely granted, provided that this notice > - * is preserved. > - * ==================================================== > - */ > - > -/* > - * __ieee754_fmod(x,y) > - * Return x mod y in exact arithmetic > - * Method: shift and subtract > - */ > +/* Floating-point remainder function. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > > -#include <math.h> > -#include <math_private.h> > -#include <stdint.h> > #include <libm-alias-finite.h> > +#include <math.h> > +#include "math_config.h" > + > +/* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the > + simplest implementation is: > + > + mx * 2^ex == 2 * mx * 2^(ex - 1) > > -static const double one = 1.0, Zero[] = {0.0, -0.0,}; > + while (ex > ey) > + { > + mx *= 2; > + --ex; > + mx %= my; > + } > + > + With mx/my being mantissa of double floating pointer, on each step the > + argument reduction can be improved 11 (which is sizeo of uint64_t minus > + MANTISSA_WIDTH plus the signal bit): > + > + while (ex > ey) > + { > + mx << 11; > + ex -= 11; > + mx %= my; > + } */ > > double > __ieee754_fmod (double x, double y) > { > - int32_t n,ix,iy; > - int64_t hx,hy,hz,sx,i; > - > - EXTRACT_WORDS64(hx,x); > - EXTRACT_WORDS64(hy,y); > - sx = hx&UINT64_C(0x8000000000000000); /* sign of x */ > - hx ^=sx; /* |x| */ > - hy &= UINT64_C(0x7fffffffffffffff); /* |y| */ > - > - /* purge off exception values */ > - if(__builtin_expect(hy==0 > - || hx >= UINT64_C(0x7ff0000000000000) > - || hy > UINT64_C(0x7ff0000000000000), 0)) > - /* y=0,or x not finite or y is NaN */ > - return (x*y)/(x*y); > - if(__builtin_expect(hx<=hy, 0)) { > - if(hx<hy) return x; /* |x|<|y| return x */ > - return Zero[(uint64_t)sx>>63]; /* |x|=|y| return x*0*/ > - } > - > - /* determine ix = ilogb(x) */ > - if(__builtin_expect(hx<UINT64_C(0x0010000000000000), 0)) { > - /* subnormal x */ > - for (ix = -1022,i=(hx<<11); i>0; i<<=1) ix -=1; > - } else ix = (hx>>52)-1023; > - > - /* determine iy = ilogb(y) */ > - if(__builtin_expect(hy<UINT64_C(0x0010000000000000), 0)) { /* subnormal y */ > - for (iy = -1022,i=(hy<<11); i>0; i<<=1) iy -=1; > - } else iy = (hy>>52)-1023; > - > - /* set up hx, hy and align y to x */ > - if(__builtin_expect(ix >= -1022, 1)) > - hx = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hx); > - else { /* subnormal x, shift x to normal */ > - n = -1022-ix; > - hx<<=n; > - } > - if(__builtin_expect(iy >= -1022, 1)) > - hy = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hy); > - else { /* subnormal y, shift y to normal */ > - n = -1022-iy; > - hy<<=n; > - } > - > - /* fix point fmod */ > - n = ix - iy; > - while(n--) { > - hz=hx-hy; > - if(hz<0){hx = hx+hx;} > - else { > - if(hz==0) /* return sign(x)*0 */ > - return Zero[(uint64_t)sx>>63]; > - hx = hz+hz; > - } > - } > - hz=hx-hy; > - if(hz>=0) {hx=hz;} > - > - /* convert back to floating value and restore the sign */ > - if(hx==0) /* return sign(x)*0 */ > - return Zero[(uint64_t)sx>>63]; > - while(hx<UINT64_C(0x0010000000000000)) { /* normalize x */ > - hx = hx+hx; > - iy -= 1; > - } > - if(__builtin_expect(iy>= -1022, 1)) { /* normalize output */ > - hx = ((hx-UINT64_C(0x0010000000000000))|((uint64_t)(iy+1023)<<52)); > - INSERT_WORDS64(x,hx|sx); > - } else { /* subnormal output */ > - n = -1022 - iy; > - hx>>=n; > - INSERT_WORDS64(x,hx|sx); > - x *= one; /* create necessary signal */ > - } > - return x; /* exact output */ > + uint64_t hx = asuint64 (x); > + uint64_t hy = asuint64 (y); > + > + uint64_t sx = hx & SIGN_MASK; > + /* Get |x| and |y|. */ > + hx ^= sx; > + hy &= ~SIGN_MASK; > + > + /* Special cases: > + - If x or y is a Nan, NaN is returned. > + - If x is an inifinity, a NaN is returned. > + - If y is zero, Nan is returned. > + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ > + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) > + return (x * y) / (x * y); > + > + if (__glibc_unlikely (hx <= hy)) > + { > + if (hx < hy) > + return x; > + return asdouble (sx); > + } > + > + int ex = hx >> MANTISSA_WIDTH; > + int ey = hy >> MANTISSA_WIDTH; > + > + /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52, */ > + if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) > + { > + uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); > + uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); > + > + uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; > + return make_double (d, ey - 1, sx); > + } > + > + /* Special case, both x and y are subnormal. */ > + if (__glibc_unlikely (ex == 0 && ey == 0)) > + return asdouble (sx | hx % hy); > + > + /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is > + not subnormal by conditions above. */ > + uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); > + ex--; > + uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); > + > + int lead_zeros_my = EXPONENT_WIDTH; > + if (__glibc_likely (ey > 0)) > + ey--; > + else > + { > + my = hy; > + lead_zeros_my = clz_uint64 (my); > + } > + > + /* Assume hy != 0 */ > + int tail_zeros_my = ctz_uint64 (my); > + int sides_zeroes = lead_zeros_my + tail_zeros_my; > + int exp_diff = ex - ey; > + > + int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; > + my >>= right_shift; > + exp_diff -= right_shift; > + ey += right_shift; > + > + int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH; > + mx <<= left_shift; > + exp_diff -= left_shift; > + > + mx %= my; > + > + if (__glibc_unlikely (mx == 0)) > + return asdouble (sx); > + > + if (exp_diff == 0) > + return make_double (my, ey, sx); > + > + /* Assume modulo/divide operation is slow, so use multiplication with invert > + values. */ > + uint64_t inv_hy = UINT64_MAX / my; > + while (exp_diff > sides_zeroes) { > + exp_diff -= sides_zeroes; > + uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes); > + mx <<= sides_zeroes; > + mx -= hd * my; > + while (__glibc_unlikely (mx > my)) > + mx -= my; > + } > + uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff); > + mx <<= exp_diff; > + mx -= hd * my; > + while (__glibc_unlikely (mx > my)) > + mx -= my; > + > + return make_double (mx, ey, sx); > } > libm_alias_finite (__ieee754_fmod, __fmod) > diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h > index 3cbaeede64..d00f629531 100644 > --- a/sysdeps/ieee754/dbl-64/math_config.h > +++ b/sysdeps/ieee754/dbl-64/math_config.h > @@ -43,6 +43,24 @@ > # define TOINT_INTRINSICS 0 > #endif > > +static inline int > +clz_uint64 (uint64_t x) > +{ > + if (sizeof (uint64_t) == sizeof (unsigned long)) > + return __builtin_clzl (x); > + else > + return __builtin_clzll (x); > +} > + > +static inline int > +ctz_uint64 (uint64_t x) > +{ > + if (sizeof (uint64_t) == sizeof (unsigned long)) > + return __builtin_ctzl (x); > + else > + return __builtin_ctzll (x); > +} > + > #if TOINT_INTRINSICS > /* Round x to nearest int in all rounding modes, ties have to be rounded > consistently with converttoint so the results match. If the result > @@ -88,6 +106,49 @@ issignaling_inline (double x) > return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL; > } > > +#define BIT_WIDTH 64 > +#define MANTISSA_WIDTH 52 > +#define EXPONENT_WIDTH 11 > +#define MANTISSA_MASK UINT64_C(0x000fffffffffffff) > +#define EXPONENT_MASK UINT64_C(0x7ff0000000000000) > +#define EXP_MANT_MASK UINT64_C(0x7fffffffffffffff) > +#define QUIET_NAN_MASK UINT64_C(0x0008000000000000) > +#define SIGN_MASK UINT64_C(0x8000000000000000) > + > +static inline bool > +is_nan (uint64_t x) > +{ > + return (x & EXP_MANT_MASK) > EXPONENT_MASK; > +} > + > +static inline uint64_t > +get_mantissa (uint64_t x) > +{ > + return x & MANTISSA_MASK; > +} > + > +/* Convert integer number X, unbiased exponent EP, and sign S to double: > + > + result = X * 2^(EP+1 - exponent_bias) > + > + NB: zero is not supported. */ > +static inline double > +make_double (uint64_t x, int64_t ep, uint64_t s) > +{ > + int lz = clz_uint64 (x) - EXPONENT_WIDTH; > + x <<= lz; > + ep -= lz; > + > + if (__glibc_unlikely (ep < 0)) > + { > + x >>= -ep; > + ep = 0; > + } > + > + return asdouble (s + x + (ep << MANTISSA_WIDTH)); > +} > + > + > #define NOINLINE __attribute__ ((noinline)) > > /* Error handling tail calls for special cases, with a sign argument. > -- > 2.34.1 >
On 15/03/23 21:58, H.J. Lu wrote: > On Wed, Mar 15, 2023 at 1:59 PM Adhemerval Zanella > <adhemerval.zanella@linaro.org> wrote: >> >> This uses a new algorithm similar to already proposed earlier [1]. >> With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), >> the simplest implementation is: >> >> mx * 2^ex == 2 * mx * 2^(ex - 1) >> >> while (ex > ey) >> { >> mx *= 2; >> --ex; >> mx %= my; >> } >> >> With mx/my being mantissa of double floating pointer, on each step the >> argument reduction can be improved 11 (which is sizeo of uint64_t minus >> MANTISSA_WIDTH plus the signal bit): >> >> while (ex > ey) >> { >> mx << 11; >> ex -= 11; >> mx %= my; >> } */ >> >> The implementation uses builtin clz and ctz, along with shifts to >> convert hx/hy back to doubles. Different than the original patch, >> this path assume modulo/divide operation is slow, so use multiplication >> with invert values. >> >> I see the following performance improvements using fmod benchtests >> (result only show the 'mean' result): >> >> Architecture | Input | master | patch >> -----------------|-----------------|----------|-------- >> x86_64 (Ryzen 9) | subnormals | 19.1584 | 12.5049 >> x86_64 (Ryzen 9) | normal | 1016.51 | 296.939 >> x86_64 (Ryzen 9) | close-exponents | 18.4428 | 16.0244 > > I tried it with the test in > > https://sourceware.org/bugzilla/show_bug.cgi?id=30179 > > On Intel i7-10710U, I got > > time ./sse > 3.13user 0.00system 0:03.13elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k > 0inputs+0outputs (0major+37minor)pagefaults 0swaps > time ./x87 > 0.24user 0.00system 0:00.24elapsed 100%CPU (0avgtext+0avgdata 512maxresident)k > 0inputs+0outputs (0major+37minor)pagefaults 0swaps > time ./generic > 0.55user 0.00system 0:00.55elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k > 0inputs+0outputs (0major+37minor)pagefaults 0swaps > > The new generic is still slower than x87. I think it really depends of the underlying hardware and on the input range. Using the benchmark from the patch set and patch 66182 [1], I see: CPU | Input | patch | 66182 -----------------|-----------------|----------|-------- Ryzen 9 | subnormals | 12.5049 | 31.2822 Ryzen 9 | normal | 296.939 | 592.489 Ryzen 9 | close-exponents | 16.0244 | 33.5172 E5-2640 | subnormals | 34.5454 | 652.59 E5-2640 | normal | 473.602 | 438.836 E5-2640 | close-exponents | 39.298 | 22.2742 i7-4510U | subnormals | 25.2624 | 666.964 i7-4510U | normal | 386.489 | 454.222 i7-4510U | close-exponents | 29.463 | 22.8572 So it seems that fprem performance is not really consistent over x86 CPUs, and even for recent AMD is far from great. So I still think the generic is better for x86, and I think fprem should be used along with ifunc to select on CPUs that really yields better numbers (and take in consideration that subnormals numbers seems to be pretty bad). You might get better x86 performance by remove the SVID wrapper as I did on the last patch; but it will increase 66182 complexity (you will need to check for NaN/INF/0.0 and set errno). And I hardly think it will close the gap on the AMD chip I use. I am also checking a algorithm change to use simple loop for the normal inputs, where integer modulo operation is used instead of inverse multiplication. But as far I am testing performance is really bad on all x86 Intel chips I tests (it is not as bad on AMD). [1] https://patchwork.sourceware.org/project/glibc/patch/20230309183312.205763-1-hjl.tools@gmail.com/
Hi, It's these cases where x87 is still faster than the generic version: > E5-2640 | close-exponents | 39.298 | 22.2742 > > i7-4510U | close-exponents | 29.463 | 22.8572 Are these mostly x < y or cases where the exponent difference is just over 11 and thus we do not use the fast path? > I am also checking a algorithm change to use simple loop for the normal inputs, > where integer modulo operation is used instead of inverse multiplication. Adding another fast path for a wider range of exponent difference could be faster than the generic modulo loop. This could do 2 modulo steps and maybe handle tail zeroes (which I think is what HJ's testcase will benefit from). For really large exponent differences, the generic modulo code could process 30 or 60 bits per iteration instead of just 11. It's more complex (so would be a separate patch) but it should help CPUs with relatively high latency multipliers. Cheers, Wilco
On 16/03/23 13:13, Wilco Dijkstra wrote: > Hi, > > It's these cases where x87 is still faster than the generic version: > >> E5-2640 | close-exponents | 39.298 | 22.2742 >> >> i7-4510U | close-exponents | 29.463 | 22.8572 > > Are these mostly x < y or cases where the exponent difference is just over 11 and > thus we do not use the fast path? In fact the fast path will be used on ~83% of the cases (849 from 1024 entries). Profiling shows that the initial checks might be the culprit, since generic compat wrapper uses compiler builtins that might map to fp instructions. But even trying to mimic did not improve much. It seems that for some CPU the integer operations to create the final floating number is what is costly. > >> I am also checking a algorithm change to use simple loop for the normal inputs, >> where integer modulo operation is used instead of inverse multiplication. > > Adding another fast path for a wider range of exponent difference could be faster > than the generic modulo loop. This could do 2 modulo steps and maybe handle > tail zeroes (which I think is what HJ's testcase will benefit from). > > For really large exponent differences, the generic modulo code could process 30 > or 60 bits per iteration instead of just 11. It's more complex (so would be a separate > patch) but it should help CPUs with relatively high latency multipliers. Yes, it might be an improvement indeed.
Hi Adhemerval, >> It's these cases where x87 is still faster than the generic version: >> >>> E5-2640 | close-exponents | 39.298 | 22.2742 >>> >>> i7-4510U | close-exponents | 29.463 | 22.8572 >> >> Are these mostly x < y or cases where the exponent difference is just over 11 and >> thus we do not use the fast path? > > In fact the fast path will be used on ~83% of the cases (849 from 1024 entries). > Profiling shows that the initial checks might be the culprit, since generic > compat wrapper uses compiler builtins that might map to fp instructions. But even > trying to mimic did not improve much. It seems that for some CPU the integer > operations to create the final floating number is what is costly. If it is mostly the fast path we could further tune it and reduce instruction counts. It takes 6 if statements to enter this fast path, we could reduce that to 3. There are several large constants which could be simplified (older x86 cores might have issues with multiple 10-byte MOVABS in the instruction stream). Also I think your results for generic above use the wrapper, so we'd still get the > 20% speedup which should make things closer. Cheers, Wilco
On Fri, Mar 17, 2023 at 7:55 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote: > > Hi Adhemerval, > > >> It's these cases where x87 is still faster than the generic version: > >> > >>> E5-2640 | close-exponents | 39.298 | 22.2742 > >>> > >>> i7-4510U | close-exponents | 29.463 | 22.8572 > >> > >> Are these mostly x < y or cases where the exponent difference is just over 11 and > >> thus we do not use the fast path? > > > > In fact the fast path will be used on ~83% of the cases (849 from 1024 entries). > > Profiling shows that the initial checks might be the culprit, since generic > > compat wrapper uses compiler builtins that might map to fp instructions. But even > > trying to mimic did not improve much. It seems that for some CPU the integer > > operations to create the final floating number is what is costly. > > If it is mostly the fast path we could further tune it and reduce instruction counts. > It takes 6 if statements to enter this fast path, we could reduce that to 3. There are > several large constants which could be simplified (older x86 cores might have > issues with multiple 10-byte MOVABS in the instruction stream). > > Also I think your results for generic above use the wrapper, so we'd still get the > > 20% speedup which should make things closer. > The current __ieee754_fmod doesn't set errno nor does x87 __ieee754_fmod. A wrapper will avoid setting errno.
Hi HJ, >> Also I think your results for generic above use the wrapper, so we'd still get the >> > 20% speedup which should make things closer. > > > The current __ieee754_fmod doesn't set errno nor does x87 __ieee754_fmod. > A wrapper will avoid setting errno. The new generic fmod removes the wrapper since it sets errno when needed - ie. the wrapper is unnecessary overhead. We must keep using the wrapper in implementations that don't set errno of course. Applications never call __ieee754_fmod, so the right comparison is between the generic fmod without the wrapper and the x87 fmod with the wrapper. Cheers, Wilco
diff --git a/math/libm-test-fmod.inc b/math/libm-test-fmod.inc index 39fd02f9ef..76c1e0cd45 100644 --- a/math/libm-test-fmod.inc +++ b/math/libm-test-fmod.inc @@ -217,6 +217,12 @@ static const struct test_ff_f_data fmod_test_data[] = TEST_ff_f (fmod, -0x1p1023L, 0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), TEST_ff_f (fmod, -0x1p1023L, -0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), #endif +#if TEST_COND_binary64 + TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), + TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), + TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), + TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), +#endif #if MIN_EXP <= -16381 TEST_ff_f (fmod, 0x1p16383L, 0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), TEST_ff_f (fmod, 0x1p16383L, -0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED), diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c index 60b8bbb9d2..d21143e0cf 100644 --- a/sysdeps/ieee754/dbl-64/e_fmod.c +++ b/sysdeps/ieee754/dbl-64/e_fmod.c @@ -1,105 +1,145 @@ -/* - * ==================================================== - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunPro, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - -/* - * __ieee754_fmod(x,y) - * Return x mod y in exact arithmetic - * Method: shift and subtract - */ +/* Floating-point remainder function. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ -#include <math.h> -#include <math_private.h> -#include <stdint.h> #include <libm-alias-finite.h> +#include <math.h> +#include "math_config.h" + +/* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the + simplest implementation is: + + mx * 2^ex == 2 * mx * 2^(ex - 1) -static const double one = 1.0, Zero[] = {0.0, -0.0,}; + while (ex > ey) + { + mx *= 2; + --ex; + mx %= my; + } + + With mx/my being mantissa of double floating pointer, on each step the + argument reduction can be improved 11 (which is sizeo of uint64_t minus + MANTISSA_WIDTH plus the signal bit): + + while (ex > ey) + { + mx << 11; + ex -= 11; + mx %= my; + } */ double __ieee754_fmod (double x, double y) { - int32_t n,ix,iy; - int64_t hx,hy,hz,sx,i; - - EXTRACT_WORDS64(hx,x); - EXTRACT_WORDS64(hy,y); - sx = hx&UINT64_C(0x8000000000000000); /* sign of x */ - hx ^=sx; /* |x| */ - hy &= UINT64_C(0x7fffffffffffffff); /* |y| */ - - /* purge off exception values */ - if(__builtin_expect(hy==0 - || hx >= UINT64_C(0x7ff0000000000000) - || hy > UINT64_C(0x7ff0000000000000), 0)) - /* y=0,or x not finite or y is NaN */ - return (x*y)/(x*y); - if(__builtin_expect(hx<=hy, 0)) { - if(hx<hy) return x; /* |x|<|y| return x */ - return Zero[(uint64_t)sx>>63]; /* |x|=|y| return x*0*/ - } - - /* determine ix = ilogb(x) */ - if(__builtin_expect(hx<UINT64_C(0x0010000000000000), 0)) { - /* subnormal x */ - for (ix = -1022,i=(hx<<11); i>0; i<<=1) ix -=1; - } else ix = (hx>>52)-1023; - - /* determine iy = ilogb(y) */ - if(__builtin_expect(hy<UINT64_C(0x0010000000000000), 0)) { /* subnormal y */ - for (iy = -1022,i=(hy<<11); i>0; i<<=1) iy -=1; - } else iy = (hy>>52)-1023; - - /* set up hx, hy and align y to x */ - if(__builtin_expect(ix >= -1022, 1)) - hx = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hx); - else { /* subnormal x, shift x to normal */ - n = -1022-ix; - hx<<=n; - } - if(__builtin_expect(iy >= -1022, 1)) - hy = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hy); - else { /* subnormal y, shift y to normal */ - n = -1022-iy; - hy<<=n; - } - - /* fix point fmod */ - n = ix - iy; - while(n--) { - hz=hx-hy; - if(hz<0){hx = hx+hx;} - else { - if(hz==0) /* return sign(x)*0 */ - return Zero[(uint64_t)sx>>63]; - hx = hz+hz; - } - } - hz=hx-hy; - if(hz>=0) {hx=hz;} - - /* convert back to floating value and restore the sign */ - if(hx==0) /* return sign(x)*0 */ - return Zero[(uint64_t)sx>>63]; - while(hx<UINT64_C(0x0010000000000000)) { /* normalize x */ - hx = hx+hx; - iy -= 1; - } - if(__builtin_expect(iy>= -1022, 1)) { /* normalize output */ - hx = ((hx-UINT64_C(0x0010000000000000))|((uint64_t)(iy+1023)<<52)); - INSERT_WORDS64(x,hx|sx); - } else { /* subnormal output */ - n = -1022 - iy; - hx>>=n; - INSERT_WORDS64(x,hx|sx); - x *= one; /* create necessary signal */ - } - return x; /* exact output */ + uint64_t hx = asuint64 (x); + uint64_t hy = asuint64 (y); + + uint64_t sx = hx & SIGN_MASK; + /* Get |x| and |y|. */ + hx ^= sx; + hy &= ~SIGN_MASK; + + /* Special cases: + - If x or y is a Nan, NaN is returned. + - If x is an inifinity, a NaN is returned. + - If y is zero, Nan is returned. + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) + return (x * y) / (x * y); + + if (__glibc_unlikely (hx <= hy)) + { + if (hx < hy) + return x; + return asdouble (sx); + } + + int ex = hx >> MANTISSA_WIDTH; + int ey = hy >> MANTISSA_WIDTH; + + /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52, */ + if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) + { + uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); + uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); + + uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; + return make_double (d, ey - 1, sx); + } + + /* Special case, both x and y are subnormal. */ + if (__glibc_unlikely (ex == 0 && ey == 0)) + return asdouble (sx | hx % hy); + + /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is + not subnormal by conditions above. */ + uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); + ex--; + uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); + + int lead_zeros_my = EXPONENT_WIDTH; + if (__glibc_likely (ey > 0)) + ey--; + else + { + my = hy; + lead_zeros_my = clz_uint64 (my); + } + + /* Assume hy != 0 */ + int tail_zeros_my = ctz_uint64 (my); + int sides_zeroes = lead_zeros_my + tail_zeros_my; + int exp_diff = ex - ey; + + int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; + my >>= right_shift; + exp_diff -= right_shift; + ey += right_shift; + + int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH; + mx <<= left_shift; + exp_diff -= left_shift; + + mx %= my; + + if (__glibc_unlikely (mx == 0)) + return asdouble (sx); + + if (exp_diff == 0) + return make_double (my, ey, sx); + + /* Assume modulo/divide operation is slow, so use multiplication with invert + values. */ + uint64_t inv_hy = UINT64_MAX / my; + while (exp_diff > sides_zeroes) { + exp_diff -= sides_zeroes; + uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes); + mx <<= sides_zeroes; + mx -= hd * my; + while (__glibc_unlikely (mx > my)) + mx -= my; + } + uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff); + mx <<= exp_diff; + mx -= hd * my; + while (__glibc_unlikely (mx > my)) + mx -= my; + + return make_double (mx, ey, sx); } libm_alias_finite (__ieee754_fmod, __fmod) diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h index 3cbaeede64..d00f629531 100644 --- a/sysdeps/ieee754/dbl-64/math_config.h +++ b/sysdeps/ieee754/dbl-64/math_config.h @@ -43,6 +43,24 @@ # define TOINT_INTRINSICS 0 #endif +static inline int +clz_uint64 (uint64_t x) +{ + if (sizeof (uint64_t) == sizeof (unsigned long)) + return __builtin_clzl (x); + else + return __builtin_clzll (x); +} + +static inline int +ctz_uint64 (uint64_t x) +{ + if (sizeof (uint64_t) == sizeof (unsigned long)) + return __builtin_ctzl (x); + else + return __builtin_ctzll (x); +} + #if TOINT_INTRINSICS /* Round x to nearest int in all rounding modes, ties have to be rounded consistently with converttoint so the results match. If the result @@ -88,6 +106,49 @@ issignaling_inline (double x) return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL; } +#define BIT_WIDTH 64 +#define MANTISSA_WIDTH 52 +#define EXPONENT_WIDTH 11 +#define MANTISSA_MASK UINT64_C(0x000fffffffffffff) +#define EXPONENT_MASK UINT64_C(0x7ff0000000000000) +#define EXP_MANT_MASK UINT64_C(0x7fffffffffffffff) +#define QUIET_NAN_MASK UINT64_C(0x0008000000000000) +#define SIGN_MASK UINT64_C(0x8000000000000000) + +static inline bool +is_nan (uint64_t x) +{ + return (x & EXP_MANT_MASK) > EXPONENT_MASK; +} + +static inline uint64_t +get_mantissa (uint64_t x) +{ + return x & MANTISSA_MASK; +} + +/* Convert integer number X, unbiased exponent EP, and sign S to double: + + result = X * 2^(EP+1 - exponent_bias) + + NB: zero is not supported. */ +static inline double +make_double (uint64_t x, int64_t ep, uint64_t s) +{ + int lz = clz_uint64 (x) - EXPONENT_WIDTH; + x <<= lz; + ep -= lz; + + if (__glibc_unlikely (ep < 0)) + { + x >>= -ep; + ep = 0; + } + + return asdouble (s + x + (ep << MANTISSA_WIDTH)); +} + + #define NOINLINE __attribute__ ((noinline)) /* Error handling tail calls for special cases, with a sign argument.