Message ID | PAWPR08MB8982428AD5DD18A711D4F31783989@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | math: Improve fmod(f) performance | expand |
On 13/04/23 11:29, Wilco Dijkstra wrote: > Optimize the fast paths (x < y) and (x/y < 2^12). Delay handling of special cases to reduce > amount of code executed before the fast paths. Performance of the close-exponents benchmark > improves by 18-19% on Cortex-A72 and Neoverse V1. Passes regress. I am seeing different numbers on x86_64 (Ryzen 9), below are the best 'mean' from 3 bench-fmod runs: master patch subnormals 8.77307 8.83486 normal 284.637 293.503 close-exponents 11.9665 11.3463 So at least with current 'close-exponents' from bench-fmod, which was generated from exponents between -10 and 10, the gain is more modest (and normal inputs does show a small regression). This should be ok, but I also think we need to outline that A72 gains might not show on different hardware. So maybe also add another bench-fmod set for |x/y| < 2^12 to show the potential gains. > > --- > > diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c > index caae4e47e2358daced51a22342ec6e34a04f6fce..7c0ef11cb6c53561e64fab4f2ece17692dff3e5f 100644 > --- a/sysdeps/ieee754/dbl-64/e_fmod.c > +++ b/sysdeps/ieee754/dbl-64/e_fmod.c > @@ -40,10 +40,10 @@ > > r == x % y == (x % (N * y)) % y > > - And with mx/my being mantissa of double floating point number (which uses > + And with mx/my being mantissa of a double floating point number (which uses > less bits than the storage type), on each step the argument reduction can > be improved by 11 (which is the size of uint64_t minus MANTISSA_WIDTH plus > - the signal bit): > + the implicit one bit): > > mx * 2^ex == 2^11 * mx * 2^(ex - 11) > > @@ -54,7 +54,12 @@ > mx << 11; > ex -= 11; > mx %= my; > - } */ > + } > + > + Special cases: > + - If x or y is a NaN, a NaN is returned. > + - If x is an infinity, or y is zero, a NaN is returned and EDOM is set. > + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ > > double > __fmod (double x, double y) Ok. > @@ -67,62 +72,70 @@ __fmod (double x, double y) > hx ^= sx; > hy &= ~SIGN_MASK; > > - /* Special cases: > - - If x or y is a Nan, NaN is returned. > - - If x is an inifinity, a NaN is returned and EDOM is set. > - - If y is zero, Nan is returned. > - - If x is +0/-0, and y is not zero, +0/-0 is returned. */ > - if (__glibc_unlikely (hy == 0 > - || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) > - { > - if (is_nan (hx) || is_nan (hy)) > - return (x * y) / (x * y); > - return __math_edom ((x * y) / (x * y)); > - } > - > - if (__glibc_unlikely (hx <= hy)) > + /* If x < y, return x (unless y is a NaN). */ > + if (__glibc_likely (hx < hy)) > { > - if (hx < hy) > - return x; > - return asdouble (sx); > + /* If y is a NaN, return a NaN. */ > + if (__glibc_unlikely (hy > EXPONENT_MASK)) > + return x * y; > + return x; > } > > int ex = hx >> MANTISSA_WIDTH; > int ey = hy >> MANTISSA_WIDTH; > + int exp_diff = ex - ey; > + > + /* Common case where exponents are close: |x/y| < 2^12, x not inf/NaN > + and |x%y| not denormal. */ > + if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH > + && ey > MANTISSA_WIDTH > + && exp_diff <= EXPONENT_WIDTH)) > + { > + uint64_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK; > + uint64_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK; > + > + mx %= (my >> exp_diff); > + > + if (__glibc_unlikely (mx == 0)) > + return asdouble (sx); > + int shift = __builtin_clzll (mx); Maybe clz_uint64 here? > + ex -= shift + 1; > + mx <<= shift; > + mx = sx | (mx >> EXPONENT_WIDTH); > + return asdouble (mx + ((uint64_t)ex << MANTISSA_WIDTH)); > + } > > - /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52, */ > - if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) > + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK)) > { > - uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); > - uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); > + /* If x is a NaN, return a NaN. */ > + if (hx > EXPONENT_MASK) > + return x * y; > > - uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; > - return make_double (d, ey - 1, sx); > + /* If x is an infinity or y is zero, return a NaN and set EDOM. */ > + return __math_edom ((x * y) / (x * y)); > } > > - /* Special case, both x and y are subnormal. */ > - if (__glibc_unlikely (ex == 0 && ey == 0)) > + /* Special case, both x and y are denormal. */ > + if (__glibc_unlikely (ex == 0)) > return asdouble (sx | hx % hy); > > - /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is > - not subnormal by conditions above. */ > + /* Extract normalized mantissas - hx is not denormal and hy != 0. */ > uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); > - ex--; > uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); > - > int lead_zeros_my = EXPONENT_WIDTH; > - if (__glibc_likely (ey > 0)) > - ey--; > - else > + > + ey--; > + /* Special case for denormal y. */ > + if (__glibc_unlikely (ey < 0)) > { > my = hy; > + ey = 0; > + exp_diff--; > lead_zeros_my = clz_uint64 (my); > } > > - /* Assume hy != 0 */ > int tail_zeros_my = ctz_uint64 (my); > int sides_zeroes = lead_zeros_my + tail_zeros_my; > - int exp_diff = ex - ey; > > int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; > my >>= right_shift; > @@ -141,8 +154,7 @@ __fmod (double x, double y) > if (exp_diff == 0) > return make_double (mx, ey, sx); > > - /* Assume modulo/divide operation is slow, so use multiplication with invert > - values. */ > + /* Multiplication with the inverse is faster than repeated modulo. */ > uint64_t inv_hy = UINT64_MAX / my; > while (exp_diff > sides_zeroes) { > exp_diff -= sides_zeroes; Ok. > diff --git a/sysdeps/ieee754/flt-32/e_fmodf.c b/sysdeps/ieee754/flt-32/e_fmodf.c > index 763900efdaa8222431e189e50a7e62baf465a54d..14f3fcae256d03ceb4bf91898a7a003bbfcb854a 100644 > --- a/sysdeps/ieee754/flt-32/e_fmodf.c > +++ b/sysdeps/ieee754/flt-32/e_fmodf.c > @@ -40,10 +40,10 @@ > > r == x % y == (x % (N * y)) % y > > - And with mx/my being mantissa of double floating point number (which uses > + And with mx/my being mantissa of a single floating point number (which uses > less bits than the storage type), on each step the argument reduction can > be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus > - the signal bit): > + the implicit one bit): > > mx * 2^ex == 2^8 * mx * 2^(ex - 8) > > @@ -54,7 +54,12 @@ > mx << 8; > ex -= 8; > mx %= my; > - } */ > + } > + > + Special cases: > + - If x or y is a NaN, a NaN is returned. > + - If x is an infinity, or y is zero, a NaN is returned and EDOM is set. > + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ > > float > __fmodf (float x, float y) > @@ -67,61 +72,69 @@ __fmodf (float x, float y) > hx ^= sx; > hy &= ~SIGN_MASK; > > - /* Special cases: > - - If x or y is a Nan, NaN is returned. > - - If x is an inifinity, a NaN is returned. > - - If y is zero, Nan is returned. > - - If x is +0/-0, and y is not zero, +0/-0 is returned. */ > - if (__glibc_unlikely (hy == 0 > - || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) > - { > - if (is_nan (hx) || is_nan (hy)) > - return (x * y) / (x * y); > - return __math_edomf ((x * y) / (x * y)); > - } > - > - if (__glibc_unlikely (hx <= hy)) > + if (__glibc_likely (hx < hy)) > { > - if (hx < hy) > - return x; > - return asfloat (sx); > + /* If y is a NaN, return a NaN. */ > + if (__glibc_unlikely (hy > EXPONENT_MASK)) > + return x * y; > + return x; > } > > int ex = hx >> MANTISSA_WIDTH; > int ey = hy >> MANTISSA_WIDTH; > + int exp_diff = ex - ey; > + > + /* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN > + and |x%y| not denormal. */ > + if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH > + && ey > MANTISSA_WIDTH > + && exp_diff <= EXPONENT_WIDTH)) > + { > + uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK; > + uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK; > + > + mx %= (my >> exp_diff); > + > + if (__glibc_unlikely (mx == 0)) > + return asfloat (sx); > + int shift = __builtin_clz (mx); > + ex -= shift + 1; > + mx <<= shift; > + mx = sx | (mx >> EXPONENT_WIDTH); > + return asfloat (mx + ((uint32_t)ex << MANTISSA_WIDTH)); > + } > > - /* Common case where exponents are close: ey >= -103 and |x/y| < 2^8, */ > - if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) > + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK)) > { > - uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); > - uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); > + /* If x is a NaN, return a NaN. */ > + if (hx > EXPONENT_MASK) > + return x * y; > > - uint32_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; > - return make_float (d, ey - 1, sx); > + /* If x is an infinity or y is zero, return a NaN and set EDOM. */ > + return __math_edomf ((x * y) / (x * y)); > } > > - /* Special case, both x and y are subnormal. */ > - if (__glibc_unlikely (ex == 0 && ey == 0)) > + /* Special case, both x and y are denormal. */ > + if (__glibc_unlikely (ex == 0)) > return asfloat (sx | hx % hy); > > - /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is > - not subnormal by conditions above. */ > + /* Extract normalized mantissas - hx is not denormal and hy != 0. */ > uint32_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); > - ex--; > - > uint32_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); > int lead_zeros_my = EXPONENT_WIDTH; > - if (__glibc_likely (ey > 0)) > - ey--; > - else > + > + ey--; > + /* Special case for denormal y. */ > + if (__glibc_unlikely (ey < 0)) > { > my = hy; > + ey = 0; > + exp_diff--; > lead_zeros_my = __builtin_clz (my); > } > > int tail_zeros_my = __builtin_ctz (my); > int sides_zeroes = lead_zeros_my + tail_zeros_my; > - int exp_diff = ex - ey; > > int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; > my >>= right_shift; > @@ -140,8 +153,7 @@ __fmodf (float x, float y) > if (exp_diff == 0) > return make_float (mx, ey, sx); > > - /* Assume modulo/divide operation is slow, so use multiplication with invert > - values. */ > + /* Multiplication with the inverse is faster than repeated modulo. */ > uint32_t inv_hy = UINT32_MAX / my; > while (exp_diff > sides_zeroes) { > exp_diff -= sides_zeroes;
Hi Adhemerval, > So at least with current 'close-exponents' from bench-fmod, which was > generated from exponents between -10 and 10, the gain is more modest > (and normal inputs does show a small regression). This should be ok, > but I also think we need to outline that A72 gains might not show on > different hardware. On a SkyLake I'm seeing this for fmod: master patch subnormals 51.34 45.92 (+11.8%) normal 436.9 420.5 (+3.9%) close-exponents 56.44 53.11 (+6.3%) And on Zen2: master patch subnormals 10.83 10.39 (+4.2%) normal 336.1 335.8 (+0.01%) close-exponents 14.90 14.11 (+5.6%) So it shows good improvements across the board. It's odd your results on AMD are worse than my Zen 2 results - are there large variations between runs? I did quite a few runs to get a fast result and increased iterations of the math benchmarks 10x. I can't explain why the gains on AArch64 are so much larger - the reduced instruction counts and branches for the common cases seem to make a big difference. On x86 there are still many MOVABS instructions which are problematic for decode. > So maybe also add another bench-fmod set for |x/y| < 2^12 to show > the potential gains. I'm not sure how that would improve things - ideally we need more realistic inputs (ie. actual traces) but we could change the existing inputs into workloads to give it a more difficult problem. Changing close-exponents into a workload shows 11.0% lower latency and 11.9% better throughput on my SkyLake. On Zen 2 I see 1% lower latency and 7.4% better throughput. Neoverse V1 shows 25.1% lower latency and 23.9% better throughput. Cheers, Wilco
On 13/04/23 17:45, Wilco Dijkstra wrote: > Hi Adhemerval, > >> So at least with current 'close-exponents' from bench-fmod, which was >> generated from exponents between -10 and 10, the gain is more modest >> (and normal inputs does show a small regression). This should be ok, >> but I also think we need to outline that A72 gains might not show on >> different hardware. > > On a SkyLake I'm seeing this for fmod: > > master patch > subnormals 51.34 45.92 (+11.8%) > normal 436.9 420.5 (+3.9%) > close-exponents 56.44 53.11 (+6.3%) > > And on Zen2: > > master patch > subnormals 10.83 10.39 (+4.2%) > normal 336.1 335.8 (+0.01%) > close-exponents 14.90 14.11 (+5.6%) > > So it shows good improvements across the board. It's odd your results on AMD are > worse than my Zen 2 results - are there large variations between runs? I did quite a > few runs to get a fast result and increased iterations of the math benchmarks 10x. I don't see much variation, but I think these numbers on multiple chips are more than enough. Could you include them on commit message? > > I can't explain why the gains on AArch64 are so much larger - the reduced instruction > counts and branches for the common cases seem to make a big difference. On x86 > there are still many MOVABS instructions which are problematic for decode> >> So maybe also add another bench-fmod set for |x/y| < 2^12 to show >> the potential gains. > > I'm not sure how that would improve things - ideally we need more realistic > inputs (ie. actual traces) but we could change the existing inputs into workloads > to give it a more difficult problem. Changing close-exponents into a workload > shows 11.0% lower latency and 11.9% better throughput on my SkyLake. On Zen 2 > I see 1% lower latency and 7.4% better throughput. Neoverse V1 shows 25.1% > lower latency and 23.9% better throughput. Fair enough, I think the only small nit is the clz_uint64 usage then.
diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c index caae4e47e2358daced51a22342ec6e34a04f6fce..7c0ef11cb6c53561e64fab4f2ece17692dff3e5f 100644 --- a/sysdeps/ieee754/dbl-64/e_fmod.c +++ b/sysdeps/ieee754/dbl-64/e_fmod.c @@ -40,10 +40,10 @@ r == x % y == (x % (N * y)) % y - And with mx/my being mantissa of double floating point number (which uses + And with mx/my being mantissa of a double floating point number (which uses less bits than the storage type), on each step the argument reduction can be improved by 11 (which is the size of uint64_t minus MANTISSA_WIDTH plus - the signal bit): + the implicit one bit): mx * 2^ex == 2^11 * mx * 2^(ex - 11) @@ -54,7 +54,12 @@ mx << 11; ex -= 11; mx %= my; - } */ + } + + Special cases: + - If x or y is a NaN, a NaN is returned. + - If x is an infinity, or y is zero, a NaN is returned and EDOM is set. + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ double __fmod (double x, double y) @@ -67,62 +72,70 @@ __fmod (double x, double y) hx ^= sx; hy &= ~SIGN_MASK; - /* Special cases: - - If x or y is a Nan, NaN is returned. - - If x is an inifinity, a NaN is returned and EDOM is set. - - If y is zero, Nan is returned. - - If x is +0/-0, and y is not zero, +0/-0 is returned. */ - if (__glibc_unlikely (hy == 0 - || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) - { - if (is_nan (hx) || is_nan (hy)) - return (x * y) / (x * y); - return __math_edom ((x * y) / (x * y)); - } - - if (__glibc_unlikely (hx <= hy)) + /* If x < y, return x (unless y is a NaN). */ + if (__glibc_likely (hx < hy)) { - if (hx < hy) - return x; - return asdouble (sx); + /* If y is a NaN, return a NaN. */ + if (__glibc_unlikely (hy > EXPONENT_MASK)) + return x * y; + return x; } int ex = hx >> MANTISSA_WIDTH; int ey = hy >> MANTISSA_WIDTH; + int exp_diff = ex - ey; + + /* Common case where exponents are close: |x/y| < 2^12, x not inf/NaN + and |x%y| not denormal. */ + if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH + && ey > MANTISSA_WIDTH + && exp_diff <= EXPONENT_WIDTH)) + { + uint64_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK; + uint64_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK; + + mx %= (my >> exp_diff); + + if (__glibc_unlikely (mx == 0)) + return asdouble (sx); + int shift = __builtin_clzll (mx); + ex -= shift + 1; + mx <<= shift; + mx = sx | (mx >> EXPONENT_WIDTH); + return asdouble (mx + ((uint64_t)ex << MANTISSA_WIDTH)); + } - /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52, */ - if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK)) { - uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); - uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); + /* If x is a NaN, return a NaN. */ + if (hx > EXPONENT_MASK) + return x * y; - uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; - return make_double (d, ey - 1, sx); + /* If x is an infinity or y is zero, return a NaN and set EDOM. */ + return __math_edom ((x * y) / (x * y)); } - /* Special case, both x and y are subnormal. */ - if (__glibc_unlikely (ex == 0 && ey == 0)) + /* Special case, both x and y are denormal. */ + if (__glibc_unlikely (ex == 0)) return asdouble (sx | hx % hy); - /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is - not subnormal by conditions above. */ + /* Extract normalized mantissas - hx is not denormal and hy != 0. */ uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); - ex--; uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); - int lead_zeros_my = EXPONENT_WIDTH; - if (__glibc_likely (ey > 0)) - ey--; - else + + ey--; + /* Special case for denormal y. */ + if (__glibc_unlikely (ey < 0)) { my = hy; + ey = 0; + exp_diff--; lead_zeros_my = clz_uint64 (my); } - /* Assume hy != 0 */ int tail_zeros_my = ctz_uint64 (my); int sides_zeroes = lead_zeros_my + tail_zeros_my; - int exp_diff = ex - ey; int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; my >>= right_shift; @@ -141,8 +154,7 @@ __fmod (double x, double y) if (exp_diff == 0) return make_double (mx, ey, sx); - /* Assume modulo/divide operation is slow, so use multiplication with invert - values. */ + /* Multiplication with the inverse is faster than repeated modulo. */ uint64_t inv_hy = UINT64_MAX / my; while (exp_diff > sides_zeroes) { exp_diff -= sides_zeroes; diff --git a/sysdeps/ieee754/flt-32/e_fmodf.c b/sysdeps/ieee754/flt-32/e_fmodf.c index 763900efdaa8222431e189e50a7e62baf465a54d..14f3fcae256d03ceb4bf91898a7a003bbfcb854a 100644 --- a/sysdeps/ieee754/flt-32/e_fmodf.c +++ b/sysdeps/ieee754/flt-32/e_fmodf.c @@ -40,10 +40,10 @@ r == x % y == (x % (N * y)) % y - And with mx/my being mantissa of double floating point number (which uses + And with mx/my being mantissa of a single floating point number (which uses less bits than the storage type), on each step the argument reduction can be improved by 8 (which is the size of uint32_t minus MANTISSA_WIDTH plus - the signal bit): + the implicit one bit): mx * 2^ex == 2^8 * mx * 2^(ex - 8) @@ -54,7 +54,12 @@ mx << 8; ex -= 8; mx %= my; - } */ + } + + Special cases: + - If x or y is a NaN, a NaN is returned. + - If x is an infinity, or y is zero, a NaN is returned and EDOM is set. + - If x is +0/-0, and y is not zero, +0/-0 is returned. */ float __fmodf (float x, float y) @@ -67,61 +72,69 @@ __fmodf (float x, float y) hx ^= sx; hy &= ~SIGN_MASK; - /* Special cases: - - If x or y is a Nan, NaN is returned. - - If x is an inifinity, a NaN is returned. - - If y is zero, Nan is returned. - - If x is +0/-0, and y is not zero, +0/-0 is returned. */ - if (__glibc_unlikely (hy == 0 - || hx >= EXPONENT_MASK || hy > EXPONENT_MASK)) - { - if (is_nan (hx) || is_nan (hy)) - return (x * y) / (x * y); - return __math_edomf ((x * y) / (x * y)); - } - - if (__glibc_unlikely (hx <= hy)) + if (__glibc_likely (hx < hy)) { - if (hx < hy) - return x; - return asfloat (sx); + /* If y is a NaN, return a NaN. */ + if (__glibc_unlikely (hy > EXPONENT_MASK)) + return x * y; + return x; } int ex = hx >> MANTISSA_WIDTH; int ey = hy >> MANTISSA_WIDTH; + int exp_diff = ex - ey; + + /* Common case where exponents are close: |x/y| < 2^9, x not inf/NaN + and |x%y| not denormal. */ + if (__glibc_likely (ey < (EXPONENT_MASK >> MANTISSA_WIDTH) - EXPONENT_WIDTH + && ey > MANTISSA_WIDTH + && exp_diff <= EXPONENT_WIDTH)) + { + uint32_t mx = (hx << EXPONENT_WIDTH) | SIGN_MASK; + uint32_t my = (hy << EXPONENT_WIDTH) | SIGN_MASK; + + mx %= (my >> exp_diff); + + if (__glibc_unlikely (mx == 0)) + return asfloat (sx); + int shift = __builtin_clz (mx); + ex -= shift + 1; + mx <<= shift; + mx = sx | (mx >> EXPONENT_WIDTH); + return asfloat (mx + ((uint32_t)ex << MANTISSA_WIDTH)); + } - /* Common case where exponents are close: ey >= -103 and |x/y| < 2^8, */ - if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH)) + if (__glibc_unlikely (hy == 0 || hx >= EXPONENT_MASK)) { - uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1); - uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1); + /* If x is a NaN, return a NaN. */ + if (hx > EXPONENT_MASK) + return x * y; - uint32_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my; - return make_float (d, ey - 1, sx); + /* If x is an infinity or y is zero, return a NaN and set EDOM. */ + return __math_edomf ((x * y) / (x * y)); } - /* Special case, both x and y are subnormal. */ - if (__glibc_unlikely (ex == 0 && ey == 0)) + /* Special case, both x and y are denormal. */ + if (__glibc_unlikely (ex == 0)) return asfloat (sx | hx % hy); - /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'. Assume that hx is - not subnormal by conditions above. */ + /* Extract normalized mantissas - hx is not denormal and hy != 0. */ uint32_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1); - ex--; - uint32_t my = get_mantissa (hy) | (MANTISSA_MASK + 1); int lead_zeros_my = EXPONENT_WIDTH; - if (__glibc_likely (ey > 0)) - ey--; - else + + ey--; + /* Special case for denormal y. */ + if (__glibc_unlikely (ey < 0)) { my = hy; + ey = 0; + exp_diff--; lead_zeros_my = __builtin_clz (my); } int tail_zeros_my = __builtin_ctz (my); int sides_zeroes = lead_zeros_my + tail_zeros_my; - int exp_diff = ex - ey; int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my; my >>= right_shift; @@ -140,8 +153,7 @@ __fmodf (float x, float y) if (exp_diff == 0) return make_float (mx, ey, sx); - /* Assume modulo/divide operation is slow, so use multiplication with invert - values. */ + /* Multiplication with the inverse is faster than repeated modulo. */ uint32_t inv_hy = UINT32_MAX / my; while (exp_diff > sides_zeroes) { exp_diff -= sides_zeroes;