Message ID | VE1PR08MB5599C92B47C5EE081422D0DE839A9@VE1PR08MB5599.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | RFC: Improve hypot performance | expand |
On Wed, Nov 17, 2021 at 03:58:53PM +0000, Wilco Dijkstra via Libc-alpha wrote: > Hi Adhemerval, > > Here is an early version of a much faster hypot implementation. It uses fma > to significantly outperform the powerpc version both in throughput and > latency. It has a worst-case ULP of ~0.949 and passes the testsuite. The > powerpc version has a worst-case ULP of ~1.21 and several test failures. > > It applies on top of your hypot patch series. I didn't optimize the non-fma > case since modern targets have fma. It'll be interesting to compare it on > Power. You'll need to correctly set FAST_FMINMAX to indicate support for > inlined fmin/fmax instructions (this will be added to math_private.h for > targets that have it), without it the code tries to use a conditional move > since using a branch here is really bad for performance. On Power10, this implementation is still has a large delta compared to the current implementation: current on Power10: "hypot": { "workload-random": { "duration": 5.27306e+08, "iterations": 4.8e+07, "reciprocal-throughput": 8.26834, "latency": 13.7027, "max-throughput": 1.20943e+08, "min-throughput": 7.29781e+07 } } with Adhemerval's patches and this patch: "hypot": { "workload-random": { "duration": 5.34629e+08, "iterations": 3.6e+07, "reciprocal-throughput": 12.5006, "latency": 17.201, "max-throughput": 7.99959e+07, "min-throughput": 5.81362e+07 } } PC > --- > > diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c > index d20bc3e3657e350a1103a8f8477db35ee60399e0..3906711788cff5d66725e5879bb4d6c36dd24dc7 100644 > --- a/sysdeps/ieee754/dbl-64/e_hypot.c > +++ b/sysdeps/ieee754/dbl-64/e_hypot.c > @@ -37,73 +37,40 @@ > #include <math-svid-compat.h> > #include <errno.h> > > +#define FAST_FMINMAX 1 > +//#undef __FP_FAST_FMA > + > +#define SCALE 0x1p-600 > +#define LARGE_VAL 0x1p+511 > +#define TINY_VAL 0x1p-459 > +#define EPS 0x1p-54 > + > + > static inline double > handle_errno (double r) > { > + r = math_narrow_eval (r); > if (isinf (r)) > __set_errno (ERANGE); > return r; > } > > -/* sqrt (DBL_EPSILON / 2.0) */ > -#define SQRT_EPS_DIV_2 0x1.6a09e667f3bcdp-27 > -/* DBL_MIN / (sqrt (DBL_EPSILON / 2.0)) */ > -#define DBL_MIN_THRESHOLD 0x1.6a09e667f3bcdp-996 > -/* eps (double) * sqrt (DBL_MIN)) */ > -#define SCALE 0x1p-563 > -/* 1 / eps (sqrt (DBL_MIN) */ > -#define INV_SCALE 0x1p+563 > -/* sqrt (DBL_MAX) */ > -#define SQRT_DBL_MAX 0x1.6a09e667f3bccp+511 > -/* sqrt (DBL_MIN) */ > -#define SQRT_DBL_MIN 0x1p-511 > - > -double > -__hypot (double x, double y) > +static inline double > +kernel (double ax, double ay) > { > - if ((isinf (x) || isinf (y)) > - && !issignaling (x) && !issignaling (y)) > - return INFINITY; > - if (isnan (x) || isnan (y)) > - return x + y; > - > - double ax = fabs (x); > - double ay = fabs (y); > - if (ay > ax) > - { > - double tmp = ax; > - ax = ay; > - ay = tmp; > - } > - > - /* Widely varying operands. The DBL_MIN_THRESHOLD check is used to avoid > - a spurious underflow from the multiplication. */ > - if (ax >= DBL_MIN_THRESHOLD && ay <= ax * SQRT_EPS_DIV_2) > - return (ay == 0.0) > - ? ax > - : handle_errno (math_narrow_eval (ax + DBL_TRUE_MIN)); > + double t1, t2; > +#ifdef __FP_FAST_FMA > + t1 = ay + ay; > + t2 = ax - ay; > > - double scale = SCALE; > - if (ax > SQRT_DBL_MAX) > - { > - ax *= scale; > - ay *= scale; > - scale = INV_SCALE; > - } > - else if (ay < SQRT_DBL_MIN) > - { > - ax /= scale; > - ay /= scale; > - } > + if (t1 >= ax) > + return sqrt (fma (t1, ax, t2 * t2)); > else > - scale = 1.0; > - > + return sqrt (fma (ax, ax, ay * ay)); > +#else > double h = sqrt (ax * ax + ay * ay); > > - double t1, t2; > - if (h == 0.0) > - return h; > - else if (h <= 2.0 * ay) > + if (h <= 2.0 * ay) > { > double delta = h - ay; > t1 = ax * (2.0 * delta - ax); > @@ -112,14 +79,57 @@ __hypot (double x, double y) > else > { > double delta = h - ax; > - t1 = 2.0 * delta * (ax - 2 * ay); > + t1 = 2.0 * delta * (ax - 2.0 * ay); > t2 = (4.0 * delta - ay) * ay + delta * delta; > } > h -= (t1 + t2) / (2.0 * h); > - h = math_narrow_eval (h * scale); > - math_check_force_underflow_nonneg (h); > - return handle_errno (h); > + return h; > +#endif > +} > + > + > +double > +__hypot (double x, double y) > +{ > + if (!isfinite (x) || !isfinite (y)) > + { > + if ((isinf (x) || isinf (y)) > + && !issignaling_inline (x) && !issignaling_inline (y)) > + return INFINITY; > + return x + y; > + } > + > + x = fabs (x); > + y = fabs (y); > + > + double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x); > + double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y); > + > + if (__glibc_unlikely (ax > LARGE_VAL)) > + { > + if (__glibc_unlikely (ay <= ax * EPS)) > + return handle_errno (ax + ay); > + > + return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE); > + } > + > + if (__glibc_unlikely (ay < TINY_VAL)) > + { > + if (__glibc_unlikely (ax >= ay / EPS)) > + return math_narrow_eval (ax + ay); > + > + ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE); > + math_check_force_underflow_nonneg (ax); > + return ax; > + } > + > + /* Common case: ax is not huge and ay is not tiny. */ > + if (__glibc_unlikely (ay <= ax * EPS)) > + return math_narrow_eval (ax + ay); > + > + return math_narrow_eval (kernel (ax, ay)); > } > + > strong_alias (__hypot, __ieee754_hypot) > libm_alias_finite (__ieee754_hypot, __hypot) > #if LIBM_SVID_COMPAT
Hi Paul, > On Power10, this implementation is still has a large delta compared to the > current implementation: It's obvious something went wrong here since it is slower than Adhemerval's version - did you look at the generated code? It should use fma, and no division or function calls (if you see calls to fmin/fmax you need to set FAST_FMINMAX to 0). Cheers, Wilco
On Thu, Nov 18, 2021 at 12:37:24PM +0000, Wilco Dijkstra wrote: > > On Power10, this implementation is still has a large delta compared to the > > current implementation: > > It's obvious something went wrong here since it is slower than Adhemerval's > version - did you look at the generated code? > > It should use fma, and no division or function calls (if you see calls to fmin/fmax > you need to set FAST_FMINMAX to 0). Ugh. That was it. I missed that subtlety. (Why would I not want "FAST" to be true? :-) Anyway, with FAST_FMINMAX set to 1, the results are much better, and compare favorably with the current implementation: current: "hypot": { "workload-random": { "duration": 5.26088e+08, "iterations": 4.8e+07, "reciprocal-throughput": 8.2599, "latency": 13.6604, "max-throughput": 1.21067e+08, "min-throughput": 7.32042e+07 } } with Adhemerval's patch and this patch with FAST_FMINMAX=1: "hypot": { "workload-random": { "duration": 5.30541e+08, "iterations": 5.2e+07, "reciprocal-throughput": 7.22461, "latency": 13.1808, "max-throughput": 1.38416e+08, "min-throughput": 7.58679e+07 } } PC
On Thu, Nov 18, 2021 at 09:43:12AM -0600, Paul A. Clarke via Libc-alpha wrote: > On Thu, Nov 18, 2021 at 12:37:24PM +0000, Wilco Dijkstra wrote: > > > On Power10, this implementation is still has a large delta compared to the > > > current implementation: > > > > It's obvious something went wrong here since it is slower than Adhemerval's > > version - did you look at the generated code? > > > > It should use fma, and no division or function calls (if you see calls to fmin/fmax > > you need to set FAST_FMINMAX to 0). > > Ugh. That was it. I missed that subtlety. (Why would I not want "FAST" to be > true? :-) > > Anyway, with FAST_FMINMAX set to 1, the results are much better, and compare That should say "with FAST_FMINMAX set to *0*". > favorably with the current implementation: > > current: > "hypot": { > "workload-random": { > "duration": 5.26088e+08, > "iterations": 4.8e+07, > "reciprocal-throughput": 8.2599, > "latency": 13.6604, > "max-throughput": 1.21067e+08, > "min-throughput": 7.32042e+07 > } > } > > with Adhemerval's patch and this patch with FAST_FMINMAX=1: > "hypot": { > "workload-random": { > "duration": 5.30541e+08, > "iterations": 5.2e+07, > "reciprocal-throughput": 7.22461, > "latency": 13.1808, > "max-throughput": 1.38416e+08, > "min-throughput": 7.58679e+07 > } > } > > PC
Hi Paul, Great, that's about 14% higher throughput and 3.5% lower latency! Looking in more detail, it appears AArch64 may be the only target which inlines fmin/fmax with -O2, so the default is now off unless math_private.h overrides it. Cheers, Wilco
diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c index d20bc3e3657e350a1103a8f8477db35ee60399e0..3906711788cff5d66725e5879bb4d6c36dd24dc7 100644 --- a/sysdeps/ieee754/dbl-64/e_hypot.c +++ b/sysdeps/ieee754/dbl-64/e_hypot.c @@ -37,73 +37,40 @@ #include <math-svid-compat.h> #include <errno.h> +#define FAST_FMINMAX 1 +//#undef __FP_FAST_FMA + +#define SCALE 0x1p-600 +#define LARGE_VAL 0x1p+511 +#define TINY_VAL 0x1p-459 +#define EPS 0x1p-54 + + static inline double handle_errno (double r) { + r = math_narrow_eval (r); if (isinf (r)) __set_errno (ERANGE); return r; } -/* sqrt (DBL_EPSILON / 2.0) */ -#define SQRT_EPS_DIV_2 0x1.6a09e667f3bcdp-27 -/* DBL_MIN / (sqrt (DBL_EPSILON / 2.0)) */ -#define DBL_MIN_THRESHOLD 0x1.6a09e667f3bcdp-996 -/* eps (double) * sqrt (DBL_MIN)) */ -#define SCALE 0x1p-563 -/* 1 / eps (sqrt (DBL_MIN) */ -#define INV_SCALE 0x1p+563 -/* sqrt (DBL_MAX) */ -#define SQRT_DBL_MAX 0x1.6a09e667f3bccp+511 -/* sqrt (DBL_MIN) */ -#define SQRT_DBL_MIN 0x1p-511 - -double -__hypot (double x, double y) +static inline double +kernel (double ax, double ay) { - if ((isinf (x) || isinf (y)) - && !issignaling (x) && !issignaling (y)) - return INFINITY; - if (isnan (x) || isnan (y)) - return x + y; - - double ax = fabs (x); - double ay = fabs (y); - if (ay > ax) - { - double tmp = ax; - ax = ay; - ay = tmp; - } - - /* Widely varying operands. The DBL_MIN_THRESHOLD check is used to avoid - a spurious underflow from the multiplication. */ - if (ax >= DBL_MIN_THRESHOLD && ay <= ax * SQRT_EPS_DIV_2) - return (ay == 0.0) - ? ax - : handle_errno (math_narrow_eval (ax + DBL_TRUE_MIN)); + double t1, t2; +#ifdef __FP_FAST_FMA + t1 = ay + ay; + t2 = ax - ay; - double scale = SCALE; - if (ax > SQRT_DBL_MAX) - { - ax *= scale; - ay *= scale; - scale = INV_SCALE; - } - else if (ay < SQRT_DBL_MIN) - { - ax /= scale; - ay /= scale; - } + if (t1 >= ax) + return sqrt (fma (t1, ax, t2 * t2)); else - scale = 1.0; - + return sqrt (fma (ax, ax, ay * ay)); +#else double h = sqrt (ax * ax + ay * ay); - double t1, t2; - if (h == 0.0) - return h; - else if (h <= 2.0 * ay) + if (h <= 2.0 * ay) { double delta = h - ay; t1 = ax * (2.0 * delta - ax); @@ -112,14 +79,57 @@ __hypot (double x, double y) else { double delta = h - ax; - t1 = 2.0 * delta * (ax - 2 * ay); + t1 = 2.0 * delta * (ax - 2.0 * ay); t2 = (4.0 * delta - ay) * ay + delta * delta; } h -= (t1 + t2) / (2.0 * h); - h = math_narrow_eval (h * scale); - math_check_force_underflow_nonneg (h); - return handle_errno (h); + return h; +#endif +} + + +double +__hypot (double x, double y) +{ + if (!isfinite (x) || !isfinite (y)) + { + if ((isinf (x) || isinf (y)) + && !issignaling_inline (x) && !issignaling_inline (y)) + return INFINITY; + return x + y; + } + + x = fabs (x); + y = fabs (y); + + double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x); + double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y); + + if (__glibc_unlikely (ax > LARGE_VAL)) + { + if (__glibc_unlikely (ay <= ax * EPS)) + return handle_errno (ax + ay); + + return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE); + } + + if (__glibc_unlikely (ay < TINY_VAL)) + { + if (__glibc_unlikely (ax >= ay / EPS)) + return math_narrow_eval (ax + ay); + + ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE); + math_check_force_underflow_nonneg (ax); + return ax; + } + + /* Common case: ax is not huge and ay is not tiny. */ + if (__glibc_unlikely (ay <= ax * EPS)) + return math_narrow_eval (ax + ay); + + return math_narrow_eval (kernel (ax, ay)); } + strong_alias (__hypot, __ieee754_hypot) libm_alias_finite (__ieee754_hypot, __hypot) #if LIBM_SVID_COMPAT