Message ID | 20210319150626.1206905-3-Paul.Zimmermann@inria.fr |
---|---|
State | New |
Headers | show |
Series | [1/9] Auxiliary function for reduction modulo 2*pi. | expand |
On 19/03/2021 12:06, Paul Zimmermann wrote: > For both j1f and y1f, the largest error for all binary32 > inputs is reduced to at most 9 ulps for all rounding modes. > > The new code is enabled only when there is a cancellation at the very end of > the j1f/y1f computation, or for very large inputs, thus should not give any > visible slowdown on average. Two different algorithms are used: > > * around the first 64 zeros of j1/y1, approximation polynomials of degree 3 > are used, computed using the Sollya tool (https://www.sollya.org/) > > * for large inputs, an asymptotic formula from [1] is used > > [1] Fast and Accurate Bessel Function Computation, > John Harrison, Proceedings of Arith 19, 2009. > > Tested on x86_64 with --disable-multi-arch and powerpc64le-linux-gnu. LGTM, with just nits below (there is no need to resend it). This patch requires some ULPs adjustments, but it is being addresses in the subsequent patch in the set. Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> > --- > sysdeps/ieee754/flt-32/e_j1f.c | 513 +++++++++++++++++++++++++++++++-- > 1 file changed, 483 insertions(+), 30 deletions(-) > > diff --git a/sysdeps/ieee754/flt-32/e_j1f.c b/sysdeps/ieee754/flt-32/e_j1f.c > index ac5bb76828..5cedfea886 100644 > --- a/sysdeps/ieee754/flt-32/e_j1f.c > +++ b/sysdeps/ieee754/flt-32/e_j1f.c > @@ -21,6 +21,8 @@ > #include <fenv_private.h> > #include <math-underflow.h> > #include <libm-alias-finite.h> > +#include <s_sincosf.h> > +#include "reduce_aux.c" > > static float ponef(float), qonef(float); > > @@ -42,6 +44,223 @@ s05 = 1.2354227016e-11; /* 0x2d59567e */ > > static const float zero = 0.0; > > +/* This is the nearest approximation of the first positive zero of j1. */ > +#define FIRST_ZERO_J1 0x3.d4eabp+0f > + > +#define SMALL_SIZE 64 > + > +/* The following table contains successive zeros of j1 and degree-3 > + polynomial approximations of j1 around these zeros: Pj[0] for the first > + positive zero (3.831705), Pj[1] for the second one (7.015586), and so on. > + Each line contains: > + {x0, xmid, x1, p0, p1, p2, p3} > + where [x0,x1] is the interval around the zero, xmid is the binary32 number > + closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation > + polynomial. Each polynomial was generated using Sollya on the interval > + [x0,x1] around the corresponding zero where the error exceeds 9 ulps > + for the alternate code. Degree 3 is enough to get an error at most > + 9 ulps, except around the first zero. > +*/ > +static const float Pj[SMALL_SIZE][7] = { > + /* For index 0, we use a degree-4 polynomial generated by Sollya, with the > + coefficient of degree 4 hard-coded in j1f_near_root(). */ > + { 0x1.e09e5ep+1, 0x1.ea7558p+1, 0x1.ef7352p+1, -0x8.4f069p-28, > + -0x6.71b3d8p-4, 0xd.744a2p-8, 0xd.acd9p-8/*, -0x1.3e51aap-8*/ }, /* 0 */ > + { 0x1.bdb4c2p+2, 0x1.c0ff6p+2, 0x1.c27a8cp+2, 0xe.c2858p-28, > + 0x4.cd464p-4, -0x5.79b71p-8, -0xc.11124p-8 }, /* 1 */ > + { 0x1.43b214p+3, 0x1.458d0ep+3, 0x1.460ccep+3, -0x1.e7acecp-24, > + -0x3.feca9p-4, 0x3.2470f8p-8, 0xa.625b7p-8 }, /* 2 */ > + { 0x1.a9c98p+3, 0x1.aa5bbp+3, 0x1.aaa4d8p+3, 0x1.698158p-24, > + 0x3.7e666cp-4, -0x2.1900ap-8, -0x9.2755p-8 }, /* 3 */ > + { 0x1.073be4p+4, 0x1.0787b4p+4, 0x1.07aed8p+4, -0x1.f5f658p-24, > + -0x3.24b8ep-4, 0x1.86e35cp-8, 0x8.4e4bbp-8 }, /* 4 */ > + { 0x1.39ae2ap+4, 0x1.39da8ep+4, 0x1.39f3dap+4, -0x1.4e744p-24, > + 0x2.e18a24p-4, -0x1.2ccd16p-8, -0x7.a27ep-8 }, /* 5 */ > + { 0x1.6bfa46p+4, 0x1.6c294ep+4, 0x1.6c412p+4, 0xa.3fb7fp-28, > + -0x2.acc9c4p-4, 0xf.0b783p-12, 0x7.1c0d3p-8 }, /* 6 */ > + { 0x1.9e42bep+4, 0x1.9e757p+4, 0x1.9e876ep+4, -0x2.29f6f4p-24, > + 0x2.81f21p-4, -0xc.641bp-12, -0x6.a7ea58p-8 }, /* 7 */ > + { 0x1.d08a3ep+4, 0x1.d0bfdp+4, 0x1.d0cd3cp+4, -0x1.b5d196p-24, > + -0x2.5e40e4p-4, 0xa.7059fp-12, 0x6.4d6bfp-8 }, /* 8 */ > + { 0x1.017794p+5, 0x1.018476p+5, 0x1.018b8cp+5, -0x4.0e001p-24, > + 0x2.3febep-4, -0x8.f23aap-12, -0x6.0102cp-8 }, /* 9 */ > + { 0x1.1a9e78p+5, 0x1.1aa89p+5, 0x1.1aaf88p+5, 0x3.b26f2p-24, > + -0x2.25babp-4, 0x7.c6d948p-12, 0x5.a1d988p-8 }, /* 10 */ > + { 0x1.33bddep+5, 0x1.33cc52p+5, 0x1.33d2e4p+5, -0xf.c8cdap-28, > + 0x2.0ed05p-4, -0x6.d97dbp-12, -0x5.8da498p-8 }, /* 11 */ > + { 0x1.4ce7cp+5, 0x1.4cefdp+5, 0x1.4cf7d4p+5, -0x3.9940e4p-24, > + -0x1.fa8b4p-4, 0x6.16108p-12, 0x5.4355e8p-8 }, /* 12 */ > + { 0x1.6603e8p+5, 0x1.661316p+5, 0x1.66173ap+5, 0x9.da15dp-28, > + 0x1.e8727ep-4, -0x5.742468p-12, -0x5.117c28p-8 }, /* 13 */ > + { 0x1.7f2ebcp+5, 0x1.7f3632p+5, 0x1.7f3a7ep+5, -0x3.39b218p-24, > + -0x1.d8293ap-4, 0x4.ee3348p-12, 0x4.f9bep-8 }, /* 14 */ > + { 0x1.9850e6p+5, 0x1.985928p+5, 0x1.985d9ep+5, -0x3.7b5108p-24, > + 0x1.c96702p-4, -0x4.7b0d08p-12, -0x4.c784a8p-8 }, /* 15 */ > + { 0x1.b172e8p+5, 0x1.b17c04p+5, 0x1.b1805cp+5, -0x1.91e43ep-24, > + -0x1.bbf246p-4, 0x4.18ad78p-12, 0x4.9bfae8p-8 }, /* 16 */ > + { 0x1.ca955ap+5, 0x1.ca9ec6p+5, 0x1.caa2a4p+5, 0x1.28453cp-24, > + 0x1.af9cb4p-4, -0x3.c3a494p-12, -0x4.78b69p-8 }, /* 17 */ > + { 0x1.e3bc94p+5, 0x1.e3c174p+5, 0x1.e3c64p+5, -0x2.e7fef4p-24, > + -0x1.a4407ep-4, 0x3.79b228p-12, 0x4.874f7p-8 }, /* 18 */ > + { 0x1.fcdf16p+5, 0x1.fce40ep+5, 0x1.fce71p+5, -0x3.23b2fcp-24, > + 0x1.99be76p-4, -0x3.39ad7cp-12, -0x4.92a55p-8 }, /* 19 */ > + { 0x1.0afe34p+6, 0x1.0b034ep+6, 0x1.0b054ap+6, -0xd.19e93p-28, > + -0x1.8ffc9cp-4, 0x2.fee7f8p-12, 0x4.2d33b8p-8 }, /* 20 */ > + { 0x1.179344p+6, 0x1.17948ep+6, 0x1.1795bp+6, 0x1.c2ac48p-24, > + 0x1.86e51cp-4, -0x2.cc5abp-12, -0x4.866d08p-8 }, /* 21 */ > + { 0x1.24231ep+6, 0x1.2425c8p+6, 0x1.2426e2p+6, -0xd.31027p-28, > + -0x1.7e656ep-4, 0x2.9db23cp-12, 0x3.cc63c8p-8 }, /* 22 */ > + { 0x1.30b5a8p+6, 0x1.30b6fep+6, 0x1.30b84ep+6, 0x5.b5e53p-24, > + 0x1.766dc2p-4, -0x2.754cfcp-12, -0x3.c39bb4p-8 }, /* 23 */ > + { 0x1.3d46ccp+6, 0x1.3d482ep+6, 0x1.3d495ep+6, -0x1.340a8ap-24, > + -0x1.6ef07ep-4, 0x2.4ff9d4p-12, 0x3.9b36e4p-8 }, /* 24 */ > + { 0x1.49d688p+6, 0x1.49d95ap+6, 0x1.49dabep+6, -0x3.ba66p-24, > + 0x1.67e1dcp-4, -0x2.2f32b8p-12, -0x3.e2aaf4p-8 }, /* 25 */ > + { 0x1.566916p+6, 0x1.566a84p+6, 0x1.566bcp+6, 0x6.47ca5p-28, > + -0x1.61379ap-4, 0x2.1096acp-12, 0x4.2d0968p-8 }, /* 26 */ > + { 0x1.62f8dap+6, 0x1.62fbaap+6, 0x1.62fc9cp+6, -0x2.12affp-24, > + 0x1.5ae8c4p-4, -0x1.f32444p-12, -0x3.9e592p-8 }, /* 27 */ > + { 0x1.6f89e6p+6, 0x1.6f8ccep+6, 0x1.6f8e34p+6, -0x7.4853ap-28, > + -0x1.54ed76p-4, 0x1.db004ap-12, 0x3.907034p-8 }, /* 28 */ > + { 0x1.7c1c6ap+6, 0x1.7c1deep+6, 0x1.7c1f4cp+6, -0x4.f0a998p-24, > + 0x1.4f3ebcp-4, -0x1.c26808p-12, -0x2.da8df8p-8 }, /* 29 */ > + { 0x1.88adaep+6, 0x1.88af0ep+6, 0x1.88afc4p+6, -0x1.80c246p-24, > + -0x1.49d668p-4, 0x1.aebc26p-12, 0x3.af7b5cp-8 }, /* 30 */ > + { 0x1.953d7p+6, 0x1.95402ap+6, 0x1.9540ep+6, -0x2.22aff8p-24, > + 0x1.44aefap-4, -0x1.99f25p-12, -0x3.5e9198p-8 }, /* 31 */ > + { 0x1.a1d01ep+6, 0x1.a1d146p+6, 0x1.a1d20ap+6, -0x3.aad6d4p-24, > + -0x1.3fc386p-4, 0x1.892858p-12, 0x3.fe0184p-8 }, /* 32 */ > + { 0x1.ae60ecp+6, 0x1.ae625ep+6, 0x1.ae6326p+6, -0x2.010be4p-24, > + 0x1.3b0fa4p-4, -0x1.7539ap-12, -0x2.b2c9bp-8 }, /* 33 */ > + { 0x1.baf234p+6, 0x1.baf376p+6, 0x1.baf442p+6, -0xd.4fd17p-32, > + -0x1.368f5cp-4, 0x1.6734e4p-12, 0x3.59f514p-8 }, /* 34 */ > + { 0x1.c782e6p+6, 0x1.c7848cp+6, 0x1.c78516p+6, -0xa.d662dp-28, > + 0x1.323f18p-4, -0x1.571c02p-12, -0x3.2be5bp-8 }, /* 35 */ > + { 0x1.d4144ep+6, 0x1.d415ap+6, 0x1.d41622p+6, 0x4.9f217p-24, > + -0x1.2e1b9ap-4, 0x1.4a2edap-12, 0x3.a4e96p-8 }, /* 36 */ > + { 0x1.e0a5ep+6, 0x1.e0a6b4p+6, 0x1.e0a788p+6, -0x2.d167p-24, > + 0x1.2a21eep-4, -0x1.3c4b46p-12, -0x4.9e0978p-8 }, /* 37 */ > + { 0x1.ed36eep+6, 0x1.ed37c8p+6, 0x1.ed3892p+6, -0x4.15a83p-24, > + -0x1.264f66p-4, 0x1.31dea4p-12, 0x3.d125ecp-8 }, /* 38 */ > + { 0x1.f9c77p+6, 0x1.f9c8d8p+6, 0x1.f9c9acp+6, -0x2.a5bbbp-24, > + 0x1.22a192p-4, -0x1.25e59ep-12, -0x2.ef6934p-8 }, /* 39 */ > + { 0x1.032c54p+7, 0x1.032cf4p+7, 0x1.032d66p+7, 0x4.e828bp-24, > + -0x1.1f1634p-4, 0x1.1c2394p-12, 0x3.6d744cp-8 }, /* 40 */ > + { 0x1.09750cp+7, 0x1.09757cp+7, 0x1.0975b6p+7, -0x3.28a3bcp-24, > + 0x1.1bab3ep-4, -0x1.1569cep-12, -0x5.84da7p-8 }, /* 41 */ > + { 0x1.0fbd9ap+7, 0x1.0fbe04p+7, 0x1.0fbe5ep+7, -0x2.2f667p-24, > + -0x1.185eccp-4, 0x1.07f42cp-12, 0x2.87896cp-8 }, /* 42 */ > + { 0x1.160628p+7, 0x1.16068ap+7, 0x1.1606cep+7, -0x6.9097dp-24, > + 0x1.152f28p-4, -0x1.0227fep-12, -0x5.da6e6p-8 }, /* 43 */ > + { 0x1.1c4e9ap+7, 0x1.1c4f12p+7, 0x1.1c4f7cp+7, -0x5.1b408p-24, > + -0x1.121abp-4, 0xf.6efcp-16, 0x2.c5e954p-8 }, /* 44 */ > + { 0x1.2296aap+7, 0x1.229798p+7, 0x1.2297d4p+7, 0x2.70d0dp-24, > + 0x1.0f1ffp-4, -0xf.523f5p-16, -0x3.5c0568p-8 }, /* 45 */ > + { 0x1.28dfa4p+7, 0x1.28e01ep+7, 0x1.28e054p+7, -0x2.7c176p-24, > + -0x1.0c3d8ap-4, 0xe.8329ap-16, 0x3.5eb34p-8 }, /* 46 */ > + { 0x1.2f282ap+7, 0x1.2f28a4p+7, 0x1.2f28dep+7, 0x4.fd6368p-24, > + 0x1.097236p-4, -0xe.17299p-16, -0x3.73a2e4p-8 }, /* 47 */ > + { 0x1.3570bp+7, 0x1.357128p+7, 0x1.35716p+7, 0x6.b05f68p-24, > + -0x1.06bccap-4, 0xd.527b8p-16, 0x2.b8bf9cp-8 }, /* 48 */ > + { 0x1.3bb932p+7, 0x1.3bb9aep+7, 0x1.3bb9eap+7, 0x4.0d622p-28, > + 0x1.041c28p-4, -0xd.0ac11p-16, -0x1.65f2ccp-8 }, /* 49 */ > + { 0x1.4201b6p+7, 0x1.420232p+7, 0x1.42027p+7, 0x7.0d98cp-24, > + -0x1.018f52p-4, 0xc.c4d8ep-16, 0x2.8f250cp-8 }, /* 50 */ > + { 0x1.484a78p+7, 0x1.484ab8p+7, 0x1.484af4p+7, 0x3.93d10cp-24, > + 0xf.f154fp-8, -0xc.7b7fep-16, -0x3.6b6e4cp-8 }, /* 51 */ > + { 0x1.4e92c8p+7, 0x1.4e933cp+7, 0x1.4e9368p+7, 0xd.88185p-32, > + -0xf.cad3fp-8, 0xc.1462p-16, 0x2.bd66p-8 }, /* 52 */ > + { 0x1.54db84p+7, 0x1.54dbcp+7, 0x1.54dbf4p+7, -0x1.fe6b92p-24, > + 0xf.a564cp-8, -0xb.c4e6cp-16, -0x3.d51decp-8 }, /* 53 */ > + { 0x1.5b23c4p+7, 0x1.5b2444p+7, 0x1.5b2486p+7, 0x2.6137f4p-24, > + -0xf.80faep-8, 0xb.5199ep-16, 0x1.9ca85ap-8 }, /* 54 */ > + { 0x1.616c62p+7, 0x1.616cc8p+7, 0x1.616d0ap+7, -0x1.55468p-24, > + 0xf.5d8acp-8, -0xb.21d16p-16, -0x1.b8809ap-8 }, /* 55 */ > + { 0x1.67b4fp+7, 0x1.67b54cp+7, 0x1.67b588p+7, -0x1.08c6bep-24, > + -0xf.3b096p-8, 0xa.e65efp-16, 0x3.642304p-8 }, /* 56 */ > + { 0x1.6dfd8ep+7, 0x1.6dfddp+7, 0x1.6dfe0ap+7, 0x4.9ebfa8p-24, > + 0xf.196c7p-8, -0xa.ba8c8p-16, -0x5.ad6a2p-8 }, /* 57 */ > + { 0x1.74461p+7, 0x1.744652p+7, 0x1.744692p+7, 0x5.a4017p-24, > + -0xe.f8aa5p-8, 0xa.49748p-16, 0x2.a86498p-8 }, /* 58 */ > + { 0x1.7a8e5ep+7, 0x1.7a8ed6p+7, 0x1.7a8ef8p+7, 0x3.bcb2a8p-28, > + 0xe.d8b9dp-8, -0x9.c43bep-16, -0x1.e7124ap-8 }, /* 59 */ > + { 0x1.80d6cep+7, 0x1.80d75ap+7, 0x1.80d78ap+7, -0x7.1091fp-24, > + -0xe.b9925p-8, 0x9.c43dap-16, 0x1.aba86p-8 }, /* 60 */ > + { 0x1.871f58p+7, 0x1.871fdcp+7, 0x1.87201ep+7, 0x2.ca1cf4p-28, > + 0xe.9b2bep-8, -0x9.843b3p-16, -0x2.093e68p-8 }, /* 61 */ > + { 0x1.8d67e8p+7, 0x1.8d685ep+7, 0x1.8d688ep+7, 0x5.aa8908p-24, > + -0xe.7d7ecp-8, 0x9.501a8p-16, 0x2.54a754p-8 }, /* 62 */ > + { 0x1.93b09cp+7, 0x1.93b0e2p+7, 0x1.93b10ep+7, 0x3.d9cd9cp-24, > + 0xe.6083ap-8, -0x9.45dadp-16, -0x5.112908p-8 }, /* 63 */ > +}; > + > +/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf: > + j1f(x) ~ sqrt(2/(pi*x))*beta1(x)*cos(x-3pi/4-alpha1(x)) > + where beta1(x) = 1 + 3/(16*x^2) - 99/(512*x^4) > + and alpha1(x) = -3/(8*x) + 21/(128*x^3) - 1899/(5120*x^5). */ > +static float > +j1f_asympt (float x) > +{ > + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */ Double space before '*/;'. > + if (x < 0) > + { > + x = -x; > + cst = -cst; > + } > + double y = 1.0 / (double) x; > + double y2 = y * y; > + double beta1 = 1.0f + y2 * (0x3p-4 - 0x3.18p-4 * y2); > + double alpha1; > + alpha1 = y * (-0x6p-4 + y2 * (0x2.ap-4 - 0x5.ef33333333334p-4 * y2)); > + double h; > + int n; > + h = reduce_aux (x, &n, alpha1); > + n--; /* Subtract pi/2. */ > + /* Now x - 3pi/4 - alpha1 = h + n*pi/2 mod (2*pi). */ > + float xr = (float) h; > + n = n & 3; > + float t = cst / sqrtf (x) * (float) beta1; > + if (n == 0) > + return t * __cosf (xr); > + else if (n == 2) /* cos(x+pi) = -cos(x) */ > + return -t * __cosf (xr); > + else if (n == 1) /* cos(x+pi/2) = -sin(x) */ > + return -t * __sinf (xr); > + else /* cos(x+3pi/2) = sin(x) */ Ditto. > + return t * __sinf (xr); > +} > + > +/* Special code for x near a root of j1. > + z is the value computed by the generic code. > + For small x, we use a polynomial approximating j1 around its root. > + For large x, we use an asymptotic formula (j1f_asympt). */ > +static float > +j1f_near_root (float x, float z) > +{ > + float index_f, sign = 1.0f; > + int index; > + > + if (x < 0) > + { > + x = -x; > + sign = -1.0f; > + } > + index_f = roundf ((x - FIRST_ZERO_J1) / (float) M_PI); > + if (index_f >= SMALL_SIZE) > + return sign * j1f_asympt (x); > + index = (int) index_f; > + const float *p = Pj[index]; > + float x0 = p[0]; > + float x1 = p[2]; > + /* If not in the interval [x0,x1] around xmid, return the value z. */ > + if (! (x0 <= x && x <= x1)) > + return z; > + float xmid = p[1]; > + float y = x - xmid; > + float p6 = (index > 0) ? p[6] : p[6] + y * -0x1.3e51aap-8f; > + return sign * (p[3] + y * (p[4] + y * (p[5] + y * p6))); > +} > + > float > __ieee754_j1f(float x) > { > @@ -53,25 +272,37 @@ __ieee754_j1f(float x) > if(__builtin_expect(ix>=0x7f800000, 0)) return one/x; > y = fabsf(x); > if(ix >= 0x40000000) { /* |x| >= 2.0 */ > + SET_RESTORE_ROUNDF (FE_TONEAREST); > __sincosf (y, &s, &c); > ss = -s-c; > cc = s-c; > - if(ix<0x7f000000) { /* make sure y+y not overflow */ > - z = __cosf(y+y); > - if ((s*c)>zero) cc = z/ss; > - else ss = z/cc; > - } > + if (ix >= 0x7f000000) > + /* x >= 2^127: use asymptotic expansion. */ > + return j1f_asympt (x); > + /* Now we are sure that x+x cannot overflow. */ > + z = __cosf(y+y); > + if ((s*c)>zero) cc = z/ss; > + else ss = z/cc; > /* > * j1(x) = 1/sqrt(pi) * (P(1,x)*cc - Q(1,x)*ss) / sqrt(x) > * y1(x) = 1/sqrt(pi) * (P(1,x)*ss + Q(1,x)*cc) / sqrt(x) > */ > - if(ix>0x5c000000) z = (invsqrtpi*cc)/sqrtf(y); > - else { > + if (ix <= 0x5c000000) > + { > u = ponef(y); v = qonef(y); > - z = invsqrtpi*(u*cc-v*ss)/sqrtf(y); > - } > - if(hx<0) return -z; > - else return z; > + cc = u*cc-v*ss; > + } > + z = (invsqrtpi * cc) / sqrtf(y); > + /* Adjust sign of z. */ > + z = (hx < 0) ? -z : z; > + /* The following threshold is optimal: for x=0x1.e09e5ep+1 > + and rounding upwards, cc=0x1.b79638p-4 and z is 10 ulps > + far from the correctly rounded value. */ > + float threshold = 0x1.b79638p-4; > + if (fabsf (cc) > threshold) > + return z; > + else > + return j1f_near_root (x, z); > } > if(__builtin_expect(ix<0x32000000, 0)) { /* |x|<2**-27 */ > if(huge+x>one) { /* inexact if x!=0 necessary */ > @@ -105,6 +336,218 @@ static const float V0[5] = { > 1.6655924903e-11, /* 0x2d9281cf */ > }; > > +/* This is the nearest approximation of the first zero of y1. */ > +#define FIRST_ZERO_Y1 0x2.3277dcp+0f > + > +/* The following table contains successive zeros of y1 and degree-3 > + polynomial approximations of y1 around these zeros: Py[0] for the first > + positive zero (2.197141), Py[1] for the second one (5.429681), and so on. > + Each line contains: > + {x0, xmid, x1, p0, p1, p2, p3} > + where [x0,x1] is the interval around the zero, xmid is the binary32 number > + closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation > + polynomial. Each polynomial was generated using Sollya on the interval > + [x0,x1] around the corresponding zero where the error exceeds 9 ulps > + for the alternate code. Degree 3 is enough, except for the first roots. > +*/ > +static const float Py[SMALL_SIZE][7] = { > + /* For index 0, we use a degree-5 polynomial generated by Sollya, with the > + coefficients of degree 4 and 5 hard-coded in y1f_near_root(). */ > + { 0x1.f7f16ap+0, 0x1.193beep+1, 0x1.2105dcp+1, 0xb.96749p-28, > + 0x8.55241p-4, -0x1.e570bp-4, -0x8.68b61p-8 > + /*, -0x1.28043p-8, 0x2.50e83p-8*/ }, /* 0 */ > + /* For index 1, we use a degree-4 polynomial generated by Sollya, with the > + coefficient of degree 4 hard-coded in y1f_near_root(). */ > + { 0x1.55c6d2p+2, 0x1.5b7fe4p+2, 0x1.5cf8cap+2, 0x1.3c7822p-24, > + -0x5.71f158p-4, 0x8.05cb4p-8, 0xd.0b15p-8/*, -0xf.ff6b8p-12*/ }, /* 1 */ > + { 0x1.113c6p+3, 0x1.13127ap+3, 0x1.1387dcp+3, -0x1.f3ad8ep-24, > + 0x4.57e66p-4, -0x4.0afb58p-8, -0xb.29207p-8 }, /* 2 */ > + { 0x1.76e7dep+3, 0x1.77f914p+3, 0x1.786a6ap+3, -0xd.5608fp-28, > + -0x3.b829d4p-4, 0x2.8852cp-8, 0x9.b70e3p-8 }, /* 3 */ > + { 0x1.dc2794p+3, 0x1.dcb7d8p+3, 0x1.dd032p+3, -0xe.a7c04p-28, > + 0x3.4e0458p-4, -0x1.c64b18p-8, -0x8.b0e7fp-8 }, /* 4 */ > + { 0x1.20874p+4, 0x1.20b1c6p+4, 0x1.20c71p+4, 0x1.c2626p-24, > + -0x3.00f03cp-4, 0x1.54f806p-8, 0x7.f9cf9p-8 }, /* 5 */ > + { 0x1.52d848p+4, 0x1.530254p+4, 0x1.531962p+4, -0x1.9503ecp-24, > + 0x2.c5b29cp-4, -0x1.0bf28p-8, -0x7.562e58p-8 }, /* 6 */ > + { 0x1.851e64p+4, 0x1.854fa4p+4, 0x1.85679p+4, -0x2.8d40fcp-24, > + -0x2.96547p-4, 0xd.9c38bp-12, 0x6.dcbf8p-8 }, /* 7 */ > + { 0x1.b7808ep+4, 0x1.b79acep+4, 0x1.b7b2a8p+4, -0x2.36df5cp-24, > + 0x2.6f55ap-4, -0xb.57f9fp-12, -0x6.82569p-8 }, /* 8 */ > + { 0x1.e9c8fp+4, 0x1.e9e48p+4, 0x1.e9f24p+4, 0xd.e2eb7p-28, > + -0x2.4e8104p-4, 0x9.a4be2p-12, 0x6.2541fp-8 }, /* 9 */ > + { 0x1.0e0808p+5, 0x1.0e169p+5, 0x1.0e1d92p+5, -0x2.3070f4p-24, > + 0x2.325e4cp-4, -0x8.53604p-12, -0x5.ca03a8p-8 }, /* 10 */ > + { 0x1.272e08p+5, 0x1.273a7cp+5, 0x1.2741fcp+5, -0x3.525508p-24, > + -0x2.19e7dcp-4, 0x7.49d1dp-12, 0x5.9cb02p-8 }, /* 11 */ > + { 0x1.404ec6p+5, 0x1.405e18p+5, 0x1.4065cep+5, -0xe.6e158p-28, > + 0x2.046174p-4, -0x6.71b3dp-12, -0x5.4c3c8p-8 }, /* 12 */ > + { 0x1.5971dcp+5, 0x1.598178p+5, 0x1.598592p+5, 0x1.e72698p-24, > + -0x1.f13fb2p-4, 0x5.c0f938p-12, 0x5.28ca78p-8 }, /* 13 */ > + { 0x1.729c4ep+5, 0x1.72a4a8p+5, 0x1.72a8eap+5, -0x1.5bed9cp-24, > + 0x1.e018dcp-4, -0x5.2f11e8p-12, -0x5.16ce48p-8 }, /* 14 */ > + { 0x1.8bbf4ep+5, 0x1.8bc7b2p+5, 0x1.8bcc1p+5, -0x3.6b654cp-24, > + -0x1.d09b2p-4, 0x4.b1747p-12, 0x4.bd22fp-8 }, /* 15 */ > + { 0x1.a4e272p+5, 0x1.a4ea9ap+5, 0x1.a4eef4p+5, 0x1.6f11bp-24, > + 0x1.c28612p-4, -0x4.47462p-12, -0x4.947c5p-8 }, /* 16 */ > + { 0x1.be08bep+5, 0x1.be0d68p+5, 0x1.be1088p+5, -0x2.0bc074p-24, > + -0x1.b5a622p-4, 0x3.ed52d4p-12, 0x4.b76fc8p-8 }, /* 17 */ > + { 0x1.d7272ap+5, 0x1.d7301ep+5, 0x1.d734aep+5, -0x2.87dd4p-24, > + 0x1.a9d184p-4, -0x3.9cf494p-12, -0x4.6303ep-8 }, /* 18 */ > + { 0x1.f0499ap+5, 0x1.f052c4p+5, 0x1.f05758p+5, -0x2.fb964p-24, > + -0x1.9ee5eep-4, 0x3.5800dp-12, 0x4.4e9f9p-8 }, /* 19 */ > + { 0x1.04b63ap+6, 0x1.04baacp+6, 0x1.04bc92p+6, 0x2.cf5adp-24, > + 0x1.94c6f4p-4, -0x3.1a83e4p-12, -0x4.2311fp-8 }, /* 20 */ > + { 0x1.1146dp+6, 0x1.114beep+6, 0x1.114e12p+6, 0x3.6766fp-24, > + -0x1.8b5cccp-4, 0x2.e4a4e4p-12, 0x4.20bf9p-8 }, /* 21 */ > + { 0x1.1dda8cp+6, 0x1.1ddd2cp+6, 0x1.1dde7ap+6, 0x3.501424p-24, > + 0x1.829356p-4, -0x2.b47524p-12, -0x4.04bf18p-8 }, /* 22 */ > + { 0x1.2a6bcp+6, 0x1.2a6e64p+6, 0x1.2a6faap+6, -0x5.c05808p-24, > + -0x1.7a597ep-4, 0x2.8a0498p-12, 0x4.187258p-8 }, /* 23 */ > + { 0x1.36fcd6p+6, 0x1.36ff96p+6, 0x1.3700f6p+6, 0x7.1e1478p-28, > + 0x1.72a09ap-4, -0x2.61a7fp-12, -0x3.c0b54p-8 }, /* 24 */ > + { 0x1.438f46p+6, 0x1.4390c4p+6, 0x1.4392p+6, 0x3.e36e6cp-24, > + -0x1.6b5c06p-4, 0x2.3f612p-12, 0x4.18f868p-8 }, /* 25 */ > + { 0x1.501f4cp+6, 0x1.5021fp+6, 0x1.50235p+6, 0x1.3f9e5ap-24, > + 0x1.6480c4p-4, -0x2.1f28fcp-12, -0x3.bb4e3cp-8 }, /* 26 */ > + { 0x1.5cb07cp+6, 0x1.5cb318p+6, 0x1.5cb464p+6, -0x2.39e41cp-24, > + -0x1.5e0544p-4, 0x2.0189f4p-12, 0x3.8b55acp-8 }, /* 27 */ > + { 0x1.694166p+6, 0x1.69443cp+6, 0x1.694594p+6, -0x2.912f84p-24, > + 0x1.57e12p-4, -0x1.e6fabep-12, -0x3.850174p-8 }, /* 28 */ > + { 0x1.75d27cp+6, 0x1.75d55ep+6, 0x1.75d67ep+6, 0x3.d5b00cp-24, > + -0x1.520ceep-4, 0x1.d0286ep-12, 0x3.8e7d1p-8 }, /* 29 */ > + { 0x1.82653ep+6, 0x1.82667ep+6, 0x1.82674p+6, -0x3.1726ecp-24, > + 0x1.4c8222p-4, -0x1.b98206p-12, -0x3.f34978p-8 }, /* 30 */ > + { 0x1.8ef4b4p+6, 0x1.8ef79cp+6, 0x1.8ef888p+6, 0x1.949e22p-24, > + -0x1.473ae6p-4, 0x1.a47388p-12, 0x3.69eefcp-8 }, /* 31 */ > + { 0x1.9b8728p+6, 0x1.9b88b8p+6, 0x1.9b896cp+6, -0x5.5553bp-28, > + 0x1.42320ap-4, -0x1.90f0b8p-12, -0x3.6565p-8 }, /* 32 */ > + { 0x1.a8183cp+6, 0x1.a819d2p+6, 0x1.a81aecp+6, 0x3.2df7ecp-28, > + -0x1.3d62e4p-4, 0x1.7dae28p-12, 0x2.9eb128p-8 }, /* 33 */ > + { 0x1.b4aa1cp+6, 0x1.b4aaeap+6, 0x1.b4abb8p+6, -0x1.e13fcep-24, > + 0x1.38c948p-4, -0x1.6eb0ecp-12, -0x1.f9ddf8p-8 }, /* 34 */ > + { 0x1.c13a7ap+6, 0x1.c13c02p+6, 0x1.c13cbp+6, -0x3.ad9974p-24, > + -0x1.34616ep-4, 0x1.5e36ecp-12, 0x2.a9fc5p-8 }, /* 35 */ > + { 0x1.cdcb76p+6, 0x1.cdcd16p+6, 0x1.cdcde4p+6, -0x3.6977e8p-24, > + 0x1.3027fp-4, -0x1.4f703p-12, -0x2.9817d4p-8 }, /* 36 */ > + { 0x1.da5cdep+6, 0x1.da5e2ap+6, 0x1.da5efp+6, 0x4.654cbp-24, > + -0x1.2c19b6p-4, 0x1.455982p-12, 0x3.f1c564p-8 }, /* 37 */ > + { 0x1.e6edccp+6, 0x1.e6ef3ep+6, 0x1.e6f00ap+6, 0x8.825c8p-32, > + 0x1.2833eep-4, -0x1.39097p-12, -0x3.b2646p-8 }, /* 38 */ > + { 0x1.f37f72p+6, 0x1.f3805p+6, 0x1.f3812ap+6, -0x2.0d11d8p-28, > + -0x1.24740ap-4, 0x1.2c16p-12, 0x1.fc3804p-8 }, /* 39 */ > + { 0x1.000842p+7, 0x1.0008bp+7, 0x1.000908p+7, -0x4.4e495p-24, > + 0x1.20d7b6p-4, -0x1.20816p-12, -0x2.d1ebe8p-8 }, /* 40 */ > + { 0x1.06505cp+7, 0x1.065138p+7, 0x1.06518p+7, 0x4.81c1c8p-24, > + -0x1.1d5ccap-4, 0x1.17ad5ap-12, 0x2.fda33p-8 }, /* 41 */ > + { 0x1.0c98dap+7, 0x1.0c99cp+7, 0x1.0c9a28p+7, -0xe.99386p-28, > + 0x1.1a015p-4, -0x1.0bd50ap-12, -0x2.9dfb68p-8 }, /* 42 */ > + { 0x1.12e212p+7, 0x1.12e248p+7, 0x1.12e29p+7, -0x6.16f1c8p-24, > + -0x1.16c37ap-4, 0x1.0303dcp-12, 0x4.34316p-8 }, /* 43 */ > + { 0x1.192a68p+7, 0x1.192acep+7, 0x1.192b02p+7, -0x1.129336p-24, > + 0x1.13a19ep-4, -0xf.bd247p-16, -0x3.851d18p-8 }, /* 44 */ > + { 0x1.1f727p+7, 0x1.1f7354p+7, 0x1.1f73ap+7, 0x5.19c09p-24, > + -0x1.109a32p-4, 0xf.09644p-16, 0x2.d78194p-8 }, /* 45 */ > + { 0x1.25bb8p+7, 0x1.25bbdap+7, 0x1.25bc12p+7, -0x6.497dp-24, > + 0x1.0dabc8p-4, -0xe.a1d25p-16, -0x2.3378bp-8 }, /* 46 */ > + { 0x1.2c04p+7, 0x1.2c046p+7, 0x1.2c04ap+7, 0x4.e4f338p-24, > + -0x1.0ad512p-4, 0xe.52d84p-16, 0x4.3bfa08p-8 }, /* 47 */ > + { 0x1.324cbp+7, 0x1.324ce6p+7, 0x1.324d4p+7, -0x1.287c58p-24, > + 0x1.0814d4p-4, -0xe.03a95p-16, 0x3.9930ap-12 }, /* 48 */ > + { 0x1.3894f6p+7, 0x1.38956cp+7, 0x1.3895ap+7, -0x4.b594ep-24, > + -0x1.0569fp-4, 0xd.6787ep-16, 0x4.0a5148p-8 }, /* 49 */ > + { 0x1.3edd98p+7, 0x1.3eddfp+7, 0x1.3ede2ap+7, -0x3.a8f164p-24, > + 0x1.02d354p-4, -0xd.0309dp-16, -0x3.2ebfb4p-8 }, /* 50 */ > + { 0x1.452638p+7, 0x1.452676p+7, 0x1.4526b4p+7, -0x6.12505p-24, > + -0x1.005004p-4, 0xc.a0045p-16, 0x4.87c67p-8 }, /* 51 */ > + { 0x1.4b6e8p+7, 0x1.4b6efap+7, 0x1.4b6f34p+7, 0x1.8acf4ep-24, > + 0xf.ddf16p-8, -0xc.2d207p-16, -0x1.da6c36p-8 }, /* 52 */ > + { 0x1.51b742p+7, 0x1.51b77ep+7, 0x1.51b7b2p+7, 0x1.39cf86p-24, > + -0xf.b7faep-8, 0xb.db598p-16, -0x8.945b1p-12 }, /* 53 */ > + { 0x1.57ffc4p+7, 0x1.580002p+7, 0x1.58003cp+7, -0x2.5f8de8p-24, > + 0xf.930fep-8, -0xb.91889p-16, -0xa.30df9p-12 }, /* 54 */ > + { 0x1.5e483p+7, 0x1.5e4886p+7, 0x1.5e48c8p+7, 0x2.073d64p-24, > + -0xf.6f245p-8, 0xb.4085fp-16, 0x2.128188p-8 }, /* 55 */ > + { 0x1.64908cp+7, 0x1.64910ap+7, 0x1.64912ap+7, -0x4.ed26ep-28, > + 0xf.4c2cep-8, -0xa.fe719p-16, -0x2.9374b8p-8 }, /* 56 */ > + { 0x1.6ad91ep+7, 0x1.6ad98ep+7, 0x1.6ad9cep+7, -0x2.ae5204p-24, > + -0xf.2a1efp-8, 0xa.aa585p-16, 0x2.1c0834p-8 }, /* 57 */ > + { 0x1.7121cep+7, 0x1.712212p+7, 0x1.712238p+7, 0x6.d72168p-24, > + 0xf.08f09p-8, -0xa.7da49p-16, -0x3.4f5f1cp-8 }, /* 58 */ > + { 0x1.776a0cp+7, 0x1.776a94p+7, 0x1.776accp+7, 0x2.d3f294p-24, > + -0xe.e8986p-8, 0xa.23ccdp-16, 0x2.2a6678p-8 }, /* 59 */ > + { 0x1.7db2e8p+7, 0x1.7db318p+7, 0x1.7db35ap+7, 0x3.88c0fp-24, > + 0xe.c90d7p-8, -0x9.eaeap-16, -0x2.86438cp-8 }, /* 60 */ > + { 0x1.83fb56p+7, 0x1.83fb9ap+7, 0x1.83fbep+7, 0x3.d94d34p-24, > + -0xe.aa478p-8, 0x9.abac7p-16, 0x1.ac2d84p-8 }, /* 61 */ > + { 0x1.8a43e8p+7, 0x1.8a441ep+7, 0x1.8a446p+7, 0x4.66b7ep-24, > + 0xe.8c3e9p-8, -0x9.87682p-16, -0x7.9ab4a8p-12 }, /* 62 */ > + { 0x1.908c6p+7, 0x1.908cap+7, 0x1.908ce6p+7, 0xf.f7ac9p-28, > + -0xe.6eeb6p-8, 0x9.4423p-16, 0x4.54c4d8p-8 }, /* 63 */ > +}; > + > +/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf: > + y1f(x) ~ sqrt(2/(pi*x))*beta1(x)*sin(x-3pi/4-alpha1(x)) > + where beta1(x) = 1 + 3/(16*x^2) - 99/(512*x^4) > + and alpha1(x) = -3/(8*x) + 21/(128*x^3) - 1899/(5120*x^5). */ > +static float > +y1f_asympt (float x) > +{ > + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */ Ditto. > + double y = 1.0 / (double) x; > + double y2 = y * y; > + double beta1 = 1.0f + y2 * (0x3p-4 - 0x3.18p-4 * y2); > + double alpha1; > + alpha1 = y * (-0x6p-4 + y2 * (0x2.ap-4 - 0x5.ef33333333334p-4 * y2)); > + double h; > + int n; > + h = reduce_aux (x, &n, alpha1); > + n--; /* Subtract pi/2. */ > + /* Now x - 3pi/4 - alpha1 = h + n*pi/2 mod (2*pi). */ > + float xr = (float) h; > + n = n & 3; > + float t = cst / sqrtf (x) * (float) beta1; > + if (n == 0) > + return t * __sinf (xr); > + else if (n == 2) /* sin(x+pi) = -sin(x) */ > + return -t * __sinf (xr); > + else if (n == 1) /* sin(x+pi/2) = cos(x) */ > + return t * __cosf (xr); > + else /* sin(x+3pi/2) = -cos(x) */ Ditto. > + return -t * __cosf (xr); > +} > + > +/* Special code for x near a root of y1. > + z is the value computed by the generic code. > + For small x, we use a polynomial approximating y1 around its root. > + For large x, we use an asymptotic formula (y1f_asympt). */ > +static float > +y1f_near_root (float x, float z) > +{ > + float index_f; > + int index; > + > + index_f = roundf ((x - FIRST_ZERO_Y1) / (float) M_PI); > + if (index_f >= SMALL_SIZE) > + return y1f_asympt (x); > + index = (int) index_f; > + const float *p = Py[index]; > + float x0 = p[0]; > + float x1 = p[2]; > + /* If not in the interval [x0,x1] around xmid, return the value z. */ > + if (! (x0 <= x && x <= x1)) > + return z; > + float xmid = p[1]; > + float y = x - xmid, p6; > + if (index == 0) > + p6 = p[6] + y * (-0x1.28043p-8 + y * 0x2.50e83p-8); > + else if (index == 1) > + p6 = p[6] + y * -0xf.ff6b8p-12; > + else > + p6 = p[6]; > + return p[3] + y * (p[4] + y * (p[5] + y * p6)); > +} > + > float > __ieee754_y1f(float x) > { > @@ -118,16 +561,18 @@ __ieee754_y1f(float x) > if(__builtin_expect(ix==0, 0)) > return -1/zero; /* -inf and divide by zero exception. */ > if(__builtin_expect(hx<0, 0)) return zero/(zero*x); > - if(ix >= 0x40000000) { /* |x| >= 2.0 */ > + if (ix >= 0x3fe0dfbc) { /* |x| >= 0x1.c1bf78p+0 */ > SET_RESTORE_ROUNDF (FE_TONEAREST); > __sincosf (x, &s, &c); > ss = -s-c; > cc = s-c; > - if(ix<0x7f000000) { /* make sure x+x not overflow */ > - z = __cosf(x+x); > - if ((s*c)>zero) cc = z/ss; > - else ss = z/cc; > - } > + if (ix >= 0x7f000000) > + /* x >= 2^127: use asymptotic expansion. */ > + return y1f_asympt (x); > + /* Now we are sure that x+x cannot overflow. */ > + z = __cosf(x+x); > + if ((s*c)>zero) cc = z/ss; > + else ss = z/cc; > /* y1(x) = sqrt(2/(pi*x))*(p1(x)*sin(x0)+q1(x)*cos(x0)) > * where x0 = x-3pi/4 > * Better formula: > @@ -139,12 +584,20 @@ __ieee754_y1f(float x) > * sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x)) > * to compute the worse one. > */ > - if(ix>0x5c000000) z = (invsqrtpi*ss)/sqrtf(x); > - else { > - u = ponef(x); v = qonef(x); > - z = invsqrtpi*(u*ss+v*cc)/sqrtf(x); > - } > - return z; > + if (ix <= 0x5c000000) > + { > + u = ponef(x); v = qonef(x); > + ss = u*ss+v*cc; > + } > + z = (invsqrtpi * ss) / sqrtf(x); > + float threshold = 0x1.3e014cp-2; > + /* The following threshold is optimal: for x=0x1.f7f16ap+0 > + and rounding upwards, |ss|=-0x1.3e014cp-2 and z is 11 ulps > + far from the correctly rounded value. */ > + if (fabsf (ss) > threshold) > + return z; > + else > + return y1f_near_root (x, z); > } > if(__builtin_expect(ix<=0x33000000, 0)) { /* x < 2**-25 */ > z = -tpi / x; > @@ -152,6 +605,7 @@ __ieee754_y1f(float x) > __set_errno (ERANGE); > return z; > } > + /* Now 2**-25 <= x < 0x1.c1bf78p+0. */ > z = x*x; > u = U0[0]+z*(U0[1]+z*(U0[2]+z*(U0[3]+z*U0[4]))); > v = one+z*(V0[0]+z*(V0[1]+z*(V0[2]+z*(V0[3]+z*V0[4])))); > @@ -159,7 +613,7 @@ __ieee754_y1f(float x) > } > libm_alias_finite (__ieee754_y1f, __y1f) > > -/* For x >= 8, the asymptotic expansions of pone is > +/* For x >= 8, the asymptotic expansion of pone is > * 1 + 15/128 s^2 - 4725/2^15 s^4 - ..., where s = 1/x. > * We approximate pone by > * pone(x) = 1 + (R/S) > @@ -252,8 +706,7 @@ ponef(float x) > return one+ r/s; > } > > - > -/* For x >= 8, the asymptotic expansions of qone is > +/* For x >= 8, the asymptotic expansion of qone is > * 3/8 s - 105/1024 s^3 - ..., where s = 1/x. > * We approximate pone by > * qone(x) = s*(0.375 + (R/S)) > @@ -340,10 +793,10 @@ qonef(float x) > GET_FLOAT_WORD(ix,x); > ix &= 0x7fffffff; > /* ix >= 0x40000000 for all calls to this function. */ > - if(ix>=0x40200000) {p = qr8; q= qs8;} > - else if(ix>=0x40f71c58){p = qr5; q= qs5;} > - else if(ix>=0x4036db68){p = qr3; q= qs3;} > - else {p = qr2; q= qs2;} > + if(ix>=0x41000000) {p = qr8; q= qs8;} /* x >= 8 */ > + else if(ix>=0x40f71c58){p = qr5; q= qs5;} /* x >= 7.722209930e+00 */ > + else if(ix>=0x4036db68){p = qr3; q= qs3;} /* x >= 2.857141495e+00 */ > + else {p = qr2; q= qs2;} /* x >= 2 */ > z = one/(x*x); > r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5])))); > s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*(q[4]+z*q[5]))))); >
Dear Adhemerval, > LGTM, with just nits below (there is no need to resend it). > > + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */ > Double space before '*/;'. I guess you mean before '*/' (i.e., after 'nearest'). > Right, so this new patch supersedes this whole set? yes it does. If/when you are done with your review, I can submit a new version with the nits above corrected. Thank you! Paul
On 30/03/2021 14:46, Paul Zimmermann wrote: > Dear Adhemerval, > >> LGTM, with just nits below (there is no need to resend it). > >>> + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */ > >> Double space before '*/;'. > > I guess you mean before '*/' (i.e., after 'nearest'). Yeap. > >> Right, so this new patch supersedes this whole set? > > yes it does. If/when you are done with your review, I can submit a new version > with the nits above corrected. I think besides the xfail for powerpc and comment nits the whole set seems to good. I will mark the set superseded and ack the new version (you can fix the commits nits before commit, there is no need to resend a newer version).
diff --git a/sysdeps/ieee754/flt-32/e_j1f.c b/sysdeps/ieee754/flt-32/e_j1f.c index ac5bb76828..5cedfea886 100644 --- a/sysdeps/ieee754/flt-32/e_j1f.c +++ b/sysdeps/ieee754/flt-32/e_j1f.c @@ -21,6 +21,8 @@ #include <fenv_private.h> #include <math-underflow.h> #include <libm-alias-finite.h> +#include <s_sincosf.h> +#include "reduce_aux.c" static float ponef(float), qonef(float); @@ -42,6 +44,223 @@ s05 = 1.2354227016e-11; /* 0x2d59567e */ static const float zero = 0.0; +/* This is the nearest approximation of the first positive zero of j1. */ +#define FIRST_ZERO_J1 0x3.d4eabp+0f + +#define SMALL_SIZE 64 + +/* The following table contains successive zeros of j1 and degree-3 + polynomial approximations of j1 around these zeros: Pj[0] for the first + positive zero (3.831705), Pj[1] for the second one (7.015586), and so on. + Each line contains: + {x0, xmid, x1, p0, p1, p2, p3} + where [x0,x1] is the interval around the zero, xmid is the binary32 number + closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation + polynomial. Each polynomial was generated using Sollya on the interval + [x0,x1] around the corresponding zero where the error exceeds 9 ulps + for the alternate code. Degree 3 is enough to get an error at most + 9 ulps, except around the first zero. +*/ +static const float Pj[SMALL_SIZE][7] = { + /* For index 0, we use a degree-4 polynomial generated by Sollya, with the + coefficient of degree 4 hard-coded in j1f_near_root(). */ + { 0x1.e09e5ep+1, 0x1.ea7558p+1, 0x1.ef7352p+1, -0x8.4f069p-28, + -0x6.71b3d8p-4, 0xd.744a2p-8, 0xd.acd9p-8/*, -0x1.3e51aap-8*/ }, /* 0 */ + { 0x1.bdb4c2p+2, 0x1.c0ff6p+2, 0x1.c27a8cp+2, 0xe.c2858p-28, + 0x4.cd464p-4, -0x5.79b71p-8, -0xc.11124p-8 }, /* 1 */ + { 0x1.43b214p+3, 0x1.458d0ep+3, 0x1.460ccep+3, -0x1.e7acecp-24, + -0x3.feca9p-4, 0x3.2470f8p-8, 0xa.625b7p-8 }, /* 2 */ + { 0x1.a9c98p+3, 0x1.aa5bbp+3, 0x1.aaa4d8p+3, 0x1.698158p-24, + 0x3.7e666cp-4, -0x2.1900ap-8, -0x9.2755p-8 }, /* 3 */ + { 0x1.073be4p+4, 0x1.0787b4p+4, 0x1.07aed8p+4, -0x1.f5f658p-24, + -0x3.24b8ep-4, 0x1.86e35cp-8, 0x8.4e4bbp-8 }, /* 4 */ + { 0x1.39ae2ap+4, 0x1.39da8ep+4, 0x1.39f3dap+4, -0x1.4e744p-24, + 0x2.e18a24p-4, -0x1.2ccd16p-8, -0x7.a27ep-8 }, /* 5 */ + { 0x1.6bfa46p+4, 0x1.6c294ep+4, 0x1.6c412p+4, 0xa.3fb7fp-28, + -0x2.acc9c4p-4, 0xf.0b783p-12, 0x7.1c0d3p-8 }, /* 6 */ + { 0x1.9e42bep+4, 0x1.9e757p+4, 0x1.9e876ep+4, -0x2.29f6f4p-24, + 0x2.81f21p-4, -0xc.641bp-12, -0x6.a7ea58p-8 }, /* 7 */ + { 0x1.d08a3ep+4, 0x1.d0bfdp+4, 0x1.d0cd3cp+4, -0x1.b5d196p-24, + -0x2.5e40e4p-4, 0xa.7059fp-12, 0x6.4d6bfp-8 }, /* 8 */ + { 0x1.017794p+5, 0x1.018476p+5, 0x1.018b8cp+5, -0x4.0e001p-24, + 0x2.3febep-4, -0x8.f23aap-12, -0x6.0102cp-8 }, /* 9 */ + { 0x1.1a9e78p+5, 0x1.1aa89p+5, 0x1.1aaf88p+5, 0x3.b26f2p-24, + -0x2.25babp-4, 0x7.c6d948p-12, 0x5.a1d988p-8 }, /* 10 */ + { 0x1.33bddep+5, 0x1.33cc52p+5, 0x1.33d2e4p+5, -0xf.c8cdap-28, + 0x2.0ed05p-4, -0x6.d97dbp-12, -0x5.8da498p-8 }, /* 11 */ + { 0x1.4ce7cp+5, 0x1.4cefdp+5, 0x1.4cf7d4p+5, -0x3.9940e4p-24, + -0x1.fa8b4p-4, 0x6.16108p-12, 0x5.4355e8p-8 }, /* 12 */ + { 0x1.6603e8p+5, 0x1.661316p+5, 0x1.66173ap+5, 0x9.da15dp-28, + 0x1.e8727ep-4, -0x5.742468p-12, -0x5.117c28p-8 }, /* 13 */ + { 0x1.7f2ebcp+5, 0x1.7f3632p+5, 0x1.7f3a7ep+5, -0x3.39b218p-24, + -0x1.d8293ap-4, 0x4.ee3348p-12, 0x4.f9bep-8 }, /* 14 */ + { 0x1.9850e6p+5, 0x1.985928p+5, 0x1.985d9ep+5, -0x3.7b5108p-24, + 0x1.c96702p-4, -0x4.7b0d08p-12, -0x4.c784a8p-8 }, /* 15 */ + { 0x1.b172e8p+5, 0x1.b17c04p+5, 0x1.b1805cp+5, -0x1.91e43ep-24, + -0x1.bbf246p-4, 0x4.18ad78p-12, 0x4.9bfae8p-8 }, /* 16 */ + { 0x1.ca955ap+5, 0x1.ca9ec6p+5, 0x1.caa2a4p+5, 0x1.28453cp-24, + 0x1.af9cb4p-4, -0x3.c3a494p-12, -0x4.78b69p-8 }, /* 17 */ + { 0x1.e3bc94p+5, 0x1.e3c174p+5, 0x1.e3c64p+5, -0x2.e7fef4p-24, + -0x1.a4407ep-4, 0x3.79b228p-12, 0x4.874f7p-8 }, /* 18 */ + { 0x1.fcdf16p+5, 0x1.fce40ep+5, 0x1.fce71p+5, -0x3.23b2fcp-24, + 0x1.99be76p-4, -0x3.39ad7cp-12, -0x4.92a55p-8 }, /* 19 */ + { 0x1.0afe34p+6, 0x1.0b034ep+6, 0x1.0b054ap+6, -0xd.19e93p-28, + -0x1.8ffc9cp-4, 0x2.fee7f8p-12, 0x4.2d33b8p-8 }, /* 20 */ + { 0x1.179344p+6, 0x1.17948ep+6, 0x1.1795bp+6, 0x1.c2ac48p-24, + 0x1.86e51cp-4, -0x2.cc5abp-12, -0x4.866d08p-8 }, /* 21 */ + { 0x1.24231ep+6, 0x1.2425c8p+6, 0x1.2426e2p+6, -0xd.31027p-28, + -0x1.7e656ep-4, 0x2.9db23cp-12, 0x3.cc63c8p-8 }, /* 22 */ + { 0x1.30b5a8p+6, 0x1.30b6fep+6, 0x1.30b84ep+6, 0x5.b5e53p-24, + 0x1.766dc2p-4, -0x2.754cfcp-12, -0x3.c39bb4p-8 }, /* 23 */ + { 0x1.3d46ccp+6, 0x1.3d482ep+6, 0x1.3d495ep+6, -0x1.340a8ap-24, + -0x1.6ef07ep-4, 0x2.4ff9d4p-12, 0x3.9b36e4p-8 }, /* 24 */ + { 0x1.49d688p+6, 0x1.49d95ap+6, 0x1.49dabep+6, -0x3.ba66p-24, + 0x1.67e1dcp-4, -0x2.2f32b8p-12, -0x3.e2aaf4p-8 }, /* 25 */ + { 0x1.566916p+6, 0x1.566a84p+6, 0x1.566bcp+6, 0x6.47ca5p-28, + -0x1.61379ap-4, 0x2.1096acp-12, 0x4.2d0968p-8 }, /* 26 */ + { 0x1.62f8dap+6, 0x1.62fbaap+6, 0x1.62fc9cp+6, -0x2.12affp-24, + 0x1.5ae8c4p-4, -0x1.f32444p-12, -0x3.9e592p-8 }, /* 27 */ + { 0x1.6f89e6p+6, 0x1.6f8ccep+6, 0x1.6f8e34p+6, -0x7.4853ap-28, + -0x1.54ed76p-4, 0x1.db004ap-12, 0x3.907034p-8 }, /* 28 */ + { 0x1.7c1c6ap+6, 0x1.7c1deep+6, 0x1.7c1f4cp+6, -0x4.f0a998p-24, + 0x1.4f3ebcp-4, -0x1.c26808p-12, -0x2.da8df8p-8 }, /* 29 */ + { 0x1.88adaep+6, 0x1.88af0ep+6, 0x1.88afc4p+6, -0x1.80c246p-24, + -0x1.49d668p-4, 0x1.aebc26p-12, 0x3.af7b5cp-8 }, /* 30 */ + { 0x1.953d7p+6, 0x1.95402ap+6, 0x1.9540ep+6, -0x2.22aff8p-24, + 0x1.44aefap-4, -0x1.99f25p-12, -0x3.5e9198p-8 }, /* 31 */ + { 0x1.a1d01ep+6, 0x1.a1d146p+6, 0x1.a1d20ap+6, -0x3.aad6d4p-24, + -0x1.3fc386p-4, 0x1.892858p-12, 0x3.fe0184p-8 }, /* 32 */ + { 0x1.ae60ecp+6, 0x1.ae625ep+6, 0x1.ae6326p+6, -0x2.010be4p-24, + 0x1.3b0fa4p-4, -0x1.7539ap-12, -0x2.b2c9bp-8 }, /* 33 */ + { 0x1.baf234p+6, 0x1.baf376p+6, 0x1.baf442p+6, -0xd.4fd17p-32, + -0x1.368f5cp-4, 0x1.6734e4p-12, 0x3.59f514p-8 }, /* 34 */ + { 0x1.c782e6p+6, 0x1.c7848cp+6, 0x1.c78516p+6, -0xa.d662dp-28, + 0x1.323f18p-4, -0x1.571c02p-12, -0x3.2be5bp-8 }, /* 35 */ + { 0x1.d4144ep+6, 0x1.d415ap+6, 0x1.d41622p+6, 0x4.9f217p-24, + -0x1.2e1b9ap-4, 0x1.4a2edap-12, 0x3.a4e96p-8 }, /* 36 */ + { 0x1.e0a5ep+6, 0x1.e0a6b4p+6, 0x1.e0a788p+6, -0x2.d167p-24, + 0x1.2a21eep-4, -0x1.3c4b46p-12, -0x4.9e0978p-8 }, /* 37 */ + { 0x1.ed36eep+6, 0x1.ed37c8p+6, 0x1.ed3892p+6, -0x4.15a83p-24, + -0x1.264f66p-4, 0x1.31dea4p-12, 0x3.d125ecp-8 }, /* 38 */ + { 0x1.f9c77p+6, 0x1.f9c8d8p+6, 0x1.f9c9acp+6, -0x2.a5bbbp-24, + 0x1.22a192p-4, -0x1.25e59ep-12, -0x2.ef6934p-8 }, /* 39 */ + { 0x1.032c54p+7, 0x1.032cf4p+7, 0x1.032d66p+7, 0x4.e828bp-24, + -0x1.1f1634p-4, 0x1.1c2394p-12, 0x3.6d744cp-8 }, /* 40 */ + { 0x1.09750cp+7, 0x1.09757cp+7, 0x1.0975b6p+7, -0x3.28a3bcp-24, + 0x1.1bab3ep-4, -0x1.1569cep-12, -0x5.84da7p-8 }, /* 41 */ + { 0x1.0fbd9ap+7, 0x1.0fbe04p+7, 0x1.0fbe5ep+7, -0x2.2f667p-24, + -0x1.185eccp-4, 0x1.07f42cp-12, 0x2.87896cp-8 }, /* 42 */ + { 0x1.160628p+7, 0x1.16068ap+7, 0x1.1606cep+7, -0x6.9097dp-24, + 0x1.152f28p-4, -0x1.0227fep-12, -0x5.da6e6p-8 }, /* 43 */ + { 0x1.1c4e9ap+7, 0x1.1c4f12p+7, 0x1.1c4f7cp+7, -0x5.1b408p-24, + -0x1.121abp-4, 0xf.6efcp-16, 0x2.c5e954p-8 }, /* 44 */ + { 0x1.2296aap+7, 0x1.229798p+7, 0x1.2297d4p+7, 0x2.70d0dp-24, + 0x1.0f1ffp-4, -0xf.523f5p-16, -0x3.5c0568p-8 }, /* 45 */ + { 0x1.28dfa4p+7, 0x1.28e01ep+7, 0x1.28e054p+7, -0x2.7c176p-24, + -0x1.0c3d8ap-4, 0xe.8329ap-16, 0x3.5eb34p-8 }, /* 46 */ + { 0x1.2f282ap+7, 0x1.2f28a4p+7, 0x1.2f28dep+7, 0x4.fd6368p-24, + 0x1.097236p-4, -0xe.17299p-16, -0x3.73a2e4p-8 }, /* 47 */ + { 0x1.3570bp+7, 0x1.357128p+7, 0x1.35716p+7, 0x6.b05f68p-24, + -0x1.06bccap-4, 0xd.527b8p-16, 0x2.b8bf9cp-8 }, /* 48 */ + { 0x1.3bb932p+7, 0x1.3bb9aep+7, 0x1.3bb9eap+7, 0x4.0d622p-28, + 0x1.041c28p-4, -0xd.0ac11p-16, -0x1.65f2ccp-8 }, /* 49 */ + { 0x1.4201b6p+7, 0x1.420232p+7, 0x1.42027p+7, 0x7.0d98cp-24, + -0x1.018f52p-4, 0xc.c4d8ep-16, 0x2.8f250cp-8 }, /* 50 */ + { 0x1.484a78p+7, 0x1.484ab8p+7, 0x1.484af4p+7, 0x3.93d10cp-24, + 0xf.f154fp-8, -0xc.7b7fep-16, -0x3.6b6e4cp-8 }, /* 51 */ + { 0x1.4e92c8p+7, 0x1.4e933cp+7, 0x1.4e9368p+7, 0xd.88185p-32, + -0xf.cad3fp-8, 0xc.1462p-16, 0x2.bd66p-8 }, /* 52 */ + { 0x1.54db84p+7, 0x1.54dbcp+7, 0x1.54dbf4p+7, -0x1.fe6b92p-24, + 0xf.a564cp-8, -0xb.c4e6cp-16, -0x3.d51decp-8 }, /* 53 */ + { 0x1.5b23c4p+7, 0x1.5b2444p+7, 0x1.5b2486p+7, 0x2.6137f4p-24, + -0xf.80faep-8, 0xb.5199ep-16, 0x1.9ca85ap-8 }, /* 54 */ + { 0x1.616c62p+7, 0x1.616cc8p+7, 0x1.616d0ap+7, -0x1.55468p-24, + 0xf.5d8acp-8, -0xb.21d16p-16, -0x1.b8809ap-8 }, /* 55 */ + { 0x1.67b4fp+7, 0x1.67b54cp+7, 0x1.67b588p+7, -0x1.08c6bep-24, + -0xf.3b096p-8, 0xa.e65efp-16, 0x3.642304p-8 }, /* 56 */ + { 0x1.6dfd8ep+7, 0x1.6dfddp+7, 0x1.6dfe0ap+7, 0x4.9ebfa8p-24, + 0xf.196c7p-8, -0xa.ba8c8p-16, -0x5.ad6a2p-8 }, /* 57 */ + { 0x1.74461p+7, 0x1.744652p+7, 0x1.744692p+7, 0x5.a4017p-24, + -0xe.f8aa5p-8, 0xa.49748p-16, 0x2.a86498p-8 }, /* 58 */ + { 0x1.7a8e5ep+7, 0x1.7a8ed6p+7, 0x1.7a8ef8p+7, 0x3.bcb2a8p-28, + 0xe.d8b9dp-8, -0x9.c43bep-16, -0x1.e7124ap-8 }, /* 59 */ + { 0x1.80d6cep+7, 0x1.80d75ap+7, 0x1.80d78ap+7, -0x7.1091fp-24, + -0xe.b9925p-8, 0x9.c43dap-16, 0x1.aba86p-8 }, /* 60 */ + { 0x1.871f58p+7, 0x1.871fdcp+7, 0x1.87201ep+7, 0x2.ca1cf4p-28, + 0xe.9b2bep-8, -0x9.843b3p-16, -0x2.093e68p-8 }, /* 61 */ + { 0x1.8d67e8p+7, 0x1.8d685ep+7, 0x1.8d688ep+7, 0x5.aa8908p-24, + -0xe.7d7ecp-8, 0x9.501a8p-16, 0x2.54a754p-8 }, /* 62 */ + { 0x1.93b09cp+7, 0x1.93b0e2p+7, 0x1.93b10ep+7, 0x3.d9cd9cp-24, + 0xe.6083ap-8, -0x9.45dadp-16, -0x5.112908p-8 }, /* 63 */ +}; + +/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf: + j1f(x) ~ sqrt(2/(pi*x))*beta1(x)*cos(x-3pi/4-alpha1(x)) + where beta1(x) = 1 + 3/(16*x^2) - 99/(512*x^4) + and alpha1(x) = -3/(8*x) + 21/(128*x^3) - 1899/(5120*x^5). */ +static float +j1f_asympt (float x) +{ + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */ + if (x < 0) + { + x = -x; + cst = -cst; + } + double y = 1.0 / (double) x; + double y2 = y * y; + double beta1 = 1.0f + y2 * (0x3p-4 - 0x3.18p-4 * y2); + double alpha1; + alpha1 = y * (-0x6p-4 + y2 * (0x2.ap-4 - 0x5.ef33333333334p-4 * y2)); + double h; + int n; + h = reduce_aux (x, &n, alpha1); + n--; /* Subtract pi/2. */ + /* Now x - 3pi/4 - alpha1 = h + n*pi/2 mod (2*pi). */ + float xr = (float) h; + n = n & 3; + float t = cst / sqrtf (x) * (float) beta1; + if (n == 0) + return t * __cosf (xr); + else if (n == 2) /* cos(x+pi) = -cos(x) */ + return -t * __cosf (xr); + else if (n == 1) /* cos(x+pi/2) = -sin(x) */ + return -t * __sinf (xr); + else /* cos(x+3pi/2) = sin(x) */ + return t * __sinf (xr); +} + +/* Special code for x near a root of j1. + z is the value computed by the generic code. + For small x, we use a polynomial approximating j1 around its root. + For large x, we use an asymptotic formula (j1f_asympt). */ +static float +j1f_near_root (float x, float z) +{ + float index_f, sign = 1.0f; + int index; + + if (x < 0) + { + x = -x; + sign = -1.0f; + } + index_f = roundf ((x - FIRST_ZERO_J1) / (float) M_PI); + if (index_f >= SMALL_SIZE) + return sign * j1f_asympt (x); + index = (int) index_f; + const float *p = Pj[index]; + float x0 = p[0]; + float x1 = p[2]; + /* If not in the interval [x0,x1] around xmid, return the value z. */ + if (! (x0 <= x && x <= x1)) + return z; + float xmid = p[1]; + float y = x - xmid; + float p6 = (index > 0) ? p[6] : p[6] + y * -0x1.3e51aap-8f; + return sign * (p[3] + y * (p[4] + y * (p[5] + y * p6))); +} + float __ieee754_j1f(float x) { @@ -53,25 +272,37 @@ __ieee754_j1f(float x) if(__builtin_expect(ix>=0x7f800000, 0)) return one/x; y = fabsf(x); if(ix >= 0x40000000) { /* |x| >= 2.0 */ + SET_RESTORE_ROUNDF (FE_TONEAREST); __sincosf (y, &s, &c); ss = -s-c; cc = s-c; - if(ix<0x7f000000) { /* make sure y+y not overflow */ - z = __cosf(y+y); - if ((s*c)>zero) cc = z/ss; - else ss = z/cc; - } + if (ix >= 0x7f000000) + /* x >= 2^127: use asymptotic expansion. */ + return j1f_asympt (x); + /* Now we are sure that x+x cannot overflow. */ + z = __cosf(y+y); + if ((s*c)>zero) cc = z/ss; + else ss = z/cc; /* * j1(x) = 1/sqrt(pi) * (P(1,x)*cc - Q(1,x)*ss) / sqrt(x) * y1(x) = 1/sqrt(pi) * (P(1,x)*ss + Q(1,x)*cc) / sqrt(x) */ - if(ix>0x5c000000) z = (invsqrtpi*cc)/sqrtf(y); - else { + if (ix <= 0x5c000000) + { u = ponef(y); v = qonef(y); - z = invsqrtpi*(u*cc-v*ss)/sqrtf(y); - } - if(hx<0) return -z; - else return z; + cc = u*cc-v*ss; + } + z = (invsqrtpi * cc) / sqrtf(y); + /* Adjust sign of z. */ + z = (hx < 0) ? -z : z; + /* The following threshold is optimal: for x=0x1.e09e5ep+1 + and rounding upwards, cc=0x1.b79638p-4 and z is 10 ulps + far from the correctly rounded value. */ + float threshold = 0x1.b79638p-4; + if (fabsf (cc) > threshold) + return z; + else + return j1f_near_root (x, z); } if(__builtin_expect(ix<0x32000000, 0)) { /* |x|<2**-27 */ if(huge+x>one) { /* inexact if x!=0 necessary */ @@ -105,6 +336,218 @@ static const float V0[5] = { 1.6655924903e-11, /* 0x2d9281cf */ }; +/* This is the nearest approximation of the first zero of y1. */ +#define FIRST_ZERO_Y1 0x2.3277dcp+0f + +/* The following table contains successive zeros of y1 and degree-3 + polynomial approximations of y1 around these zeros: Py[0] for the first + positive zero (2.197141), Py[1] for the second one (5.429681), and so on. + Each line contains: + {x0, xmid, x1, p0, p1, p2, p3} + where [x0,x1] is the interval around the zero, xmid is the binary32 number + closest to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation + polynomial. Each polynomial was generated using Sollya on the interval + [x0,x1] around the corresponding zero where the error exceeds 9 ulps + for the alternate code. Degree 3 is enough, except for the first roots. +*/ +static const float Py[SMALL_SIZE][7] = { + /* For index 0, we use a degree-5 polynomial generated by Sollya, with the + coefficients of degree 4 and 5 hard-coded in y1f_near_root(). */ + { 0x1.f7f16ap+0, 0x1.193beep+1, 0x1.2105dcp+1, 0xb.96749p-28, + 0x8.55241p-4, -0x1.e570bp-4, -0x8.68b61p-8 + /*, -0x1.28043p-8, 0x2.50e83p-8*/ }, /* 0 */ + /* For index 1, we use a degree-4 polynomial generated by Sollya, with the + coefficient of degree 4 hard-coded in y1f_near_root(). */ + { 0x1.55c6d2p+2, 0x1.5b7fe4p+2, 0x1.5cf8cap+2, 0x1.3c7822p-24, + -0x5.71f158p-4, 0x8.05cb4p-8, 0xd.0b15p-8/*, -0xf.ff6b8p-12*/ }, /* 1 */ + { 0x1.113c6p+3, 0x1.13127ap+3, 0x1.1387dcp+3, -0x1.f3ad8ep-24, + 0x4.57e66p-4, -0x4.0afb58p-8, -0xb.29207p-8 }, /* 2 */ + { 0x1.76e7dep+3, 0x1.77f914p+3, 0x1.786a6ap+3, -0xd.5608fp-28, + -0x3.b829d4p-4, 0x2.8852cp-8, 0x9.b70e3p-8 }, /* 3 */ + { 0x1.dc2794p+3, 0x1.dcb7d8p+3, 0x1.dd032p+3, -0xe.a7c04p-28, + 0x3.4e0458p-4, -0x1.c64b18p-8, -0x8.b0e7fp-8 }, /* 4 */ + { 0x1.20874p+4, 0x1.20b1c6p+4, 0x1.20c71p+4, 0x1.c2626p-24, + -0x3.00f03cp-4, 0x1.54f806p-8, 0x7.f9cf9p-8 }, /* 5 */ + { 0x1.52d848p+4, 0x1.530254p+4, 0x1.531962p+4, -0x1.9503ecp-24, + 0x2.c5b29cp-4, -0x1.0bf28p-8, -0x7.562e58p-8 }, /* 6 */ + { 0x1.851e64p+4, 0x1.854fa4p+4, 0x1.85679p+4, -0x2.8d40fcp-24, + -0x2.96547p-4, 0xd.9c38bp-12, 0x6.dcbf8p-8 }, /* 7 */ + { 0x1.b7808ep+4, 0x1.b79acep+4, 0x1.b7b2a8p+4, -0x2.36df5cp-24, + 0x2.6f55ap-4, -0xb.57f9fp-12, -0x6.82569p-8 }, /* 8 */ + { 0x1.e9c8fp+4, 0x1.e9e48p+4, 0x1.e9f24p+4, 0xd.e2eb7p-28, + -0x2.4e8104p-4, 0x9.a4be2p-12, 0x6.2541fp-8 }, /* 9 */ + { 0x1.0e0808p+5, 0x1.0e169p+5, 0x1.0e1d92p+5, -0x2.3070f4p-24, + 0x2.325e4cp-4, -0x8.53604p-12, -0x5.ca03a8p-8 }, /* 10 */ + { 0x1.272e08p+5, 0x1.273a7cp+5, 0x1.2741fcp+5, -0x3.525508p-24, + -0x2.19e7dcp-4, 0x7.49d1dp-12, 0x5.9cb02p-8 }, /* 11 */ + { 0x1.404ec6p+5, 0x1.405e18p+5, 0x1.4065cep+5, -0xe.6e158p-28, + 0x2.046174p-4, -0x6.71b3dp-12, -0x5.4c3c8p-8 }, /* 12 */ + { 0x1.5971dcp+5, 0x1.598178p+5, 0x1.598592p+5, 0x1.e72698p-24, + -0x1.f13fb2p-4, 0x5.c0f938p-12, 0x5.28ca78p-8 }, /* 13 */ + { 0x1.729c4ep+5, 0x1.72a4a8p+5, 0x1.72a8eap+5, -0x1.5bed9cp-24, + 0x1.e018dcp-4, -0x5.2f11e8p-12, -0x5.16ce48p-8 }, /* 14 */ + { 0x1.8bbf4ep+5, 0x1.8bc7b2p+5, 0x1.8bcc1p+5, -0x3.6b654cp-24, + -0x1.d09b2p-4, 0x4.b1747p-12, 0x4.bd22fp-8 }, /* 15 */ + { 0x1.a4e272p+5, 0x1.a4ea9ap+5, 0x1.a4eef4p+5, 0x1.6f11bp-24, + 0x1.c28612p-4, -0x4.47462p-12, -0x4.947c5p-8 }, /* 16 */ + { 0x1.be08bep+5, 0x1.be0d68p+5, 0x1.be1088p+5, -0x2.0bc074p-24, + -0x1.b5a622p-4, 0x3.ed52d4p-12, 0x4.b76fc8p-8 }, /* 17 */ + { 0x1.d7272ap+5, 0x1.d7301ep+5, 0x1.d734aep+5, -0x2.87dd4p-24, + 0x1.a9d184p-4, -0x3.9cf494p-12, -0x4.6303ep-8 }, /* 18 */ + { 0x1.f0499ap+5, 0x1.f052c4p+5, 0x1.f05758p+5, -0x2.fb964p-24, + -0x1.9ee5eep-4, 0x3.5800dp-12, 0x4.4e9f9p-8 }, /* 19 */ + { 0x1.04b63ap+6, 0x1.04baacp+6, 0x1.04bc92p+6, 0x2.cf5adp-24, + 0x1.94c6f4p-4, -0x3.1a83e4p-12, -0x4.2311fp-8 }, /* 20 */ + { 0x1.1146dp+6, 0x1.114beep+6, 0x1.114e12p+6, 0x3.6766fp-24, + -0x1.8b5cccp-4, 0x2.e4a4e4p-12, 0x4.20bf9p-8 }, /* 21 */ + { 0x1.1dda8cp+6, 0x1.1ddd2cp+6, 0x1.1dde7ap+6, 0x3.501424p-24, + 0x1.829356p-4, -0x2.b47524p-12, -0x4.04bf18p-8 }, /* 22 */ + { 0x1.2a6bcp+6, 0x1.2a6e64p+6, 0x1.2a6faap+6, -0x5.c05808p-24, + -0x1.7a597ep-4, 0x2.8a0498p-12, 0x4.187258p-8 }, /* 23 */ + { 0x1.36fcd6p+6, 0x1.36ff96p+6, 0x1.3700f6p+6, 0x7.1e1478p-28, + 0x1.72a09ap-4, -0x2.61a7fp-12, -0x3.c0b54p-8 }, /* 24 */ + { 0x1.438f46p+6, 0x1.4390c4p+6, 0x1.4392p+6, 0x3.e36e6cp-24, + -0x1.6b5c06p-4, 0x2.3f612p-12, 0x4.18f868p-8 }, /* 25 */ + { 0x1.501f4cp+6, 0x1.5021fp+6, 0x1.50235p+6, 0x1.3f9e5ap-24, + 0x1.6480c4p-4, -0x2.1f28fcp-12, -0x3.bb4e3cp-8 }, /* 26 */ + { 0x1.5cb07cp+6, 0x1.5cb318p+6, 0x1.5cb464p+6, -0x2.39e41cp-24, + -0x1.5e0544p-4, 0x2.0189f4p-12, 0x3.8b55acp-8 }, /* 27 */ + { 0x1.694166p+6, 0x1.69443cp+6, 0x1.694594p+6, -0x2.912f84p-24, + 0x1.57e12p-4, -0x1.e6fabep-12, -0x3.850174p-8 }, /* 28 */ + { 0x1.75d27cp+6, 0x1.75d55ep+6, 0x1.75d67ep+6, 0x3.d5b00cp-24, + -0x1.520ceep-4, 0x1.d0286ep-12, 0x3.8e7d1p-8 }, /* 29 */ + { 0x1.82653ep+6, 0x1.82667ep+6, 0x1.82674p+6, -0x3.1726ecp-24, + 0x1.4c8222p-4, -0x1.b98206p-12, -0x3.f34978p-8 }, /* 30 */ + { 0x1.8ef4b4p+6, 0x1.8ef79cp+6, 0x1.8ef888p+6, 0x1.949e22p-24, + -0x1.473ae6p-4, 0x1.a47388p-12, 0x3.69eefcp-8 }, /* 31 */ + { 0x1.9b8728p+6, 0x1.9b88b8p+6, 0x1.9b896cp+6, -0x5.5553bp-28, + 0x1.42320ap-4, -0x1.90f0b8p-12, -0x3.6565p-8 }, /* 32 */ + { 0x1.a8183cp+6, 0x1.a819d2p+6, 0x1.a81aecp+6, 0x3.2df7ecp-28, + -0x1.3d62e4p-4, 0x1.7dae28p-12, 0x2.9eb128p-8 }, /* 33 */ + { 0x1.b4aa1cp+6, 0x1.b4aaeap+6, 0x1.b4abb8p+6, -0x1.e13fcep-24, + 0x1.38c948p-4, -0x1.6eb0ecp-12, -0x1.f9ddf8p-8 }, /* 34 */ + { 0x1.c13a7ap+6, 0x1.c13c02p+6, 0x1.c13cbp+6, -0x3.ad9974p-24, + -0x1.34616ep-4, 0x1.5e36ecp-12, 0x2.a9fc5p-8 }, /* 35 */ + { 0x1.cdcb76p+6, 0x1.cdcd16p+6, 0x1.cdcde4p+6, -0x3.6977e8p-24, + 0x1.3027fp-4, -0x1.4f703p-12, -0x2.9817d4p-8 }, /* 36 */ + { 0x1.da5cdep+6, 0x1.da5e2ap+6, 0x1.da5efp+6, 0x4.654cbp-24, + -0x1.2c19b6p-4, 0x1.455982p-12, 0x3.f1c564p-8 }, /* 37 */ + { 0x1.e6edccp+6, 0x1.e6ef3ep+6, 0x1.e6f00ap+6, 0x8.825c8p-32, + 0x1.2833eep-4, -0x1.39097p-12, -0x3.b2646p-8 }, /* 38 */ + { 0x1.f37f72p+6, 0x1.f3805p+6, 0x1.f3812ap+6, -0x2.0d11d8p-28, + -0x1.24740ap-4, 0x1.2c16p-12, 0x1.fc3804p-8 }, /* 39 */ + { 0x1.000842p+7, 0x1.0008bp+7, 0x1.000908p+7, -0x4.4e495p-24, + 0x1.20d7b6p-4, -0x1.20816p-12, -0x2.d1ebe8p-8 }, /* 40 */ + { 0x1.06505cp+7, 0x1.065138p+7, 0x1.06518p+7, 0x4.81c1c8p-24, + -0x1.1d5ccap-4, 0x1.17ad5ap-12, 0x2.fda33p-8 }, /* 41 */ + { 0x1.0c98dap+7, 0x1.0c99cp+7, 0x1.0c9a28p+7, -0xe.99386p-28, + 0x1.1a015p-4, -0x1.0bd50ap-12, -0x2.9dfb68p-8 }, /* 42 */ + { 0x1.12e212p+7, 0x1.12e248p+7, 0x1.12e29p+7, -0x6.16f1c8p-24, + -0x1.16c37ap-4, 0x1.0303dcp-12, 0x4.34316p-8 }, /* 43 */ + { 0x1.192a68p+7, 0x1.192acep+7, 0x1.192b02p+7, -0x1.129336p-24, + 0x1.13a19ep-4, -0xf.bd247p-16, -0x3.851d18p-8 }, /* 44 */ + { 0x1.1f727p+7, 0x1.1f7354p+7, 0x1.1f73ap+7, 0x5.19c09p-24, + -0x1.109a32p-4, 0xf.09644p-16, 0x2.d78194p-8 }, /* 45 */ + { 0x1.25bb8p+7, 0x1.25bbdap+7, 0x1.25bc12p+7, -0x6.497dp-24, + 0x1.0dabc8p-4, -0xe.a1d25p-16, -0x2.3378bp-8 }, /* 46 */ + { 0x1.2c04p+7, 0x1.2c046p+7, 0x1.2c04ap+7, 0x4.e4f338p-24, + -0x1.0ad512p-4, 0xe.52d84p-16, 0x4.3bfa08p-8 }, /* 47 */ + { 0x1.324cbp+7, 0x1.324ce6p+7, 0x1.324d4p+7, -0x1.287c58p-24, + 0x1.0814d4p-4, -0xe.03a95p-16, 0x3.9930ap-12 }, /* 48 */ + { 0x1.3894f6p+7, 0x1.38956cp+7, 0x1.3895ap+7, -0x4.b594ep-24, + -0x1.0569fp-4, 0xd.6787ep-16, 0x4.0a5148p-8 }, /* 49 */ + { 0x1.3edd98p+7, 0x1.3eddfp+7, 0x1.3ede2ap+7, -0x3.a8f164p-24, + 0x1.02d354p-4, -0xd.0309dp-16, -0x3.2ebfb4p-8 }, /* 50 */ + { 0x1.452638p+7, 0x1.452676p+7, 0x1.4526b4p+7, -0x6.12505p-24, + -0x1.005004p-4, 0xc.a0045p-16, 0x4.87c67p-8 }, /* 51 */ + { 0x1.4b6e8p+7, 0x1.4b6efap+7, 0x1.4b6f34p+7, 0x1.8acf4ep-24, + 0xf.ddf16p-8, -0xc.2d207p-16, -0x1.da6c36p-8 }, /* 52 */ + { 0x1.51b742p+7, 0x1.51b77ep+7, 0x1.51b7b2p+7, 0x1.39cf86p-24, + -0xf.b7faep-8, 0xb.db598p-16, -0x8.945b1p-12 }, /* 53 */ + { 0x1.57ffc4p+7, 0x1.580002p+7, 0x1.58003cp+7, -0x2.5f8de8p-24, + 0xf.930fep-8, -0xb.91889p-16, -0xa.30df9p-12 }, /* 54 */ + { 0x1.5e483p+7, 0x1.5e4886p+7, 0x1.5e48c8p+7, 0x2.073d64p-24, + -0xf.6f245p-8, 0xb.4085fp-16, 0x2.128188p-8 }, /* 55 */ + { 0x1.64908cp+7, 0x1.64910ap+7, 0x1.64912ap+7, -0x4.ed26ep-28, + 0xf.4c2cep-8, -0xa.fe719p-16, -0x2.9374b8p-8 }, /* 56 */ + { 0x1.6ad91ep+7, 0x1.6ad98ep+7, 0x1.6ad9cep+7, -0x2.ae5204p-24, + -0xf.2a1efp-8, 0xa.aa585p-16, 0x2.1c0834p-8 }, /* 57 */ + { 0x1.7121cep+7, 0x1.712212p+7, 0x1.712238p+7, 0x6.d72168p-24, + 0xf.08f09p-8, -0xa.7da49p-16, -0x3.4f5f1cp-8 }, /* 58 */ + { 0x1.776a0cp+7, 0x1.776a94p+7, 0x1.776accp+7, 0x2.d3f294p-24, + -0xe.e8986p-8, 0xa.23ccdp-16, 0x2.2a6678p-8 }, /* 59 */ + { 0x1.7db2e8p+7, 0x1.7db318p+7, 0x1.7db35ap+7, 0x3.88c0fp-24, + 0xe.c90d7p-8, -0x9.eaeap-16, -0x2.86438cp-8 }, /* 60 */ + { 0x1.83fb56p+7, 0x1.83fb9ap+7, 0x1.83fbep+7, 0x3.d94d34p-24, + -0xe.aa478p-8, 0x9.abac7p-16, 0x1.ac2d84p-8 }, /* 61 */ + { 0x1.8a43e8p+7, 0x1.8a441ep+7, 0x1.8a446p+7, 0x4.66b7ep-24, + 0xe.8c3e9p-8, -0x9.87682p-16, -0x7.9ab4a8p-12 }, /* 62 */ + { 0x1.908c6p+7, 0x1.908cap+7, 0x1.908ce6p+7, 0xf.f7ac9p-28, + -0xe.6eeb6p-8, 0x9.4423p-16, 0x4.54c4d8p-8 }, /* 63 */ +}; + +/* Formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf: + y1f(x) ~ sqrt(2/(pi*x))*beta1(x)*sin(x-3pi/4-alpha1(x)) + where beta1(x) = 1 + 3/(16*x^2) - 99/(512*x^4) + and alpha1(x) = -3/(8*x) + 21/(128*x^3) - 1899/(5120*x^5). */ +static float +y1f_asympt (float x) +{ + float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */ + double y = 1.0 / (double) x; + double y2 = y * y; + double beta1 = 1.0f + y2 * (0x3p-4 - 0x3.18p-4 * y2); + double alpha1; + alpha1 = y * (-0x6p-4 + y2 * (0x2.ap-4 - 0x5.ef33333333334p-4 * y2)); + double h; + int n; + h = reduce_aux (x, &n, alpha1); + n--; /* Subtract pi/2. */ + /* Now x - 3pi/4 - alpha1 = h + n*pi/2 mod (2*pi). */ + float xr = (float) h; + n = n & 3; + float t = cst / sqrtf (x) * (float) beta1; + if (n == 0) + return t * __sinf (xr); + else if (n == 2) /* sin(x+pi) = -sin(x) */ + return -t * __sinf (xr); + else if (n == 1) /* sin(x+pi/2) = cos(x) */ + return t * __cosf (xr); + else /* sin(x+3pi/2) = -cos(x) */ + return -t * __cosf (xr); +} + +/* Special code for x near a root of y1. + z is the value computed by the generic code. + For small x, we use a polynomial approximating y1 around its root. + For large x, we use an asymptotic formula (y1f_asympt). */ +static float +y1f_near_root (float x, float z) +{ + float index_f; + int index; + + index_f = roundf ((x - FIRST_ZERO_Y1) / (float) M_PI); + if (index_f >= SMALL_SIZE) + return y1f_asympt (x); + index = (int) index_f; + const float *p = Py[index]; + float x0 = p[0]; + float x1 = p[2]; + /* If not in the interval [x0,x1] around xmid, return the value z. */ + if (! (x0 <= x && x <= x1)) + return z; + float xmid = p[1]; + float y = x - xmid, p6; + if (index == 0) + p6 = p[6] + y * (-0x1.28043p-8 + y * 0x2.50e83p-8); + else if (index == 1) + p6 = p[6] + y * -0xf.ff6b8p-12; + else + p6 = p[6]; + return p[3] + y * (p[4] + y * (p[5] + y * p6)); +} + float __ieee754_y1f(float x) { @@ -118,16 +561,18 @@ __ieee754_y1f(float x) if(__builtin_expect(ix==0, 0)) return -1/zero; /* -inf and divide by zero exception. */ if(__builtin_expect(hx<0, 0)) return zero/(zero*x); - if(ix >= 0x40000000) { /* |x| >= 2.0 */ + if (ix >= 0x3fe0dfbc) { /* |x| >= 0x1.c1bf78p+0 */ SET_RESTORE_ROUNDF (FE_TONEAREST); __sincosf (x, &s, &c); ss = -s-c; cc = s-c; - if(ix<0x7f000000) { /* make sure x+x not overflow */ - z = __cosf(x+x); - if ((s*c)>zero) cc = z/ss; - else ss = z/cc; - } + if (ix >= 0x7f000000) + /* x >= 2^127: use asymptotic expansion. */ + return y1f_asympt (x); + /* Now we are sure that x+x cannot overflow. */ + z = __cosf(x+x); + if ((s*c)>zero) cc = z/ss; + else ss = z/cc; /* y1(x) = sqrt(2/(pi*x))*(p1(x)*sin(x0)+q1(x)*cos(x0)) * where x0 = x-3pi/4 * Better formula: @@ -139,12 +584,20 @@ __ieee754_y1f(float x) * sin(x) +- cos(x) = -cos(2x)/(sin(x) -+ cos(x)) * to compute the worse one. */ - if(ix>0x5c000000) z = (invsqrtpi*ss)/sqrtf(x); - else { - u = ponef(x); v = qonef(x); - z = invsqrtpi*(u*ss+v*cc)/sqrtf(x); - } - return z; + if (ix <= 0x5c000000) + { + u = ponef(x); v = qonef(x); + ss = u*ss+v*cc; + } + z = (invsqrtpi * ss) / sqrtf(x); + float threshold = 0x1.3e014cp-2; + /* The following threshold is optimal: for x=0x1.f7f16ap+0 + and rounding upwards, |ss|=-0x1.3e014cp-2 and z is 11 ulps + far from the correctly rounded value. */ + if (fabsf (ss) > threshold) + return z; + else + return y1f_near_root (x, z); } if(__builtin_expect(ix<=0x33000000, 0)) { /* x < 2**-25 */ z = -tpi / x; @@ -152,6 +605,7 @@ __ieee754_y1f(float x) __set_errno (ERANGE); return z; } + /* Now 2**-25 <= x < 0x1.c1bf78p+0. */ z = x*x; u = U0[0]+z*(U0[1]+z*(U0[2]+z*(U0[3]+z*U0[4]))); v = one+z*(V0[0]+z*(V0[1]+z*(V0[2]+z*(V0[3]+z*V0[4])))); @@ -159,7 +613,7 @@ __ieee754_y1f(float x) } libm_alias_finite (__ieee754_y1f, __y1f) -/* For x >= 8, the asymptotic expansions of pone is +/* For x >= 8, the asymptotic expansion of pone is * 1 + 15/128 s^2 - 4725/2^15 s^4 - ..., where s = 1/x. * We approximate pone by * pone(x) = 1 + (R/S) @@ -252,8 +706,7 @@ ponef(float x) return one+ r/s; } - -/* For x >= 8, the asymptotic expansions of qone is +/* For x >= 8, the asymptotic expansion of qone is * 3/8 s - 105/1024 s^3 - ..., where s = 1/x. * We approximate pone by * qone(x) = s*(0.375 + (R/S)) @@ -340,10 +793,10 @@ qonef(float x) GET_FLOAT_WORD(ix,x); ix &= 0x7fffffff; /* ix >= 0x40000000 for all calls to this function. */ - if(ix>=0x40200000) {p = qr8; q= qs8;} - else if(ix>=0x40f71c58){p = qr5; q= qs5;} - else if(ix>=0x4036db68){p = qr3; q= qs3;} - else {p = qr2; q= qs2;} + if(ix>=0x41000000) {p = qr8; q= qs8;} /* x >= 8 */ + else if(ix>=0x40f71c58){p = qr5; q= qs5;} /* x >= 7.722209930e+00 */ + else if(ix>=0x4036db68){p = qr3; q= qs3;} /* x >= 2.857141495e+00 */ + else {p = qr2; q= qs2;} /* x >= 2 */ z = one/(x*x); r = p[0]+z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5])))); s = one+z*(q[0]+z*(q[1]+z*(q[2]+z*(q[3]+z*(q[4]+z*q[5])))));