Message ID | 20240220164413.20463-1-Joe.Ramsay@arm.com |
---|---|
State | New |
Headers | show |
Series | aarch64/fpu: Sync libmvec routines from 2.39 and before with AOR | expand |
On 20/02/24 13:44, Joe Ramsay wrote: > This includes a fix for big-endian in AdvSIMD log, some cosmetic > changes, and numerous small optimisations mainly around inlining and > using indexed variants of MLA intrinsics. LGTM, thanks. Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> > --- > Thanks, > Joe > sysdeps/aarch64/fpu/acos_advsimd.c | 4 ++-- > sysdeps/aarch64/fpu/asin_advsimd.c | 4 ++-- > sysdeps/aarch64/fpu/atan2_sve.c | 29 +++++++++++++-------------- > sysdeps/aarch64/fpu/atan2f_sve.c | 30 +++++++++++++++------------- > sysdeps/aarch64/fpu/cos_advsimd.c | 3 +-- > sysdeps/aarch64/fpu/cosf_advsimd.c | 3 +-- > sysdeps/aarch64/fpu/exp10_advsimd.c | 4 ++-- > sysdeps/aarch64/fpu/exp10f_advsimd.c | 21 ++++++++++--------- > sysdeps/aarch64/fpu/exp2_advsimd.c | 20 ++++++++++--------- > sysdeps/aarch64/fpu/exp2f_sve.c | 4 +++- > sysdeps/aarch64/fpu/exp_advsimd.c | 4 ++-- > sysdeps/aarch64/fpu/expm1_advsimd.c | 11 +++++----- > sysdeps/aarch64/fpu/expm1f_advsimd.c | 17 ++++++++-------- > sysdeps/aarch64/fpu/log_advsimd.c | 5 +++++ > sysdeps/aarch64/fpu/sin_advsimd.c | 3 +-- > sysdeps/aarch64/fpu/sinf_advsimd.c | 3 +-- > sysdeps/aarch64/fpu/tan_advsimd.c | 26 ++++++++++++------------ > sysdeps/aarch64/fpu/tanf_advsimd.c | 25 ++++++++++++----------- > 18 files changed, 111 insertions(+), 105 deletions(-) > > diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c > index a8eabb5e71..0a86c9823a 100644 > --- a/sysdeps/aarch64/fpu/acos_advsimd.c > +++ b/sysdeps/aarch64/fpu/acos_advsimd.c > @@ -40,8 +40,8 @@ static const struct data > }; > > #define AllMask v_u64 (0xffffffffffffffff) > -#define Oneu (0x3ff0000000000000) > -#define Small (0x3e50000000000000) /* 2^-53. */ > +#define Oneu 0x3ff0000000000000 > +#define Small 0x3e50000000000000 /* 2^-53. */ > > #if WANT_SIMD_EXCEPT > static float64x2_t VPCS_ATTR NOINLINE > diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c > index 141646e954..2de6eff407 100644 > --- a/sysdeps/aarch64/fpu/asin_advsimd.c > +++ b/sysdeps/aarch64/fpu/asin_advsimd.c > @@ -39,8 +39,8 @@ static const struct data > }; > > #define AllMask v_u64 (0xffffffffffffffff) > -#define One (0x3ff0000000000000) > -#define Small (0x3e50000000000000) /* 2^-12. */ > +#define One 0x3ff0000000000000 > +#define Small 0x3e50000000000000 /* 2^-12. */ > > #if WANT_SIMD_EXCEPT > static float64x2_t VPCS_ATTR NOINLINE > diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c > index 09a4c559b8..04fa71fa37 100644 > --- a/sysdeps/aarch64/fpu/atan2_sve.c > +++ b/sysdeps/aarch64/fpu/atan2_sve.c > @@ -37,9 +37,6 @@ static const struct data > .pi_over_2 = 0x1.921fb54442d18p+0, > }; > > -/* Useful constants. */ > -#define SignMask sv_u64 (0x8000000000000000) > - > /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ > static svfloat64_t NOINLINE > special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, > @@ -72,14 +69,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) > svbool_t cmp_y = zeroinfnan (iy, pg); > svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); > > - svuint64_t sign_x = svand_x (pg, ix, SignMask); > - svuint64_t sign_y = svand_x (pg, iy, SignMask); > - svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); > - > svfloat64_t ax = svabs_x (pg, x); > svfloat64_t ay = svabs_x (pg, y); > + svuint64_t iax = svreinterpret_u64 (ax); > + svuint64_t iay = svreinterpret_u64 (ay); > + > + svuint64_t sign_x = sveor_x (pg, ix, iax); > + svuint64_t sign_y = sveor_x (pg, iy, iay); > + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); > > - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); > svbool_t pred_aygtax = svcmpgt (pg, ay, ax); > > /* Set up z for call to atan. */ > @@ -88,8 +86,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) > svfloat64_t z = svdiv_x (pg, n, d); > > /* Work out the correct shift. */ > - svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); > - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); > + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); > + shift = svsel (pred_aygtax, sv_f64 (1.0), shift); > + shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); > shift = svmul_x (pg, shift, data_ptr->pi_over_2); > > /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ > @@ -109,10 +108,10 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) > ret = svadd_m (pg, ret, shift); > > /* Account for the sign of x and y. */ > - ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); > - > if (__glibc_unlikely (svptest_any (pg, cmp_xy))) > - return special_case (y, x, ret, cmp_xy); > - > - return ret; > + return special_case ( > + y, x, > + svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)), > + cmp_xy); > + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); > } > diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c > index b92f83cdea..9ea197147c 100644 > --- a/sysdeps/aarch64/fpu/atan2f_sve.c > +++ b/sysdeps/aarch64/fpu/atan2f_sve.c > @@ -32,10 +32,8 @@ static const struct data > .pi_over_2 = 0x1.921fb6p+0f, > }; > > -#define SignMask sv_u32 (0x80000000) > - > /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ > -static inline svfloat32_t > +static svfloat32_t NOINLINE > special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, > const svbool_t cmp) > { > @@ -67,14 +65,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) > svbool_t cmp_y = zeroinfnan (iy, pg); > svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); > > - svuint32_t sign_x = svand_x (pg, ix, SignMask); > - svuint32_t sign_y = svand_x (pg, iy, SignMask); > - svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); > - > svfloat32_t ax = svabs_x (pg, x); > svfloat32_t ay = svabs_x (pg, y); > + svuint32_t iax = svreinterpret_u32 (ax); > + svuint32_t iay = svreinterpret_u32 (ay); > + > + svuint32_t sign_x = sveor_x (pg, ix, iax); > + svuint32_t sign_y = sveor_x (pg, iy, iay); > + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); > > - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); > svbool_t pred_aygtax = svcmpgt (pg, ay, ax); > > /* Set up z for call to atan. */ > @@ -83,11 +82,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) > svfloat32_t z = svdiv_x (pg, n, d); > > /* Work out the correct shift. */ > - svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); > - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); > + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); > + shift = svsel (pred_aygtax, sv_f32 (1.0), shift); > + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); > shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); > > - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ > + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ > svfloat32_t z2 = svmul_x (pg, z, z); > svfloat32_t z4 = svmul_x (pg, z2, z2); > svfloat32_t z8 = svmul_x (pg, z4, z4); > @@ -101,10 +101,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) > ret = svadd_m (pg, ret, shift); > > /* Account for the sign of x and y. */ > - ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); > > if (__glibc_unlikely (svptest_any (pg, cmp_xy))) > - return special_case (y, x, ret, cmp_xy); > + return special_case ( > + y, x, > + svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)), > + cmp_xy); > > - return ret; > + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); > } > diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c > index 2897e8b909..3924c9ce44 100644 > --- a/sysdeps/aarch64/fpu/cos_advsimd.c > +++ b/sysdeps/aarch64/fpu/cos_advsimd.c > @@ -63,8 +63,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) > special-case handler later. */ > r = vbslq_f64 (cmp, v_f64 (1.0), r); > #else > - cmp = vcageq_f64 (d->range_val, x); > - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ > + cmp = vcageq_f64 (x, d->range_val); > r = x; > #endif > > diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c > index 60abc8dfcf..d0c285b03a 100644 > --- a/sysdeps/aarch64/fpu/cosf_advsimd.c > +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c > @@ -64,8 +64,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) > special-case handler later. */ > r = vbslq_f32 (cmp, v_f32 (1.0f), r); > #else > - cmp = vcageq_f32 (d->range_val, x); > - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ > + cmp = vcageq_f32 (x, d->range_val); > r = x; > #endif > > diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c > index fe7149b191..eeb31ca839 100644 > --- a/sysdeps/aarch64/fpu/exp10_advsimd.c > +++ b/sysdeps/aarch64/fpu/exp10_advsimd.c > @@ -57,7 +57,7 @@ const static struct data > # define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ > # define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ > > -static inline float64x2_t VPCS_ATTR > +static float64x2_t VPCS_ATTR NOINLINE > special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) > { > /* If fenv exceptions are to be triggered correctly, fall back to the scalar > @@ -72,7 +72,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) > # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ > # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ > > -static float64x2_t VPCS_ATTR NOINLINE > +static inline float64x2_t VPCS_ATTR > special_case (float64x2_t s, float64x2_t y, float64x2_t n, > const struct data *d) > { > diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c > index 7ee0c90948..ab117b69da 100644 > --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c > +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c > @@ -25,7 +25,8 @@ > static const struct data > { > float32x4_t poly[5]; > - float32x4_t shift, log10_2, log2_10_hi, log2_10_lo; > + float32x4_t log10_2_and_inv, shift; > + > #if !WANT_SIMD_EXCEPT > float32x4_t scale_thresh; > #endif > @@ -38,9 +39,9 @@ static const struct data > .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), > V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, > .shift = V4 (0x1.8p23f), > - .log10_2 = V4 (0x1.a934fp+1), > - .log2_10_hi = V4 (0x1.344136p-2), > - .log2_10_lo = V4 (-0x1.ec10cp-27), > + > + /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ > + .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, > #if !WANT_SIMD_EXCEPT > .scale_thresh = V4 (ScaleBound) > #endif > @@ -98,24 +99,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) > #if WANT_SIMD_EXCEPT > /* asuint(x) - TinyBound >= BigBound - TinyBound. */ > uint32x4_t cmp = vcgeq_u32 ( > - vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), > - TinyBound), > - Thres); > + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); > float32x4_t xm = x; > /* If any lanes are special, mask them with 1 and retain a copy of x to allow > special case handler to fix special lanes later. This is only necessary if > fenv exceptions are to be triggered correctly. */ > if (__glibc_unlikely (v_any_u32 (cmp))) > - x = vbslq_f32 (cmp, v_f32 (1), x); > + x = v_zerofy_f32 (x, cmp); > #endif > > /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), > with poly(r) in [1/sqrt(2), sqrt(2)] and > x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ > - float32x4_t z = vfmaq_f32 (d->shift, x, d->log10_2); > + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); > float32x4_t n = vsubq_f32 (z, d->shift); > - float32x4_t r = vfmsq_f32 (x, n, d->log2_10_hi); > - r = vfmsq_f32 (r, n, d->log2_10_lo); > + float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); > + r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); > uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); > > float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); > diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c > index 391a93180c..ae1e63d503 100644 > --- a/sysdeps/aarch64/fpu/exp2_advsimd.c > +++ b/sysdeps/aarch64/fpu/exp2_advsimd.c > @@ -24,6 +24,7 @@ > #define IndexMask (N - 1) > #define BigBound 1022.0 > #define UOFlowBound 1280.0 > +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ > > static const struct data > { > @@ -48,14 +49,13 @@ lookup_sbits (uint64x2_t i) > > #if WANT_SIMD_EXCEPT > > -# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ > # define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ > > /* Call scalar exp2 as a fallback. */ > static float64x2_t VPCS_ATTR NOINLINE > -special_case (float64x2_t x) > +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) > { > - return v_call_f64 (exp2, x, x, v_u64 (0xffffffffffffffff)); > + return v_call_f64 (exp2, x, y, is_special); > } > > #else > @@ -65,7 +65,7 @@ special_case (float64x2_t x) > # define SpecialBias1 0x7000000000000000 /* 0x1p769. */ > # define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ > > -static float64x2_t VPCS_ATTR > +static inline float64x2_t VPCS_ATTR > special_case (float64x2_t s, float64x2_t y, float64x2_t n, > const struct data *d) > { > @@ -94,10 +94,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) > #if WANT_SIMD_EXCEPT > uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); > cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); > - /* If any special case (inf, nan, small and large x) is detected, > - fall back to scalar for all lanes. */ > - if (__glibc_unlikely (v_any_u64 (cmp))) > - return special_case (x); > + /* Mask special lanes and retain a copy of x for passing to special-case > + handler. */ > + float64x2_t xc = x; > + x = v_zerofy_f64 (x, cmp); > #else > cmp = vcagtq_f64 (x, d->scale_big_bound); > #endif > @@ -120,9 +120,11 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) > float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); > y = vmulq_f64 (r, y); > > -#if !WANT_SIMD_EXCEPT > if (__glibc_unlikely (v_any_u64 (cmp))) > +#if !WANT_SIMD_EXCEPT > return special_case (s, y, n, d); > +#else > + return special_case (xc, vfmaq_f64 (s, s, y), cmp); > #endif > return vfmaq_f64 (s, s, y); > } > diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c > index 9a5a523a10..8a686e3e05 100644 > --- a/sysdeps/aarch64/fpu/exp2f_sve.c > +++ b/sysdeps/aarch64/fpu/exp2f_sve.c > @@ -20,6 +20,8 @@ > #include "sv_math.h" > #include "poly_sve_f32.h" > > +#define Thres 0x1.5d5e2ap+6f > + > static const struct data > { > float poly[5]; > @@ -33,7 +35,7 @@ static const struct data > .shift = 0x1.903f8p17f, > /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled > correctly by FEXPA. */ > - .thres = 0x1.5d5e2ap+6f, > + .thres = Thres, > }; > > static svfloat32_t NOINLINE > diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c > index fd215f1d2c..5e3a9a0d44 100644 > --- a/sysdeps/aarch64/fpu/exp_advsimd.c > +++ b/sysdeps/aarch64/fpu/exp_advsimd.c > @@ -54,7 +54,7 @@ const static volatile struct > # define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ > # define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ > > -static inline float64x2_t VPCS_ATTR > +static float64x2_t VPCS_ATTR NOINLINE > special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) > { > /* If fenv exceptions are to be triggered correctly, fall back to the scalar > @@ -69,7 +69,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) > # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ > # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ > > -static float64x2_t VPCS_ATTR NOINLINE > +static inline float64x2_t VPCS_ATTR > special_case (float64x2_t s, float64x2_t y, float64x2_t n) > { > /* 2^(n/N) may overflow, break it up into s1*s2. */ > diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c > index 0b85bd06f3..3628398674 100644 > --- a/sysdeps/aarch64/fpu/expm1_advsimd.c > +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c > @@ -23,7 +23,7 @@ > static const struct data > { > float64x2_t poly[11]; > - float64x2_t invln2, ln2_lo, ln2_hi, shift; > + float64x2_t invln2, ln2, shift; > int64x2_t exponent_bias; > #if WANT_SIMD_EXCEPT > uint64x2_t thresh, tiny_bound; > @@ -38,8 +38,7 @@ static const struct data > V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), > V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, > .invln2 = V2 (0x1.71547652b82fep0), > - .ln2_hi = V2 (0x1.62e42fefa39efp-1), > - .ln2_lo = V2 (0x1.abc9e3b39803fp-56), > + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, > .shift = V2 (0x1.8p52), > .exponent_bias = V2 (0x3ff0000000000000), > #if WANT_SIMD_EXCEPT > @@ -83,7 +82,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) > x = v_zerofy_f64 (x, special); > #else > /* Large input, NaNs and Infs. */ > - uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound)); > + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); > #endif > > /* Reduce argument to smaller range: > @@ -93,8 +92,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) > where 2^i is exact because i is an integer. */ > float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); > int64x2_t i = vcvtq_s64_f64 (n); > - float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi); > - f = vfmsq_f64 (f, n, d->ln2_lo); > + float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); > + f = vfmsq_laneq_f64 (f, n, d->ln2, 1); > > /* Approximate expm1(f) using polynomial. > Taylor expansion for expm1(x) has the form: > diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c > index 8d4c9a2193..93db200f61 100644 > --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c > +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c > @@ -23,7 +23,8 @@ > static const struct data > { > float32x4_t poly[5]; > - float32x4_t invln2, ln2_lo, ln2_hi, shift; > + float32x4_t invln2_and_ln2; > + float32x4_t shift; > int32x4_t exponent_bias; > #if WANT_SIMD_EXCEPT > uint32x4_t thresh; > @@ -34,9 +35,8 @@ static const struct data > /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ > .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), > V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, > - .invln2 = V4 (0x1.715476p+0f), > - .ln2_hi = V4 (0x1.62e4p-1f), > - .ln2_lo = V4 (0x1.7f7d1cp-20f), > + /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ > + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, > .shift = V4 (0x1.8p23f), > .exponent_bias = V4 (0x3f800000), > #if !WANT_SIMD_EXCEPT > @@ -80,7 +80,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) > x = v_zerofy_f32 (x, special); > #else > /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ > - uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound)); > + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); > #endif > > /* Reduce argument to smaller range: > @@ -88,10 +88,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) > and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. > exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 > where 2^i is exact because i is an integer. */ > - float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift); > + float32x4_t j = vsubq_f32 ( > + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); > int32x4_t i = vcvtq_s32_f32 (j); > - float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi); > - f = vfmsq_f32 (f, j, d->ln2_lo); > + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); > + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); > > /* Approximate expm1(f) using polynomial. > Taylor expansion for expm1(x) has the form: > diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c > index 067ae79613..21df61728c 100644 > --- a/sysdeps/aarch64/fpu/log_advsimd.c > +++ b/sysdeps/aarch64/fpu/log_advsimd.c > @@ -58,8 +58,13 @@ lookup (uint64x2_t i) > uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; > float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); > float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); > +#if __BYTE_ORDER == __LITTLE_ENDIAN > e.invc = vuzp1q_f64 (e0, e1); > e.logc = vuzp2q_f64 (e0, e1); > +#else > + e.invc = vuzp1q_f64 (e1, e0); > + e.logc = vuzp2q_f64 (e1, e0); > +#endif > return e; > } > > diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c > index efce183e86..a0d9d3b819 100644 > --- a/sysdeps/aarch64/fpu/sin_advsimd.c > +++ b/sysdeps/aarch64/fpu/sin_advsimd.c > @@ -75,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) > r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); > #else > r = x; > - cmp = vcageq_f64 (d->range_val, x); > - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ > + cmp = vcageq_f64 (x, d->range_val); > #endif > > /* n = rint(|x|/pi). */ > diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c > index 60cf3f2ca1..375dfc3331 100644 > --- a/sysdeps/aarch64/fpu/sinf_advsimd.c > +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c > @@ -67,8 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) > r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); > #else > r = x; > - cmp = vcageq_f32 (d->range_val, x); > - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ > + cmp = vcageq_f32 (x, d->range_val); > #endif > > /* n = rint(|x|/pi) */ > diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c > index d7e5ba7b1a..0459821ab2 100644 > --- a/sysdeps/aarch64/fpu/tan_advsimd.c > +++ b/sysdeps/aarch64/fpu/tan_advsimd.c > @@ -23,7 +23,7 @@ > static const struct data > { > float64x2_t poly[9]; > - float64x2_t half_pi_hi, half_pi_lo, two_over_pi, shift; > + float64x2_t half_pi, two_over_pi, shift; > #if !WANT_SIMD_EXCEPT > float64x2_t range_val; > #endif > @@ -34,8 +34,7 @@ static const struct data > V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), > V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), > V2 (0x1.4e4fd14147622p-12) }, > - .half_pi_hi = V2 (0x1.921fb54442d18p0), > - .half_pi_lo = V2 (0x1.1a62633145c07p-54), > + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, > .two_over_pi = V2 (0x1.45f306dc9c883p-1), > .shift = V2 (0x1.8p52), > #if !WANT_SIMD_EXCEPT > @@ -56,15 +55,15 @@ special_case (float64x2_t x) > > /* Vector approximation for double-precision tan. > Maximum measured error is 3.48 ULP: > - __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 > - want -0x1.f6ccd8ecf7deap+37. */ > + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 > + want -0x1.f6ccd8ecf7deap+37. */ > float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) > { > const struct data *dat = ptr_barrier (&data); > - /* Our argument reduction cannot calculate q with sufficient accuracy for very > - large inputs. Fall back to scalar routine for all lanes if any are too > - large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny > - input to avoid underflow. */ > + /* Our argument reduction cannot calculate q with sufficient accuracy for > + very large inputs. Fall back to scalar routine for all lanes if any are > + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for > + tiny input to avoid underflow. */ > #if WANT_SIMD_EXCEPT > uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); > /* iax - tiny_bound > range_val - tiny_bound. */ > @@ -82,8 +81,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) > /* Use q to reduce x to r in [-pi/4, pi/4], by: > r = x - q * pi/2, in extended precision. */ > float64x2_t r = x; > - r = vfmsq_f64 (r, q, dat->half_pi_hi); > - r = vfmsq_f64 (r, q, dat->half_pi_lo); > + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); > + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); > /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle > formula. */ > r = vmulq_n_f64 (r, 0.5); > @@ -106,14 +105,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) > and reciprocity around pi/2: > tan(x) = 1 / (tan(pi/2 - x)) > to assemble result using change-of-sign and conditional selection of > - numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */ > + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). > + */ > float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); > float64x2_t d = vaddq_f64 (p, p); > > uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); > > #if !WANT_SIMD_EXCEPT > - uint64x2_t special = vceqzq_u64 (vcaleq_f64 (x, dat->range_val)); > + uint64x2_t special = vcageq_f64 (x, dat->range_val); > if (__glibc_unlikely (v_any_u64 (special))) > return special_case (x); > #endif > diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c > index 1f16103f8a..5a7489390a 100644 > --- a/sysdeps/aarch64/fpu/tanf_advsimd.c > +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c > @@ -23,7 +23,8 @@ > static const struct data > { > float32x4_t poly[6]; > - float32x4_t neg_half_pi_1, neg_half_pi_2, neg_half_pi_3, two_over_pi, shift; > + float32x4_t pi_consts; > + float32x4_t shift; > #if !WANT_SIMD_EXCEPT > float32x4_t range_val; > #endif > @@ -31,10 +32,9 @@ static const struct data > /* Coefficients generated using FPMinimax. */ > .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), > V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, > - .neg_half_pi_1 = V4 (-0x1.921fb6p+0f), > - .neg_half_pi_2 = V4 (0x1.777a5cp-25f), > - .neg_half_pi_3 = V4 (0x1.ee59dap-50f), > - .two_over_pi = V4 (0x1.45f306p-1f), > + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */ > + .pi_consts > + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, > .shift = V4 (0x1.8p+23f), > #if !WANT_SIMD_EXCEPT > .range_val = V4 (0x1p15f), > @@ -58,10 +58,11 @@ eval_poly (float32x4_t z, const struct data *d) > { > float32x4_t z2 = vmulq_f32 (z, z); > #if WANT_SIMD_EXCEPT > - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions > - are to be triggered correctly, sidestep this by fixing such lanes to 0. */ > + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. > + If fp exceptions are to be triggered correctly, > + sidestep this by fixing such lanes to 0. */ > uint32x4_t will_uflow > - = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); > + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); > if (__glibc_unlikely (v_any_u32 (will_uflow))) > z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); > #endif > @@ -94,16 +95,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) > #endif > > /* n = rint(x/(pi/2)). */ > - float32x4_t q = vfmaq_f32 (d->shift, d->two_over_pi, x); > + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); > float32x4_t n = vsubq_f32 (q, d->shift); > /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ > uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); > > /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ > float32x4_t r; > - r = vfmaq_f32 (x, d->neg_half_pi_1, n); > - r = vfmaq_f32 (r, d->neg_half_pi_2, n); > - r = vfmaq_f32 (r, d->neg_half_pi_3, n); > + r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); > + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); > + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); > > /* If x lives in an interval, where |tan(x)| > - is finite, then use a polynomial approximation of the form
Thanks Adhemerval. I don't have commit rights - would you be able to commit this for me? On 23/02/2024 12:19, Adhemerval Zanella Netto wrote: > > On 20/02/24 13:44, Joe Ramsay wrote: >> This includes a fix for big-endian in AdvSIMD log, some cosmetic >> changes, and numerous small optimisations mainly around inlining and >> using indexed variants of MLA intrinsics. > LGTM, thanks. > > Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> > >> --- >> Thanks, >> Joe >> sysdeps/aarch64/fpu/acos_advsimd.c | 4 ++-- >> sysdeps/aarch64/fpu/asin_advsimd.c | 4 ++-- >> sysdeps/aarch64/fpu/atan2_sve.c | 29 +++++++++++++-------------- >> sysdeps/aarch64/fpu/atan2f_sve.c | 30 +++++++++++++++------------- >> sysdeps/aarch64/fpu/cos_advsimd.c | 3 +-- >> sysdeps/aarch64/fpu/cosf_advsimd.c | 3 +-- >> sysdeps/aarch64/fpu/exp10_advsimd.c | 4 ++-- >> sysdeps/aarch64/fpu/exp10f_advsimd.c | 21 ++++++++++--------- >> sysdeps/aarch64/fpu/exp2_advsimd.c | 20 ++++++++++--------- >> sysdeps/aarch64/fpu/exp2f_sve.c | 4 +++- >> sysdeps/aarch64/fpu/exp_advsimd.c | 4 ++-- >> sysdeps/aarch64/fpu/expm1_advsimd.c | 11 +++++----- >> sysdeps/aarch64/fpu/expm1f_advsimd.c | 17 ++++++++-------- >> sysdeps/aarch64/fpu/log_advsimd.c | 5 +++++ >> sysdeps/aarch64/fpu/sin_advsimd.c | 3 +-- >> sysdeps/aarch64/fpu/sinf_advsimd.c | 3 +-- >> sysdeps/aarch64/fpu/tan_advsimd.c | 26 ++++++++++++------------ >> sysdeps/aarch64/fpu/tanf_advsimd.c | 25 ++++++++++++----------- >> 18 files changed, 111 insertions(+), 105 deletions(-) >> >> diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c >> index a8eabb5e71..0a86c9823a 100644 >> --- a/sysdeps/aarch64/fpu/acos_advsimd.c >> +++ b/sysdeps/aarch64/fpu/acos_advsimd.c >> @@ -40,8 +40,8 @@ static const struct data >> }; >> >> #define AllMask v_u64 (0xffffffffffffffff) >> -#define Oneu (0x3ff0000000000000) >> -#define Small (0x3e50000000000000) /* 2^-53. */ >> +#define Oneu 0x3ff0000000000000 >> +#define Small 0x3e50000000000000 /* 2^-53. */ >> >> #if WANT_SIMD_EXCEPT >> static float64x2_t VPCS_ATTR NOINLINE >> diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c >> index 141646e954..2de6eff407 100644 >> --- a/sysdeps/aarch64/fpu/asin_advsimd.c >> +++ b/sysdeps/aarch64/fpu/asin_advsimd.c >> @@ -39,8 +39,8 @@ static const struct data >> }; >> >> #define AllMask v_u64 (0xffffffffffffffff) >> -#define One (0x3ff0000000000000) >> -#define Small (0x3e50000000000000) /* 2^-12. */ >> +#define One 0x3ff0000000000000 >> +#define Small 0x3e50000000000000 /* 2^-12. */ >> >> #if WANT_SIMD_EXCEPT >> static float64x2_t VPCS_ATTR NOINLINE >> diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c >> index 09a4c559b8..04fa71fa37 100644 >> --- a/sysdeps/aarch64/fpu/atan2_sve.c >> +++ b/sysdeps/aarch64/fpu/atan2_sve.c >> @@ -37,9 +37,6 @@ static const struct data >> .pi_over_2 = 0x1.921fb54442d18p+0, >> }; >> >> -/* Useful constants. */ >> -#define SignMask sv_u64 (0x8000000000000000) >> - >> /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ >> static svfloat64_t NOINLINE >> special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, >> @@ -72,14 +69,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) >> svbool_t cmp_y = zeroinfnan (iy, pg); >> svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); >> >> - svuint64_t sign_x = svand_x (pg, ix, SignMask); >> - svuint64_t sign_y = svand_x (pg, iy, SignMask); >> - svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); >> - >> svfloat64_t ax = svabs_x (pg, x); >> svfloat64_t ay = svabs_x (pg, y); >> + svuint64_t iax = svreinterpret_u64 (ax); >> + svuint64_t iay = svreinterpret_u64 (ay); >> + >> + svuint64_t sign_x = sveor_x (pg, ix, iax); >> + svuint64_t sign_y = sveor_x (pg, iy, iay); >> + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); >> >> - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); >> svbool_t pred_aygtax = svcmpgt (pg, ay, ax); >> >> /* Set up z for call to atan. */ >> @@ -88,8 +86,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) >> svfloat64_t z = svdiv_x (pg, n, d); >> >> /* Work out the correct shift. */ >> - svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); >> - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); >> + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); >> + shift = svsel (pred_aygtax, sv_f64 (1.0), shift); >> + shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); >> shift = svmul_x (pg, shift, data_ptr->pi_over_2); >> >> /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ >> @@ -109,10 +108,10 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) >> ret = svadd_m (pg, ret, shift); >> >> /* Account for the sign of x and y. */ >> - ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); >> - >> if (__glibc_unlikely (svptest_any (pg, cmp_xy))) >> - return special_case (y, x, ret, cmp_xy); >> - >> - return ret; >> + return special_case ( >> + y, x, >> + svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)), >> + cmp_xy); >> + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); >> } >> diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c >> index b92f83cdea..9ea197147c 100644 >> --- a/sysdeps/aarch64/fpu/atan2f_sve.c >> +++ b/sysdeps/aarch64/fpu/atan2f_sve.c >> @@ -32,10 +32,8 @@ static const struct data >> .pi_over_2 = 0x1.921fb6p+0f, >> }; >> >> -#define SignMask sv_u32 (0x80000000) >> - >> /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ >> -static inline svfloat32_t >> +static svfloat32_t NOINLINE >> special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, >> const svbool_t cmp) >> { >> @@ -67,14 +65,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) >> svbool_t cmp_y = zeroinfnan (iy, pg); >> svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); >> >> - svuint32_t sign_x = svand_x (pg, ix, SignMask); >> - svuint32_t sign_y = svand_x (pg, iy, SignMask); >> - svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); >> - >> svfloat32_t ax = svabs_x (pg, x); >> svfloat32_t ay = svabs_x (pg, y); >> + svuint32_t iax = svreinterpret_u32 (ax); >> + svuint32_t iay = svreinterpret_u32 (ay); >> + >> + svuint32_t sign_x = sveor_x (pg, ix, iax); >> + svuint32_t sign_y = sveor_x (pg, iy, iay); >> + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); >> >> - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); >> svbool_t pred_aygtax = svcmpgt (pg, ay, ax); >> >> /* Set up z for call to atan. */ >> @@ -83,11 +82,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) >> svfloat32_t z = svdiv_x (pg, n, d); >> >> /* Work out the correct shift. */ >> - svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); >> - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); >> + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); >> + shift = svsel (pred_aygtax, sv_f32 (1.0), shift); >> + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); >> shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); >> >> - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ >> + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ >> svfloat32_t z2 = svmul_x (pg, z, z); >> svfloat32_t z4 = svmul_x (pg, z2, z2); >> svfloat32_t z8 = svmul_x (pg, z4, z4); >> @@ -101,10 +101,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) >> ret = svadd_m (pg, ret, shift); >> >> /* Account for the sign of x and y. */ >> - ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); >> >> if (__glibc_unlikely (svptest_any (pg, cmp_xy))) >> - return special_case (y, x, ret, cmp_xy); >> + return special_case ( >> + y, x, >> + svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)), >> + cmp_xy); >> >> - return ret; >> + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); >> } >> diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c >> index 2897e8b909..3924c9ce44 100644 >> --- a/sysdeps/aarch64/fpu/cos_advsimd.c >> +++ b/sysdeps/aarch64/fpu/cos_advsimd.c >> @@ -63,8 +63,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) >> special-case handler later. */ >> r = vbslq_f64 (cmp, v_f64 (1.0), r); >> #else >> - cmp = vcageq_f64 (d->range_val, x); >> - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ >> + cmp = vcageq_f64 (x, d->range_val); >> r = x; >> #endif >> >> diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c >> index 60abc8dfcf..d0c285b03a 100644 >> --- a/sysdeps/aarch64/fpu/cosf_advsimd.c >> +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c >> @@ -64,8 +64,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) >> special-case handler later. */ >> r = vbslq_f32 (cmp, v_f32 (1.0f), r); >> #else >> - cmp = vcageq_f32 (d->range_val, x); >> - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ >> + cmp = vcageq_f32 (x, d->range_val); >> r = x; >> #endif >> >> diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c >> index fe7149b191..eeb31ca839 100644 >> --- a/sysdeps/aarch64/fpu/exp10_advsimd.c >> +++ b/sysdeps/aarch64/fpu/exp10_advsimd.c >> @@ -57,7 +57,7 @@ const static struct data >> # define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ >> # define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ >> >> -static inline float64x2_t VPCS_ATTR >> +static float64x2_t VPCS_ATTR NOINLINE >> special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) >> { >> /* If fenv exceptions are to be triggered correctly, fall back to the scalar >> @@ -72,7 +72,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) >> # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ >> # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ >> >> -static float64x2_t VPCS_ATTR NOINLINE >> +static inline float64x2_t VPCS_ATTR >> special_case (float64x2_t s, float64x2_t y, float64x2_t n, >> const struct data *d) >> { >> diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c >> index 7ee0c90948..ab117b69da 100644 >> --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c >> +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c >> @@ -25,7 +25,8 @@ >> static const struct data >> { >> float32x4_t poly[5]; >> - float32x4_t shift, log10_2, log2_10_hi, log2_10_lo; >> + float32x4_t log10_2_and_inv, shift; >> + >> #if !WANT_SIMD_EXCEPT >> float32x4_t scale_thresh; >> #endif >> @@ -38,9 +39,9 @@ static const struct data >> .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), >> V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, >> .shift = V4 (0x1.8p23f), >> - .log10_2 = V4 (0x1.a934fp+1), >> - .log2_10_hi = V4 (0x1.344136p-2), >> - .log2_10_lo = V4 (-0x1.ec10cp-27), >> + >> + /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ >> + .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, >> #if !WANT_SIMD_EXCEPT >> .scale_thresh = V4 (ScaleBound) >> #endif >> @@ -98,24 +99,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) >> #if WANT_SIMD_EXCEPT >> /* asuint(x) - TinyBound >= BigBound - TinyBound. */ >> uint32x4_t cmp = vcgeq_u32 ( >> - vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), >> - TinyBound), >> - Thres); >> + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); >> float32x4_t xm = x; >> /* If any lanes are special, mask them with 1 and retain a copy of x to allow >> special case handler to fix special lanes later. This is only necessary if >> fenv exceptions are to be triggered correctly. */ >> if (__glibc_unlikely (v_any_u32 (cmp))) >> - x = vbslq_f32 (cmp, v_f32 (1), x); >> + x = v_zerofy_f32 (x, cmp); >> #endif >> >> /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), >> with poly(r) in [1/sqrt(2), sqrt(2)] and >> x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ >> - float32x4_t z = vfmaq_f32 (d->shift, x, d->log10_2); >> + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); >> float32x4_t n = vsubq_f32 (z, d->shift); >> - float32x4_t r = vfmsq_f32 (x, n, d->log2_10_hi); >> - r = vfmsq_f32 (r, n, d->log2_10_lo); >> + float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); >> + r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); >> uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); >> >> float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); >> diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c >> index 391a93180c..ae1e63d503 100644 >> --- a/sysdeps/aarch64/fpu/exp2_advsimd.c >> +++ b/sysdeps/aarch64/fpu/exp2_advsimd.c >> @@ -24,6 +24,7 @@ >> #define IndexMask (N - 1) >> #define BigBound 1022.0 >> #define UOFlowBound 1280.0 >> +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ >> >> static const struct data >> { >> @@ -48,14 +49,13 @@ lookup_sbits (uint64x2_t i) >> >> #if WANT_SIMD_EXCEPT >> >> -# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ >> # define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ >> >> /* Call scalar exp2 as a fallback. */ >> static float64x2_t VPCS_ATTR NOINLINE >> -special_case (float64x2_t x) >> +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) >> { >> - return v_call_f64 (exp2, x, x, v_u64 (0xffffffffffffffff)); >> + return v_call_f64 (exp2, x, y, is_special); >> } >> >> #else >> @@ -65,7 +65,7 @@ special_case (float64x2_t x) >> # define SpecialBias1 0x7000000000000000 /* 0x1p769. */ >> # define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ >> >> -static float64x2_t VPCS_ATTR >> +static inline float64x2_t VPCS_ATTR >> special_case (float64x2_t s, float64x2_t y, float64x2_t n, >> const struct data *d) >> { >> @@ -94,10 +94,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) >> #if WANT_SIMD_EXCEPT >> uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); >> cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); >> - /* If any special case (inf, nan, small and large x) is detected, >> - fall back to scalar for all lanes. */ >> - if (__glibc_unlikely (v_any_u64 (cmp))) >> - return special_case (x); >> + /* Mask special lanes and retain a copy of x for passing to special-case >> + handler. */ >> + float64x2_t xc = x; >> + x = v_zerofy_f64 (x, cmp); >> #else >> cmp = vcagtq_f64 (x, d->scale_big_bound); >> #endif >> @@ -120,9 +120,11 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) >> float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); >> y = vmulq_f64 (r, y); >> >> -#if !WANT_SIMD_EXCEPT >> if (__glibc_unlikely (v_any_u64 (cmp))) >> +#if !WANT_SIMD_EXCEPT >> return special_case (s, y, n, d); >> +#else >> + return special_case (xc, vfmaq_f64 (s, s, y), cmp); >> #endif >> return vfmaq_f64 (s, s, y); >> } >> diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c >> index 9a5a523a10..8a686e3e05 100644 >> --- a/sysdeps/aarch64/fpu/exp2f_sve.c >> +++ b/sysdeps/aarch64/fpu/exp2f_sve.c >> @@ -20,6 +20,8 @@ >> #include "sv_math.h" >> #include "poly_sve_f32.h" >> >> +#define Thres 0x1.5d5e2ap+6f >> + >> static const struct data >> { >> float poly[5]; >> @@ -33,7 +35,7 @@ static const struct data >> .shift = 0x1.903f8p17f, >> /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled >> correctly by FEXPA. */ >> - .thres = 0x1.5d5e2ap+6f, >> + .thres = Thres, >> }; >> >> static svfloat32_t NOINLINE >> diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c >> index fd215f1d2c..5e3a9a0d44 100644 >> --- a/sysdeps/aarch64/fpu/exp_advsimd.c >> +++ b/sysdeps/aarch64/fpu/exp_advsimd.c >> @@ -54,7 +54,7 @@ const static volatile struct >> # define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ >> # define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ >> >> -static inline float64x2_t VPCS_ATTR >> +static float64x2_t VPCS_ATTR NOINLINE >> special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) >> { >> /* If fenv exceptions are to be triggered correctly, fall back to the scalar >> @@ -69,7 +69,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) >> # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ >> # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ >> >> -static float64x2_t VPCS_ATTR NOINLINE >> +static inline float64x2_t VPCS_ATTR >> special_case (float64x2_t s, float64x2_t y, float64x2_t n) >> { >> /* 2^(n/N) may overflow, break it up into s1*s2. */ >> diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c >> index 0b85bd06f3..3628398674 100644 >> --- a/sysdeps/aarch64/fpu/expm1_advsimd.c >> +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c >> @@ -23,7 +23,7 @@ >> static const struct data >> { >> float64x2_t poly[11]; >> - float64x2_t invln2, ln2_lo, ln2_hi, shift; >> + float64x2_t invln2, ln2, shift; >> int64x2_t exponent_bias; >> #if WANT_SIMD_EXCEPT >> uint64x2_t thresh, tiny_bound; >> @@ -38,8 +38,7 @@ static const struct data >> V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), >> V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, >> .invln2 = V2 (0x1.71547652b82fep0), >> - .ln2_hi = V2 (0x1.62e42fefa39efp-1), >> - .ln2_lo = V2 (0x1.abc9e3b39803fp-56), >> + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, >> .shift = V2 (0x1.8p52), >> .exponent_bias = V2 (0x3ff0000000000000), >> #if WANT_SIMD_EXCEPT >> @@ -83,7 +82,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) >> x = v_zerofy_f64 (x, special); >> #else >> /* Large input, NaNs and Infs. */ >> - uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound)); >> + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); >> #endif >> >> /* Reduce argument to smaller range: >> @@ -93,8 +92,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) >> where 2^i is exact because i is an integer. */ >> float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); >> int64x2_t i = vcvtq_s64_f64 (n); >> - float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi); >> - f = vfmsq_f64 (f, n, d->ln2_lo); >> + float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); >> + f = vfmsq_laneq_f64 (f, n, d->ln2, 1); >> >> /* Approximate expm1(f) using polynomial. >> Taylor expansion for expm1(x) has the form: >> diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c >> index 8d4c9a2193..93db200f61 100644 >> --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c >> +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c >> @@ -23,7 +23,8 @@ >> static const struct data >> { >> float32x4_t poly[5]; >> - float32x4_t invln2, ln2_lo, ln2_hi, shift; >> + float32x4_t invln2_and_ln2; >> + float32x4_t shift; >> int32x4_t exponent_bias; >> #if WANT_SIMD_EXCEPT >> uint32x4_t thresh; >> @@ -34,9 +35,8 @@ static const struct data >> /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ >> .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), >> V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, >> - .invln2 = V4 (0x1.715476p+0f), >> - .ln2_hi = V4 (0x1.62e4p-1f), >> - .ln2_lo = V4 (0x1.7f7d1cp-20f), >> + /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ >> + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, >> .shift = V4 (0x1.8p23f), >> .exponent_bias = V4 (0x3f800000), >> #if !WANT_SIMD_EXCEPT >> @@ -80,7 +80,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) >> x = v_zerofy_f32 (x, special); >> #else >> /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ >> - uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound)); >> + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); >> #endif >> >> /* Reduce argument to smaller range: >> @@ -88,10 +88,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) >> and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. >> exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 >> where 2^i is exact because i is an integer. */ >> - float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift); >> + float32x4_t j = vsubq_f32 ( >> + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); >> int32x4_t i = vcvtq_s32_f32 (j); >> - float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi); >> - f = vfmsq_f32 (f, j, d->ln2_lo); >> + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); >> + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); >> >> /* Approximate expm1(f) using polynomial. >> Taylor expansion for expm1(x) has the form: >> diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c >> index 067ae79613..21df61728c 100644 >> --- a/sysdeps/aarch64/fpu/log_advsimd.c >> +++ b/sysdeps/aarch64/fpu/log_advsimd.c >> @@ -58,8 +58,13 @@ lookup (uint64x2_t i) >> uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; >> float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); >> float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); >> +#if __BYTE_ORDER == __LITTLE_ENDIAN >> e.invc = vuzp1q_f64 (e0, e1); >> e.logc = vuzp2q_f64 (e0, e1); >> +#else >> + e.invc = vuzp1q_f64 (e1, e0); >> + e.logc = vuzp2q_f64 (e1, e0); >> +#endif >> return e; >> } >> >> diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c >> index efce183e86..a0d9d3b819 100644 >> --- a/sysdeps/aarch64/fpu/sin_advsimd.c >> +++ b/sysdeps/aarch64/fpu/sin_advsimd.c >> @@ -75,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) >> r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); >> #else >> r = x; >> - cmp = vcageq_f64 (d->range_val, x); >> - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ >> + cmp = vcageq_f64 (x, d->range_val); >> #endif >> >> /* n = rint(|x|/pi). */ >> diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c >> index 60cf3f2ca1..375dfc3331 100644 >> --- a/sysdeps/aarch64/fpu/sinf_advsimd.c >> +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c >> @@ -67,8 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) >> r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); >> #else >> r = x; >> - cmp = vcageq_f32 (d->range_val, x); >> - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ >> + cmp = vcageq_f32 (x, d->range_val); >> #endif >> >> /* n = rint(|x|/pi) */ >> diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c >> index d7e5ba7b1a..0459821ab2 100644 >> --- a/sysdeps/aarch64/fpu/tan_advsimd.c >> +++ b/sysdeps/aarch64/fpu/tan_advsimd.c >> @@ -23,7 +23,7 @@ >> static const struct data >> { >> float64x2_t poly[9]; >> - float64x2_t half_pi_hi, half_pi_lo, two_over_pi, shift; >> + float64x2_t half_pi, two_over_pi, shift; >> #if !WANT_SIMD_EXCEPT >> float64x2_t range_val; >> #endif >> @@ -34,8 +34,7 @@ static const struct data >> V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), >> V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), >> V2 (0x1.4e4fd14147622p-12) }, >> - .half_pi_hi = V2 (0x1.921fb54442d18p0), >> - .half_pi_lo = V2 (0x1.1a62633145c07p-54), >> + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, >> .two_over_pi = V2 (0x1.45f306dc9c883p-1), >> .shift = V2 (0x1.8p52), >> #if !WANT_SIMD_EXCEPT >> @@ -56,15 +55,15 @@ special_case (float64x2_t x) >> >> /* Vector approximation for double-precision tan. >> Maximum measured error is 3.48 ULP: >> - __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 >> - want -0x1.f6ccd8ecf7deap+37. */ >> + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 >> + want -0x1.f6ccd8ecf7deap+37. */ >> float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) >> { >> const struct data *dat = ptr_barrier (&data); >> - /* Our argument reduction cannot calculate q with sufficient accuracy for very >> - large inputs. Fall back to scalar routine for all lanes if any are too >> - large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny >> - input to avoid underflow. */ >> + /* Our argument reduction cannot calculate q with sufficient accuracy for >> + very large inputs. Fall back to scalar routine for all lanes if any are >> + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for >> + tiny input to avoid underflow. */ >> #if WANT_SIMD_EXCEPT >> uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); >> /* iax - tiny_bound > range_val - tiny_bound. */ >> @@ -82,8 +81,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) >> /* Use q to reduce x to r in [-pi/4, pi/4], by: >> r = x - q * pi/2, in extended precision. */ >> float64x2_t r = x; >> - r = vfmsq_f64 (r, q, dat->half_pi_hi); >> - r = vfmsq_f64 (r, q, dat->half_pi_lo); >> + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); >> + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); >> /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle >> formula. */ >> r = vmulq_n_f64 (r, 0.5); >> @@ -106,14 +105,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) >> and reciprocity around pi/2: >> tan(x) = 1 / (tan(pi/2 - x)) >> to assemble result using change-of-sign and conditional selection of >> - numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */ >> + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). >> + */ >> float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); >> float64x2_t d = vaddq_f64 (p, p); >> >> uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); >> >> #if !WANT_SIMD_EXCEPT >> - uint64x2_t special = vceqzq_u64 (vcaleq_f64 (x, dat->range_val)); >> + uint64x2_t special = vcageq_f64 (x, dat->range_val); >> if (__glibc_unlikely (v_any_u64 (special))) >> return special_case (x); >> #endif >> diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c >> index 1f16103f8a..5a7489390a 100644 >> --- a/sysdeps/aarch64/fpu/tanf_advsimd.c >> +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c >> @@ -23,7 +23,8 @@ >> static const struct data >> { >> float32x4_t poly[6]; >> - float32x4_t neg_half_pi_1, neg_half_pi_2, neg_half_pi_3, two_over_pi, shift; >> + float32x4_t pi_consts; >> + float32x4_t shift; >> #if !WANT_SIMD_EXCEPT >> float32x4_t range_val; >> #endif >> @@ -31,10 +32,9 @@ static const struct data >> /* Coefficients generated using FPMinimax. */ >> .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), >> V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, >> - .neg_half_pi_1 = V4 (-0x1.921fb6p+0f), >> - .neg_half_pi_2 = V4 (0x1.777a5cp-25f), >> - .neg_half_pi_3 = V4 (0x1.ee59dap-50f), >> - .two_over_pi = V4 (0x1.45f306p-1f), >> + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */ >> + .pi_consts >> + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, >> .shift = V4 (0x1.8p+23f), >> #if !WANT_SIMD_EXCEPT >> .range_val = V4 (0x1p15f), >> @@ -58,10 +58,11 @@ eval_poly (float32x4_t z, const struct data *d) >> { >> float32x4_t z2 = vmulq_f32 (z, z); >> #if WANT_SIMD_EXCEPT >> - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions >> - are to be triggered correctly, sidestep this by fixing such lanes to 0. */ >> + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. >> + If fp exceptions are to be triggered correctly, >> + sidestep this by fixing such lanes to 0. */ >> uint32x4_t will_uflow >> - = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); >> + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); >> if (__glibc_unlikely (v_any_u32 (will_uflow))) >> z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); >> #endif >> @@ -94,16 +95,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) >> #endif >> >> /* n = rint(x/(pi/2)). */ >> - float32x4_t q = vfmaq_f32 (d->shift, d->two_over_pi, x); >> + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); >> float32x4_t n = vsubq_f32 (q, d->shift); >> /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ >> uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); >> >> /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ >> float32x4_t r; >> - r = vfmaq_f32 (x, d->neg_half_pi_1, n); >> - r = vfmaq_f32 (r, d->neg_half_pi_2, n); >> - r = vfmaq_f32 (r, d->neg_half_pi_3, n); >> + r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); >> + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); >> + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); >> >> /* If x lives in an interval, where |tan(x)| >> - is finite, then use a polynomial approximation of the form
diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c index a8eabb5e71..0a86c9823a 100644 --- a/sysdeps/aarch64/fpu/acos_advsimd.c +++ b/sysdeps/aarch64/fpu/acos_advsimd.c @@ -40,8 +40,8 @@ static const struct data }; #define AllMask v_u64 (0xffffffffffffffff) -#define Oneu (0x3ff0000000000000) -#define Small (0x3e50000000000000) /* 2^-53. */ +#define Oneu 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-53. */ #if WANT_SIMD_EXCEPT static float64x2_t VPCS_ATTR NOINLINE diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c index 141646e954..2de6eff407 100644 --- a/sysdeps/aarch64/fpu/asin_advsimd.c +++ b/sysdeps/aarch64/fpu/asin_advsimd.c @@ -39,8 +39,8 @@ static const struct data }; #define AllMask v_u64 (0xffffffffffffffff) -#define One (0x3ff0000000000000) -#define Small (0x3e50000000000000) /* 2^-12. */ +#define One 0x3ff0000000000000 +#define Small 0x3e50000000000000 /* 2^-12. */ #if WANT_SIMD_EXCEPT static float64x2_t VPCS_ATTR NOINLINE diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c index 09a4c559b8..04fa71fa37 100644 --- a/sysdeps/aarch64/fpu/atan2_sve.c +++ b/sysdeps/aarch64/fpu/atan2_sve.c @@ -37,9 +37,6 @@ static const struct data .pi_over_2 = 0x1.921fb54442d18p+0, }; -/* Useful constants. */ -#define SignMask sv_u64 (0x8000000000000000) - /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ static svfloat64_t NOINLINE special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret, @@ -72,14 +69,15 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) svbool_t cmp_y = zeroinfnan (iy, pg); svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - svuint64_t sign_x = svand_x (pg, ix, SignMask); - svuint64_t sign_y = svand_x (pg, iy, SignMask); - svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); - svfloat64_t ax = svabs_x (pg, x); svfloat64_t ay = svabs_x (pg, y); + svuint64_t iax = svreinterpret_u64 (ax); + svuint64_t iay = svreinterpret_u64 (ay); + + svuint64_t sign_x = sveor_x (pg, ix, iax); + svuint64_t sign_y = sveor_x (pg, iy, iay); + svuint64_t sign_xy = sveor_x (pg, sign_x, sign_y); - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ @@ -88,8 +86,9 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) svfloat64_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - svfloat64_t shift = svsel (pred_xlt0, sv_f64 (-2.0), sv_f64 (0.0)); - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1)); + shift = svsel (pred_aygtax, sv_f64 (1.0), shift); + shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift))); shift = svmul_x (pg, shift, data_ptr->pi_over_2); /* Use split Estrin scheme for P(z^2) with deg(P)=19. */ @@ -109,10 +108,10 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg) ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); - if (__glibc_unlikely (svptest_any (pg, cmp_xy))) - return special_case (y, x, ret, cmp_xy); - - return ret; + return special_case ( + y, x, + svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)), + cmp_xy); + return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ret), sign_xy)); } diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c index b92f83cdea..9ea197147c 100644 --- a/sysdeps/aarch64/fpu/atan2f_sve.c +++ b/sysdeps/aarch64/fpu/atan2f_sve.c @@ -32,10 +32,8 @@ static const struct data .pi_over_2 = 0x1.921fb6p+0f, }; -#define SignMask sv_u32 (0x80000000) - /* Special cases i.e. 0, infinity, nan (fall back to scalar calls). */ -static inline svfloat32_t +static svfloat32_t NOINLINE special_case (svfloat32_t y, svfloat32_t x, svfloat32_t ret, const svbool_t cmp) { @@ -67,14 +65,15 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) svbool_t cmp_y = zeroinfnan (iy, pg); svbool_t cmp_xy = svorr_z (pg, cmp_x, cmp_y); - svuint32_t sign_x = svand_x (pg, ix, SignMask); - svuint32_t sign_y = svand_x (pg, iy, SignMask); - svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); - svfloat32_t ax = svabs_x (pg, x); svfloat32_t ay = svabs_x (pg, y); + svuint32_t iax = svreinterpret_u32 (ax); + svuint32_t iay = svreinterpret_u32 (ay); + + svuint32_t sign_x = sveor_x (pg, ix, iax); + svuint32_t sign_y = sveor_x (pg, iy, iay); + svuint32_t sign_xy = sveor_x (pg, sign_x, sign_y); - svbool_t pred_xlt0 = svcmplt (pg, x, 0.0); svbool_t pred_aygtax = svcmpgt (pg, ay, ax); /* Set up z for call to atan. */ @@ -83,11 +82,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) svfloat32_t z = svdiv_x (pg, n, d); /* Work out the correct shift. */ - svfloat32_t shift = svsel (pred_xlt0, sv_f32 (-2.0), sv_f32 (0.0)); - shift = svsel (pred_aygtax, svadd_x (pg, shift, 1.0), shift); + svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1)); + shift = svsel (pred_aygtax, sv_f32 (1.0), shift); + shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift))); shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2)); - /* Use split Estrin scheme for P(z^2) with deg(P)=7. */ + /* Use pure Estrin scheme for P(z^2) with deg(P)=7. */ svfloat32_t z2 = svmul_x (pg, z, z); svfloat32_t z4 = svmul_x (pg, z2, z2); svfloat32_t z8 = svmul_x (pg, z4, z4); @@ -101,10 +101,12 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg) ret = svadd_m (pg, ret, shift); /* Account for the sign of x and y. */ - ret = svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); if (__glibc_unlikely (svptest_any (pg, cmp_xy))) - return special_case (y, x, ret, cmp_xy); + return special_case ( + y, x, + svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)), + cmp_xy); - return ret; + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (ret), sign_xy)); } diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c index 2897e8b909..3924c9ce44 100644 --- a/sysdeps/aarch64/fpu/cos_advsimd.c +++ b/sysdeps/aarch64/fpu/cos_advsimd.c @@ -63,8 +63,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) special-case handler later. */ r = vbslq_f64 (cmp, v_f64 (1.0), r); #else - cmp = vcageq_f64 (d->range_val, x); - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ + cmp = vcageq_f64 (x, d->range_val); r = x; #endif diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c index 60abc8dfcf..d0c285b03a 100644 --- a/sysdeps/aarch64/fpu/cosf_advsimd.c +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c @@ -64,8 +64,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cos) (float32x4_t x) special-case handler later. */ r = vbslq_f32 (cmp, v_f32 (1.0f), r); #else - cmp = vcageq_f32 (d->range_val, x); - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ + cmp = vcageq_f32 (x, d->range_val); r = x; #endif diff --git a/sysdeps/aarch64/fpu/exp10_advsimd.c b/sysdeps/aarch64/fpu/exp10_advsimd.c index fe7149b191..eeb31ca839 100644 --- a/sysdeps/aarch64/fpu/exp10_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10_advsimd.c @@ -57,7 +57,7 @@ const static struct data # define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */ # define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */ -static inline float64x2_t VPCS_ATTR +static float64x2_t VPCS_ATTR NOINLINE special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) { /* If fenv exceptions are to be triggered correctly, fall back to the scalar @@ -72,7 +72,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ -static float64x2_t VPCS_ATTR NOINLINE +static inline float64x2_t VPCS_ATTR special_case (float64x2_t s, float64x2_t y, float64x2_t n, const struct data *d) { diff --git a/sysdeps/aarch64/fpu/exp10f_advsimd.c b/sysdeps/aarch64/fpu/exp10f_advsimd.c index 7ee0c90948..ab117b69da 100644 --- a/sysdeps/aarch64/fpu/exp10f_advsimd.c +++ b/sysdeps/aarch64/fpu/exp10f_advsimd.c @@ -25,7 +25,8 @@ static const struct data { float32x4_t poly[5]; - float32x4_t shift, log10_2, log2_10_hi, log2_10_lo; + float32x4_t log10_2_and_inv, shift; + #if !WANT_SIMD_EXCEPT float32x4_t scale_thresh; #endif @@ -38,9 +39,9 @@ static const struct data .poly = { V4 (0x1.26bb16p+1f), V4 (0x1.5350d2p+1f), V4 (0x1.04744ap+1f), V4 (0x1.2d8176p+0f), V4 (0x1.12b41ap-1f) }, .shift = V4 (0x1.8p23f), - .log10_2 = V4 (0x1.a934fp+1), - .log2_10_hi = V4 (0x1.344136p-2), - .log2_10_lo = V4 (-0x1.ec10cp-27), + + /* Stores constants 1/log10(2), log10(2)_high, log10(2)_low, 0. */ + .log10_2_and_inv = { 0x1.a934fp+1, 0x1.344136p-2, -0x1.ec10cp-27, 0 }, #if !WANT_SIMD_EXCEPT .scale_thresh = V4 (ScaleBound) #endif @@ -98,24 +99,22 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10) (float32x4_t x) #if WANT_SIMD_EXCEPT /* asuint(x) - TinyBound >= BigBound - TinyBound. */ uint32x4_t cmp = vcgeq_u32 ( - vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)), - TinyBound), - Thres); + vsubq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (x)), TinyBound), Thres); float32x4_t xm = x; /* If any lanes are special, mask them with 1 and retain a copy of x to allow special case handler to fix special lanes later. This is only necessary if fenv exceptions are to be triggered correctly. */ if (__glibc_unlikely (v_any_u32 (cmp))) - x = vbslq_f32 (cmp, v_f32 (1), x); + x = v_zerofy_f32 (x, cmp); #endif /* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)), with poly(r) in [1/sqrt(2), sqrt(2)] and x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */ - float32x4_t z = vfmaq_f32 (d->shift, x, d->log10_2); + float32x4_t z = vfmaq_laneq_f32 (d->shift, x, d->log10_2_and_inv, 0); float32x4_t n = vsubq_f32 (z, d->shift); - float32x4_t r = vfmsq_f32 (x, n, d->log2_10_hi); - r = vfmsq_f32 (r, n, d->log2_10_lo); + float32x4_t r = vfmsq_laneq_f32 (x, n, d->log10_2_and_inv, 1); + r = vfmsq_laneq_f32 (r, n, d->log10_2_and_inv, 2); uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23); float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, ExponentBias)); diff --git a/sysdeps/aarch64/fpu/exp2_advsimd.c b/sysdeps/aarch64/fpu/exp2_advsimd.c index 391a93180c..ae1e63d503 100644 --- a/sysdeps/aarch64/fpu/exp2_advsimd.c +++ b/sysdeps/aarch64/fpu/exp2_advsimd.c @@ -24,6 +24,7 @@ #define IndexMask (N - 1) #define BigBound 1022.0 #define UOFlowBound 1280.0 +#define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ static const struct data { @@ -48,14 +49,13 @@ lookup_sbits (uint64x2_t i) #if WANT_SIMD_EXCEPT -# define TinyBound 0x2000000000000000 /* asuint64(0x1p-511). */ # define Thres 0x2080000000000000 /* asuint64(512.0) - TinyBound. */ /* Call scalar exp2 as a fallback. */ static float64x2_t VPCS_ATTR NOINLINE -special_case (float64x2_t x) +special_case (float64x2_t x, float64x2_t y, uint64x2_t is_special) { - return v_call_f64 (exp2, x, x, v_u64 (0xffffffffffffffff)); + return v_call_f64 (exp2, x, y, is_special); } #else @@ -65,7 +65,7 @@ special_case (float64x2_t x) # define SpecialBias1 0x7000000000000000 /* 0x1p769. */ # define SpecialBias2 0x3010000000000000 /* 0x1p-254. */ -static float64x2_t VPCS_ATTR +static inline float64x2_t VPCS_ATTR special_case (float64x2_t s, float64x2_t y, float64x2_t n, const struct data *d) { @@ -94,10 +94,10 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) #if WANT_SIMD_EXCEPT uint64x2_t ia = vreinterpretq_u64_f64 (vabsq_f64 (x)); cmp = vcgeq_u64 (vsubq_u64 (ia, v_u64 (TinyBound)), v_u64 (Thres)); - /* If any special case (inf, nan, small and large x) is detected, - fall back to scalar for all lanes. */ - if (__glibc_unlikely (v_any_u64 (cmp))) - return special_case (x); + /* Mask special lanes and retain a copy of x for passing to special-case + handler. */ + float64x2_t xc = x; + x = v_zerofy_f64 (x, cmp); #else cmp = vcagtq_f64 (x, d->scale_big_bound); #endif @@ -120,9 +120,11 @@ float64x2_t V_NAME_D1 (exp2) (float64x2_t x) float64x2_t y = v_pairwise_poly_3_f64 (r, r2, d->poly); y = vmulq_f64 (r, y); -#if !WANT_SIMD_EXCEPT if (__glibc_unlikely (v_any_u64 (cmp))) +#if !WANT_SIMD_EXCEPT return special_case (s, y, n, d); +#else + return special_case (xc, vfmaq_f64 (s, s, y), cmp); #endif return vfmaq_f64 (s, s, y); } diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c index 9a5a523a10..8a686e3e05 100644 --- a/sysdeps/aarch64/fpu/exp2f_sve.c +++ b/sysdeps/aarch64/fpu/exp2f_sve.c @@ -20,6 +20,8 @@ #include "sv_math.h" #include "poly_sve_f32.h" +#define Thres 0x1.5d5e2ap+6f + static const struct data { float poly[5]; @@ -33,7 +35,7 @@ static const struct data .shift = 0x1.903f8p17f, /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled correctly by FEXPA. */ - .thres = 0x1.5d5e2ap+6f, + .thres = Thres, }; static svfloat32_t NOINLINE diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c index fd215f1d2c..5e3a9a0d44 100644 --- a/sysdeps/aarch64/fpu/exp_advsimd.c +++ b/sysdeps/aarch64/fpu/exp_advsimd.c @@ -54,7 +54,7 @@ const static volatile struct # define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9). */ # define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound. */ -static inline float64x2_t VPCS_ATTR +static float64x2_t VPCS_ATTR NOINLINE special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) { /* If fenv exceptions are to be triggered correctly, fall back to the scalar @@ -69,7 +69,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp) # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */ # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */ -static float64x2_t VPCS_ATTR NOINLINE +static inline float64x2_t VPCS_ATTR special_case (float64x2_t s, float64x2_t y, float64x2_t n) { /* 2^(n/N) may overflow, break it up into s1*s2. */ diff --git a/sysdeps/aarch64/fpu/expm1_advsimd.c b/sysdeps/aarch64/fpu/expm1_advsimd.c index 0b85bd06f3..3628398674 100644 --- a/sysdeps/aarch64/fpu/expm1_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1_advsimd.c @@ -23,7 +23,7 @@ static const struct data { float64x2_t poly[11]; - float64x2_t invln2, ln2_lo, ln2_hi, shift; + float64x2_t invln2, ln2, shift; int64x2_t exponent_bias; #if WANT_SIMD_EXCEPT uint64x2_t thresh, tiny_bound; @@ -38,8 +38,7 @@ static const struct data V2 (0x1.71ddf82db5bb4p-19), V2 (0x1.27e517fc0d54bp-22), V2 (0x1.af5eedae67435p-26), V2 (0x1.1f143d060a28ap-29) }, .invln2 = V2 (0x1.71547652b82fep0), - .ln2_hi = V2 (0x1.62e42fefa39efp-1), - .ln2_lo = V2 (0x1.abc9e3b39803fp-56), + .ln2 = { 0x1.62e42fefa39efp-1, 0x1.abc9e3b39803fp-56 }, .shift = V2 (0x1.8p52), .exponent_bias = V2 (0x3ff0000000000000), #if WANT_SIMD_EXCEPT @@ -83,7 +82,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) x = v_zerofy_f64 (x, special); #else /* Large input, NaNs and Infs. */ - uint64x2_t special = vceqzq_u64 (vcaltq_f64 (x, d->oflow_bound)); + uint64x2_t special = vcageq_f64 (x, d->oflow_bound); #endif /* Reduce argument to smaller range: @@ -93,8 +92,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (expm1) (float64x2_t x) where 2^i is exact because i is an integer. */ float64x2_t n = vsubq_f64 (vfmaq_f64 (d->shift, d->invln2, x), d->shift); int64x2_t i = vcvtq_s64_f64 (n); - float64x2_t f = vfmsq_f64 (x, n, d->ln2_hi); - f = vfmsq_f64 (f, n, d->ln2_lo); + float64x2_t f = vfmsq_laneq_f64 (x, n, d->ln2, 0); + f = vfmsq_laneq_f64 (f, n, d->ln2, 1); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: diff --git a/sysdeps/aarch64/fpu/expm1f_advsimd.c b/sysdeps/aarch64/fpu/expm1f_advsimd.c index 8d4c9a2193..93db200f61 100644 --- a/sysdeps/aarch64/fpu/expm1f_advsimd.c +++ b/sysdeps/aarch64/fpu/expm1f_advsimd.c @@ -23,7 +23,8 @@ static const struct data { float32x4_t poly[5]; - float32x4_t invln2, ln2_lo, ln2_hi, shift; + float32x4_t invln2_and_ln2; + float32x4_t shift; int32x4_t exponent_bias; #if WANT_SIMD_EXCEPT uint32x4_t thresh; @@ -34,9 +35,8 @@ static const struct data /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */ .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, - .invln2 = V4 (0x1.715476p+0f), - .ln2_hi = V4 (0x1.62e4p-1f), - .ln2_lo = V4 (0x1.7f7d1cp-20f), + /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */ + .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), #if !WANT_SIMD_EXCEPT @@ -80,7 +80,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) x = v_zerofy_f32 (x, special); #else /* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */ - uint32x4_t special = vceqzq_u32 (vcaltq_f32 (x, d->oflow_bound)); + uint32x4_t special = vcagtq_f32 (x, d->oflow_bound); #endif /* Reduce argument to smaller range: @@ -88,10 +88,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x) and f = x - i * ln2, then f is in [-ln2/2, ln2/2]. exp(x) - 1 = 2^i * (expm1(f) + 1) - 1 where 2^i is exact because i is an integer. */ - float32x4_t j = vsubq_f32 (vfmaq_f32 (d->shift, d->invln2, x), d->shift); + float32x4_t j = vsubq_f32 ( + vfmaq_laneq_f32 (d->shift, x, d->invln2_and_ln2, 0), d->shift); int32x4_t i = vcvtq_s32_f32 (j); - float32x4_t f = vfmsq_f32 (x, j, d->ln2_hi); - f = vfmsq_f32 (f, j, d->ln2_lo); + float32x4_t f = vfmsq_laneq_f32 (x, j, d->invln2_and_ln2, 1); + f = vfmsq_laneq_f32 (f, j, d->invln2_and_ln2, 2); /* Approximate expm1(f) using polynomial. Taylor expansion for expm1(x) has the form: diff --git a/sysdeps/aarch64/fpu/log_advsimd.c b/sysdeps/aarch64/fpu/log_advsimd.c index 067ae79613..21df61728c 100644 --- a/sysdeps/aarch64/fpu/log_advsimd.c +++ b/sysdeps/aarch64/fpu/log_advsimd.c @@ -58,8 +58,13 @@ lookup (uint64x2_t i) uint64_t i1 = (i[1] >> (52 - V_LOG_TABLE_BITS)) & IndexMask; float64x2_t e0 = vld1q_f64 (&__v_log_data.table[i0].invc); float64x2_t e1 = vld1q_f64 (&__v_log_data.table[i1].invc); +#if __BYTE_ORDER == __LITTLE_ENDIAN e.invc = vuzp1q_f64 (e0, e1); e.logc = vuzp2q_f64 (e0, e1); +#else + e.invc = vuzp1q_f64 (e1, e0); + e.logc = vuzp2q_f64 (e1, e0); +#endif return e; } diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c index efce183e86..a0d9d3b819 100644 --- a/sysdeps/aarch64/fpu/sin_advsimd.c +++ b/sysdeps/aarch64/fpu/sin_advsimd.c @@ -75,8 +75,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x) r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x); #else r = x; - cmp = vcageq_f64 (d->range_val, x); - cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ + cmp = vcageq_f64 (x, d->range_val); #endif /* n = rint(|x|/pi). */ diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c index 60cf3f2ca1..375dfc3331 100644 --- a/sysdeps/aarch64/fpu/sinf_advsimd.c +++ b/sysdeps/aarch64/fpu/sinf_advsimd.c @@ -67,8 +67,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sin) (float32x4_t x) r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x); #else r = x; - cmp = vcageq_f32 (d->range_val, x); - cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ + cmp = vcageq_f32 (x, d->range_val); #endif /* n = rint(|x|/pi) */ diff --git a/sysdeps/aarch64/fpu/tan_advsimd.c b/sysdeps/aarch64/fpu/tan_advsimd.c index d7e5ba7b1a..0459821ab2 100644 --- a/sysdeps/aarch64/fpu/tan_advsimd.c +++ b/sysdeps/aarch64/fpu/tan_advsimd.c @@ -23,7 +23,7 @@ static const struct data { float64x2_t poly[9]; - float64x2_t half_pi_hi, half_pi_lo, two_over_pi, shift; + float64x2_t half_pi, two_over_pi, shift; #if !WANT_SIMD_EXCEPT float64x2_t range_val; #endif @@ -34,8 +34,7 @@ static const struct data V2 (0x1.226e5e5ecdfa3p-7), V2 (0x1.d6c7ddbf87047p-9), V2 (0x1.7ea75d05b583ep-10), V2 (0x1.289f22964a03cp-11), V2 (0x1.4e4fd14147622p-12) }, - .half_pi_hi = V2 (0x1.921fb54442d18p0), - .half_pi_lo = V2 (0x1.1a62633145c07p-54), + .half_pi = { 0x1.921fb54442d18p0, 0x1.1a62633145c07p-54 }, .two_over_pi = V2 (0x1.45f306dc9c883p-1), .shift = V2 (0x1.8p52), #if !WANT_SIMD_EXCEPT @@ -56,15 +55,15 @@ special_case (float64x2_t x) /* Vector approximation for double-precision tan. Maximum measured error is 3.48 ULP: - __v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 - want -0x1.f6ccd8ecf7deap+37. */ + _ZGVnN2v_tan(0x1.4457047ef78d8p+20) got -0x1.f6ccd8ecf7dedp+37 + want -0x1.f6ccd8ecf7deap+37. */ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) { const struct data *dat = ptr_barrier (&data); - /* Our argument reduction cannot calculate q with sufficient accuracy for very - large inputs. Fall back to scalar routine for all lanes if any are too - large, or Inf/NaN. If fenv exceptions are expected, also fall back for tiny - input to avoid underflow. */ + /* Our argument reduction cannot calculate q with sufficient accuracy for + very large inputs. Fall back to scalar routine for all lanes if any are + too large, or Inf/NaN. If fenv exceptions are expected, also fall back for + tiny input to avoid underflow. */ #if WANT_SIMD_EXCEPT uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x)); /* iax - tiny_bound > range_val - tiny_bound. */ @@ -82,8 +81,8 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) /* Use q to reduce x to r in [-pi/4, pi/4], by: r = x - q * pi/2, in extended precision. */ float64x2_t r = x; - r = vfmsq_f64 (r, q, dat->half_pi_hi); - r = vfmsq_f64 (r, q, dat->half_pi_lo); + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 0); + r = vfmsq_laneq_f64 (r, q, dat->half_pi, 1); /* Further reduce r to [-pi/8, pi/8], to be reconstructed using double angle formula. */ r = vmulq_n_f64 (r, 0.5); @@ -106,14 +105,15 @@ float64x2_t VPCS_ATTR V_NAME_D1 (tan) (float64x2_t x) and reciprocity around pi/2: tan(x) = 1 / (tan(pi/2 - x)) to assemble result using change-of-sign and conditional selection of - numerator/denominator, dependent on odd/even-ness of q (hence quadrant). */ + numerator/denominator, dependent on odd/even-ness of q (hence quadrant). + */ float64x2_t n = vfmaq_f64 (v_f64 (-1), p, p); float64x2_t d = vaddq_f64 (p, p); uint64x2_t no_recip = vtstq_u64 (vreinterpretq_u64_s64 (qi), v_u64 (1)); #if !WANT_SIMD_EXCEPT - uint64x2_t special = vceqzq_u64 (vcaleq_f64 (x, dat->range_val)); + uint64x2_t special = vcageq_f64 (x, dat->range_val); if (__glibc_unlikely (v_any_u64 (special))) return special_case (x); #endif diff --git a/sysdeps/aarch64/fpu/tanf_advsimd.c b/sysdeps/aarch64/fpu/tanf_advsimd.c index 1f16103f8a..5a7489390a 100644 --- a/sysdeps/aarch64/fpu/tanf_advsimd.c +++ b/sysdeps/aarch64/fpu/tanf_advsimd.c @@ -23,7 +23,8 @@ static const struct data { float32x4_t poly[6]; - float32x4_t neg_half_pi_1, neg_half_pi_2, neg_half_pi_3, two_over_pi, shift; + float32x4_t pi_consts; + float32x4_t shift; #if !WANT_SIMD_EXCEPT float32x4_t range_val; #endif @@ -31,10 +32,9 @@ static const struct data /* Coefficients generated using FPMinimax. */ .poly = { V4 (0x1.55555p-2f), V4 (0x1.11166p-3f), V4 (0x1.b88a78p-5f), V4 (0x1.7b5756p-6f), V4 (0x1.4ef4cep-8f), V4 (0x1.0e1e74p-7f) }, - .neg_half_pi_1 = V4 (-0x1.921fb6p+0f), - .neg_half_pi_2 = V4 (0x1.777a5cp-25f), - .neg_half_pi_3 = V4 (0x1.ee59dap-50f), - .two_over_pi = V4 (0x1.45f306p-1f), + /* Stores constants: (-pi/2)_high, (-pi/2)_mid, (-pi/2)_low, and 2/pi. */ + .pi_consts + = { -0x1.921fb6p+0f, 0x1.777a5cp-25f, 0x1.ee59dap-50f, 0x1.45f306p-1f }, .shift = V4 (0x1.8p+23f), #if !WANT_SIMD_EXCEPT .range_val = V4 (0x1p15f), @@ -58,10 +58,11 @@ eval_poly (float32x4_t z, const struct data *d) { float32x4_t z2 = vmulq_f32 (z, z); #if WANT_SIMD_EXCEPT - /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. If fp exceptions - are to be triggered correctly, sidestep this by fixing such lanes to 0. */ + /* Tiny z (<= 0x1p-31) will underflow when calculating z^4. + If fp exceptions are to be triggered correctly, + sidestep this by fixing such lanes to 0. */ uint32x4_t will_uflow - = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); + = vcleq_u32 (vreinterpretq_u32_f32 (vabsq_f32 (z)), TinyBound); if (__glibc_unlikely (v_any_u32 (will_uflow))) z2 = vbslq_f32 (will_uflow, v_f32 (0), z2); #endif @@ -94,16 +95,16 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) #endif /* n = rint(x/(pi/2)). */ - float32x4_t q = vfmaq_f32 (d->shift, d->two_over_pi, x); + float32x4_t q = vfmaq_laneq_f32 (d->shift, x, d->pi_consts, 3); float32x4_t n = vsubq_f32 (q, d->shift); /* Determine if x lives in an interval, where |tan(x)| grows to infinity. */ uint32x4_t pred_alt = vtstq_u32 (vreinterpretq_u32_f32 (q), v_u32 (1)); /* r = x - n * (pi/2) (range reduction into -pi./4 .. pi/4). */ float32x4_t r; - r = vfmaq_f32 (x, d->neg_half_pi_1, n); - r = vfmaq_f32 (r, d->neg_half_pi_2, n); - r = vfmaq_f32 (r, d->neg_half_pi_3, n); + r = vfmaq_laneq_f32 (x, n, d->pi_consts, 0); + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 1); + r = vfmaq_laneq_f32 (r, n, d->pi_consts, 2); /* If x lives in an interval, where |tan(x)| - is finite, then use a polynomial approximation of the form