Message ID | 20230628111939.48140-1-Joe.Ramsay@arm.com |
---|---|
State | New |
Headers | show |
Series | [v4,1/4] aarch64: Add vector implementations of cos routines | expand |
The 06/28/2023 12:19, Joe Ramsay via Libc-alpha wrote: > Replace the loop-over-scalar placeholder routines with optimised > implementations from Arm Optimized Routines (AOR). > > Also add some headers containing utilities for aarch64 libmvec > routines, and update libm-test-ulps. > > Data tables for new routines are used via a pointer with a > barrier on it, in order to prevent overly aggressive constant > inlining in GCC. This allows a single adrp, combined with offset > loads, to be used for every constant in the table. > > Special-case handlers are marked NOINLINE in order to confine the > save/restore overhead of switching from vector to normal calling > standard. This way we only incur the extra memory access in the > exceptional cases. NOINLINE definitions have been moved to > math_private.h in order to reduce duplication. > > AOR exposes a config option, WANT_SIMD_EXCEPT, to enable > selective masking (and later fixing up) of invalid lanes, in > order to trigger fp exceptions correctly (AdvSIMD only). This is > tested and maintained in AOR, however it is configured off at > source level here for performance reasons. We keep the > WANT_SIMD_EXCEPT blocks in routine sources to greatly simplify > the upstreaming process from AOR to glibc. looks good. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> > --- > Changes to v3: > * Remove volatile from AdvSIMD data - use ptr_barrier instead > Thanks, > Joe > sysdeps/aarch64/fpu/advsimd_utils.h | 39 ------- > sysdeps/aarch64/fpu/cos_advsimd.c | 82 +++++++++++++-- > sysdeps/aarch64/fpu/cos_sve.c | 74 +++++++++++++- > sysdeps/aarch64/fpu/cosf_advsimd.c | 77 ++++++++++++-- > sysdeps/aarch64/fpu/cosf_sve.c | 72 ++++++++++++- > sysdeps/aarch64/fpu/sv_math.h | 141 ++++++++++++++++++++++++++ > sysdeps/aarch64/fpu/sve_utils.h | 55 ---------- > sysdeps/aarch64/fpu/v_math.h | 145 +++++++++++++++++++++++++++ > sysdeps/aarch64/fpu/vecmath_config.h | 38 +++++++ > sysdeps/aarch64/libm-test-ulps | 2 +- > sysdeps/generic/math_private.h | 2 +- > sysdeps/ieee754/dbl-64/math_config.h | 3 - > sysdeps/ieee754/flt-32/math_config.h | 2 - > 13 files changed, 609 insertions(+), 123 deletions(-) > delete mode 100644 sysdeps/aarch64/fpu/advsimd_utils.h > create mode 100644 sysdeps/aarch64/fpu/sv_math.h > delete mode 100644 sysdeps/aarch64/fpu/sve_utils.h > create mode 100644 sysdeps/aarch64/fpu/v_math.h > create mode 100644 sysdeps/aarch64/fpu/vecmath_config.h > > diff --git a/sysdeps/aarch64/fpu/advsimd_utils.h b/sysdeps/aarch64/fpu/advsimd_utils.h > deleted file mode 100644 > index 8a0fcc0e06..0000000000 > --- a/sysdeps/aarch64/fpu/advsimd_utils.h > +++ /dev/null > @@ -1,39 +0,0 @@ > -/* Helpers for Advanced SIMD vector math functions. > - > - Copyright (C) 2023 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <arm_neon.h> > - > -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) > - > -#define V_NAME_F1(fun) _ZGVnN4v_##fun##f > -#define V_NAME_D1(fun) _ZGVnN2v_##fun > -#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f > -#define V_NAME_D2(fun) _ZGVnN2vv_##fun > - > -static __always_inline float32x4_t > -v_call_f32 (float (*f) (float), float32x4_t x) > -{ > - return (float32x4_t){ f (x[0]), f (x[1]), f (x[2]), f (x[3]) }; > -} > - > -static __always_inline float64x2_t > -v_call_f64 (double (*f) (double), float64x2_t x) > -{ > - return (float64x2_t){ f (x[0]), f (x[1]) }; > -} > diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c > index 40831e6b0d..e8f1d10540 100644 > --- a/sysdeps/aarch64/fpu/cos_advsimd.c > +++ b/sysdeps/aarch64/fpu/cos_advsimd.c > @@ -17,13 +17,83 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <math.h> > +#include "v_math.h" > > -#include "advsimd_utils.h" > +static const struct data > +{ > + float64x2_t poly[7]; > + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; > +} data = { > + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ > + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), > + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), > + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), > + V2 (-0x1.9e9540300a1p-41) }, > + .inv_pi = V2 (0x1.45f306dc9c883p-2), > + .half_pi = V2 (0x1.921fb54442d18p+0), > + .pi_1 = V2 (0x1.921fb54442d18p+1), > + .pi_2 = V2 (0x1.1a62633145c06p-53), > + .pi_3 = V2 (0x1.c1cd129024e09p-106), > + .shift = V2 (0x1.8p52), > + .range_val = V2 (0x1p23) > +}; > + > +#define C(i) d->poly[i] > + > +static float64x2_t VPCS_ATTR NOINLINE > +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) > +{ > + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); > + return v_call_f64 (cos, x, y, cmp); > +} > > -VPCS_ATTR > -float64x2_t > -V_NAME_D1 (cos) (float64x2_t x) > +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) > { > - return v_call_f64 (cos, x); > + const struct data *d = ptr_barrier (&data); > + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; > + uint64x2_t odd, cmp; > + > +#if WANT_SIMD_EXCEPT > + r = vabsq_f64 (x); > + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), > + vreinterpretq_u64_f64 (d->range_val)); > + if (__glibc_unlikely (v_any_u64 (cmp))) > + /* If fenv exceptions are to be triggered correctly, set any special lanes > + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by > + special-case handler later. */ > + r = vbslq_f64 (cmp, v_f64 (1.0), r); > +#else > + cmp = vcageq_f64 (d->range_val, x); > + cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ > + r = x; > +#endif > + > + /* n = rint((|x|+pi/2)/pi) - 0.5. */ > + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); > + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); > + n = vsubq_f64 (n, d->shift); > + n = vsubq_f64 (n, v_f64 (0.5)); > + > + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ > + r = vfmsq_f64 (r, d->pi_1, n); > + r = vfmsq_f64 (r, d->pi_2, n); > + r = vfmsq_f64 (r, d->pi_3, n); > + > + /* sin(r) poly approx. */ > + r2 = vmulq_f64 (r, r); > + r3 = vmulq_f64 (r2, r); > + r4 = vmulq_f64 (r2, r2); > + > + t1 = vfmaq_f64 (C (4), C (5), r2); > + t2 = vfmaq_f64 (C (2), C (3), r2); > + t3 = vfmaq_f64 (C (0), C (1), r2); > + > + y = vfmaq_f64 (t1, C (6), r4); > + y = vfmaq_f64 (t2, y, r4); > + y = vfmaq_f64 (t3, y, r4); > + y = vfmaq_f64 (r, y, r3); > + > + if (__glibc_unlikely (v_any_u64 (cmp))) > + return special_case (x, y, odd, cmp); > + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); > } > diff --git a/sysdeps/aarch64/fpu/cos_sve.c b/sysdeps/aarch64/fpu/cos_sve.c > index 55501e5000..d7a9b134da 100644 > --- a/sysdeps/aarch64/fpu/cos_sve.c > +++ b/sysdeps/aarch64/fpu/cos_sve.c > @@ -17,12 +17,76 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <math.h> > +#include "sv_math.h" > > -#include "sve_utils.h" > +static const struct data > +{ > + double inv_pio2, pio2_1, pio2_2, pio2_3, shift; > +} data = { > + /* Polynomial coefficients are hardwired in FTMAD instructions. */ > + .inv_pio2 = 0x1.45f306dc9c882p-1, > + .pio2_1 = 0x1.921fb50000000p+0, > + .pio2_2 = 0x1.110b460000000p-26, > + .pio2_3 = 0x1.1a62633145c07p-54, > + /* Original shift used in AdvSIMD cos, > + plus a contribution to set the bit #0 of q > + as expected by trigonometric instructions. */ > + .shift = 0x1.8000000000001p52 > +}; > + > +#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ > + > +static svfloat64_t NOINLINE > +special_case (svfloat64_t x, svfloat64_t y, svbool_t out_of_bounds) > +{ > + return sv_call_f64 (cos, x, y, out_of_bounds); > +} > > -svfloat64_t > -SV_NAME_D1 (cos) (svfloat64_t x, svbool_t pg) > +/* A fast SVE implementation of cos based on trigonometric > + instructions (FTMAD, FTSSEL, FTSMUL). > + Maximum measured error: 2.108 ULPs. > + SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 > + want -0x1.fddd4c65c7f05p-3. */ > +svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) > { > - return sv_call_f64 (cos, x, svdup_n_f64 (0), pg); > + const struct data *d = ptr_barrier (&data); > + > + svfloat64_t r = svabs_f64_x (pg, x); > + svbool_t out_of_bounds > + = svcmpge_n_u64 (pg, svreinterpret_u64_f64 (r), RangeVal); > + > + /* Load some constants in quad-word chunks to minimise memory access. */ > + svbool_t ptrue = svptrue_b64 (); > + svfloat64_t invpio2_and_pio2_1 = svld1rq_f64 (ptrue, &d->inv_pio2); > + svfloat64_t pio2_23 = svld1rq_f64 (ptrue, &d->pio2_2); > + > + /* n = rint(|x|/(pi/2)). */ > + svfloat64_t q = svmla_lane_f64 (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); > + svfloat64_t n = svsub_n_f64_x (pg, q, d->shift); > + > + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ > + r = svmls_lane_f64 (r, n, invpio2_and_pio2_1, 1); > + r = svmls_lane_f64 (r, n, pio2_23, 0); > + r = svmls_lane_f64 (r, n, pio2_23, 1); > + > + /* cos(r) poly approx. */ > + svfloat64_t r2 = svtsmul_f64 (r, svreinterpret_u64_f64 (q)); > + svfloat64_t y = sv_f64 (0.0); > + y = svtmad_f64 (y, r2, 7); > + y = svtmad_f64 (y, r2, 6); > + y = svtmad_f64 (y, r2, 5); > + y = svtmad_f64 (y, r2, 4); > + y = svtmad_f64 (y, r2, 3); > + y = svtmad_f64 (y, r2, 2); > + y = svtmad_f64 (y, r2, 1); > + y = svtmad_f64 (y, r2, 0); > + > + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ > + svfloat64_t f = svtssel_f64 (r, svreinterpret_u64_f64 (q)); > + /* Apply factor. */ > + y = svmul_f64_x (pg, f, y); > + > + if (__glibc_unlikely (svptest_any (pg, out_of_bounds))) > + return special_case (x, y, out_of_bounds); > + return y; > } > diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c > index 35bb81aead..f05dd2bcda 100644 > --- a/sysdeps/aarch64/fpu/cosf_advsimd.c > +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c > @@ -17,13 +17,78 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <math.h> > +#include "v_math.h" > > -#include "advsimd_utils.h" > +static const struct data > +{ > + float32x4_t poly[4]; > + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; > +} data = { > + /* 1.886 ulp error. */ > + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), > + V4 (0x1.5b2e76p-19f) }, > + > + .pi_1 = V4 (0x1.921fb6p+1f), > + .pi_2 = V4 (-0x1.777a5cp-24f), > + .pi_3 = V4 (-0x1.ee59dap-49f), > + > + .inv_pi = V4 (0x1.45f306p-2f), > + .shift = V4 (0x1.8p+23f), > + .half_pi = V4 (0x1.921fb6p0f), > + .range_val = V4 (0x1p20f) > +}; > + > +#define C(i) d->poly[i] > + > +static float32x4_t VPCS_ATTR NOINLINE > +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) > +{ > + /* Fall back to scalar code. */ > + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); > + return v_call_f32 (cosf, x, y, cmp); > +} > > -VPCS_ATTR > -float32x4_t > -V_NAME_F1 (cos) (float32x4_t x) > +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) > { > - return v_call_f32 (cosf, x); > + const struct data *d = ptr_barrier (&data); > + float32x4_t n, r, r2, r3, y; > + uint32x4_t odd, cmp; > + > +#if WANT_SIMD_EXCEPT > + r = vabsq_f32 (x); > + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), > + vreinterpretq_u32_f32 (d->range_val)); > + if (__glibc_unlikely (v_any_u32 (cmp))) > + /* If fenv exceptions are to be triggered correctly, set any special lanes > + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by > + special-case handler later. */ > + r = vbslq_f32 (cmp, v_f32 (1.0f), r); > +#else > + cmp = vcageq_f32 (d->range_val, x); > + cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ > + r = x; > +#endif > + > + /* n = rint((|x|+pi/2)/pi) - 0.5. */ > + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); > + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); > + n = vsubq_f32 (n, d->shift); > + n = vsubq_f32 (n, v_f32 (0.5f)); > + > + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ > + r = vfmsq_f32 (r, d->pi_1, n); > + r = vfmsq_f32 (r, d->pi_2, n); > + r = vfmsq_f32 (r, d->pi_3, n); > + > + /* y = sin(r). */ > + r2 = vmulq_f32 (r, r); > + r3 = vmulq_f32 (r2, r); > + y = vfmaq_f32 (C (2), C (3), r2); > + y = vfmaq_f32 (C (1), y, r2); > + y = vfmaq_f32 (C (0), y, r2); > + y = vfmaq_f32 (r, y, r3); > + > + if (__glibc_unlikely (v_any_u32 (cmp))) > + return special_case (x, y, odd, cmp); > + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); > } > diff --git a/sysdeps/aarch64/fpu/cosf_sve.c b/sysdeps/aarch64/fpu/cosf_sve.c > index 16c68f387b..577cbd864e 100644 > --- a/sysdeps/aarch64/fpu/cosf_sve.c > +++ b/sysdeps/aarch64/fpu/cosf_sve.c > @@ -17,12 +17,74 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <math.h> > +#include "sv_math.h" > > -#include "sve_utils.h" > +static const struct data > +{ > + float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift; > +} data = { > + /* Polynomial coefficients are hard-wired in FTMAD instructions. */ > + .neg_pio2_1 = -0x1.921fb6p+0f, > + .neg_pio2_2 = 0x1.777a5cp-25f, > + .neg_pio2_3 = 0x1.ee59dap-50f, > + .inv_pio2 = 0x1.45f306p-1f, > + /* Original shift used in AdvSIMD cosf, > + plus a contribution to set the bit #0 of q > + as expected by trigonometric instructions. */ > + .shift = 0x1.800002p+23f > +}; > + > +#define RangeVal 0x49800000 /* asuint32(0x1p20f). */ > > -svfloat32_t > -SV_NAME_F1 (cos) (svfloat32_t x, svbool_t pg) > +static svfloat32_t NOINLINE > +special_case (svfloat32_t x, svfloat32_t y, svbool_t out_of_bounds) > { > - return sv_call_f32 (cosf, x, svdup_n_f32 (0), pg); > + return sv_call_f32 (cosf, x, y, out_of_bounds); > +} > + > +/* A fast SVE implementation of cosf based on trigonometric > + instructions (FTMAD, FTSSEL, FTSMUL). > + Maximum measured error: 2.06 ULPs. > + SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6 > + want 0x1.fffe76p-6. */ > +svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg) > +{ > + const struct data *d = ptr_barrier (&data); > + > + svfloat32_t r = svabs_f32_x (pg, x); > + svbool_t out_of_bounds > + = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (r), RangeVal); > + > + /* Load some constants in quad-word chunks to minimise memory access. */ > + svfloat32_t negpio2_and_invpio2 > + = svld1rq_f32 (svptrue_b32 (), &d->neg_pio2_1); > + > + /* n = rint(|x|/(pi/2)). */ > + svfloat32_t q > + = svmla_lane_f32 (sv_f32 (d->shift), r, negpio2_and_invpio2, 3); > + svfloat32_t n = svsub_n_f32_x (pg, q, d->shift); > + > + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ > + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 0); > + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 1); > + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 2); > + > + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ > + svfloat32_t f = svtssel_f32 (r, svreinterpret_u32_f32 (q)); > + > + /* cos(r) poly approx. */ > + svfloat32_t r2 = svtsmul_f32 (r, svreinterpret_u32_f32 (q)); > + svfloat32_t y = sv_f32 (0.0f); > + y = svtmad_f32 (y, r2, 4); > + y = svtmad_f32 (y, r2, 3); > + y = svtmad_f32 (y, r2, 2); > + y = svtmad_f32 (y, r2, 1); > + y = svtmad_f32 (y, r2, 0); > + > + /* Apply factor. */ > + y = svmul_f32_x (pg, f, y); > + > + if (__glibc_unlikely (svptest_any (pg, out_of_bounds))) > + return special_case (x, y, out_of_bounds); > + return y; > } > diff --git a/sysdeps/aarch64/fpu/sv_math.h b/sysdeps/aarch64/fpu/sv_math.h > new file mode 100644 > index 0000000000..fc8e690e80 > --- /dev/null > +++ b/sysdeps/aarch64/fpu/sv_math.h > @@ -0,0 +1,141 @@ > +/* Utilities for SVE libmvec routines. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef SV_MATH_H > +#define SV_MATH_H > + > +#include <arm_sve.h> > +#include <stdbool.h> > + > +#include "vecmath_config.h" > + > +#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f > +#define SV_NAME_D1(fun) _ZGVsMxv_##fun > +#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f > +#define SV_NAME_D2(fun) _ZGVsMxvv_##fun > + > +/* Double precision. */ > +static inline svint64_t > +sv_s64 (int64_t x) > +{ > + return svdup_n_s64 (x); > +} > + > +static inline svuint64_t > +sv_u64 (uint64_t x) > +{ > + return svdup_n_u64 (x); > +} > + > +static inline svfloat64_t > +sv_f64 (double x) > +{ > + return svdup_n_f64 (x); > +} > + > +static inline svfloat64_t > +sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) > +{ > + svbool_t p = svpfirst (cmp, svpfalse ()); > + while (svptest_any (cmp, p)) > + { > + double elem = svclastb_n_f64 (p, 0, x); > + elem = (*f) (elem); > + svfloat64_t y2 = svdup_n_f64 (elem); > + y = svsel_f64 (p, y2, y); > + p = svpnext_b64 (cmp, p); > + } > + return y; > +} > + > +static inline svfloat64_t > +sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2, > + svfloat64_t y, svbool_t cmp) > +{ > + svbool_t p = svpfirst (cmp, svpfalse ()); > + while (svptest_any (cmp, p)) > + { > + double elem1 = svclastb_n_f64 (p, 0, x1); > + double elem2 = svclastb_n_f64 (p, 0, x2); > + double ret = (*f) (elem1, elem2); > + svfloat64_t y2 = svdup_n_f64 (ret); > + y = svsel_f64 (p, y2, y); > + p = svpnext_b64 (cmp, p); > + } > + return y; > +} > + > +static inline svuint64_t > +sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y) > +{ > + svuint64_t q = svdiv_n_u64_x (pg, x, y); > + return svmls_n_u64_x (pg, x, q, y); > +} > + > +/* Single precision. */ > +static inline svint32_t > +sv_s32 (int32_t x) > +{ > + return svdup_n_s32 (x); > +} > + > +static inline svuint32_t > +sv_u32 (uint32_t x) > +{ > + return svdup_n_u32 (x); > +} > + > +static inline svfloat32_t > +sv_f32 (float x) > +{ > + return svdup_n_f32 (x); > +} > + > +static inline svfloat32_t > +sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) > +{ > + svbool_t p = svpfirst (cmp, svpfalse ()); > + while (svptest_any (cmp, p)) > + { > + float elem = svclastb_n_f32 (p, 0, x); > + elem = f (elem); > + svfloat32_t y2 = svdup_n_f32 (elem); > + y = svsel_f32 (p, y2, y); > + p = svpnext_b32 (cmp, p); > + } > + return y; > +} > + > +static inline svfloat32_t > +sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2, > + svfloat32_t y, svbool_t cmp) > +{ > + svbool_t p = svpfirst (cmp, svpfalse ()); > + while (svptest_any (cmp, p)) > + { > + float elem1 = svclastb_n_f32 (p, 0, x1); > + float elem2 = svclastb_n_f32 (p, 0, x2); > + float ret = f (elem1, elem2); > + svfloat32_t y2 = svdup_n_f32 (ret); > + y = svsel_f32 (p, y2, y); > + p = svpnext_b32 (cmp, p); > + } > + return y; > +} > + > +#endif > diff --git a/sysdeps/aarch64/fpu/sve_utils.h b/sysdeps/aarch64/fpu/sve_utils.h > deleted file mode 100644 > index 5ce3d2e8d6..0000000000 > --- a/sysdeps/aarch64/fpu/sve_utils.h > +++ /dev/null > @@ -1,55 +0,0 @@ > -/* Helpers for SVE vector math functions. > - > - Copyright (C) 2023 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <arm_sve.h> > - > -#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f > -#define SV_NAME_D1(fun) _ZGVsMxv_##fun > -#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f > -#define SV_NAME_D2(fun) _ZGVsMxvv_##fun > - > -static __always_inline svfloat32_t > -sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) > -{ > - svbool_t p = svpfirst (cmp, svpfalse ()); > - while (svptest_any (cmp, p)) > - { > - float elem = svclastb_n_f32 (p, 0, x); > - elem = (*f) (elem); > - svfloat32_t y2 = svdup_n_f32 (elem); > - y = svsel_f32 (p, y2, y); > - p = svpnext_b32 (cmp, p); > - } > - return y; > -} > - > -static __always_inline svfloat64_t > -sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) > -{ > - svbool_t p = svpfirst (cmp, svpfalse ()); > - while (svptest_any (cmp, p)) > - { > - double elem = svclastb_n_f64 (p, 0, x); > - elem = (*f) (elem); > - svfloat64_t y2 = svdup_n_f64 (elem); > - y = svsel_f64 (p, y2, y); > - p = svpnext_b64 (cmp, p); > - } > - return y; > -} > diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h > new file mode 100644 > index 0000000000..43efd8f99d > --- /dev/null > +++ b/sysdeps/aarch64/fpu/v_math.h > @@ -0,0 +1,145 @@ > +/* Utilities for Advanced SIMD libmvec routines. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef _V_MATH_H > +#define _V_MATH_H > + > +#include <arm_neon.h> > +#include "vecmath_config.h" > + > +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) > + > +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f > +#define V_NAME_D1(fun) _ZGVnN2v_##fun > +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f > +#define V_NAME_D2(fun) _ZGVnN2vv_##fun > + > +/* Shorthand helpers for declaring constants. */ > +#define V2(x) \ > + { \ > + x, x \ > + } > + > +#define V4(x) \ > + { \ > + x, x, x, x \ > + } > + > +static inline float32x4_t > +v_f32 (float x) > +{ > + return (float32x4_t) V4 (x); > +} > +static inline uint32x4_t > +v_u32 (uint32_t x) > +{ > + return (uint32x4_t) V4 (x); > +} > +static inline int32x4_t > +v_s32 (int32_t x) > +{ > + return (int32x4_t) V4 (x); > +} > + > +/* true if any elements of a vector compare result is non-zero. */ > +static inline int > +v_any_u32 (uint32x4_t x) > +{ > + /* assume elements in x are either 0 or -1u. */ > + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; > +} > +static inline float32x4_t > +v_lookup_f32 (const float *tab, uint32x4_t idx) > +{ > + return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; > +} > +static inline uint32x4_t > +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) > +{ > + return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; > +} > +static inline float32x4_t > +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) > +{ > + return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], > + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; > +} > +static inline float32x4_t > +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, > + float32x4_t y, uint32x4_t p) > +{ > + return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], > + p[1] ? f (x1[1], x2[1]) : y[1], > + p[2] ? f (x1[2], x2[2]) : y[2], > + p[3] ? f (x1[3], x2[3]) : y[3] }; > +} > + > +static inline float64x2_t > +v_f64 (double x) > +{ > + return (float64x2_t) V2 (x); > +} > +static inline uint64x2_t > +v_u64 (uint64_t x) > +{ > + return (uint64x2_t) V2 (x); > +} > +static inline int64x2_t > +v_s64 (int64_t x) > +{ > + return (int64x2_t) V2 (x); > +} > + > +/* true if any elements of a vector compare result is non-zero. */ > +static inline int > +v_any_u64 (uint64x2_t x) > +{ > + /* assume elements in x are either 0 or -1u. */ > + return vpaddd_u64 (x) != 0; > +} > +/* true if all elements of a vector compare result is 1. */ > +static inline int > +v_all_u64 (uint64x2_t x) > +{ > + /* assume elements in x are either 0 or -1u. */ > + return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; > +} > +static inline float64x2_t > +v_lookup_f64 (const double *tab, uint64x2_t idx) > +{ > + return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; > +} > +static inline uint64x2_t > +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) > +{ > + return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; > +} > +static inline float64x2_t > +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) > +{ > + return (float64x2_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1] }; > +} > +static inline float64x2_t > +v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, > + float64x2_t y, uint64x2_t p) > +{ > + return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0], > + p[1] ? f (x1[1], x2[1]) : y[1] }; > +} > + > +#endif > diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h > new file mode 100644 > index 0000000000..d0bdbb4ae8 > --- /dev/null > +++ b/sysdeps/aarch64/fpu/vecmath_config.h > @@ -0,0 +1,38 @@ > +/* Configuration for libmvec routines. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef _VECMATH_CONFIG_H > +#define _VECMATH_CONFIG_H > + > +#include <math_private.h> > + > +/* Deprecated config option from Arm Optimized Routines which ensures > + fp exceptions are correctly triggered. This is not intended to be > + supported in GLIBC, however we keep it for ease of development. */ > +#define WANT_SIMD_EXCEPT 0 > + > +/* Return ptr but hide its value from the compiler so accesses through it > + cannot be optimized based on the contents. */ > +#define ptr_barrier(ptr) \ > + ({ \ > + __typeof (ptr) __ptr = (ptr); \ > + __asm("" : "+r"(__ptr)); \ > + __ptr; \ > + }) > + > +#endif > diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps > index da7c64942c..07da4ab843 100644 > --- a/sysdeps/aarch64/libm-test-ulps > +++ b/sysdeps/aarch64/libm-test-ulps > @@ -642,7 +642,7 @@ float: 1 > ldouble: 2 > > Function: "cos_advsimd": > -double: 1 > +double: 2 > float: 1 > > Function: "cos_downward": > diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h > index 98b54875f5..3865dcf905 100644 > --- a/sysdeps/generic/math_private.h > +++ b/sysdeps/generic/math_private.h > @@ -193,7 +193,7 @@ do { \ > # undef _Mdouble_ > #endif > > - > +#define NOINLINE __attribute__ ((noinline)) > > /* Prototypes for functions of the IBM Accurate Mathematical Library. */ > extern double __sin (double __x); > diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h > index 499ca7591b..19af33fd86 100644 > --- a/sysdeps/ieee754/dbl-64/math_config.h > +++ b/sysdeps/ieee754/dbl-64/math_config.h > @@ -148,9 +148,6 @@ make_double (uint64_t x, int64_t ep, uint64_t s) > return asdouble (s + x + (ep << MANTISSA_WIDTH)); > } > > - > -#define NOINLINE __attribute__ ((noinline)) > - > /* Error handling tail calls for special cases, with a sign argument. > The sign of the return value is set if the argument is non-zero. */ > > diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h > index a7fb332767..d1b06a1a90 100644 > --- a/sysdeps/ieee754/flt-32/math_config.h > +++ b/sysdeps/ieee754/flt-32/math_config.h > @@ -151,8 +151,6 @@ make_float (uint32_t x, int ep, uint32_t s) > return asfloat (s + x + (ep << MANTISSA_WIDTH)); > } > > -#define NOINLINE __attribute__ ((noinline)) > - > attribute_hidden float __math_oflowf (uint32_t); > attribute_hidden float __math_uflowf (uint32_t); > attribute_hidden float __math_may_uflowf (uint32_t); > -- > 2.27.0 >
diff --git a/sysdeps/aarch64/fpu/advsimd_utils.h b/sysdeps/aarch64/fpu/advsimd_utils.h deleted file mode 100644 index 8a0fcc0e06..0000000000 --- a/sysdeps/aarch64/fpu/advsimd_utils.h +++ /dev/null @@ -1,39 +0,0 @@ -/* Helpers for Advanced SIMD vector math functions. - - Copyright (C) 2023 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <arm_neon.h> - -#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) - -#define V_NAME_F1(fun) _ZGVnN4v_##fun##f -#define V_NAME_D1(fun) _ZGVnN2v_##fun -#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f -#define V_NAME_D2(fun) _ZGVnN2vv_##fun - -static __always_inline float32x4_t -v_call_f32 (float (*f) (float), float32x4_t x) -{ - return (float32x4_t){ f (x[0]), f (x[1]), f (x[2]), f (x[3]) }; -} - -static __always_inline float64x2_t -v_call_f64 (double (*f) (double), float64x2_t x) -{ - return (float64x2_t){ f (x[0]), f (x[1]) }; -} diff --git a/sysdeps/aarch64/fpu/cos_advsimd.c b/sysdeps/aarch64/fpu/cos_advsimd.c index 40831e6b0d..e8f1d10540 100644 --- a/sysdeps/aarch64/fpu/cos_advsimd.c +++ b/sysdeps/aarch64/fpu/cos_advsimd.c @@ -17,13 +17,83 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "v_math.h" -#include "advsimd_utils.h" +static const struct data +{ + float64x2_t poly[7]; + float64x2_t range_val, shift, inv_pi, half_pi, pi_1, pi_2, pi_3; +} data = { + /* Worst-case error is 3.3 ulp in [-pi/2, pi/2]. */ + .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), + V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), + V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), + V2 (-0x1.9e9540300a1p-41) }, + .inv_pi = V2 (0x1.45f306dc9c883p-2), + .half_pi = V2 (0x1.921fb54442d18p+0), + .pi_1 = V2 (0x1.921fb54442d18p+1), + .pi_2 = V2 (0x1.1a62633145c06p-53), + .pi_3 = V2 (0x1.c1cd129024e09p-106), + .shift = V2 (0x1.8p52), + .range_val = V2 (0x1p23) +}; + +#define C(i) d->poly[i] + +static float64x2_t VPCS_ATTR NOINLINE +special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp) +{ + y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); + return v_call_f64 (cos, x, y, cmp); +} -VPCS_ATTR -float64x2_t -V_NAME_D1 (cos) (float64x2_t x) +float64x2_t VPCS_ATTR V_NAME_D1 (cos) (float64x2_t x) { - return v_call_f64 (cos, x); + const struct data *d = ptr_barrier (&data); + float64x2_t n, r, r2, r3, r4, t1, t2, t3, y; + uint64x2_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f64 (x); + cmp = vcgeq_u64 (vreinterpretq_u64_f64 (r), + vreinterpretq_u64_f64 (d->range_val)); + if (__glibc_unlikely (v_any_u64 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f64 (cmp, v_f64 (1.0), r); +#else + cmp = vcageq_f64 (d->range_val, x); + cmp = vceqzq_u64 (cmp); /* cmp = ~cmp. */ + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f64 (d->shift, d->inv_pi, vaddq_f64 (r, d->half_pi)); + odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63); + n = vsubq_f64 (n, d->shift); + n = vsubq_f64 (n, v_f64 (0.5)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f64 (r, d->pi_1, n); + r = vfmsq_f64 (r, d->pi_2, n); + r = vfmsq_f64 (r, d->pi_3, n); + + /* sin(r) poly approx. */ + r2 = vmulq_f64 (r, r); + r3 = vmulq_f64 (r2, r); + r4 = vmulq_f64 (r2, r2); + + t1 = vfmaq_f64 (C (4), C (5), r2); + t2 = vfmaq_f64 (C (2), C (3), r2); + t3 = vfmaq_f64 (C (0), C (1), r2); + + y = vfmaq_f64 (t1, C (6), r4); + y = vfmaq_f64 (t2, y, r4); + y = vfmaq_f64 (t3, y, r4); + y = vfmaq_f64 (r, y, r3); + + if (__glibc_unlikely (v_any_u64 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd)); } diff --git a/sysdeps/aarch64/fpu/cos_sve.c b/sysdeps/aarch64/fpu/cos_sve.c index 55501e5000..d7a9b134da 100644 --- a/sysdeps/aarch64/fpu/cos_sve.c +++ b/sysdeps/aarch64/fpu/cos_sve.c @@ -17,12 +17,76 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "sv_math.h" -#include "sve_utils.h" +static const struct data +{ + double inv_pio2, pio2_1, pio2_2, pio2_3, shift; +} data = { + /* Polynomial coefficients are hardwired in FTMAD instructions. */ + .inv_pio2 = 0x1.45f306dc9c882p-1, + .pio2_1 = 0x1.921fb50000000p+0, + .pio2_2 = 0x1.110b460000000p-26, + .pio2_3 = 0x1.1a62633145c07p-54, + /* Original shift used in AdvSIMD cos, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.8000000000001p52 +}; + +#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ + +static svfloat64_t NOINLINE +special_case (svfloat64_t x, svfloat64_t y, svbool_t out_of_bounds) +{ + return sv_call_f64 (cos, x, y, out_of_bounds); +} -svfloat64_t -SV_NAME_D1 (cos) (svfloat64_t x, svbool_t pg) +/* A fast SVE implementation of cos based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.108 ULPs. + SV_NAME_D1 (cos)(0x1.9b0ba158c98f3p+7) got -0x1.fddd4c65c7f07p-3 + want -0x1.fddd4c65c7f05p-3. */ +svfloat64_t SV_NAME_D1 (cos) (svfloat64_t x, const svbool_t pg) { - return sv_call_f64 (cos, x, svdup_n_f64 (0), pg); + const struct data *d = ptr_barrier (&data); + + svfloat64_t r = svabs_f64_x (pg, x); + svbool_t out_of_bounds + = svcmpge_n_u64 (pg, svreinterpret_u64_f64 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svbool_t ptrue = svptrue_b64 (); + svfloat64_t invpio2_and_pio2_1 = svld1rq_f64 (ptrue, &d->inv_pio2); + svfloat64_t pio2_23 = svld1rq_f64 (ptrue, &d->pio2_2); + + /* n = rint(|x|/(pi/2)). */ + svfloat64_t q = svmla_lane_f64 (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); + svfloat64_t n = svsub_n_f64_x (pg, q, d->shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = svmls_lane_f64 (r, n, invpio2_and_pio2_1, 1); + r = svmls_lane_f64 (r, n, pio2_23, 0); + r = svmls_lane_f64 (r, n, pio2_23, 1); + + /* cos(r) poly approx. */ + svfloat64_t r2 = svtsmul_f64 (r, svreinterpret_u64_f64 (q)); + svfloat64_t y = sv_f64 (0.0); + y = svtmad_f64 (y, r2, 7); + y = svtmad_f64 (y, r2, 6); + y = svtmad_f64 (y, r2, 5); + y = svtmad_f64 (y, r2, 4); + y = svtmad_f64 (y, r2, 3); + y = svtmad_f64 (y, r2, 2); + y = svtmad_f64 (y, r2, 1); + y = svtmad_f64 (y, r2, 0); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + svfloat64_t f = svtssel_f64 (r, svreinterpret_u64_f64 (q)); + /* Apply factor. */ + y = svmul_f64_x (pg, f, y); + + if (__glibc_unlikely (svptest_any (pg, out_of_bounds))) + return special_case (x, y, out_of_bounds); + return y; } diff --git a/sysdeps/aarch64/fpu/cosf_advsimd.c b/sysdeps/aarch64/fpu/cosf_advsimd.c index 35bb81aead..f05dd2bcda 100644 --- a/sysdeps/aarch64/fpu/cosf_advsimd.c +++ b/sysdeps/aarch64/fpu/cosf_advsimd.c @@ -17,13 +17,78 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "v_math.h" -#include "advsimd_utils.h" +static const struct data +{ + float32x4_t poly[4]; + float32x4_t range_val, inv_pi, half_pi, shift, pi_1, pi_2, pi_3; +} data = { + /* 1.886 ulp error. */ + .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f), + V4 (0x1.5b2e76p-19f) }, + + .pi_1 = V4 (0x1.921fb6p+1f), + .pi_2 = V4 (-0x1.777a5cp-24f), + .pi_3 = V4 (-0x1.ee59dap-49f), + + .inv_pi = V4 (0x1.45f306p-2f), + .shift = V4 (0x1.8p+23f), + .half_pi = V4 (0x1.921fb6p0f), + .range_val = V4 (0x1p20f) +}; + +#define C(i) d->poly[i] + +static float32x4_t VPCS_ATTR NOINLINE +special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) +{ + /* Fall back to scalar code. */ + y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); + return v_call_f32 (cosf, x, y, cmp); +} -VPCS_ATTR -float32x4_t -V_NAME_F1 (cos) (float32x4_t x) +float32x4_t VPCS_ATTR V_NAME_F1 (cos) (float32x4_t x) { - return v_call_f32 (cosf, x); + const struct data *d = ptr_barrier (&data); + float32x4_t n, r, r2, r3, y; + uint32x4_t odd, cmp; + +#if WANT_SIMD_EXCEPT + r = vabsq_f32 (x); + cmp = vcgeq_u32 (vreinterpretq_u32_f32 (r), + vreinterpretq_u32_f32 (d->range_val)); + if (__glibc_unlikely (v_any_u32 (cmp))) + /* If fenv exceptions are to be triggered correctly, set any special lanes + to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by + special-case handler later. */ + r = vbslq_f32 (cmp, v_f32 (1.0f), r); +#else + cmp = vcageq_f32 (d->range_val, x); + cmp = vceqzq_u32 (cmp); /* cmp = ~cmp. */ + r = x; +#endif + + /* n = rint((|x|+pi/2)/pi) - 0.5. */ + n = vfmaq_f32 (d->shift, d->inv_pi, vaddq_f32 (r, d->half_pi)); + odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31); + n = vsubq_f32 (n, d->shift); + n = vsubq_f32 (n, v_f32 (0.5f)); + + /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ + r = vfmsq_f32 (r, d->pi_1, n); + r = vfmsq_f32 (r, d->pi_2, n); + r = vfmsq_f32 (r, d->pi_3, n); + + /* y = sin(r). */ + r2 = vmulq_f32 (r, r); + r3 = vmulq_f32 (r2, r); + y = vfmaq_f32 (C (2), C (3), r2); + y = vfmaq_f32 (C (1), y, r2); + y = vfmaq_f32 (C (0), y, r2); + y = vfmaq_f32 (r, y, r3); + + if (__glibc_unlikely (v_any_u32 (cmp))) + return special_case (x, y, odd, cmp); + return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd)); } diff --git a/sysdeps/aarch64/fpu/cosf_sve.c b/sysdeps/aarch64/fpu/cosf_sve.c index 16c68f387b..577cbd864e 100644 --- a/sysdeps/aarch64/fpu/cosf_sve.c +++ b/sysdeps/aarch64/fpu/cosf_sve.c @@ -17,12 +17,74 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <math.h> +#include "sv_math.h" -#include "sve_utils.h" +static const struct data +{ + float neg_pio2_1, neg_pio2_2, neg_pio2_3, inv_pio2, shift; +} data = { + /* Polynomial coefficients are hard-wired in FTMAD instructions. */ + .neg_pio2_1 = -0x1.921fb6p+0f, + .neg_pio2_2 = 0x1.777a5cp-25f, + .neg_pio2_3 = 0x1.ee59dap-50f, + .inv_pio2 = 0x1.45f306p-1f, + /* Original shift used in AdvSIMD cosf, + plus a contribution to set the bit #0 of q + as expected by trigonometric instructions. */ + .shift = 0x1.800002p+23f +}; + +#define RangeVal 0x49800000 /* asuint32(0x1p20f). */ -svfloat32_t -SV_NAME_F1 (cos) (svfloat32_t x, svbool_t pg) +static svfloat32_t NOINLINE +special_case (svfloat32_t x, svfloat32_t y, svbool_t out_of_bounds) { - return sv_call_f32 (cosf, x, svdup_n_f32 (0), pg); + return sv_call_f32 (cosf, x, y, out_of_bounds); +} + +/* A fast SVE implementation of cosf based on trigonometric + instructions (FTMAD, FTSSEL, FTSMUL). + Maximum measured error: 2.06 ULPs. + SV_NAME_F1 (cos)(0x1.dea2f2p+19) got 0x1.fffe7ap-6 + want 0x1.fffe76p-6. */ +svfloat32_t SV_NAME_F1 (cos) (svfloat32_t x, const svbool_t pg) +{ + const struct data *d = ptr_barrier (&data); + + svfloat32_t r = svabs_f32_x (pg, x); + svbool_t out_of_bounds + = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (r), RangeVal); + + /* Load some constants in quad-word chunks to minimise memory access. */ + svfloat32_t negpio2_and_invpio2 + = svld1rq_f32 (svptrue_b32 (), &d->neg_pio2_1); + + /* n = rint(|x|/(pi/2)). */ + svfloat32_t q + = svmla_lane_f32 (sv_f32 (d->shift), r, negpio2_and_invpio2, 3); + svfloat32_t n = svsub_n_f32_x (pg, q, d->shift); + + /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 0); + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 1); + r = svmla_lane_f32 (r, n, negpio2_and_invpio2, 2); + + /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ + svfloat32_t f = svtssel_f32 (r, svreinterpret_u32_f32 (q)); + + /* cos(r) poly approx. */ + svfloat32_t r2 = svtsmul_f32 (r, svreinterpret_u32_f32 (q)); + svfloat32_t y = sv_f32 (0.0f); + y = svtmad_f32 (y, r2, 4); + y = svtmad_f32 (y, r2, 3); + y = svtmad_f32 (y, r2, 2); + y = svtmad_f32 (y, r2, 1); + y = svtmad_f32 (y, r2, 0); + + /* Apply factor. */ + y = svmul_f32_x (pg, f, y); + + if (__glibc_unlikely (svptest_any (pg, out_of_bounds))) + return special_case (x, y, out_of_bounds); + return y; } diff --git a/sysdeps/aarch64/fpu/sv_math.h b/sysdeps/aarch64/fpu/sv_math.h new file mode 100644 index 0000000000..fc8e690e80 --- /dev/null +++ b/sysdeps/aarch64/fpu/sv_math.h @@ -0,0 +1,141 @@ +/* Utilities for SVE libmvec routines. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef SV_MATH_H +#define SV_MATH_H + +#include <arm_sve.h> +#include <stdbool.h> + +#include "vecmath_config.h" + +#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f +#define SV_NAME_D1(fun) _ZGVsMxv_##fun +#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f +#define SV_NAME_D2(fun) _ZGVsMxvv_##fun + +/* Double precision. */ +static inline svint64_t +sv_s64 (int64_t x) +{ + return svdup_n_s64 (x); +} + +static inline svuint64_t +sv_u64 (uint64_t x) +{ + return svdup_n_u64 (x); +} + +static inline svfloat64_t +sv_f64 (double x) +{ + return svdup_n_f64 (x); +} + +static inline svfloat64_t +sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double elem = svclastb_n_f64 (p, 0, x); + elem = (*f) (elem); + svfloat64_t y2 = svdup_n_f64 (elem); + y = svsel_f64 (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +static inline svfloat64_t +sv_call2_f64 (double (*f) (double, double), svfloat64_t x1, svfloat64_t x2, + svfloat64_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + double elem1 = svclastb_n_f64 (p, 0, x1); + double elem2 = svclastb_n_f64 (p, 0, x2); + double ret = (*f) (elem1, elem2); + svfloat64_t y2 = svdup_n_f64 (ret); + y = svsel_f64 (p, y2, y); + p = svpnext_b64 (cmp, p); + } + return y; +} + +static inline svuint64_t +sv_mod_n_u64_x (svbool_t pg, svuint64_t x, uint64_t y) +{ + svuint64_t q = svdiv_n_u64_x (pg, x, y); + return svmls_n_u64_x (pg, x, q, y); +} + +/* Single precision. */ +static inline svint32_t +sv_s32 (int32_t x) +{ + return svdup_n_s32 (x); +} + +static inline svuint32_t +sv_u32 (uint32_t x) +{ + return svdup_n_u32 (x); +} + +static inline svfloat32_t +sv_f32 (float x) +{ + return svdup_n_f32 (x); +} + +static inline svfloat32_t +sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float elem = svclastb_n_f32 (p, 0, x); + elem = f (elem); + svfloat32_t y2 = svdup_n_f32 (elem); + y = svsel_f32 (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +static inline svfloat32_t +sv_call2_f32 (float (*f) (float, float), svfloat32_t x1, svfloat32_t x2, + svfloat32_t y, svbool_t cmp) +{ + svbool_t p = svpfirst (cmp, svpfalse ()); + while (svptest_any (cmp, p)) + { + float elem1 = svclastb_n_f32 (p, 0, x1); + float elem2 = svclastb_n_f32 (p, 0, x2); + float ret = f (elem1, elem2); + svfloat32_t y2 = svdup_n_f32 (ret); + y = svsel_f32 (p, y2, y); + p = svpnext_b32 (cmp, p); + } + return y; +} + +#endif diff --git a/sysdeps/aarch64/fpu/sve_utils.h b/sysdeps/aarch64/fpu/sve_utils.h deleted file mode 100644 index 5ce3d2e8d6..0000000000 --- a/sysdeps/aarch64/fpu/sve_utils.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Helpers for SVE vector math functions. - - Copyright (C) 2023 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <arm_sve.h> - -#define SV_NAME_F1(fun) _ZGVsMxv_##fun##f -#define SV_NAME_D1(fun) _ZGVsMxv_##fun -#define SV_NAME_F2(fun) _ZGVsMxvv_##fun##f -#define SV_NAME_D2(fun) _ZGVsMxvv_##fun - -static __always_inline svfloat32_t -sv_call_f32 (float (*f) (float), svfloat32_t x, svfloat32_t y, svbool_t cmp) -{ - svbool_t p = svpfirst (cmp, svpfalse ()); - while (svptest_any (cmp, p)) - { - float elem = svclastb_n_f32 (p, 0, x); - elem = (*f) (elem); - svfloat32_t y2 = svdup_n_f32 (elem); - y = svsel_f32 (p, y2, y); - p = svpnext_b32 (cmp, p); - } - return y; -} - -static __always_inline svfloat64_t -sv_call_f64 (double (*f) (double), svfloat64_t x, svfloat64_t y, svbool_t cmp) -{ - svbool_t p = svpfirst (cmp, svpfalse ()); - while (svptest_any (cmp, p)) - { - double elem = svclastb_n_f64 (p, 0, x); - elem = (*f) (elem); - svfloat64_t y2 = svdup_n_f64 (elem); - y = svsel_f64 (p, y2, y); - p = svpnext_b64 (cmp, p); - } - return y; -} diff --git a/sysdeps/aarch64/fpu/v_math.h b/sysdeps/aarch64/fpu/v_math.h new file mode 100644 index 0000000000..43efd8f99d --- /dev/null +++ b/sysdeps/aarch64/fpu/v_math.h @@ -0,0 +1,145 @@ +/* Utilities for Advanced SIMD libmvec routines. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _V_MATH_H +#define _V_MATH_H + +#include <arm_neon.h> +#include "vecmath_config.h" + +#define VPCS_ATTR __attribute__ ((aarch64_vector_pcs)) + +#define V_NAME_F1(fun) _ZGVnN4v_##fun##f +#define V_NAME_D1(fun) _ZGVnN2v_##fun +#define V_NAME_F2(fun) _ZGVnN4vv_##fun##f +#define V_NAME_D2(fun) _ZGVnN2vv_##fun + +/* Shorthand helpers for declaring constants. */ +#define V2(x) \ + { \ + x, x \ + } + +#define V4(x) \ + { \ + x, x, x, x \ + } + +static inline float32x4_t +v_f32 (float x) +{ + return (float32x4_t) V4 (x); +} +static inline uint32x4_t +v_u32 (uint32_t x) +{ + return (uint32x4_t) V4 (x); +} +static inline int32x4_t +v_s32 (int32_t x) +{ + return (int32x4_t) V4 (x); +} + +/* true if any elements of a vector compare result is non-zero. */ +static inline int +v_any_u32 (uint32x4_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (vreinterpretq_u64_u32 (x)) != 0; +} +static inline float32x4_t +v_lookup_f32 (const float *tab, uint32x4_t idx) +{ + return (float32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline uint32x4_t +v_lookup_u32 (const uint32_t *tab, uint32x4_t idx) +{ + return (uint32x4_t){ tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]] }; +} +static inline float32x4_t +v_call_f32 (float (*f) (float), float32x4_t x, float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1], + p[2] ? f (x[2]) : y[2], p[3] ? f (x[3]) : y[3] }; +} +static inline float32x4_t +v_call2_f32 (float (*f) (float, float), float32x4_t x1, float32x4_t x2, + float32x4_t y, uint32x4_t p) +{ + return (float32x4_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1], + p[2] ? f (x1[2], x2[2]) : y[2], + p[3] ? f (x1[3], x2[3]) : y[3] }; +} + +static inline float64x2_t +v_f64 (double x) +{ + return (float64x2_t) V2 (x); +} +static inline uint64x2_t +v_u64 (uint64_t x) +{ + return (uint64x2_t) V2 (x); +} +static inline int64x2_t +v_s64 (int64_t x) +{ + return (int64x2_t) V2 (x); +} + +/* true if any elements of a vector compare result is non-zero. */ +static inline int +v_any_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_u64 (x) != 0; +} +/* true if all elements of a vector compare result is 1. */ +static inline int +v_all_u64 (uint64x2_t x) +{ + /* assume elements in x are either 0 or -1u. */ + return vpaddd_s64 (vreinterpretq_s64_u64 (x)) == -2; +} +static inline float64x2_t +v_lookup_f64 (const double *tab, uint64x2_t idx) +{ + return (float64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline uint64x2_t +v_lookup_u64 (const uint64_t *tab, uint64x2_t idx) +{ + return (uint64x2_t){ tab[idx[0]], tab[idx[1]] }; +} +static inline float64x2_t +v_call_f64 (double (*f) (double), float64x2_t x, float64x2_t y, uint64x2_t p) +{ + return (float64x2_t){ p[0] ? f (x[0]) : y[0], p[1] ? f (x[1]) : y[1] }; +} +static inline float64x2_t +v_call2_f64 (double (*f) (double, double), float64x2_t x1, float64x2_t x2, + float64x2_t y, uint64x2_t p) +{ + return (float64x2_t){ p[0] ? f (x1[0], x2[0]) : y[0], + p[1] ? f (x1[1], x2[1]) : y[1] }; +} + +#endif diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h new file mode 100644 index 0000000000..d0bdbb4ae8 --- /dev/null +++ b/sysdeps/aarch64/fpu/vecmath_config.h @@ -0,0 +1,38 @@ +/* Configuration for libmvec routines. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef _VECMATH_CONFIG_H +#define _VECMATH_CONFIG_H + +#include <math_private.h> + +/* Deprecated config option from Arm Optimized Routines which ensures + fp exceptions are correctly triggered. This is not intended to be + supported in GLIBC, however we keep it for ease of development. */ +#define WANT_SIMD_EXCEPT 0 + +/* Return ptr but hide its value from the compiler so accesses through it + cannot be optimized based on the contents. */ +#define ptr_barrier(ptr) \ + ({ \ + __typeof (ptr) __ptr = (ptr); \ + __asm("" : "+r"(__ptr)); \ + __ptr; \ + }) + +#endif diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps index da7c64942c..07da4ab843 100644 --- a/sysdeps/aarch64/libm-test-ulps +++ b/sysdeps/aarch64/libm-test-ulps @@ -642,7 +642,7 @@ float: 1 ldouble: 2 Function: "cos_advsimd": -double: 1 +double: 2 float: 1 Function: "cos_downward": diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h index 98b54875f5..3865dcf905 100644 --- a/sysdeps/generic/math_private.h +++ b/sysdeps/generic/math_private.h @@ -193,7 +193,7 @@ do { \ # undef _Mdouble_ #endif - +#define NOINLINE __attribute__ ((noinline)) /* Prototypes for functions of the IBM Accurate Mathematical Library. */ extern double __sin (double __x); diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h index 499ca7591b..19af33fd86 100644 --- a/sysdeps/ieee754/dbl-64/math_config.h +++ b/sysdeps/ieee754/dbl-64/math_config.h @@ -148,9 +148,6 @@ make_double (uint64_t x, int64_t ep, uint64_t s) return asdouble (s + x + (ep << MANTISSA_WIDTH)); } - -#define NOINLINE __attribute__ ((noinline)) - /* Error handling tail calls for special cases, with a sign argument. The sign of the return value is set if the argument is non-zero. */ diff --git a/sysdeps/ieee754/flt-32/math_config.h b/sysdeps/ieee754/flt-32/math_config.h index a7fb332767..d1b06a1a90 100644 --- a/sysdeps/ieee754/flt-32/math_config.h +++ b/sysdeps/ieee754/flt-32/math_config.h @@ -151,8 +151,6 @@ make_float (uint32_t x, int ep, uint32_t s) return asfloat (s + x + (ep << MANTISSA_WIDTH)); } -#define NOINLINE __attribute__ ((noinline)) - attribute_hidden float __math_oflowf (uint32_t); attribute_hidden float __math_uflowf (uint32_t); attribute_hidden float __math_may_uflowf (uint32_t);