Message ID | 20231004105809.50464-1-Joe.Ramsay@arm.com |
---|---|
State | New |
Headers | show |
Series | aarch64: Improve vecmath sin routines | expand |
Hi,
> * Update ULP comment reflecting a new observed max in [-pi/2, pi/2]
for the record the corresponding input is x = 0x1.921d5c6a07142p+0,
it would be nice to add it in comment so that people can reproduce
the observed error.
Also, in [-2^23, 2^23], where the new polynomial is used together with
argument reduction, the maximal observed error is 3.224395 ulps for
x = 0x1.5702447b6f17bp+22.
Paul
The 10/04/2023 11:58, Joe Ramsay wrote: > * Update ULP comment reflecting a new observed max in [-pi/2, pi/2] > * Use the same polynomial in AdvSIMD and SVE, rather than FTRIG instructions > * Improve register use near special-case branch > > Also use overloaded intrinsics for SVE. looks good. committed. > --- > Subsumes a patch from August which replaced FTRIG instructions with polynomial. > Thanks, > Joe > sysdeps/aarch64/fpu/sin_advsimd.c | 2 +- > sysdeps/aarch64/fpu/sin_sve.c | 102 +++++++++++++++--------------- > sysdeps/aarch64/fpu/sinf_sve.c | 44 +++++++------ > 3 files changed, 75 insertions(+), 73 deletions(-) > > diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c > index 0389b334cc..55644c4cc6 100644 > --- a/sysdeps/aarch64/fpu/sin_advsimd.c > +++ b/sysdeps/aarch64/fpu/sin_advsimd.c > @@ -24,7 +24,7 @@ static const struct data > float64x2_t poly[7]; > float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; > } data = { > - /* Worst-case error is 2.8 ulp in [-pi/2, pi/2]. */ > + /* Worst-case error is 2.87 ulp in [-pi/2, pi/2]. */ > .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), > V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), > V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), > diff --git a/sysdeps/aarch64/fpu/sin_sve.c b/sysdeps/aarch64/fpu/sin_sve.c > index c3f450d0ea..9e7f5ff684 100644 > --- a/sysdeps/aarch64/fpu/sin_sve.c > +++ b/sysdeps/aarch64/fpu/sin_sve.c > @@ -21,20 +21,23 @@ > > static const struct data > { > - double inv_pi, half_pi, inv_pi_over_2, pi_over_2_1, pi_over_2_2, pi_over_2_3, > - shift; > + double inv_pi, pi_1, pi_2, pi_3, shift, range_val; > + double poly[7]; > } data = { > - /* Polynomial coefficients are hard-wired in the FTMAD instruction. */ > + /* Worst-case error is 2.87 ulp in [-pi/2, pi/2]. */ > + .poly = { -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, -0x1.a01a019936f27p-13, > + 0x1.71de37a97d93ep-19, -0x1.ae633919987c6p-26, > + 0x1.60e277ae07cecp-33, -0x1.9e9540300a1p-41, }, > + > .inv_pi = 0x1.45f306dc9c883p-2, > - .half_pi = 0x1.921fb54442d18p+0, > - .inv_pi_over_2 = 0x1.45f306dc9c882p-1, > - .pi_over_2_1 = 0x1.921fb50000000p+0, > - .pi_over_2_2 = 0x1.110b460000000p-26, > - .pi_over_2_3 = 0x1.1a62633145c07p-54, > - .shift = 0x1.8p52 > + .pi_1 = 0x1.921fb54442d18p+1, > + .pi_2 = 0x1.1a62633145c06p-53, > + .pi_3 = 0x1.c1cd129024e09p-106, > + .shift = 0x1.8p52, > + .range_val = 0x1p23, > }; > > -#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ > +#define C(i) sv_f64 (d->poly[i]) > > static svfloat64_t NOINLINE > special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) > @@ -42,56 +45,53 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) > return sv_call_f64 (sin, x, y, cmp); > } > > -/* A fast SVE implementation of sin based on trigonometric > - instructions (FTMAD, FTSSEL, FTSMUL). > - Maximum observed error in 2.52 ULP: > - SV_NAME_D1 (sin)(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40 > - want 0x1.10ace8f3e7868p-40. */ > +/* A fast SVE implementation of sin. > + Maximum observed error in 3.22 ULP: > + _ZGVsMxv_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 > + want -0x1.ffe9537d5dbb4p-3. */ > svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg) > { > const struct data *d = ptr_barrier (&data); > > - svfloat64_t r = svabs_f64_x (pg, x); > - svuint64_t sign > - = sveor_u64_x (pg, svreinterpret_u64_f64 (x), svreinterpret_u64_f64 (r)); > - svbool_t cmp = svcmpge_n_u64 (pg, svreinterpret_u64_f64 (r), RangeVal); > + /* Load some values in quad-word chunks to minimise memory access. */ > + const svbool_t ptrue = svptrue_b64 (); > + svfloat64_t shift = sv_f64 (d->shift); > + svfloat64_t inv_pi_and_pi1 = svld1rq (ptrue, &d->inv_pi); > + svfloat64_t pi2_and_pi3 = svld1rq (ptrue, &d->pi_2); > > - /* Load first two pio2-related constants to one vector. */ > - svfloat64_t invpio2_and_pio2_1 > - = svld1rq_f64 (svptrue_b64 (), &d->inv_pi_over_2); > + /* n = rint(|x|/pi). */ > + svfloat64_t n = svmla_lane (shift, x, inv_pi_and_pi1, 0); > + svuint64_t odd = svlsl_x (pg, svreinterpret_u64 (n), 63); > + n = svsub_x (pg, n, shift); > > - /* n = rint(|x|/(pi/2)). */ > - svfloat64_t q = svmla_lane_f64 (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); > - svfloat64_t n = svsub_n_f64_x (pg, q, d->shift); > + /* r = |x| - n*(pi/2) (range reduction into -pi/2 .. pi/2). */ > + svfloat64_t r = x; > + r = svmls_lane (r, n, inv_pi_and_pi1, 1); > + r = svmls_lane (r, n, pi2_and_pi3, 0); > + r = svmls_lane (r, n, pi2_and_pi3, 1); > > - /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ > - r = svmls_lane_f64 (r, n, invpio2_and_pio2_1, 1); > - r = svmls_n_f64_x (pg, r, n, d->pi_over_2_2); > - r = svmls_n_f64_x (pg, r, n, d->pi_over_2_3); > + /* sin(r) poly approx. */ > + svfloat64_t r2 = svmul_x (pg, r, r); > + svfloat64_t r3 = svmul_x (pg, r2, r); > + svfloat64_t r4 = svmul_x (pg, r2, r2); > > - /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ > - svfloat64_t f = svtssel_f64 (r, svreinterpret_u64_f64 (q)); > + svfloat64_t t1 = svmla_x (pg, C (4), C (5), r2); > + svfloat64_t t2 = svmla_x (pg, C (2), C (3), r2); > + svfloat64_t t3 = svmla_x (pg, C (0), C (1), r2); > > - /* sin(r) poly approx. */ > - svfloat64_t r2 = svtsmul_f64 (r, svreinterpret_u64_f64 (q)); > - svfloat64_t y = sv_f64 (0.0); > - y = svtmad_f64 (y, r2, 7); > - y = svtmad_f64 (y, r2, 6); > - y = svtmad_f64 (y, r2, 5); > - y = svtmad_f64 (y, r2, 4); > - y = svtmad_f64 (y, r2, 3); > - y = svtmad_f64 (y, r2, 2); > - y = svtmad_f64 (y, r2, 1); > - y = svtmad_f64 (y, r2, 0); > - > - /* Apply factor. */ > - y = svmul_f64_x (pg, f, y); > - > - /* sign = y^sign. */ > - y = svreinterpret_f64_u64 ( > - sveor_u64_x (pg, svreinterpret_u64_f64 (y), sign)); > + svfloat64_t y = svmla_x (pg, t1, C (6), r4); > + y = svmla_x (pg, t2, y, r4); > + y = svmla_x (pg, t3, y, r4); > + y = svmla_x (pg, r, y, r3); > > + svbool_t cmp = svacle (pg, x, d->range_val); > + cmp = svnot_z (pg, cmp); > if (__glibc_unlikely (svptest_any (pg, cmp))) > - return special_case (x, y, cmp); > - return y; > + return special_case (x, > + svreinterpret_f64 (sveor_z ( > + svnot_z (pg, cmp), svreinterpret_u64 (y), odd)), > + cmp); > + > + /* Copy sign. */ > + return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd)); > } > diff --git a/sysdeps/aarch64/fpu/sinf_sve.c b/sysdeps/aarch64/fpu/sinf_sve.c > index 4d2ce7a846..590881c14b 100644 > --- a/sysdeps/aarch64/fpu/sinf_sve.c > +++ b/sysdeps/aarch64/fpu/sinf_sve.c > @@ -23,7 +23,7 @@ static const struct data > { > float poly[4]; > /* Pi-related values to be loaded as one quad-word and used with > - svmla_lane_f32. */ > + svmla_lane. */ > float negpi1, negpi2, negpi3, invpi; > float shift; > } data = { > @@ -57,40 +57,42 @@ svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg) > { > const struct data *d = ptr_barrier (&data); > > - svfloat32_t ax = svabs_f32_x (pg, x); > - svuint32_t sign = sveor_u32_x (pg, svreinterpret_u32_f32 (x), > - svreinterpret_u32_f32 (ax)); > - svbool_t cmp = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (ax), RangeVal); > + svfloat32_t ax = svabs_x (pg, x); > + svuint32_t sign > + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); > + svbool_t cmp = svcmpge (pg, svreinterpret_u32 (ax), RangeVal); > > /* pi_vals are a quad-word of helper values - the first 3 elements contain > -pi in extended precision, the last contains 1 / pi. */ > - svfloat32_t pi_vals = svld1rq_f32 (svptrue_b32 (), &d->negpi1); > + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->negpi1); > > /* n = rint(|x|/pi). */ > - svfloat32_t n = svmla_lane_f32 (sv_f32 (d->shift), ax, pi_vals, 3); > - svuint32_t odd = svlsl_n_u32_x (pg, svreinterpret_u32_f32 (n), 31); > - n = svsub_n_f32_x (pg, n, d->shift); > + svfloat32_t n = svmla_lane (sv_f32 (d->shift), ax, pi_vals, 3); > + svuint32_t odd = svlsl_x (pg, svreinterpret_u32 (n), 31); > + n = svsub_x (pg, n, d->shift); > > /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ > svfloat32_t r; > - r = svmla_lane_f32 (ax, n, pi_vals, 0); > - r = svmla_lane_f32 (r, n, pi_vals, 1); > - r = svmla_lane_f32 (r, n, pi_vals, 2); > + r = svmla_lane (ax, n, pi_vals, 0); > + r = svmla_lane (r, n, pi_vals, 1); > + r = svmla_lane (r, n, pi_vals, 2); > > /* sin(r) approx using a degree 9 polynomial from the Taylor series > expansion. Note that only the odd terms of this are non-zero. */ > - svfloat32_t r2 = svmul_f32_x (pg, r, r); > + svfloat32_t r2 = svmul_x (pg, r, r); > svfloat32_t y; > - y = svmla_f32_x (pg, C (2), r2, C (3)); > - y = svmla_f32_x (pg, C (1), r2, y); > - y = svmla_f32_x (pg, C (0), r2, y); > - y = svmla_f32_x (pg, r, r, svmul_f32_x (pg, y, r2)); > + y = svmla_x (pg, C (2), r2, C (3)); > + y = svmla_x (pg, C (1), r2, y); > + y = svmla_x (pg, C (0), r2, y); > + y = svmla_x (pg, r, r, svmul_x (pg, y, r2)); > > /* sign = y^sign^odd. */ > - y = svreinterpret_f32_u32 (sveor_u32_x (pg, svreinterpret_u32_f32 (y), > - sveor_u32_x (pg, sign, odd))); > + sign = sveor_x (pg, sign, odd); > > if (__glibc_unlikely (svptest_any (pg, cmp))) > - return special_case (x, y, cmp); > - return y; > + return special_case (x, > + svreinterpret_f32 (sveor_x ( > + svnot_z (pg, cmp), svreinterpret_u32 (y), sign)), > + cmp); > + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); > } > -- > 2.27.0 >
The 10/05/2023 17:00, Szabolcs Nagy wrote: > The 10/04/2023 11:58, Joe Ramsay wrote: > > * Update ULP comment reflecting a new observed max in [-pi/2, pi/2] > > * Use the same polynomial in AdvSIMD and SVE, rather than FTRIG instructions > > * Improve register use near special-case branch > > > > Also use overloaded intrinsics for SVE. > > looks good. committed. sorry, not this one, but v2.
diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c index 0389b334cc..55644c4cc6 100644 --- a/sysdeps/aarch64/fpu/sin_advsimd.c +++ b/sysdeps/aarch64/fpu/sin_advsimd.c @@ -24,7 +24,7 @@ static const struct data float64x2_t poly[7]; float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3; } data = { - /* Worst-case error is 2.8 ulp in [-pi/2, pi/2]. */ + /* Worst-case error is 2.87 ulp in [-pi/2, pi/2]. */ .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), diff --git a/sysdeps/aarch64/fpu/sin_sve.c b/sysdeps/aarch64/fpu/sin_sve.c index c3f450d0ea..9e7f5ff684 100644 --- a/sysdeps/aarch64/fpu/sin_sve.c +++ b/sysdeps/aarch64/fpu/sin_sve.c @@ -21,20 +21,23 @@ static const struct data { - double inv_pi, half_pi, inv_pi_over_2, pi_over_2_1, pi_over_2_2, pi_over_2_3, - shift; + double inv_pi, pi_1, pi_2, pi_3, shift, range_val; + double poly[7]; } data = { - /* Polynomial coefficients are hard-wired in the FTMAD instruction. */ + /* Worst-case error is 2.87 ulp in [-pi/2, pi/2]. */ + .poly = { -0x1.555555555547bp-3, 0x1.1111111108a4dp-7, -0x1.a01a019936f27p-13, + 0x1.71de37a97d93ep-19, -0x1.ae633919987c6p-26, + 0x1.60e277ae07cecp-33, -0x1.9e9540300a1p-41, }, + .inv_pi = 0x1.45f306dc9c883p-2, - .half_pi = 0x1.921fb54442d18p+0, - .inv_pi_over_2 = 0x1.45f306dc9c882p-1, - .pi_over_2_1 = 0x1.921fb50000000p+0, - .pi_over_2_2 = 0x1.110b460000000p-26, - .pi_over_2_3 = 0x1.1a62633145c07p-54, - .shift = 0x1.8p52 + .pi_1 = 0x1.921fb54442d18p+1, + .pi_2 = 0x1.1a62633145c06p-53, + .pi_3 = 0x1.c1cd129024e09p-106, + .shift = 0x1.8p52, + .range_val = 0x1p23, }; -#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23). */ +#define C(i) sv_f64 (d->poly[i]) static svfloat64_t NOINLINE special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) @@ -42,56 +45,53 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp) return sv_call_f64 (sin, x, y, cmp); } -/* A fast SVE implementation of sin based on trigonometric - instructions (FTMAD, FTSSEL, FTSMUL). - Maximum observed error in 2.52 ULP: - SV_NAME_D1 (sin)(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40 - want 0x1.10ace8f3e7868p-40. */ +/* A fast SVE implementation of sin. + Maximum observed error in 3.22 ULP: + _ZGVsMxv_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 + want -0x1.ffe9537d5dbb4p-3. */ svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svfloat64_t r = svabs_f64_x (pg, x); - svuint64_t sign - = sveor_u64_x (pg, svreinterpret_u64_f64 (x), svreinterpret_u64_f64 (r)); - svbool_t cmp = svcmpge_n_u64 (pg, svreinterpret_u64_f64 (r), RangeVal); + /* Load some values in quad-word chunks to minimise memory access. */ + const svbool_t ptrue = svptrue_b64 (); + svfloat64_t shift = sv_f64 (d->shift); + svfloat64_t inv_pi_and_pi1 = svld1rq (ptrue, &d->inv_pi); + svfloat64_t pi2_and_pi3 = svld1rq (ptrue, &d->pi_2); - /* Load first two pio2-related constants to one vector. */ - svfloat64_t invpio2_and_pio2_1 - = svld1rq_f64 (svptrue_b64 (), &d->inv_pi_over_2); + /* n = rint(|x|/pi). */ + svfloat64_t n = svmla_lane (shift, x, inv_pi_and_pi1, 0); + svuint64_t odd = svlsl_x (pg, svreinterpret_u64 (n), 63); + n = svsub_x (pg, n, shift); - /* n = rint(|x|/(pi/2)). */ - svfloat64_t q = svmla_lane_f64 (sv_f64 (d->shift), r, invpio2_and_pio2_1, 0); - svfloat64_t n = svsub_n_f64_x (pg, q, d->shift); + /* r = |x| - n*(pi/2) (range reduction into -pi/2 .. pi/2). */ + svfloat64_t r = x; + r = svmls_lane (r, n, inv_pi_and_pi1, 1); + r = svmls_lane (r, n, pi2_and_pi3, 0); + r = svmls_lane (r, n, pi2_and_pi3, 1); - /* r = |x| - n*(pi/2) (range reduction into -pi/4 .. pi/4). */ - r = svmls_lane_f64 (r, n, invpio2_and_pio2_1, 1); - r = svmls_n_f64_x (pg, r, n, d->pi_over_2_2); - r = svmls_n_f64_x (pg, r, n, d->pi_over_2_3); + /* sin(r) poly approx. */ + svfloat64_t r2 = svmul_x (pg, r, r); + svfloat64_t r3 = svmul_x (pg, r2, r); + svfloat64_t r4 = svmul_x (pg, r2, r2); - /* Final multiplicative factor: 1.0 or x depending on bit #0 of q. */ - svfloat64_t f = svtssel_f64 (r, svreinterpret_u64_f64 (q)); + svfloat64_t t1 = svmla_x (pg, C (4), C (5), r2); + svfloat64_t t2 = svmla_x (pg, C (2), C (3), r2); + svfloat64_t t3 = svmla_x (pg, C (0), C (1), r2); - /* sin(r) poly approx. */ - svfloat64_t r2 = svtsmul_f64 (r, svreinterpret_u64_f64 (q)); - svfloat64_t y = sv_f64 (0.0); - y = svtmad_f64 (y, r2, 7); - y = svtmad_f64 (y, r2, 6); - y = svtmad_f64 (y, r2, 5); - y = svtmad_f64 (y, r2, 4); - y = svtmad_f64 (y, r2, 3); - y = svtmad_f64 (y, r2, 2); - y = svtmad_f64 (y, r2, 1); - y = svtmad_f64 (y, r2, 0); - - /* Apply factor. */ - y = svmul_f64_x (pg, f, y); - - /* sign = y^sign. */ - y = svreinterpret_f64_u64 ( - sveor_u64_x (pg, svreinterpret_u64_f64 (y), sign)); + svfloat64_t y = svmla_x (pg, t1, C (6), r4); + y = svmla_x (pg, t2, y, r4); + y = svmla_x (pg, t3, y, r4); + y = svmla_x (pg, r, y, r3); + svbool_t cmp = svacle (pg, x, d->range_val); + cmp = svnot_z (pg, cmp); if (__glibc_unlikely (svptest_any (pg, cmp))) - return special_case (x, y, cmp); - return y; + return special_case (x, + svreinterpret_f64 (sveor_z ( + svnot_z (pg, cmp), svreinterpret_u64 (y), odd)), + cmp); + + /* Copy sign. */ + return svreinterpret_f64 (sveor_z (pg, svreinterpret_u64 (y), odd)); } diff --git a/sysdeps/aarch64/fpu/sinf_sve.c b/sysdeps/aarch64/fpu/sinf_sve.c index 4d2ce7a846..590881c14b 100644 --- a/sysdeps/aarch64/fpu/sinf_sve.c +++ b/sysdeps/aarch64/fpu/sinf_sve.c @@ -23,7 +23,7 @@ static const struct data { float poly[4]; /* Pi-related values to be loaded as one quad-word and used with - svmla_lane_f32. */ + svmla_lane. */ float negpi1, negpi2, negpi3, invpi; float shift; } data = { @@ -57,40 +57,42 @@ svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg) { const struct data *d = ptr_barrier (&data); - svfloat32_t ax = svabs_f32_x (pg, x); - svuint32_t sign = sveor_u32_x (pg, svreinterpret_u32_f32 (x), - svreinterpret_u32_f32 (ax)); - svbool_t cmp = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (ax), RangeVal); + svfloat32_t ax = svabs_x (pg, x); + svuint32_t sign + = sveor_x (pg, svreinterpret_u32 (x), svreinterpret_u32 (ax)); + svbool_t cmp = svcmpge (pg, svreinterpret_u32 (ax), RangeVal); /* pi_vals are a quad-word of helper values - the first 3 elements contain -pi in extended precision, the last contains 1 / pi. */ - svfloat32_t pi_vals = svld1rq_f32 (svptrue_b32 (), &d->negpi1); + svfloat32_t pi_vals = svld1rq (svptrue_b32 (), &d->negpi1); /* n = rint(|x|/pi). */ - svfloat32_t n = svmla_lane_f32 (sv_f32 (d->shift), ax, pi_vals, 3); - svuint32_t odd = svlsl_n_u32_x (pg, svreinterpret_u32_f32 (n), 31); - n = svsub_n_f32_x (pg, n, d->shift); + svfloat32_t n = svmla_lane (sv_f32 (d->shift), ax, pi_vals, 3); + svuint32_t odd = svlsl_x (pg, svreinterpret_u32 (n), 31); + n = svsub_x (pg, n, d->shift); /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */ svfloat32_t r; - r = svmla_lane_f32 (ax, n, pi_vals, 0); - r = svmla_lane_f32 (r, n, pi_vals, 1); - r = svmla_lane_f32 (r, n, pi_vals, 2); + r = svmla_lane (ax, n, pi_vals, 0); + r = svmla_lane (r, n, pi_vals, 1); + r = svmla_lane (r, n, pi_vals, 2); /* sin(r) approx using a degree 9 polynomial from the Taylor series expansion. Note that only the odd terms of this are non-zero. */ - svfloat32_t r2 = svmul_f32_x (pg, r, r); + svfloat32_t r2 = svmul_x (pg, r, r); svfloat32_t y; - y = svmla_f32_x (pg, C (2), r2, C (3)); - y = svmla_f32_x (pg, C (1), r2, y); - y = svmla_f32_x (pg, C (0), r2, y); - y = svmla_f32_x (pg, r, r, svmul_f32_x (pg, y, r2)); + y = svmla_x (pg, C (2), r2, C (3)); + y = svmla_x (pg, C (1), r2, y); + y = svmla_x (pg, C (0), r2, y); + y = svmla_x (pg, r, r, svmul_x (pg, y, r2)); /* sign = y^sign^odd. */ - y = svreinterpret_f32_u32 (sveor_u32_x (pg, svreinterpret_u32_f32 (y), - sveor_u32_x (pg, sign, odd))); + sign = sveor_x (pg, sign, odd); if (__glibc_unlikely (svptest_any (pg, cmp))) - return special_case (x, y, cmp); - return y; + return special_case (x, + svreinterpret_f32 (sveor_x ( + svnot_z (pg, cmp), svreinterpret_u32 (y), sign)), + cmp); + return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign)); }