@@ -18,27 +18,18 @@
<https://www.gnu.org/licenses/>. */
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "v_expm1f_inline.h"
static const struct data
{
- float32x4_t poly[5];
- float invln2_and_ln2[4];
- float32x4_t shift;
- int32x4_t exponent_bias;
+ struct v_expm1f_data d;
#if WANT_SIMD_EXCEPT
uint32x4_t thresh;
#else
float32x4_t oflow_bound;
#endif
} data = {
- /* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
- /* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
- .shift = V4 (0x1.8p23f),
- .exponent_bias = V4 (0x3f800000),
+ .d = V_EXPM1F_DATA,
#if !WANT_SIMD_EXCEPT
/* Value above which expm1f(x) should overflow. Absolute value of the
underflow bound is greater than this, so it catches both cases - there is
@@ -55,67 +46,38 @@ static const struct data
#define TinyBound v_u32 (0x34000000 << 1)
static float32x4_t VPCS_ATTR NOINLINE
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t special, const struct data *d)
{
- return v_call_f32 (expm1f, x, y, special);
+ return v_call_f32 (
+ expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
}
/* Single-precision vector exp(x) - 1 function.
- The maximum error is 1.51 ULP:
- _ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
- want 0x1.e2fb94p-2. */
+ The maximum error is 1.62 ULP:
+ _ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
+ want 0x1.da9f44p-2. */
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
{
const struct data *d = ptr_barrier (&data);
- uint32x4_t ix = vreinterpretq_u32_f32 (x);
#if WANT_SIMD_EXCEPT
+ uint32x4_t ix = vreinterpretq_u32_f32 (x);
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
shift-left by 1, and compare with thresh which was left-shifted offline -
this is effectively an absolute compare. */
uint32x4_t special
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
- if (__glibc_unlikely (v_any_u32 (special)))
- x = v_zerofy_f32 (x, special);
#else
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
#endif
- /* Reduce argument to smaller range:
- Let i = round(x / ln2)
- and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
- exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
- where 2^i is exact because i is an integer. */
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- float32x4_t j
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
- int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
-
- /* Approximate expm1(f) using polynomial.
- Taylor expansion for expm1(x) has the form:
- x + ax^2 + bx^3 + cx^4 ....
- So we calculate the polynomial P(f) = a + bf + cf^2 + ...
- and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
- float32x4_t p = v_horner_4_f32 (f, d->poly);
- p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
-
- /* Assemble the result.
- expm1(x) ~= 2^i * (p + 1) - 1
- Let t = 2^i. */
- int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
- float32x4_t t = vreinterpretq_f32_s32 (u);
-
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
- special);
+ return special_case (x, special, d);
/* expm1(x) ~= p * t + (t - 1). */
- return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
+ return expm1f_inline (x, &d->d);
}
libmvec_hidden_def (V_NAME_F1 (expm1))
HALF_WIDTH_ALIAS_F1 (expm1)
@@ -23,15 +23,13 @@
static const struct data
{
struct v_expm1f_data expm1f_consts;
- uint32x4_t halff;
#if WANT_SIMD_EXCEPT
uint32x4_t tiny_bound, thresh;
#else
- uint32x4_t oflow_bound;
+ float32x4_t oflow_bound;
#endif
} data = {
.expm1f_consts = V_EXPM1F_DATA,
- .halff = V4 (0x3f000000),
#if WANT_SIMD_EXCEPT
/* 0x1.6a09e8p-32, below which expm1f underflows. */
.tiny_bound = V4 (0x2fb504f4),
@@ -39,14 +37,15 @@ static const struct data
.thresh = V4 (0x12fbbbb3),
#else
/* 0x1.61814ep+6, above which expm1f helper overflows. */
- .oflow_bound = V4 (0x42b0c0a7),
+ .oflow_bound = V4 (0x1.61814ep+6),
#endif
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
+ uint32x4_t special)
{
- return v_call_f32 (sinhf, x, y, special);
+ return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
}
/* Approximation for vector single-precision sinh(x) using expm1.
@@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
uint32x4_t ix = vreinterpretq_u32_f32 (x);
float32x4_t ax = vabsq_f32 (x);
- uint32x4_t iax = vreinterpretq_u32_f32 (ax);
- uint32x4_t sign = veorq_u32 (ix, iax);
- float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
+ float32x4_t halfsign = vreinterpretq_f32_u32 (
+ vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
#if WANT_SIMD_EXCEPT
- uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
+ uint32x4_t special = vcgeq_u32 (
+ vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
ax = v_zerofy_f32 (ax, special);
#else
- uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
+ uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
#endif
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
@@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
/* Fall back to the scalar variant for any lanes that should trigger an
exception. */
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (x, vmulq_f32 (t, halfsign), special);
+ return special_case (x, t, halfsign, special);
return vmulq_f32 (t, halfsign);
}
@@ -28,13 +28,16 @@ static const struct data
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
.boring_bound = V4 (0x41102cb3),
.large_bound = V4 (0x7f800000),
- .onef = V4 (0x3f800000),
};
static float32x4_t NOINLINE VPCS_ATTR
-special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
+special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
+ float32x4_t q, uint32x4_t special)
{
- return v_call_f32 (tanhf, x, y, special);
+ return v_call_f32 (
+ tanhf, x,
+ vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
+ special);
}
/* Approximation for single-precision vector tanh(x), using a simplified
@@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
uint32x4_t sign = veorq_u32 (ix, iax);
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
- float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
+ /* expm1 exponent bias is 1.0f reinterpreted to int. */
+ float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
+ sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
#if WANT_SIMD_EXCEPT
/* If fp exceptions are to be triggered properly, set all special and boring
@@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
- float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
+
if (__glibc_unlikely (v_any_u32 (special)))
- return special_case (vreinterpretq_f32_u32 (ix),
- vbslq_f32 (is_boring, boring, y), special);
+ return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
+ special);
+
+ float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
return vbslq_f32 (is_boring, boring, y);
}
libmvec_hidden_def (V_NAME_F1 (tanh))
@@ -21,48 +21,47 @@
#define AARCH64_FPU_V_EXPM1F_INLINE_H
#include "v_math.h"
-#include "poly_advsimd_f32.h"
+#include "math_config.h"
struct v_expm1f_data
{
- float32x4_t poly[5];
- float invln2_and_ln2[4];
- float32x4_t shift;
+ float32x4_t c0, c2;
int32x4_t exponent_bias;
+ float c1, c3, inv_ln2, c4;
+ float ln2_hi, ln2_lo;
};
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
- log(2)/2]. Exponent bias is asuint(1.0f).
- invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
+ log(2)/2]. Exponent bias is asuint(1.0f). */
#define V_EXPM1F_DATA \
{ \
- .poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
- V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
- .shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
- .invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
+ .c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
+ .c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
+ .exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
+ .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
}
static inline float32x4_t
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
{
- /* Helper routine for calculating exp(x) - 1.
- Copied from v_expm1f_1u6.c, with all special-case handling removed - the
- calling routine should handle special values if required. */
+ /* Helper routine for calculating exp(x) - 1. */
+
+ float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
+ float32x4_t lane_consts = vld1q_f32 (&d->c1);
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
- float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
- float32x4_t j
- = vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
+ float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
int32x4_t i = vcvtq_s32_f32 (j);
- float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
- f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
+ float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
+ f = vfmsq_lane_f32 (f, j, ln2, 1);
- /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
- Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
- Horner. */
+ /* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
float32x4_t f2 = vmulq_f32 (f, f);
float32x4_t f4 = vmulq_f32 (f2, f2);
- float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
+ float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
+ float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
+ float32x4_t p = vfmaq_f32 (p01, f2, p23);
+ p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
p = vfmaq_f32 (f, f2, p);
/* t = 2^i. */