diff mbox series

[v4,4/4] aarch64: Add vector implementations of exp routines

Message ID 20230623152423.1683-4-Joe.Ramsay@arm.com
State New
Headers show
Series [v3,1/4] aarch64: Add vector implementations of cos routines | expand

Commit Message

Joe Ramsay June 23, 2023, 3:24 p.m. UTC
Optimised implementations for single and double precision, Advanced
SIMD and SVE, copied from Arm Optimized Routines.

As previously, data tables are marked volatile or used via pointers to
prevent overly aggressive constant inlining. Special-case handlers are
marked NOINLINE to avoid incurring the penalty of switching call
standards unnecessarily.
---
Changes to v3:
 * Mark SVE data tables as const, and use new barrier
Thanks,
Joe
 sysdeps/aarch64/fpu/Makefile                  |   4 +-
 sysdeps/aarch64/fpu/Versions                  |   4 +
 sysdeps/aarch64/fpu/bits/math-vector.h        |   4 +
 sysdeps/aarch64/fpu/exp_advsimd.c             | 136 +++++++++++++++++
 sysdeps/aarch64/fpu/exp_sve.c                 | 142 ++++++++++++++++++
 sysdeps/aarch64/fpu/expf_advsimd.c            | 132 ++++++++++++++++
 sysdeps/aarch64/fpu/expf_sve.c                |  90 +++++++++++
 .../fpu/test-double-advsimd-wrappers.c        |   1 +
 .../aarch64/fpu/test-double-sve-wrappers.c    |   1 +
 .../aarch64/fpu/test-float-advsimd-wrappers.c |   1 +
 sysdeps/aarch64/fpu/test-float-sve-wrappers.c |   1 +
 sysdeps/aarch64/fpu/v_exp_data.c              |  66 ++++++++
 sysdeps/aarch64/fpu/vecmath_config.h          |   3 +
 sysdeps/aarch64/libm-test-ulps                |   8 +
 .../unix/sysv/linux/aarch64/libmvec.abilist   |   4 +
 15 files changed, 596 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/aarch64/fpu/exp_advsimd.c
 create mode 100644 sysdeps/aarch64/fpu/exp_sve.c
 create mode 100644 sysdeps/aarch64/fpu/expf_advsimd.c
 create mode 100644 sysdeps/aarch64/fpu/expf_sve.c
 create mode 100644 sysdeps/aarch64/fpu/v_exp_data.c
diff mbox series

Patch

diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index cc90c4cb75..04aa2e37ca 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,4 +1,5 @@ 
 libmvec-supported-funcs = cos \
+                          exp \
                           log \
                           sin
 
@@ -12,7 +13,8 @@  libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
                   $(addsuffix _advsimd,$(double-advsimd-funcs)) \
                   $(addsuffix f_sve,$(float-sve-funcs)) \
                   $(addsuffix _sve,$(double-sve-funcs)) \
-                  v_log_data
+                  v_log_data \
+                  v_exp_data
 endif
 
 sve-cflags = -march=armv8-a+sve
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 902446f40d..c85c0f3efb 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -1,13 +1,17 @@ 
 libmvec {
   GLIBC_2.38 {
     _ZGVnN2v_cos;
+    _ZGVnN2v_exp;
     _ZGVnN2v_log;
     _ZGVnN2v_sin;
     _ZGVnN4v_cosf;
+    _ZGVnN4v_expf;
     _ZGVnN4v_logf;
     _ZGVnN4v_sinf;
     _ZGVsMxv_cos;
     _ZGVsMxv_cosf;
+    _ZGVsMxv_exp;
+    _ZGVsMxv_expf;
     _ZGVsMxv_log;
     _ZGVsMxv_logf;
     _ZGVsMxv_sin;
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 70c737338e..7c200599c1 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -50,10 +50,12 @@  typedef __SVBool_t __sv_bool_t;
 #  define __vpcs __attribute__ ((__aarch64_vector_pcs__))
 
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
 __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
 
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
 __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 
@@ -63,10 +65,12 @@  __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 #ifdef __SVE_VEC_MATH_SUPPORTED
 
 __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
 __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
 
 __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
 __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
 
diff --git a/sysdeps/aarch64/fpu/exp_advsimd.c b/sysdeps/aarch64/fpu/exp_advsimd.c
new file mode 100644
index 0000000000..022e717cb3
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp_advsimd.c
@@ -0,0 +1,136 @@ 
+/* Double-precision vector (Advanced SIMD) exp function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+#define N (1 << V_EXP_TABLE_BITS)
+#define IndexMask (N - 1)
+
+const static volatile struct
+{
+  float64x2_t poly[3];
+  float64x2_t inv_ln2, ln2_hi, ln2_lo, shift;
+#if !WANT_SIMD_EXCEPT
+  float64x2_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.88 +0.5 ulp
+     rel error: 1.4337*2^-53
+     abs error: 1.4299*2^-53 in [ -ln2/256, ln2/256 ].  */
+  .poly = { V2 (0x1.ffffffffffd43p-2), V2 (0x1.55555c75adbb2p-3),
+	    V2 (0x1.55555da646206p-5) },
+#if !WANT_SIMD_EXCEPT
+  .scale_thresh = V2 (163840.0), /* 1280.0 * N.  */
+  .special_bound = V2 (704.0),
+#endif
+  .inv_ln2 = V2 (0x1.71547652b82fep7), /* N/ln2.  */
+  .ln2_hi = V2 (0x1.62e42fefa39efp-8), /* ln2/N.  */
+  .ln2_lo = V2 (0x1.abc9e3b39803f3p-63),
+  .shift = V2 (0x1.8p+52)
+};
+
+#define C(i) data.poly[i]
+#define Tab __v_exp_data
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511).  */
+# define BigBound v_u64 (0x4080000000000000) /* asuint64 (0x1p9).  */
+# define SpecialBound v_u64 (0x2080000000000000) /* BigBound - TinyBound.  */
+
+static inline float64x2_t VPCS_ATTR
+special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f64 (exp, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+# define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769.  */
+# define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254.  */
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t s, float64x2_t y, float64x2_t n)
+{
+  /* 2^(n/N) may overflow, break it up into s1*s2.  */
+  uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
+  float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
+  float64x2_t s2 = vreinterpretq_f64_u64 (
+      vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
+  uint64x2_t cmp = vcagtq_f64 (n, data.scale_thresh);
+  float64x2_t r1 = vmulq_f64 (s1, s1);
+  float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
+  return vbslq_f64 (cmp, r1, r0);
+}
+
+#endif
+
+float64x2_t VPCS_ATTR V_NAME_D1 (exp) (float64x2_t x)
+{
+  float64x2_t n, r, r2, s, y, z;
+  uint64x2_t cmp, u, e;
+
+#if WANT_SIMD_EXCEPT
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special_case to fix special lanes later. This is only necessary if fenv
+     exceptions are to be triggered correctly.  */
+  float64x2_t xm = x;
+  uint64x2_t iax = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (iax, TinyBound), SpecialBound);
+  if (__glibc_unlikely (v_any_u64 (cmp)))
+    x = vbslq_f64 (cmp, v_f64 (1), x);
+#else
+  cmp = vcagtq_f64 (x, data.special_bound);
+#endif
+
+  /* n = round(x/(ln2/N)).  */
+  z = vfmaq_f64 (data.shift, x, data.inv_ln2);
+  u = vreinterpretq_u64_f64 (z);
+  n = vsubq_f64 (z, data.shift);
+
+  /* r = x - n*ln2/N.  */
+  r = x;
+  r = vfmsq_f64 (r, data.ln2_hi, n);
+  r = vfmsq_f64 (r, data.ln2_lo, n);
+
+  e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4.  */
+  r2 = vmulq_f64 (r, r);
+  y = vfmaq_f64 (C (0), C (1), r);
+  y = vfmaq_f64 (y, C (2), r2);
+  y = vfmaq_f64 (r, y, r2);
+
+  /* s = 2^(n/N).  */
+  u = (uint64x2_t){ Tab[u[0] & IndexMask], Tab[u[1] & IndexMask] };
+  s = vreinterpretq_f64_u64 (vaddq_u64 (u, e));
+
+  if (__glibc_unlikely (v_any_u64 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f64 (s, y, s), cmp);
+#else
+    return special_case (s, y, n);
+#endif
+
+  return vfmaq_f64 (s, y, s);
+}
diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
new file mode 100644
index 0000000000..ec0932bd70
--- /dev/null
+++ b/sysdeps/aarch64/fpu/exp_sve.c
@@ -0,0 +1,142 @@ 
+/* Double-precision vector (SVE) exp function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  double poly[4];
+  double ln2_hi, ln2_lo, inv_ln2, shift, thres;
+} data = {
+  .poly = { /* ulp error: 0.53.  */
+	    0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
+	    0x1.1111266d28935p-7 },
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  /* 1/ln2.  */
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  /* 1.5*2^46+1023. This value is further explained below.  */
+  .shift = 0x1.800000000ffc0p+46,
+  .thres = 704.0,
+};
+
+#define C(i) sv_f64 (d->poly[i])
+#define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
+
+/* Update of both special and non-special cases, if any special case is
+   detected.  */
+static inline svfloat64_t
+special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
+{
+  /* s=2^n may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x6, 0 otherwise.  */
+  svbool_t p_sign = svcmple_n_f64 (pg, n, 0.0); /* n <= 0.  */
+  svuint64_t b
+      = svdup_n_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n.  */
+  svfloat64_t s1 = svreinterpret_f64_u64 (
+      svsubr_n_u64_x (pg, b, SpecialBias1)); /* 0x70...0 - b.  */
+  /* Offset s to avoid overflow in final result if n is below threshold.  */
+  svfloat64_t s2 = svreinterpret_f64_u64 (svadd_u64_x (
+      pg, svsub_n_u64_x (pg, svreinterpret_u64_f64 (s), SpecialBias2),
+      b)); /* as_u64 (s) - 0x3010...0 + b.  */
+
+  /* |n| > 1280 => 2^(n) overflows.  */
+  svbool_t p_cmp = svacgt_n_f64 (pg, n, 1280.0);
+
+  svfloat64_t r1 = svmul_f64_x (pg, s1, s1);
+  svfloat64_t r2 = svmla_f64_x (pg, s2, s2, y);
+  svfloat64_t r0 = svmul_f64_x (pg, r2, s1);
+
+  return svsel_f64 (p_cmp, r1, r0);
+}
+
+/* SVE exp algorithm. Maximum measured error is 1.01ulps:
+   SV_NAME_D1 (exp)(0x1.4619d7b04da41p+6) got 0x1.885d9acc41da7p+117
+					 want 0x1.885d9acc41da6p+117.  */
+svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  svbool_t special = svacgt_n_f64 (pg, x, d->thres);
+
+  /* Use a modifed version of the shift used for flooring, such that x/ln2 is
+     rounded to a multiple of 2^-6=1/64, shift = 1.5 * 2^52 * 2^-6 = 1.5 *
+     2^46.
+
+     n is not an integer but can be written as n = m + i/64, with i and m
+     integer, 0 <= i < 64 and m <= n.
+
+     Bits 5:0 of z will be null every time x/ln2 reaches a new integer value
+     (n=m, i=0), and is incremented every time z (or n) is incremented by 1/64.
+     FEXPA expects i in bits 5:0 of the input so it can be used as index into
+     FEXPA hardwired table T[i] = 2^(i/64) for i = 0:63, that will in turn
+     populate the mantissa of the output. Therefore, we use u=asuint(z) as
+     input to FEXPA.
+
+     We add 1023 to the modified shift value in order to set bits 16:6 of u to
+     1, such that once these bits are moved to the exponent of the output of
+     FEXPA, we get the exponent of 2^n right, i.e. we get 2^m.  */
+  svfloat64_t z = svmla_n_f64_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64_f64 (z);
+  svfloat64_t n = svsub_n_f64_x (pg, z, d->shift);
+
+  /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)].  */
+  svfloat64_t ln2 = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = svmls_lane_f64 (x, n, ln2, 0);
+  r = svmls_lane_f64 (r, n, ln2, 1);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5.  */
+  svfloat64_t r2 = svmul_f64_x (pg, r, r);
+  svfloat64_t p01 = svmla_f64_x (pg, C (0), C (1), r);
+  svfloat64_t p23 = svmla_f64_x (pg, C (2), C (3), r);
+  svfloat64_t p04 = svmla_f64_x (pg, p01, p23, r2);
+  svfloat64_t y = svmla_f64_x (pg, r, p04, r2);
+
+  /* s = 2^n, computed using FEXPA. FEXPA does not propagate NaNs, so for
+     consistent NaN handling we have to manually propagate them. This comes at
+     significant performance cost.  */
+  svfloat64_t s = svexpa_f64 (u);
+
+  /* Assemble result as exp(x) = 2^n * exp(r).  If |x| > Thresh the
+     multiplication may overflow, so use special case routine.  */
+
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+	 special case function so needs to be copied.
+	 e = sign bit of u << 46.  */
+      svuint64_t e
+	  = svand_n_u64_x (pg, svlsl_n_u64_x (pg, u, 46), 0x8000000000000000);
+      /* Copy sign to s.  */
+      s = svreinterpret_f64_u64 (
+	  svadd_u64_x (pg, e, svreinterpret_u64_f64 (s)));
+      return special_case (pg, s, y, n);
+    }
+
+  /* No special case.  */
+  return svmla_f64_x (pg, s, s, y);
+}
diff --git a/sysdeps/aarch64/fpu/expf_advsimd.c b/sysdeps/aarch64/fpu/expf_advsimd.c
new file mode 100644
index 0000000000..b1db1f0d03
--- /dev/null
+++ b/sysdeps/aarch64/fpu/expf_advsimd.c
@@ -0,0 +1,132 @@ 
+/* Single-precision vector (Advanced SIMD) exp function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const volatile struct
+{
+  float32x4_t poly[5];
+  float32x4_t shift, inv_ln2, ln2_hi, ln2_lo;
+  uint32x4_t exponent_bias;
+#if !WANT_SIMD_EXCEPT
+  float32x4_t special_bound, scale_thresh;
+#endif
+} data = {
+  /* maxerr: 1.45358 +0.5 ulp.  */
+  .poly = { V4 (0x1.0e4020p-7f), V4 (0x1.573e2ep-5f), V4 (0x1.555e66p-3f),
+	    V4 (0x1.fffdb6p-2f), V4 (0x1.ffffecp-1f) },
+  .shift = V4 (0x1.8p23f),
+  .inv_ln2 = V4 (0x1.715476p+0f),
+  .ln2_hi = V4 (0x1.62e4p-1f),
+  .ln2_lo = V4 (0x1.7f7d1cp-20f),
+  .exponent_bias = V4 (0x3f800000),
+#if !WANT_SIMD_EXCEPT
+  .special_bound = V4 (126.0f),
+  .scale_thresh = V4 (192.0f),
+#endif
+};
+
+#define C(i) data.poly[i]
+
+#if WANT_SIMD_EXCEPT
+
+# define TinyBound v_u32 (0x20000000)	/* asuint (0x1p-63).  */
+# define BigBound v_u32 (0x42800000)	/* asuint (0x1p6).  */
+# define SpecialBound v_u32 (0x22800000) /* BigBound - TinyBound.  */
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp)
+{
+  /* If fenv exceptions are to be triggered correctly, fall back to the scalar
+     routine to special lanes.  */
+  return v_call_f32 (expf, x, y, cmp);
+}
+
+#else
+
+# define SpecialOffset v_u32 (0x82000000)
+# define SpecialBias v_u32 (0x7f000000)
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
+	      float32x4_t scale)
+{
+  /* 2^n may overflow, break it up into s1*s2.  */
+  uint32x4_t b = vandq_u32 (vclezq_f32 (n), SpecialOffset);
+  float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, SpecialBias));
+  float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
+  uint32x4_t cmp2 = vcagtq_f32 (n, data.scale_thresh);
+  float32x4_t r2 = vmulq_f32 (s1, s1);
+  float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
+  /* Similar to r1 but avoids double rounding in the subnormal range.  */
+  float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
+  float32x4_t r = vbslq_f32 (cmp1, r1, r0);
+  return vbslq_f32 (cmp2, r2, r);
+}
+
+#endif
+
+float32x4_t VPCS_ATTR V_NAME_F1 (exp) (float32x4_t x)
+{
+  float32x4_t n, r, r2, scale, p, q, poly, z;
+  uint32x4_t cmp, e;
+
+#if WANT_SIMD_EXCEPT
+  /* asuint(x) - TinyBound >= BigBound - TinyBound.  */
+  cmp = vcgeq_u32 (
+      vsubq_u32 (vandq_u32 (vreinterpretq_u32_f32 (x), v_u32 (0x7fffffff)),
+		 TinyBound),
+      SpecialBound);
+  float32x4_t xm = x;
+  /* If any lanes are special, mask them with 1 and retain a copy of x to allow
+     special case handler to fix special lanes later. This is only necessary if
+     fenv exceptions are to be triggered correctly.  */
+  if (__glibc_unlikely (v_any_u32 (cmp)))
+    x = vbslq_f32 (cmp, v_f32 (1), x);
+#endif
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+  z = vfmaq_f32 (data.shift, x, data.inv_ln2);
+  n = vsubq_f32 (z, data.shift);
+  r = vfmsq_f32 (x, n, data.ln2_hi);
+  r = vfmsq_f32 (r, n, data.ln2_lo);
+  e = vshlq_n_u32 (vreinterpretq_u32_f32 (z), 23);
+  scale = vreinterpretq_f32_u32 (vaddq_u32 (e, data.exponent_bias));
+
+#if !WANT_SIMD_EXCEPT
+  cmp = vcagtq_f32 (n, data.special_bound);
+#endif
+
+  r2 = vmulq_f32 (r, r);
+  p = vfmaq_f32 (C (1), C (0), r);
+  q = vfmaq_f32 (C (3), C (2), r);
+  q = vfmaq_f32 (q, p, r2);
+  p = vmulq_f32 (C (4), r);
+  poly = vfmaq_f32 (p, q, r2);
+
+  if (__glibc_unlikely (v_any_u32 (cmp)))
+#if WANT_SIMD_EXCEPT
+    return special_case (xm, vfmaq_f32 (scale, poly, scale), cmp);
+#else
+    return special_case (poly, n, e, cmp, scale);
+#endif
+
+  return vfmaq_f32 (scale, poly, scale);
+}
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
new file mode 100644
index 0000000000..03f2e086ed
--- /dev/null
+++ b/sysdeps/aarch64/fpu/expf_sve.c
@@ -0,0 +1,90 @@ 
+/* Single-precision vector (SVE) exp function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static const struct data
+{
+  float poly[5];
+  float inv_ln2, ln2_hi, ln2_lo, shift, thres;
+} data = {
+  /* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
+     compatibility with polynomial helpers.  */
+  .poly = { 0x1.ffffecp-1f, 0x1.fffdb6p-2f, 0x1.555e66p-3f, 0x1.573e2ep-5f,
+	    0x1.0e4020p-7f },
+  .inv_ln2 = 0x1.715476p+0f,
+  .ln2_hi = 0x1.62e4p-1f,
+  .ln2_lo = 0x1.7f7d1cp-20f,
+  /* 1.5*2^17 + 127.  */
+  .shift = 0x1.903f8p17f,
+  /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
+     correctly by FEXPA.  */
+  .thres = 0x1.5d5e2ap+6f,
+};
+
+#define C(i) sv_f32 (d->poly[i])
+#define ExponentBias 0x3f800000
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
+{
+  return sv_call_f32 (expf, x, y, special);
+}
+
+/* Optimised single-precision SVE exp function.
+   Worst-case error is 1.04 ulp:
+   SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
+				  want 0x1.ba74bap+4.  */
+svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
+     x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
+
+  /* Load some constants in quad-word chunks to minimise memory access (last
+     lane is wasted).  */
+  svfloat32_t invln2_and_ln2 = svld1rq_f32 (svptrue_b32 (), &d->inv_ln2);
+
+  /* n = round(x/(ln2/N)).  */
+  svfloat32_t z = svmla_lane_f32 (sv_f32 (d->shift), x, invln2_and_ln2, 0);
+  svfloat32_t n = svsub_n_f32_x (pg, z, d->shift);
+
+  /* r = x - n*ln2/N.  */
+  svfloat32_t r = svmls_lane_f32 (x, n, invln2_and_ln2, 1);
+  r = svmls_lane_f32 (r, n, invln2_and_ln2, 2);
+
+/* scale = 2^(n/N).  */
+  svbool_t is_special_case = svacgt_n_f32 (pg, x, d->thres);
+  svfloat32_t scale = svexpa_f32 (svreinterpret_u32_f32 (z));
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat32_t p12 = svmla_f32_x (pg, C (1), C (2), r);
+  svfloat32_t p34 = svmla_f32_x (pg, C (3), C (4), r);
+  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  svfloat32_t p14 = svmla_f32_x (pg, p12, p34, r2);
+  svfloat32_t p0 = svmul_f32_x (pg, r, C (0));
+  svfloat32_t poly = svmla_f32_x (pg, p0, r2, p14);
+
+  if (__glibc_unlikely (svptest_any (pg, is_special_case)))
+    return special_case (x, svmla_f32_x (pg, scale, scale, poly),
+			 is_special_case);
+
+  return svmla_f32_x (pg, scale, scale, poly);
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index c5f6fcd7c4..3b6b1e343d 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -24,5 +24,6 @@ 
 #define VEC_TYPE float64x2_t
 
 VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
 VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
 VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index d5e2ec6dc5..d7ac47ca22 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -33,5 +33,6 @@ 
   }
 
 SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
 SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
 SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index c240738837..d4a9ac7154 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -24,5 +24,6 @@ 
 #define VEC_TYPE float32x4_t
 
 VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
 VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
 VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index 5a06b75857..d44033eab0 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -33,5 +33,6 @@ 
   }
 
 SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
 SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
 SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
diff --git a/sysdeps/aarch64/fpu/v_exp_data.c b/sysdeps/aarch64/fpu/v_exp_data.c
new file mode 100644
index 0000000000..b857062f78
--- /dev/null
+++ b/sysdeps/aarch64/fpu/v_exp_data.c
@@ -0,0 +1,66 @@ 
+/* Scale values for vector exp and exp2
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "vecmath_config.h"
+
+const uint64_t __v_exp_data[] = {
+  0x3ff0000000000000, 0x3feff63da9fb3335, 0x3fefec9a3e778061,
+  0x3fefe315e86e7f85, 0x3fefd9b0d3158574, 0x3fefd06b29ddf6de,
+  0x3fefc74518759bc8, 0x3fefbe3ecac6f383, 0x3fefb5586cf9890f,
+  0x3fefac922b7247f7, 0x3fefa3ec32d3d1a2, 0x3fef9b66affed31b,
+  0x3fef9301d0125b51, 0x3fef8abdc06c31cc, 0x3fef829aaea92de0,
+  0x3fef7a98c8a58e51, 0x3fef72b83c7d517b, 0x3fef6af9388c8dea,
+  0x3fef635beb6fcb75, 0x3fef5be084045cd4, 0x3fef54873168b9aa,
+  0x3fef4d5022fcd91d, 0x3fef463b88628cd6, 0x3fef3f49917ddc96,
+  0x3fef387a6e756238, 0x3fef31ce4fb2a63f, 0x3fef2b4565e27cdd,
+  0x3fef24dfe1f56381, 0x3fef1e9df51fdee1, 0x3fef187fd0dad990,
+  0x3fef1285a6e4030b, 0x3fef0cafa93e2f56, 0x3fef06fe0a31b715,
+  0x3fef0170fc4cd831, 0x3feefc08b26416ff, 0x3feef6c55f929ff1,
+  0x3feef1a7373aa9cb, 0x3feeecae6d05d866, 0x3feee7db34e59ff7,
+  0x3feee32dc313a8e5, 0x3feedea64c123422, 0x3feeda4504ac801c,
+  0x3feed60a21f72e2a, 0x3feed1f5d950a897, 0x3feece086061892d,
+  0x3feeca41ed1d0057, 0x3feec6a2b5c13cd0, 0x3feec32af0d7d3de,
+  0x3feebfdad5362a27, 0x3feebcb299fddd0d, 0x3feeb9b2769d2ca7,
+  0x3feeb6daa2cf6642, 0x3feeb42b569d4f82, 0x3feeb1a4ca5d920f,
+  0x3feeaf4736b527da, 0x3feead12d497c7fd, 0x3feeab07dd485429,
+  0x3feea9268a5946b7, 0x3feea76f15ad2148, 0x3feea5e1b976dc09,
+  0x3feea47eb03a5585, 0x3feea34634ccc320, 0x3feea23882552225,
+  0x3feea155d44ca973, 0x3feea09e667f3bcd, 0x3feea012750bdabf,
+  0x3fee9fb23c651a2f, 0x3fee9f7df9519484, 0x3fee9f75e8ec5f74,
+  0x3fee9f9a48a58174, 0x3fee9feb564267c9, 0x3feea0694fde5d3f,
+  0x3feea11473eb0187, 0x3feea1ed0130c132, 0x3feea2f336cf4e62,
+  0x3feea427543e1a12, 0x3feea589994cce13, 0x3feea71a4623c7ad,
+  0x3feea8d99b4492ed, 0x3feeaac7d98a6699, 0x3feeace5422aa0db,
+  0x3feeaf3216b5448c, 0x3feeb1ae99157736, 0x3feeb45b0b91ffc6,
+  0x3feeb737b0cdc5e5, 0x3feeba44cbc8520f, 0x3feebd829fde4e50,
+  0x3feec0f170ca07ba, 0x3feec49182a3f090, 0x3feec86319e32323,
+  0x3feecc667b5de565, 0x3feed09bec4a2d33, 0x3feed503b23e255d,
+  0x3feed99e1330b358, 0x3feede6b5579fdbf, 0x3feee36bbfd3f37a,
+  0x3feee89f995ad3ad, 0x3feeee07298db666, 0x3feef3a2b84f15fb,
+  0x3feef9728de5593a, 0x3feeff76f2fb5e47, 0x3fef05b030a1064a,
+  0x3fef0c1e904bc1d2, 0x3fef12c25bd71e09, 0x3fef199bdd85529c,
+  0x3fef20ab5fffd07a, 0x3fef27f12e57d14b, 0x3fef2f6d9406e7b5,
+  0x3fef3720dcef9069, 0x3fef3f0b555dc3fa, 0x3fef472d4a07897c,
+  0x3fef4f87080d89f2, 0x3fef5818dcfba487, 0x3fef60e316c98398,
+  0x3fef69e603db3285, 0x3fef7321f301b460, 0x3fef7c97337b9b5f,
+  0x3fef864614f5a129, 0x3fef902ee78b3ff6, 0x3fef9a51fbc74c83,
+  0x3fefa4afa2a490da, 0x3fefaf482d8e67f1, 0x3fefba1bee615a27,
+  0x3fefc52b376bba97, 0x3fefd0765b6e4540, 0x3fefdbfdad9cbe14,
+  0x3fefe7c1819e90d8, 0x3feff3c22b8f71f1,
+};
diff --git a/sysdeps/aarch64/fpu/vecmath_config.h b/sysdeps/aarch64/fpu/vecmath_config.h
index 0920658a0c..e7d30b477f 100644
--- a/sysdeps/aarch64/fpu/vecmath_config.h
+++ b/sysdeps/aarch64/fpu/vecmath_config.h
@@ -45,4 +45,7 @@  extern const struct v_log_data
   double invc[1 << V_LOG_TABLE_BITS];
   double logc[1 << V_LOG_TABLE_BITS];
 } __v_log_data attribute_hidden;
+
+#define V_EXP_TABLE_BITS 7
+extern const uint64_t __v_exp_data[1 << V_EXP_TABLE_BITS] attribute_hidden;
 #endif
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 7bdbf4614d..6b25ed43e0 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1005,10 +1005,18 @@  double: 1
 float: 1
 ldouble: 2
 
+Function: "exp_advsimd":
+double: 1
+float: 1
+
 Function: "exp_downward":
 double: 1
 float: 1
 
+Function: "exp_sve":
+double: 1
+float: 1
+
 Function: "exp_towardzero":
 double: 1
 float: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 1922191886..ae46ef8c34 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -1,11 +1,15 @@ 
 GLIBC_2.38 _ZGVnN2v_cos F
+GLIBC_2.38 _ZGVnN2v_exp F
 GLIBC_2.38 _ZGVnN2v_log F
 GLIBC_2.38 _ZGVnN2v_sin F
 GLIBC_2.38 _ZGVnN4v_cosf F
+GLIBC_2.38 _ZGVnN4v_expf F
 GLIBC_2.38 _ZGVnN4v_logf F
 GLIBC_2.38 _ZGVnN4v_sinf F
 GLIBC_2.38 _ZGVsMxv_cos F
 GLIBC_2.38 _ZGVsMxv_cosf F
+GLIBC_2.38 _ZGVsMxv_exp F
+GLIBC_2.38 _ZGVsMxv_expf F
 GLIBC_2.38 _ZGVsMxv_log F
 GLIBC_2.38 _ZGVsMxv_logf F
 GLIBC_2.38 _ZGVsMxv_sin F