diff mbox series

[v2,2/4] aarch64: Add vector implementations of sin routines

Message ID 20230619145634.18801-2-Joe.Ramsay@arm.com
State New
Headers show
Series [v2,1/4] aarch64: Add vector implementations of cos routines | expand

Commit Message

Joe Ramsay June 19, 2023, 2:56 p.m. UTC
Optimised implementations for single and double precision, Advanced
SIMD and SVE, copied from Arm Optimized Routines. Also allow
certain tests to be skipped for mathvec routines, for example both
AdvSIMD algorithms discard the sign of 0.

As previously, data tables are marked volatile or writable to prevent
overly aggressive constant inlining. Special-case handlers are marked
NOINLINE to avoid incurring the penalty of switching call standards
unnecessarily.
---
Changes to v1:
 * Use __glibc_unlikely
 * Remove polynomial helper macros, as they do not improve code for small polynomials
 * Explain data table storage

 math/auto-libm-test-out-sin                   |   4 +-
 math/gen-libm-test.py                         |   3 +-
 sysdeps/aarch64/fpu/Makefile                  |   8 +-
 sysdeps/aarch64/fpu/Versions                  |   4 +
 sysdeps/aarch64/fpu/bits/math-vector.h        |   6 ++
 sysdeps/aarch64/fpu/sin_advsimd.c             | 100 ++++++++++++++++++
 sysdeps/aarch64/fpu/sin_sve.c                 |  96 +++++++++++++++++
 sysdeps/aarch64/fpu/sinf_advsimd.c            |  93 ++++++++++++++++
 sysdeps/aarch64/fpu/sinf_sve.c                |  94 ++++++++++++++++
 .../fpu/test-double-advsimd-wrappers.c        |   1 +
 .../aarch64/fpu/test-double-sve-wrappers.c    |   1 +
 .../aarch64/fpu/test-float-advsimd-wrappers.c |   1 +
 sysdeps/aarch64/fpu/test-float-sve-wrappers.c |   1 +
 sysdeps/aarch64/libm-test-ulps                |   8 ++
 .../unix/sysv/linux/aarch64/libmvec.abilist   |   4 +
 15 files changed, 417 insertions(+), 7 deletions(-)
 create mode 100644 sysdeps/aarch64/fpu/sin_advsimd.c
 create mode 100644 sysdeps/aarch64/fpu/sin_sve.c
 create mode 100644 sysdeps/aarch64/fpu/sinf_advsimd.c
 create mode 100644 sysdeps/aarch64/fpu/sinf_sve.c
diff mbox series

Patch

diff --git a/math/auto-libm-test-out-sin b/math/auto-libm-test-out-sin
index f1d21b179c..27ccaff1aa 100644
--- a/math/auto-libm-test-out-sin
+++ b/math/auto-libm-test-out-sin
@@ -25,11 +25,11 @@  sin 0
 = sin upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok
 sin -0
 = sin downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= sin tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok
+= sin tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok no-mathvec
 = sin towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok
 = sin upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
 = sin downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= sin tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok
+= sin tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok no-mathvec
 = sin towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok
 = sin upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
 = sin downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
diff --git a/math/gen-libm-test.py b/math/gen-libm-test.py
index 6ae78beb01..a573c3b8cb 100755
--- a/math/gen-libm-test.py
+++ b/math/gen-libm-test.py
@@ -93,7 +93,8 @@  BEAUTIFY_MAP = {'minus_zero': '-0',
 
 # Flags in auto-libm-test-out that map directly to C flags.
 FLAGS_SIMPLE = {'ignore-zero-inf-sign': 'IGNORE_ZERO_INF_SIGN',
-                'xfail': 'XFAIL_TEST'}
+                'xfail': 'XFAIL_TEST',
+                'no-mathvec': 'NO_TEST_MATHVEC'}
 
 # Exceptions in auto-libm-test-out, and their corresponding C flags
 # for being required, OK or required to be absent.
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 850cfb9012..b3285542ea 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,10 +1,10 @@ 
-float-advsimd-funcs = cos
+float-advsimd-funcs = cos sin
 
-double-advsimd-funcs = cos
+double-advsimd-funcs = cos sin
 
-float-sve-funcs = cos
+float-sve-funcs = cos sin
 
-double-sve-funcs = cos
+double-sve-funcs = cos sin
 
 ifeq ($(subdir),mathvec)
 libmvec-support = $(addsuffix f_advsimd,$(float-advsimd-funcs)) \
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index 5222a6f180..d26b3968a9 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -1,8 +1,12 @@ 
 libmvec {
   GLIBC_2.38 {
     _ZGVnN2v_cos;
+    _ZGVnN2v_sin;
     _ZGVnN4v_cosf;
+    _ZGVnN4v_sinf;
     _ZGVsMxv_cos;
     _ZGVsMxv_cosf;
+    _ZGVsMxv_sin;
+    _ZGVsMxv_sinf;
   }
 }
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index a2f2277591..ad9c9945e8 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -50,7 +50,10 @@  typedef __SVBool_t __sv_bool_t;
 #  define __vpcs __attribute__ ((__aarch64_vector_pcs__))
 
 __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+
 __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
 
 #  undef __ADVSIMD_VEC_MATH_SUPPORTED
 #endif /* __ADVSIMD_VEC_MATH_SUPPORTED */
@@ -58,7 +61,10 @@  __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
 #ifdef __SVE_VEC_MATH_SUPPORTED
 
 __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
+
 __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
 
 #  undef __SVE_VEC_MATH_SUPPORTED
 #endif /* __SVE_VEC_MATH_SUPPORTED */
diff --git a/sysdeps/aarch64/fpu/sin_advsimd.c b/sysdeps/aarch64/fpu/sin_advsimd.c
new file mode 100644
index 0000000000..b64129b57b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sin_advsimd.c
@@ -0,0 +1,100 @@ 
+/* Double-precision vector (Advanced SIMD) sin function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const volatile struct
+{
+  float64x2_t poly[7];
+  float64x2_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+  /* Worst-case error is 2.8 ulp in [-pi/2, pi/2].  */
+  .poly = { V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
+	    V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
+	    V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
+	    V2 (-0x1.9e9540300a1p-41) },
+
+  .range_val = V2 (0x1p23),
+  .inv_pi = V2 (0x1.45f306dc9c883p-2),
+  .pi_1 = V2 (0x1.921fb54442d18p+1),
+  .pi_2 = V2 (0x1.1a62633145c06p-53),
+  .pi_3 = V2 (0x1.c1cd129024e09p-106),
+  .shift = V2 (0x1.8p52),
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u64 (0x3000000000000000) /* asuint64 (0x1p-255).  */
+# define Thresh v_u64 (0x1160000000000000)    /* RangeVal - TinyBound.  */
+#endif
+
+#define C(i) data.poly[i]
+
+static float64x2_t VPCS_ATTR NOINLINE
+special_case (float64x2_t x, float64x2_t y, uint64x2_t odd, uint64x2_t cmp)
+{
+  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+  return v_call_f64 (sin, x, y, cmp);
+}
+
+float64x2_t VPCS_ATTR V_NAME_D1 (sin) (float64x2_t x)
+{
+  float64x2_t n, r, r2, r3, r4, y, t1, t2, t3;
+  uint64x2_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  /* Detect |x| <= TinyBound or |x| >= RangeVal. If fenv exceptions are to be
+     triggered correctly, set any special lanes to 1 (which is neutral w.r.t.
+     fenv). These lanes will be fixed by special-case handler later.  */
+  uint64x2_t ir = vreinterpretq_u64_f64 (vabsq_f64 (x));
+  cmp = vcgeq_u64 (vsubq_u64 (ir, TinyBound), Thresh);
+  r = vbslq_f64 (cmp, vreinterpretq_f64_u64 (cmp), x);
+#else
+  r = x;
+  cmp = vcageq_f64 (data.range_val, x);
+  cmp = vceqzq_u64 (cmp); /* cmp = ~cmp.  */
+#endif
+
+  /* n = rint(|x|/pi).  */
+  n = vfmaq_f64 (data.shift, data.inv_pi, r);
+  odd = vshlq_n_u64 (vreinterpretq_u64_f64 (n), 63);
+  n = vsubq_f64 (n, data.shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  r = vfmsq_f64 (r, data.pi_1, n);
+  r = vfmsq_f64 (r, data.pi_2, n);
+  r = vfmsq_f64 (r, data.pi_3, n);
+
+  /* sin(r) poly approx.  */
+  r2 = vmulq_f64 (r, r);
+  r3 = vmulq_f64 (r2, r);
+  r4 = vmulq_f64 (r2, r2);
+
+  t1 = vfmaq_f64 (C (4), C (5), r2);
+  t2 = vfmaq_f64 (C (2), C (3), r2);
+  t3 = vfmaq_f64 (C (0), C (1), r2);
+
+  y = vfmaq_f64 (t1, C (6), r4);
+  y = vfmaq_f64 (t2, y, r4);
+  y = vfmaq_f64 (t3, y, r4);
+  y = vfmaq_f64 (r, y, r3);
+
+  if (__glibc_unlikely (v_any_u64 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), odd));
+}
diff --git a/sysdeps/aarch64/fpu/sin_sve.c b/sysdeps/aarch64/fpu/sin_sve.c
new file mode 100644
index 0000000000..482fc326ba
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sin_sve.c
@@ -0,0 +1,96 @@ 
+/* Double-precision vector (SVE) sin function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static struct
+{
+  double inv_pi, half_pi, inv_pi_over_2, pi_over_2_1, pi_over_2_2, pi_over_2_3,
+      shift;
+} data = {
+  /* Polynomial coefficients are hard-wired in the FTMAD instruction.  */
+  .inv_pi = 0x1.45f306dc9c883p-2,
+  .half_pi = 0x1.921fb54442d18p+0,
+  .inv_pi_over_2 = 0x1.45f306dc9c882p-1,
+  .pi_over_2_1 = 0x1.921fb50000000p+0,
+  .pi_over_2_2 = 0x1.110b460000000p-26,
+  .pi_over_2_3 = 0x1.1a62633145c07p-54,
+  .shift = 0x1.8p52
+};
+
+#define RangeVal 0x4160000000000000 /* asuint64 (0x1p23).  */
+
+static svfloat64_t NOINLINE
+special_case (svfloat64_t x, svfloat64_t y, svbool_t cmp)
+{
+  return sv_call_f64 (sin, x, y, cmp);
+}
+
+/* A fast SVE implementation of sin based on trigonometric
+   instructions (FTMAD, FTSSEL, FTSMUL).
+   Maximum observed error in 2.52 ULP:
+   SV_NAME_D1 (sin)(0x1.2d2b00df69661p+19) got 0x1.10ace8f3e786bp-40
+					  want 0x1.10ace8f3e7868p-40.  */
+svfloat64_t SV_NAME_D1 (sin) (svfloat64_t x, const svbool_t pg)
+{
+  svfloat64_t r = svabs_f64_x (pg, x);
+  svuint64_t sign
+      = sveor_u64_x (pg, svreinterpret_u64_f64 (x), svreinterpret_u64_f64 (r));
+  svbool_t cmp = svcmpge_n_u64 (pg, svreinterpret_u64_f64 (r), RangeVal);
+
+  /* Load first two pio2-related constants to one vector.  */
+  svfloat64_t invpio2_and_pio2_1
+      = svld1rq_f64 (svptrue_b64 (), &data.inv_pi_over_2);
+
+  /* n = rint(|x|/(pi/2)).  */
+  svfloat64_t q
+      = svmla_lane_f64 (sv_f64 (data.shift), r, invpio2_and_pio2_1, 0);
+  svfloat64_t n = svsub_n_f64_x (pg, q, data.shift);
+
+  /* r = |x| - n*(pi/2)  (range reduction into -pi/4 .. pi/4).  */
+  r = svmls_lane_f64 (r, n, invpio2_and_pio2_1, 1);
+  r = svmls_n_f64_x (pg, r, n, data.pi_over_2_2);
+  r = svmls_n_f64_x (pg, r, n, data.pi_over_2_3);
+
+  /* Final multiplicative factor: 1.0 or x depending on bit #0 of q.  */
+  svfloat64_t f = svtssel_f64 (r, svreinterpret_u64_f64 (q));
+
+  /* sin(r) poly approx.  */
+  svfloat64_t r2 = svtsmul_f64 (r, svreinterpret_u64_f64 (q));
+  svfloat64_t y = sv_f64 (0.0);
+  y = svtmad_f64 (y, r2, 7);
+  y = svtmad_f64 (y, r2, 6);
+  y = svtmad_f64 (y, r2, 5);
+  y = svtmad_f64 (y, r2, 4);
+  y = svtmad_f64 (y, r2, 3);
+  y = svtmad_f64 (y, r2, 2);
+  y = svtmad_f64 (y, r2, 1);
+  y = svtmad_f64 (y, r2, 0);
+
+  /* Apply factor.  */
+  y = svmul_f64_x (pg, f, y);
+
+  /* sign = y^sign.  */
+  y = svreinterpret_f64_u64 (
+      sveor_u64_x (pg, svreinterpret_u64_f64 (y), sign));
+
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/sinf_advsimd.c b/sysdeps/aarch64/fpu/sinf_advsimd.c
new file mode 100644
index 0000000000..df45d99737
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinf_advsimd.c
@@ -0,0 +1,93 @@ 
+/* Single-precision vector (Advanced SIMD) sin function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "v_math.h"
+
+static const volatile struct
+{
+  float32x4_t poly[4];
+  float32x4_t range_val, inv_pi, shift, pi_1, pi_2, pi_3;
+} data = {
+  /* 1.886 ulp error.  */
+  .poly = { V4 (-0x1.555548p-3f), V4 (0x1.110df4p-7f), V4 (-0x1.9f42eap-13f),
+	    V4 (0x1.5b2e76p-19f) },
+
+  .pi_1 = V4 (0x1.921fb6p+1f),
+  .pi_2 = V4 (-0x1.777a5cp-24f),
+  .pi_3 = V4 (-0x1.ee59dap-49f),
+
+  .inv_pi = V4 (0x1.45f306p-2f),
+  .shift = V4 (0x1.8p+23f),
+  .range_val = V4 (0x1p20f)
+};
+
+#if WANT_SIMD_EXCEPT
+# define TinyBound v_u32 (0x21000000) /* asuint32(0x1p-61f).  */
+# define Thresh v_u32 (0x28800000)    /* RangeVal - TinyBound.  */
+#endif
+
+#define C(i) data.poly[i]
+
+static float32x4_t VPCS_ATTR NOINLINE
+special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp)
+{
+  /* Fall back to scalar code.  */
+  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+  return v_call_f32 (sinf, x, y, cmp);
+}
+
+float32x4_t VPCS_ATTR V_NAME_F1 (sin) (float32x4_t x)
+{
+  float32x4_t n, r, r2, y;
+  uint32x4_t odd, cmp;
+
+#if WANT_SIMD_EXCEPT
+  uint32x4_t ir = vreinterpretq_u32_f32 (vabsq_f32 (x));
+  cmp = vcgeq_u32 (vsubq_u32 (ir, TinyBound), Thresh);
+  /* If fenv exceptions are to be triggered correctly, set any special lanes
+     to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+     special-case handler later.  */
+  r = vbslq_f32 (cmp, vreinterpretq_f32_u32 (cmp), x);
+#else
+  r = x;
+  cmp = vcageq_f32 (data.range_val, x);
+  cmp = vceqzq_u32 (cmp); /* cmp = ~cmp.  */
+#endif
+
+  /* n = rint(|x|/pi) */
+  n = vfmaq_f32 (data.shift, data.inv_pi, r);
+  odd = vshlq_n_u32 (vreinterpretq_u32_f32 (n), 31);
+  n = vsubq_f32 (n, data.shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2) */
+  r = vfmsq_f32 (r, data.pi_1, n);
+  r = vfmsq_f32 (r, data.pi_2, n);
+  r = vfmsq_f32 (r, data.pi_3, n);
+
+  /* y = sin(r) */
+  r2 = vmulq_f32 (r, r);
+  y = vfmaq_f32 (C (2), C (3), r2);
+  y = vfmaq_f32 (C (1), y, r2);
+  y = vfmaq_f32 (C (0), y, r2);
+  y = vfmaq_f32 (r, vmulq_f32 (y, r2), r);
+
+  if (__glibc_unlikely (v_any_u32 (cmp)))
+    return special_case (x, y, odd, cmp);
+  return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), odd));
+}
diff --git a/sysdeps/aarch64/fpu/sinf_sve.c b/sysdeps/aarch64/fpu/sinf_sve.c
new file mode 100644
index 0000000000..54df1aa860
--- /dev/null
+++ b/sysdeps/aarch64/fpu/sinf_sve.c
@@ -0,0 +1,94 @@ 
+/* Single-precision vector (SVE) sin function.
+
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "sv_math.h"
+
+static struct
+{
+  float poly[4];
+  /* Pi-related values to be loaded as one quad-word and used with
+     svmla_lane_f32.  */
+  float negpi1, negpi2, negpi3, invpi;
+  float shift;
+} data = {
+  .poly = {
+    /* Non-zero coefficients from the degree 9 Taylor series expansion of
+       sin.  */
+    -0x1.555548p-3f, 0x1.110df4p-7f, -0x1.9f42eap-13f, 0x1.5b2e76p-19f
+  },
+  .negpi1 = -0x1.921fb6p+1f,
+  .negpi2 = 0x1.777a5cp-24f,
+  .negpi3 = 0x1.ee59dap-49f,
+  .invpi = 0x1.45f306p-2f,
+  .shift = 0x1.8p+23f
+};
+
+#define RangeVal 0x49800000 /* asuint32 (0x1p20f).  */
+#define C(i) sv_f32 (data.poly[i])
+
+static svfloat32_t NOINLINE
+special_case (svfloat32_t x, svfloat32_t y, svbool_t cmp)
+{
+  return sv_call_f32 (sinf, x, y, cmp);
+}
+
+/* A fast SVE implementation of sinf.
+   Maximum error: 1.89 ULPs.
+   This maximum error is achieved at multiple values in [-2^18, 2^18]
+   but one example is:
+   SV_NAME_F1 (sin)(0x1.9247a4p+0) got 0x1.fffff6p-1 want 0x1.fffffap-1.  */
+svfloat32_t SV_NAME_F1 (sin) (svfloat32_t x, const svbool_t pg)
+{
+  svfloat32_t ax = svabs_f32_x (pg, x);
+  svuint32_t sign = sveor_u32_x (pg, svreinterpret_u32_f32 (x),
+				 svreinterpret_u32_f32 (ax));
+  svbool_t cmp = svcmpge_n_u32 (pg, svreinterpret_u32_f32 (ax), RangeVal);
+
+  /* pi_vals are a quad-word of helper values - the first 3 elements contain
+     -pi in extended precision, the last contains 1 / pi.  */
+  svfloat32_t pi_vals = svld1rq_f32 (svptrue_b32 (), &data.negpi1);
+
+  /* n = rint(|x|/pi).  */
+  svfloat32_t n = svmla_lane_f32 (sv_f32 (data.shift), ax, pi_vals, 3);
+  svuint32_t odd = svlsl_n_u32_x (pg, svreinterpret_u32_f32 (n), 31);
+  n = svsub_n_f32_x (pg, n, data.shift);
+
+  /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+  svfloat32_t r;
+  r = svmla_lane_f32 (ax, n, pi_vals, 0);
+  r = svmla_lane_f32 (r, n, pi_vals, 1);
+  r = svmla_lane_f32 (r, n, pi_vals, 2);
+
+  /* sin(r) approx using a degree 9 polynomial from the Taylor series
+     expansion. Note that only the odd terms of this are non-zero.  */
+  svfloat32_t r2 = svmul_f32_x (pg, r, r);
+  svfloat32_t y;
+  y = svmla_f32_x (pg, C (2), r2, C (3));
+  y = svmla_f32_x (pg, C (1), r2, y);
+  y = svmla_f32_x (pg, C (0), r2, y);
+  y = svmla_f32_x (pg, r, r, svmul_f32_x (pg, y, r2));
+
+  /* sign = y^sign^odd.  */
+  y = svreinterpret_f32_u32 (sveor_u32_x (pg, svreinterpret_u32_f32 (y),
+					  sveor_u32_x (pg, sign, odd)));
+
+  if (__glibc_unlikely (svptest_any (pg, cmp)))
+    return special_case (x, y, cmp);
+  return y;
+}
diff --git a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
index cb45fd3298..4af97a25a2 100644
--- a/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c
@@ -24,3 +24,4 @@ 
 #define VEC_TYPE float64x2_t
 
 VPCS_VECTOR_WRAPPER (cos_advsimd, _ZGVnN2v_cos)
+VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
diff --git a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
index cf72ef83b7..64c790adc5 100644
--- a/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-double-sve-wrappers.c
@@ -33,3 +33,4 @@ 
   }
 
 SVE_VECTOR_WRAPPER (cos_sve, _ZGVsMxv_cos)
+SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
diff --git a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
index fa146862b0..50e776b952 100644
--- a/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c
@@ -24,3 +24,4 @@ 
 #define VEC_TYPE float32x4_t
 
 VPCS_VECTOR_WRAPPER (cosf_advsimd, _ZGVnN4v_cosf)
+VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
diff --git a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
index bc26558c62..7355032929 100644
--- a/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
+++ b/sysdeps/aarch64/fpu/test-float-sve-wrappers.c
@@ -33,3 +33,4 @@ 
   }
 
 SVE_VECTOR_WRAPPER (cosf_sve, _ZGVsMxv_cosf)
+SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 07da4ab843..4145662b2d 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1257,11 +1257,19 @@  double: 1
 float: 1
 ldouble: 2
 
+Function: "sin_advsimd":
+double: 2
+float: 1
+
 Function: "sin_downward":
 double: 1
 float: 1
 ldouble: 3
 
+Function: "sin_sve":
+double: 2
+float: 1
+
 Function: "sin_towardzero":
 double: 1
 float: 1
diff --git a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
index 13af421af2..a4c564859c 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libmvec.abilist
@@ -1,4 +1,8 @@ 
 GLIBC_2.38 _ZGVnN2v_cos F
+GLIBC_2.38 _ZGVnN2v_sin F
 GLIBC_2.38 _ZGVnN4v_cosf F
+GLIBC_2.38 _ZGVnN4v_sinf F
 GLIBC_2.38 _ZGVsMxv_cos F
 GLIBC_2.38 _ZGVsMxv_cosf F
+GLIBC_2.38 _ZGVsMxv_sin F
+GLIBC_2.38 _ZGVsMxv_sinf F