diff mbox series

[v5,1/1] x86-64: Add vector acos/acosf implementation to libmvec

Message ID 20211219171809.2912282-2-skpgkp2@gmail.com
State New
Headers show
Series Add vector math function acos/acosf to libmvec | expand

Commit Message

Sunil Pandey Dec. 19, 2021, 5:18 p.m. UTC
Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
AVX512 versions for libmvec as per vector ABI.  It also contains
accuracy and ABI tests for vector acos/acosf with regenerated ulps.
---
 bits/libm-simd-decl-stubs.h                   |  11 +
 math/bits/mathcalls.h                         |   2 +-
 .../unix/sysv/linux/x86_64/libmvec.abilist    |   8 +
 sysdeps/x86/fpu/bits/math-vector.h            |   4 +
 .../x86/fpu/finclude/math-vector-fortran.h    |   4 +
 sysdeps/x86_64/fpu/Makeconfig                 |   1 +
 sysdeps/x86_64/fpu/Versions                   |   4 +
 sysdeps/x86_64/fpu/libm-test-ulps             |  20 ++
 .../fpu/multiarch/ifunc-mathvec-avx512-skx.h  |  39 +++
 .../fpu/multiarch/svml_d_acos2_core-sse2.S    |  20 ++
 .../x86_64/fpu/multiarch/svml_d_acos2_core.c  |  27 ++
 .../fpu/multiarch/svml_d_acos2_core_sse4.S    | 293 +++++++++++++++++
 .../fpu/multiarch/svml_d_acos4_core-sse.S     |  20 ++
 .../x86_64/fpu/multiarch/svml_d_acos4_core.c  |  27 ++
 .../fpu/multiarch/svml_d_acos4_core_avx2.S    | 273 ++++++++++++++++
 .../fpu/multiarch/svml_d_acos8_core-avx2.S    |  20 ++
 .../x86_64/fpu/multiarch/svml_d_acos8_core.c  |  27 ++
 .../fpu/multiarch/svml_d_acos8_core_avx512.S  | 298 ++++++++++++++++++
 .../fpu/multiarch/svml_s_acosf16_core-avx2.S  |  20 ++
 .../fpu/multiarch/svml_s_acosf16_core.c       |  28 ++
 .../multiarch/svml_s_acosf16_core_avx512.S    | 262 +++++++++++++++
 .../fpu/multiarch/svml_s_acosf4_core-sse2.S   |  20 ++
 .../x86_64/fpu/multiarch/svml_s_acosf4_core.c |  28 ++
 .../fpu/multiarch/svml_s_acosf4_core_sse4.S   | 260 +++++++++++++++
 .../fpu/multiarch/svml_s_acosf8_core-sse.S    |  20 ++
 .../x86_64/fpu/multiarch/svml_s_acosf8_core.c |  28 ++
 .../fpu/multiarch/svml_s_acosf8_core_avx2.S   | 252 +++++++++++++++
 sysdeps/x86_64/fpu/svml_d_acos2_core.S        |  29 ++
 sysdeps/x86_64/fpu/svml_d_acos4_core.S        |  29 ++
 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S    |  25 ++
 sysdeps/x86_64/fpu/svml_d_acos8_core.S        |  25 ++
 sysdeps/x86_64/fpu/svml_s_acosf16_core.S      |  25 ++
 sysdeps/x86_64/fpu/svml_s_acosf4_core.S       |  29 ++
 sysdeps/x86_64/fpu/svml_s_acosf8_core.S       |  29 ++
 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S   |  25 ++
 .../x86_64/fpu/test-double-libmvec-acos-avx.c |   1 +
 .../fpu/test-double-libmvec-acos-avx2.c       |   1 +
 .../fpu/test-double-libmvec-acos-avx512f.c    |   1 +
 sysdeps/x86_64/fpu/test-double-libmvec-acos.c |   3 +
 .../x86_64/fpu/test-double-vlen2-wrappers.c   |   1 +
 .../fpu/test-double-vlen4-avx2-wrappers.c     |   1 +
 .../x86_64/fpu/test-double-vlen4-wrappers.c   |   1 +
 .../x86_64/fpu/test-double-vlen8-wrappers.c   |   1 +
 .../x86_64/fpu/test-float-libmvec-acosf-avx.c |   1 +
 .../fpu/test-float-libmvec-acosf-avx2.c       |   1 +
 .../fpu/test-float-libmvec-acosf-avx512f.c    |   1 +
 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c |   3 +
 .../x86_64/fpu/test-float-vlen16-wrappers.c   |   1 +
 .../x86_64/fpu/test-float-vlen4-wrappers.c    |   1 +
 .../fpu/test-float-vlen8-avx2-wrappers.c      |   1 +
 .../x86_64/fpu/test-float-vlen8-wrappers.c    |   1 +
 51 files changed, 2251 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos2_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
 create mode 100644 sysdeps/x86_64/fpu/svml_d_acos8_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf16_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf4_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core.S
 create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
 create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
 create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c

Comments

Noah Goldstein Dec. 19, 2021, 6:29 p.m. UTC | #1
On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
> AVX512 versions for libmvec as per vector ABI.  It also contains
> accuracy and ABI tests for vector acos/acosf with regenerated ulps.
> ---

Have a few small comments but generally okay with a patch like this
one going out in
2.35.

>  bits/libm-simd-decl-stubs.h                   |  11 +
>  math/bits/mathcalls.h                         |   2 +-
>  .../unix/sysv/linux/x86_64/libmvec.abilist    |   8 +
>  sysdeps/x86/fpu/bits/math-vector.h            |   4 +
>  .../x86/fpu/finclude/math-vector-fortran.h    |   4 +
>  sysdeps/x86_64/fpu/Makeconfig                 |   1 +
>  sysdeps/x86_64/fpu/Versions                   |   4 +
>  sysdeps/x86_64/fpu/libm-test-ulps             |  20 ++
>  .../fpu/multiarch/ifunc-mathvec-avx512-skx.h  |  39 +++
>  .../fpu/multiarch/svml_d_acos2_core-sse2.S    |  20 ++
>  .../x86_64/fpu/multiarch/svml_d_acos2_core.c  |  27 ++
>  .../fpu/multiarch/svml_d_acos2_core_sse4.S    | 293 +++++++++++++++++
>  .../fpu/multiarch/svml_d_acos4_core-sse.S     |  20 ++
>  .../x86_64/fpu/multiarch/svml_d_acos4_core.c  |  27 ++
>  .../fpu/multiarch/svml_d_acos4_core_avx2.S    | 273 ++++++++++++++++
>  .../fpu/multiarch/svml_d_acos8_core-avx2.S    |  20 ++
>  .../x86_64/fpu/multiarch/svml_d_acos8_core.c  |  27 ++
>  .../fpu/multiarch/svml_d_acos8_core_avx512.S  | 298 ++++++++++++++++++
>  .../fpu/multiarch/svml_s_acosf16_core-avx2.S  |  20 ++
>  .../fpu/multiarch/svml_s_acosf16_core.c       |  28 ++
>  .../multiarch/svml_s_acosf16_core_avx512.S    | 262 +++++++++++++++
>  .../fpu/multiarch/svml_s_acosf4_core-sse2.S   |  20 ++
>  .../x86_64/fpu/multiarch/svml_s_acosf4_core.c |  28 ++
>  .../fpu/multiarch/svml_s_acosf4_core_sse4.S   | 260 +++++++++++++++
>  .../fpu/multiarch/svml_s_acosf8_core-sse.S    |  20 ++
>  .../x86_64/fpu/multiarch/svml_s_acosf8_core.c |  28 ++
>  .../fpu/multiarch/svml_s_acosf8_core_avx2.S   | 252 +++++++++++++++
>  sysdeps/x86_64/fpu/svml_d_acos2_core.S        |  29 ++
>  sysdeps/x86_64/fpu/svml_d_acos4_core.S        |  29 ++
>  sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S    |  25 ++
>  sysdeps/x86_64/fpu/svml_d_acos8_core.S        |  25 ++
>  sysdeps/x86_64/fpu/svml_s_acosf16_core.S      |  25 ++
>  sysdeps/x86_64/fpu/svml_s_acosf4_core.S       |  29 ++
>  sysdeps/x86_64/fpu/svml_s_acosf8_core.S       |  29 ++
>  sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S   |  25 ++
>  .../x86_64/fpu/test-double-libmvec-acos-avx.c |   1 +
>  .../fpu/test-double-libmvec-acos-avx2.c       |   1 +
>  .../fpu/test-double-libmvec-acos-avx512f.c    |   1 +
>  sysdeps/x86_64/fpu/test-double-libmvec-acos.c |   3 +
>  .../x86_64/fpu/test-double-vlen2-wrappers.c   |   1 +
>  .../fpu/test-double-vlen4-avx2-wrappers.c     |   1 +
>  .../x86_64/fpu/test-double-vlen4-wrappers.c   |   1 +
>  .../x86_64/fpu/test-double-vlen8-wrappers.c   |   1 +
>  .../x86_64/fpu/test-float-libmvec-acosf-avx.c |   1 +
>  .../fpu/test-float-libmvec-acosf-avx2.c       |   1 +
>  .../fpu/test-float-libmvec-acosf-avx512f.c    |   1 +
>  sysdeps/x86_64/fpu/test-float-libmvec-acosf.c |   3 +
>  .../x86_64/fpu/test-float-vlen16-wrappers.c   |   1 +
>  .../x86_64/fpu/test-float-vlen4-wrappers.c    |   1 +
>  .../fpu/test-float-vlen8-avx2-wrappers.c      |   1 +
>  .../x86_64/fpu/test-float-vlen8-wrappers.c    |   1 +
>  51 files changed, 2251 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
>  create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos2_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_d_acos8_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf16_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf4_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core.S
>  create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
>  create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
>  create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
>
> diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
> index b80ff332a0..2ccdd1fc53 100644
> --- a/bits/libm-simd-decl-stubs.h
> +++ b/bits/libm-simd-decl-stubs.h
> @@ -98,4 +98,15 @@
>  #define __DECL_SIMD_powf32x
>  #define __DECL_SIMD_powf64x
>  #define __DECL_SIMD_powf128x
> +
> +#define __DECL_SIMD_acos
> +#define __DECL_SIMD_acosf
> +#define __DECL_SIMD_acosl
> +#define __DECL_SIMD_acosf16
> +#define __DECL_SIMD_acosf32
> +#define __DECL_SIMD_acosf64
> +#define __DECL_SIMD_acosf128
> +#define __DECL_SIMD_acosf32x
> +#define __DECL_SIMD_acosf64x
> +#define __DECL_SIMD_acosf128x
>  #endif
> diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
> index da4cf4e10c..2cc6654208 100644
> --- a/math/bits/mathcalls.h
> +++ b/math/bits/mathcalls.h
> @@ -50,7 +50,7 @@
>  /* Trigonometric functions.  */
>
>  /* Arc cosine of X.  */
> -__MATHCALL (acos,, (_Mdouble_ __x));
> +__MATHCALL_VEC (acos,, (_Mdouble_ __x));
>  /* Arc sine of X.  */
>  __MATHCALL (asin,, (_Mdouble_ __x));
>  /* Arc tangent of X.  */
> diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> index 363d4ace1e..b37b55777e 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
> @@ -46,3 +46,11 @@ GLIBC_2.22 _ZGVeN8v_log F
>  GLIBC_2.22 _ZGVeN8v_sin F
>  GLIBC_2.22 _ZGVeN8vv_pow F
>  GLIBC_2.22 _ZGVeN8vvv_sincos F
> +GLIBC_2.35 _ZGVbN2v_acos F
> +GLIBC_2.35 _ZGVbN4v_acosf F
> +GLIBC_2.35 _ZGVcN4v_acos F
> +GLIBC_2.35 _ZGVcN8v_acosf F
> +GLIBC_2.35 _ZGVdN4v_acos F
> +GLIBC_2.35 _ZGVdN8v_acosf F
> +GLIBC_2.35 _ZGVeN16v_acosf F
> +GLIBC_2.35 _ZGVeN8v_acos F
> diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
> index dc0bfb3705..dabb74cbb9 100644
> --- a/sysdeps/x86/fpu/bits/math-vector.h
> +++ b/sysdeps/x86/fpu/bits/math-vector.h
> @@ -58,6 +58,10 @@
>  #  define __DECL_SIMD_pow __DECL_SIMD_x86_64
>  #  undef __DECL_SIMD_powf
>  #  define __DECL_SIMD_powf __DECL_SIMD_x86_64
> +#  undef __DECL_SIMD_acos
> +#  define __DECL_SIMD_acos __DECL_SIMD_x86_64
> +#  undef __DECL_SIMD_acosf
> +#  define __DECL_SIMD_acosf __DECL_SIMD_x86_64
>
>  # endif
>  #endif
> diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> index 311bb4e391..4bcbd1fbce 100644
> --- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> +++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
> @@ -28,6 +28,8 @@
>  !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64')
>  !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64')
>  !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
> +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
>
>  !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
>  !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
> @@ -41,3 +43,5 @@
>  !GCC$ builtin (expf) attributes simd (notinbranch) if('x32')
>  !GCC$ builtin (pow) attributes simd (notinbranch) if('x32')
>  !GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
> +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
> diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
> index b0e3bf7887..7acf1f306c 100644
> --- a/sysdeps/x86_64/fpu/Makeconfig
> +++ b/sysdeps/x86_64/fpu/Makeconfig
> @@ -22,6 +22,7 @@ postclean-generated += libmvec.mk
>
>  # Define for both math and mathvec directories.
>  libmvec-funcs = \
> +  acos \
>    cos \
>    exp \
>    log \
> diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
> index 08132045d6..2985fe7ca7 100644
> --- a/sysdeps/x86_64/fpu/Versions
> +++ b/sysdeps/x86_64/fpu/Versions
> @@ -13,4 +13,8 @@ libmvec {
>      _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
>      _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
>    }
> +  GLIBC_2.35 {
> +    _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
> +    _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
> +  }
>  }
> diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
> index 312575f933..85a568ed29 100644
> --- a/sysdeps/x86_64/fpu/libm-test-ulps
> +++ b/sysdeps/x86_64/fpu/libm-test-ulps
> @@ -25,6 +25,26 @@ float: 1
>  float128: 1
>  ldouble: 2
>
> +Function: "acos_vlen16":
> +float: 1
> +
> +Function: "acos_vlen2":
> +double: 1
> +
> +Function: "acos_vlen4":
> +double: 1
> +float: 2
> +
> +Function: "acos_vlen4_avx2":
> +double: 1
> +
> +Function: "acos_vlen8":
> +double: 1
> +float: 2
> +
> +Function: "acos_vlen8_avx2":
> +float: 1
> +
>  Function: "acosh":
>  double: 2
>  float: 2
> diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
> new file mode 100644
> index 0000000000..3aed563dde
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
> @@ -0,0 +1,39 @@
> +/* Common definition for libmathvec ifunc selections optimized with
> +   AVX512.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +#undef PASTER2
> +#define PASTER2(x,y)   x##_##y
> +
> +extern void REDIRECT_NAME (void);
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)
> +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ))
> +    return OPTIMIZE (skx);
> +
> +  return OPTIMIZE (avx2_wrapper);
> +}
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
> new file mode 100644
> index 0000000000..25fb8d0cac
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized acos, vector length is 2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2
> +#include "../svml_d_acos2_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
> new file mode 100644
> index 0000000000..5ba5d6fac2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVbN2v_acos
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
> new file mode 100644
> index 0000000000..2c528c012e
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
> @@ -0,0 +1,293 @@
> +/* Function acos vectorized with SSE4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define SgnBit                         0
> +#define OneHalf                        16
> +#define SmallNorm                      32
> +#define MOne                           48
> +#define Two                            64
> +#define sqrt_coeff                     80
> +#define poly_coeff                     144
> +#define PiH                            336
> +#define Pi2H                           352
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN2v_acos_sse4)
> +        subq      $72, %rsp
> +        cfi_def_cfa_offset(80)
> +        movaps    %xmm0, %xmm5
> +        movups    __svml_dacos_data_internal(%rip), %xmm3
> +        movups    OneHalf+__svml_dacos_data_internal(%rip), %xmm6
> +
> +/* x = -|arg| */
> +        movaps    %xmm3, %xmm4
> +        orps      %xmm5, %xmm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        movaps    %xmm6, %xmm7
> +        mulpd     %xmm4, %xmm7
> +        addpd     %xmm7, %xmm6
> +
> +/* S ~ 2*sqrt(Y) */
> +        cvtpd2ps  %xmm6, %xmm9
> +        movlhps   %xmm9, %xmm9
> +
> +/* x^2 */
> +        movaps    %xmm4, %xmm0
> +        rsqrtps   %xmm9, %xmm10
> +        mulpd     %xmm4, %xmm0
> +        cvtps2pd  %xmm10, %xmm11
> +        minpd     %xmm6, %xmm0
> +        movaps    %xmm6, %xmm1
> +        movaps    %xmm0, %xmm2
> +        cmpltpd   SmallNorm+__svml_dacos_data_internal(%rip), %xmm1
> +        cmpnltpd  %xmm6, %xmm2
> +        addpd     %xmm6, %xmm6
> +        andnps    %xmm11, %xmm1
> +        movaps    %xmm0, %xmm11
> +        movaps    %xmm1, %xmm12
> +        andps     %xmm5, %xmm3
> +        mulpd     %xmm1, %xmm12
> +        mulpd     %xmm6, %xmm1
> +        mulpd     %xmm12, %xmm6
> +        mulpd     %xmm0, %xmm11
> +        subpd     Two+__svml_dacos_data_internal(%rip), %xmm6
> +        movups    sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm13
> +        movaps    %xmm6, %xmm14
> +        mulpd     %xmm6, %xmm13
> +        mulpd     %xmm1, %xmm14
> +        addpd     sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm13
> +        mulpd     %xmm6, %xmm13
> +        addpd     sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm13
> +        mulpd     %xmm13, %xmm6
> +
> +/* polynomial */
> +        movups    poly_coeff+__svml_dacos_data_internal(%rip), %xmm15
> +        movaps    %xmm11, %xmm7
> +        mulpd     %xmm0, %xmm15
> +        addpd     sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6
> +        addpd     poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm15
> +        mulpd     %xmm11, %xmm7
> +        mulpd     %xmm6, %xmm14
> +        mulpd     %xmm11, %xmm15
> +        subpd     %xmm14, %xmm1
> +        movups    MOne+__svml_dacos_data_internal(%rip), %xmm8
> +        andps     %xmm2, %xmm1
> +
> +/* NaN processed in special branch (so wind test passed) */
> +        cmpnlepd  %xmm4, %xmm8
> +        movmskpd  %xmm8, %edx
> +
> +/* X<X^2 iff X<0 */
> +        movaps    %xmm5, %xmm12
> +        movups    poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm8
> +        movaps    %xmm2, %xmm13
> +        movups    poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm6
> +        mulpd     %xmm0, %xmm8
> +        mulpd     %xmm0, %xmm6
> +        addpd     poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm8
> +        addpd     poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm6
> +        cmpltpd   %xmm0, %xmm12
> +        addpd     %xmm15, %xmm8
> +        mulpd     %xmm11, %xmm6
> +        mulpd     %xmm7, %xmm8
> +        movups    poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm9
> +        mulpd     %xmm0, %xmm9
> +        addpd     poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm9
> +        addpd     %xmm6, %xmm9
> +        movups    poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm10
> +        movaps    %xmm2, %xmm6
> +        mulpd     %xmm0, %xmm10
> +        addpd     %xmm8, %xmm9
> +        addpd     poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm10
> +        mulpd     %xmm11, %xmm9
> +        movups    poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm14
> +        andnps    %xmm4, %xmm6
> +        addpd     %xmm9, %xmm10
> +        mulpd     %xmm0, %xmm14
> +        mulpd     %xmm10, %xmm11
> +        addpd     poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm14
> +        addpd     %xmm11, %xmm14
> +        mulpd     %xmm0, %xmm14
> +        orps      %xmm1, %xmm6
> +        pxor      %xmm3, %xmm6
> +        mulpd     %xmm6, %xmm14
> +        movups    PiH+__svml_dacos_data_internal(%rip), %xmm0
> +        andps     %xmm2, %xmm0
> +        andnps    Pi2H+__svml_dacos_data_internal(%rip), %xmm13
> +        andps     %xmm12, %xmm0
> +        addpd     %xmm13, %xmm0
> +        addpd     %xmm14, %xmm6
> +        addpd     %xmm6, %xmm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        addq      $72, %rsp
> +        cfi_def_cfa_offset(8)
> +        ret
> +        cfi_def_cfa_offset(80)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        movups    %xmm5, 32(%rsp)
> +        movups    %xmm0, 48(%rsp)
> +        xorl      %eax, %eax
> +        movq      %r12, 16(%rsp)
> +        cfi_offset(12, -64)
> +        movl      %eax, %r12d
> +        movq      %r13, 8(%rsp)
> +        cfi_offset(13, -72)
> +        movl      %edx, %r13d
> +        movq      %r14, (%rsp)
> +        cfi_offset(14, -80)
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $2, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      16(%rsp), %r12
> +        cfi_restore(12)
> +        movq      8(%rsp), %r13
> +        cfi_restore(13)
> +        movq      (%rsp), %r14
> +        cfi_restore(14)
> +        movups    48(%rsp), %xmm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        cfi_offset(12, -64)
> +        cfi_offset(13, -72)
> +        cfi_offset(14, -80)
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movsd     32(%rsp,%r14,8), %xmm0
> +        call      acos@PLT
> +        movsd     %xmm0, 48(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVbN2v_acos_sse4)
> +
> +        .section .rodata, "a"
> +        .align 16
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +        __declspec(align(16)) VUINT32 SgnBit[2][2];
> +        __declspec(align(16)) VUINT32 OneHalf[2][2];
> +        __declspec(align(16)) VUINT32 SmallNorm[2][2];
> +        __declspec(align(16)) VUINT32 MOne[2][2];
> +        __declspec(align(16)) VUINT32 Two[2][2];
> +        __declspec(align(16)) VUINT32 sqrt_coeff[4][2][2];
> +        __declspec(align(16)) VUINT32 poly_coeff[12][2][2];
> +        __declspec(align(16)) VUINT32 PiH[2][2];
> +        __declspec(align(16)) VUINT32 Pi2H[2][2];
> +} __svml_dacos_data_internal;
> +#endif
> +__svml_dacos_data_internal:
> +        /*== SgnBit ==*/
> +        .quad 0x8000000000000000, 0x8000000000000000
> +        /*== OneHalf ==*/
> +        .align 16
> +        .quad 0x3fe0000000000000, 0x3fe0000000000000
> +        /*== SmallNorm ==*/
> +        .align 16
> +        .quad 0x3000000000000000, 0x3000000000000000
> +        /*== MOne ==*/
> +        .align 16
> +        .quad 0xbff0000000000000, 0xbff0000000000000
> +        /*== Two ==*/
> +        .align 16
> +        .quad 0x4000000000000000, 0x4000000000000000
> +        /*== sqrt_coeff[4] ==*/
> +        .align 16
> +        .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> +        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> +        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> +        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> +        /*== poly_coeff[12] ==*/
> +        .align 16
> +        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> +        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> +        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> +        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> +        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> +        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> +        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> +        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> +        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> +        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> +        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> +        .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> +        /*== PiH ==*/
> +        .align 16
> +        .quad 0x400921fb54442d18, 0x400921fb54442d18
> +        /*== Pi2H ==*/
> +        .align 16
> +        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
> +        .align 16
> +        .type  __svml_dacos_data_internal,@object
> +        .size  __svml_dacos_data_internal,.-__svml_dacos_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
> new file mode 100644
> index 0000000000..750f71c81c
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized acos, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper
> +#include "../svml_d_acos4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
> new file mode 100644
> index 0000000000..6453e7ebe2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVdN4v_acos
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> new file mode 100644
> index 0000000000..172080e3ea
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> @@ -0,0 +1,273 @@
> +/* Function acos vectorized with AVX2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define SgnBit                         0
> +#define OneHalf                        32
> +#define SmallNorm                      64
> +#define MOne                           96
> +#define Two                            128
> +#define sqrt_coeff                     160
> +#define poly_coeff                     288
> +#define PiH                            672
> +#define Pi2H                           704
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN4v_acos_avx2)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-32, %rsp
> +        subq      $96, %rsp
> +        vmovupd   __svml_dacos_data_internal(%rip), %ymm6
> +        vmovupd   OneHalf+__svml_dacos_data_internal(%rip), %ymm7
> +        vmovapd   %ymm0, %ymm5
> +
> +/* x = -|arg| */
> +        vorpd     %ymm5, %ymm6, %ymm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231pd %ymm4, %ymm7, %ymm7
> +
> +/* x^2 */
> +        vmulpd    %ymm4, %ymm4, %ymm8
> +
> +/* S ~ 2*sqrt(Y) */
> +        vmovupd   sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
> +        vcmplt_oqpd SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm12
> +        vminpd    %ymm7, %ymm8, %ymm2
> +
> +/* NaN processed in special branch (so wind test passed) */
> +        vcmpnge_uqpd MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm9
> +        vcvtpd2ps %ymm7, %xmm10
> +        vmovupd   poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm8
> +        vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
> +        vrsqrtps  %xmm10, %xmm11
> +        vfmadd213pd poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm8
> +        vcvtps2pd %xmm11, %ymm13
> +        vmovupd   poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm11
> +        vandnpd   %ymm13, %ymm12, %ymm14
> +        vmulpd    %ymm14, %ymm14, %ymm15
> +        vfmadd213pd poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm11
> +        vmulpd    %ymm2, %ymm2, %ymm13
> +        vmovupd   poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm12
> +        vmulpd    %ymm13, %ymm13, %ymm10
> +        vfmadd213pd poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
> +        vandpd    %ymm5, %ymm6, %ymm3
> +        vaddpd    %ymm7, %ymm7, %ymm6
> +        vmulpd    %ymm6, %ymm14, %ymm7
> +        vfmsub213pd Two+__svml_dacos_data_internal(%rip), %ymm15, %ymm6
> +        vmovupd   poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm14
> +        vfmadd213pd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
> +        vmulpd    %ymm6, %ymm7, %ymm15
> +        vfmadd213pd poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm14
> +        vfmadd213pd sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
> +        vfmadd213pd sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
> +
> +/* polynomial */
> +        vmovupd   poly_coeff+__svml_dacos_data_internal(%rip), %ymm6
> +        vfnmadd213pd %ymm7, %ymm15, %ymm0
> +        vfmadd213pd poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
> +        vblendvpd %ymm1, %ymm0, %ymm4, %ymm0
> +        vfmadd213pd %ymm8, %ymm13, %ymm6
> +        vmovmskpd %ymm9, %edx
> +        vmovupd   poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm9
> +        vfmadd213pd poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
> +        vfmadd213pd %ymm9, %ymm13, %ymm11
> +        vfmadd213pd %ymm11, %ymm10, %ymm6
> +        vfmadd213pd %ymm12, %ymm13, %ymm6
> +        vfmadd213pd %ymm14, %ymm13, %ymm6
> +        vmulpd    %ymm6, %ymm2, %ymm9
> +
> +/* X<X^2 iff X<0 */
> +        vcmplt_oqpd %ymm2, %ymm5, %ymm6
> +        vandpd    PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
> +        vandnpd   Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
> +        vxorpd    %ymm3, %ymm0, %ymm1
> +        vfmadd213pd %ymm1, %ymm1, %ymm9
> +        vandpd    %ymm6, %ymm2, %ymm2
> +        vaddpd    %ymm7, %ymm2, %ymm8
> +        vaddpd    %ymm9, %ymm8, %ymm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovupd   %ymm5, 32(%rsp)
> +        vmovupd   %ymm0, 64(%rsp)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        movq      %r12, 16(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 8(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, (%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $4, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      16(%rsp), %r12
> +        cfi_restore(12)
> +        movq      8(%rsp), %r13
> +        cfi_restore(13)
> +        movq      (%rsp), %r14
> +        cfi_restore(14)
> +        vmovupd   64(%rsp), %ymm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movsd     32(%rsp,%r14,8), %xmm0
> +        call      acos@PLT
> +        movsd     %xmm0, 64(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVdN4v_acos_avx2)
> +
> +        .section .rodata, "a"
> +        .align 32
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +        __declspec(align(32)) VUINT32 SgnBit[4][2];
> +        __declspec(align(32)) VUINT32 OneHalf[4][2];
> +        __declspec(align(32)) VUINT32 SmallNorm[4][2];
> +        __declspec(align(32)) VUINT32 MOne[4][2];
> +        __declspec(align(32)) VUINT32 Two[4][2];
> +        __declspec(align(32)) VUINT32 sqrt_coeff[4][4][2];
> +        __declspec(align(32)) VUINT32 poly_coeff[12][4][2];
> +        __declspec(align(32)) VUINT32 PiH[4][2];
> +        __declspec(align(32)) VUINT32 Pi2H[4][2];
> +} __svml_dacos_data_internal;
> +#endif
> +__svml_dacos_data_internal:
> +        /*== SgnBit ==*/
> +        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> +        /*== OneHalf ==*/
> +        .align 32
> +        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
> +        /*== SmallNorm ==*/
> +        .align 32
> +        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
> +        /*== MOne ==*/
> +        .align 32
> +        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> +        /*== Two ==*/
> +        .align 32
> +        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
> +        /*== sqrt_coeff[4] ==*/
> +        .align 32
> +        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> +        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> +        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> +        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> +        /*== poly_coeff[12] ==*/
> +        .align 32
> +        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> +        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> +        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> +        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> +        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> +        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> +        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> +        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> +        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> +        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> +        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> +        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> +        /*== PiH ==*/
> +        .align 32
> +        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
> +        /*== Pi2H ==*/
> +        .align 32
> +        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> +        .align 32
> +        .type  __svml_dacos_data_internal,@object
> +        .size  __svml_dacos_data_internal,.-__svml_dacos_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
> new file mode 100644
> index 0000000000..4d64fd1c00
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized acos, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper
> +#include "../svml_d_acos8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
> new file mode 100644
> index 0000000000..1e7d1865fb
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
> @@ -0,0 +1,27 @@
> +/* Multiple versions of vectorized acos, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVeN8v_acos
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> new file mode 100644
> index 0000000000..76ca35ad7b
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> @@ -0,0 +1,298 @@
> +/* Function acos vectorized with AVX-512.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + */
> +
> +/* Offsets for data table __svml_dacos_data_internal
> + */
> +#define SgnBit                         0
> +#define OneHalf                        64
> +#define SmallNorm                      128
> +#define MOne                           192
> +#define Two                            256
> +#define sqrt_coeff_1                   320
> +#define sqrt_coeff_2                   384
> +#define sqrt_coeff_3                   448
> +#define sqrt_coeff_4                   512
> +#define poly_coeff_1                   576
> +#define poly_coeff_2                   640
> +#define poly_coeff_3                   704
> +#define poly_coeff_4                   768
> +#define poly_coeff_5                   832
> +#define poly_coeff_6                   896
> +#define poly_coeff_7                   960
> +#define poly_coeff_8                   1024
> +#define poly_coeff_9                   1088
> +#define poly_coeff_10                  1152
> +#define poly_coeff_11                  1216
> +#define poly_coeff_12                  1280
> +#define PiH                            1344
> +#define Pi2H                           1408

There is enough memory here it may pay to make the accesses
sequential in memory.
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.evex512,"ax",@progbits
> +ENTRY(_ZGVeN8v_acos_skx)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $192, %rsp
> +        vmovups   __svml_dacos_data_internal(%rip), %zmm7
> +        vmovups   OneHalf+__svml_dacos_data_internal(%rip), %zmm8
> +
> +/* S ~ 2*sqrt(Y) */
> +        vmovups   SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
> +        vmovups   Two+__svml_dacos_data_internal(%rip), %zmm14
> +        vmovups   sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
> +        vmovups   sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
> +        vmovups   sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
> +        vmovups   MOne+__svml_dacos_data_internal(%rip), %zmm10
> +        vmovaps   %zmm0, %zmm6
> +
> +/* x = -|arg| */
> +        vorpd     %zmm6, %zmm7, %zmm5
> +        vandpd    %zmm6, %zmm7, %zmm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
> +
> +/* x^2 */
> +        vmulpd    {rn-sae}, %zmm5, %zmm5, %zmm9
> +        vrsqrt14pd %zmm8, %zmm12
> +        vcmppd    $17, {sae}, %zmm11, %zmm8, %k1
> +        vcmppd    $17, {sae}, %zmm10, %zmm5, %k0
> +        vmovups   poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
> +        vmovups   poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
> +        vminpd    {sae}, %zmm8, %zmm9, %zmm3
> +        vmovups   poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
> +        vxorpd    %zmm12, %zmm12, %zmm12{%k1}
> +        vaddpd    {rn-sae}, %zmm8, %zmm8, %zmm0
> +        vcmppd    $21, {sae}, %zmm8, %zmm3, %k4
> +
> +/* X<X^2 iff X<0 */
> +        vcmppd    $17, {sae}, %zmm3, %zmm6, %k2
> +        vmulpd    {rn-sae}, %zmm12, %zmm12, %zmm13
> +        vmulpd    {rn-sae}, %zmm12, %zmm0, %zmm7
> +        vmovups   poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
> +
> +/* polynomial */
> +        vmovups   poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
> +        vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
> +        vmovups   sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
> +        vmovups   poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
> +        vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
> +        vmovups   poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
> +        vmulpd    {rn-sae}, %zmm0, %zmm7, %zmm14
> +        vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
> +        vmovups   poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
> +        kmovw     %k4, %eax
> +        kmovw     %k2, %ecx
> +        kmovw     %k0, %edx
> +        vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
> +        vmovups   poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
> +        vmulpd    {rn-sae}, %zmm3, %zmm3, %zmm0
> +        vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
> +        vmovups   poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
> +        vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
> +        vblendmpd %zmm2, %zmm5, %zmm2{%k4}
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
> +        vmovups   poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> +        andl      %eax, %ecx
drop I think

> +        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> +        kmovw     %ecx, %k3
kandw %k4, %k2, %k3

> +        vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
> +        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
> +        vmulpd    {rn-sae}, %zmm0, %zmm0, %zmm10
> +        vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
> +        vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
> +        vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
> +        vmovups   Pi2H+__svml_dacos_data_internal(%rip), %zmm0
> +        vmulpd    {rn-sae}, %zmm3, %zmm1, %zmm1
> +        vxorpd    %zmm4, %zmm2, %zmm3
> +        vxorpd    %zmm0, %zmm0, %zmm0{%k4}
> +        vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
> +        vorpd     PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k3}
> +        vaddpd    {rn-sae}, %zmm1, %zmm0, %zmm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovups   %zmm6, 64(%rsp)
> +        vmovups   %zmm0, 128(%rsp)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        movq      %r12, 16(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 8(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, (%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $8, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      16(%rsp), %r12
> +        cfi_restore(12)
> +        movq      8(%rsp), %r13
> +        cfi_restore(13)
> +        movq      (%rsp), %r14
> +        cfi_restore(14)
> +        vmovups   128(%rsp), %zmm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movsd     64(%rsp,%r14,8), %xmm0
> +        call      acos@PLT
> +        movsd     %xmm0, 128(%rsp,%r14,8)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVeN8v_acos_skx)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_dacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +        __declspec(align(64)) VUINT32 SgnBit[8][2];
> +        __declspec(align(64)) VUINT32 OneHalf[8][2];
> +        __declspec(align(64)) VUINT32 SmallNorm[8][2];
> +        __declspec(align(64)) VUINT32 MOne[8][2];
> +        __declspec(align(64)) VUINT32 Two[8][2];
> +        __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
> +        __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
> +        __declspec(align(64)) VUINT32 PiH[8][2];
> +        __declspec(align(64)) VUINT32 Pi2H[8][2];
> +} __svml_dacos_data_internal;
> +#endif
> +__svml_dacos_data_internal:
> +        /*== SgnBit ==*/
> +        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
> +        /*== OneHalf ==*/
> +        .align 64
> +        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
> +        /*== MOne ==*/
> +        .align 64
> +        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
> +        /*== Two ==*/
> +        .align 64
> +        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
> +        /*== sqrt_coeff[4] ==*/
> +        .align 64
> +        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
> +        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
> +        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
> +        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
> +        /*== poly_coeff[12] ==*/
> +        .align 64
> +        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
> +        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
> +        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
> +        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
> +        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
> +        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
> +        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
> +        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
> +        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
> +        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
> +        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
> +        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
> +        /*== PiH ==*/
> +        .align 64
> +        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
> +        /*== Pi2H ==*/
> +        .align 64
> +        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
> +        .align 64
> +        .type  __svml_dacos_data_internal,@object
> +        .size  __svml_dacos_data_internal,.-__svml_dacos_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
> new file mode 100644
> index 0000000000..1ff0cfc8d5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
> @@ -0,0 +1,20 @@
> +/* AVX2 version of vectorized acosf.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper
> +#include "../svml_s_acosf16_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
> new file mode 100644
> index 0000000000..fcf05782c5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 16.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVeN16v_acosf
> +#include "ifunc-mathvec-avx512-skx.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf,
> +              __redirect__ZGVeN16v_acosf)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> new file mode 100644
> index 0000000000..1db2969c77
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> @@ -0,0 +1,262 @@
> +/* Function acosf vectorized with AVX-512.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define SgnBit                         0
> +#define OneHalf                        64
> +#define SmallNorm                      128
> +#define MOne                           192
> +#define Two                            256
> +#define sqrt_coeff_1                   320
> +#define sqrt_coeff_2                   384
> +#define poly_coeff_1                   448
> +#define poly_coeff_2                   512
> +#define poly_coeff_3                   576
> +#define poly_coeff_4                   640
> +#define poly_coeff_5                   704
> +#define Pi2H                           768
> +#define PiH                            832
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.exex512,"ax",@progbits
> +ENTRY(_ZGVeN16v_acosf_skx)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-64, %rsp
> +        subq      $192, %rsp
> +        vmovups   __svml_sacos_data_internal(%rip), %zmm5
> +        vmovups   OneHalf+__svml_sacos_data_internal(%rip), %zmm6
> +
> +/* SQ ~ 2*sqrt(Y) */
> +        vmovups   SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
> +        vmovups   MOne+__svml_sacos_data_internal(%rip), %zmm8
> +        vmovups   Two+__svml_sacos_data_internal(%rip), %zmm12
> +        vmovups   sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
> +        vmovaps   %zmm0, %zmm4
> +
> +/* x = -|arg| */
> +        vorps     %zmm4, %zmm5, %zmm3
> +        vandps    %zmm4, %zmm5, %zmm2
> +        vmovups   sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
> +
> +/* x^2 */
> +        vmulps    {rn-sae}, %zmm3, %zmm3, %zmm7
> +        vrsqrt14ps %zmm6, %zmm10
> +        vcmpps    $17, {sae}, %zmm9, %zmm6, %k1
> +        vcmpps    $22, {sae}, %zmm3, %zmm8, %k0
> +        vmovups   poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
> +        vminps    {sae}, %zmm6, %zmm7, %zmm1
> +        vmovups   poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
> +        vxorps    %zmm10, %zmm10, %zmm10{%k1}
> +        vaddps    {rn-sae}, %zmm6, %zmm6, %zmm14
> +        vmulps    {rn-sae}, %zmm1, %zmm1, %zmm8
> +        vmulps    {rn-sae}, %zmm10, %zmm10, %zmm11
> +        vmulps    {rn-sae}, %zmm10, %zmm14, %zmm5
> +        vcmpps    $21, {sae}, %zmm6, %zmm1, %k4
> +
> +/* X<X^2 iff X<0 */
> +        vcmpps    $17, {sae}, %zmm1, %zmm4, %k2
> +
> +/* polynomial */
> +        vmovups   poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
> +        vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
> +        vmovups   poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
> +        vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
> +        vmovups   poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
> +        vmovups   Pi2H+__svml_sacos_data_internal(%rip), %zmm12
> +        vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
> +        vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
> +        vmulps    {rn-sae}, %zmm14, %zmm5, %zmm15
> +        vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
> +        vxorps    %zmm12, %zmm12, %zmm12{%k4}
> +        vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
> +        vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
> +        kmovw     %k4, %eax
> +        kmovw     %k2, %ecx
> +        kmovw     %k0, %edx
> +        vmulps    {rn-sae}, %zmm1, %zmm11, %zmm13
> +        vblendmps %zmm0, %zmm3, %zmm0{%k4}
> +        vxorps    %zmm2, %zmm0, %zmm1
> +        andl      %eax, %ecx
drop I think

> +        kmovw     %ecx, %k3
kandw %k4, %k2, %k3

> +        vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
> +        vorps     PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3}
> +        vaddps    {rn-sae}, %zmm13, %zmm12, %zmm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovups   %zmm4, 64(%rsp)
> +        vmovups   %zmm0, 128(%rsp)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        movq      %r12, 16(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 8(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, (%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $16, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      16(%rsp), %r12
> +        cfi_restore(12)
> +        movq      8(%rsp), %r13
> +        cfi_restore(13)
> +        movq      (%rsp), %r14
> +        cfi_restore(14)
> +        vmovups   128(%rsp), %zmm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movss     64(%rsp,%r14,4), %xmm0
> +        call      acosf@PLT
> +        movss     %xmm0, 128(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVeN16v_acosf_skx)
> +
> +        .section .rodata, "a"
> +        .align 64
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +        __declspec(align(64)) VUINT32 SgnBit[16][1];
> +        __declspec(align(64)) VUINT32 OneHalf[16][1];
> +        __declspec(align(64)) VUINT32 SmallNorm[16][1];
> +        __declspec(align(64)) VUINT32 MOne[16][1];
> +        __declspec(align(64)) VUINT32 Two[16][1];
> +        __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
> +        __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
> +        __declspec(align(64)) VUINT32 Pi2H[16][1];
> +        __declspec(align(64)) VUINT32 PiH[16][1];
> +} __svml_sacos_data_internal;
> +#endif
> +__svml_sacos_data_internal:
> +        /*== SgnBit ==*/
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== OneHalf ==*/
> +        .align 64
> +        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> +        /*== SmallNorm ==*/
> +        .align 64
> +        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> +        /*== MOne ==*/
> +        .align 64
> +        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> +        /*== Two ==*/
> +        .align 64
> +        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
> +        /*== sqrt_coeff[2] ==*/
> +        .align 64
> +        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> +        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> +        /*== poly_coeff[5] ==*/
> +        .align 64
> +        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> +        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> +        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> +        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> +        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> +        /*== Pi2H ==*/
> +        .align 64
> +        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> +        /*== PiH ==*/
> +        .align 64
> +        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> +        .align 64
> +        .type  __svml_sacos_data_internal,@object
> +        .size  __svml_sacos_data_internal,.-__svml_sacos_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
> new file mode 100644
> index 0000000000..f94b3eb01a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
> @@ -0,0 +1,20 @@
> +/* SSE2 version of vectorized acosf, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2
> +#include "../svml_s_acosf4_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
> new file mode 100644
> index 0000000000..6f9a5c1082
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVbN4v_acosf
> +#include "ifunc-mathvec-sse4_1.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf,
> +              __redirect__ZGVbN4v_acosf)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
> new file mode 100644
> index 0000000000..fe0c94aeb5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
> @@ -0,0 +1,260 @@
> +/* Function acosf vectorized with SSE4.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define SgnBit                         0
> +#define OneHalf                        16
> +#define SmallNorm                      32
> +#define MOne                           48
> +#define Two                            64
> +#define sqrt_coeff                     80
> +#define poly_coeff                     112
> +#define Pi2H                           192
> +#define PiH                            208
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.sse4,"ax",@progbits
> +ENTRY(_ZGVbN4v_acosf_sse4)
> +        subq      $72, %rsp
> +        cfi_def_cfa_offset(80)
> +
> +/* X<X^2 iff X<0 */
> +        movaps    %xmm0, %xmm14
> +
> +/*
> + * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
> + * SQ ~ 2*sqrt(X)
> + */
> +        movups    __svml_sacos_data_internal(%rip), %xmm3
> +        movups    OneHalf+__svml_sacos_data_internal(%rip), %xmm5
> +
> +/* x = -|arg| */
> +        movaps    %xmm3, %xmm4
> +        orps      %xmm0, %xmm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        movaps    %xmm5, %xmm6
> +        mulps     %xmm4, %xmm6
> +
> +/* x^2 */
> +        movaps    %xmm4, %xmm13
> +        mulps     %xmm4, %xmm13
> +        addps     %xmm6, %xmm5
> +
> +/* SQ ~ 2*sqrt(Y) */
> +        rsqrtps   %xmm5, %xmm8
> +        minps     %xmm5, %xmm13
> +        movaps    %xmm5, %xmm2
> +        movaps    %xmm13, %xmm1
> +        cmpltps   SmallNorm+__svml_sacos_data_internal(%rip), %xmm2
> +        cmpnltps  %xmm5, %xmm1
> +        cmpltps   %xmm13, %xmm14
> +        addps     %xmm5, %xmm5
> +        andnps    %xmm8, %xmm2
> +        movaps    %xmm13, %xmm11
> +        movaps    %xmm2, %xmm9
> +        movaps    %xmm1, %xmm6
> +        mulps     %xmm2, %xmm9
> +        andnps    %xmm4, %xmm6
> +        mulps     %xmm5, %xmm2
> +        mulps     %xmm13, %xmm11
> +        mulps     %xmm9, %xmm5
> +        movups    sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm10
> +        andps     %xmm0, %xmm3
> +
> +/* polynomial */
> +        movups    poly_coeff+__svml_sacos_data_internal(%rip), %xmm12
> +        movaps    %xmm1, %xmm15
> +        mulps     %xmm13, %xmm12
> +        subps     Two+__svml_sacos_data_internal(%rip), %xmm5
> +        mulps     %xmm5, %xmm10
> +        addps     poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm12
> +        mulps     %xmm2, %xmm5
> +        mulps     %xmm11, %xmm12
> +        addps     sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm10
> +        mulps     %xmm5, %xmm10
> +        movups    poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm5
> +        subps     %xmm10, %xmm2
> +        mulps     %xmm13, %xmm5
> +        movups    MOne+__svml_sacos_data_internal(%rip), %xmm7
> +        andps     %xmm1, %xmm2
> +        cmpnleps  %xmm4, %xmm7
> +        addps     poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm5
> +        movmskps  %xmm7, %edx
> +        orps      %xmm2, %xmm6
> +        addps     %xmm12, %xmm5
> +        mulps     %xmm13, %xmm5
> +        pxor      %xmm3, %xmm6
> +        movups    PiH+__svml_sacos_data_internal(%rip), %xmm7
> +        andps     %xmm1, %xmm7
> +        addps     poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm5
> +        mulps     %xmm13, %xmm5
> +        andps     %xmm14, %xmm7
> +        mulps     %xmm6, %xmm5
> +        andnps    Pi2H+__svml_sacos_data_internal(%rip), %xmm15
> +        addps     %xmm5, %xmm6
> +        addps     %xmm15, %xmm7
> +        addps     %xmm6, %xmm7
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movaps    %xmm7, %xmm0
> +        addq      $72, %rsp
> +        cfi_def_cfa_offset(8)
> +        ret
> +        cfi_def_cfa_offset(80)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        movups    %xmm0, 32(%rsp)
> +        movups    %xmm7, 48(%rsp)
> +        xorl      %eax, %eax
> +        movq      %r12, 16(%rsp)
> +        cfi_offset(12, -64)
> +        movl      %eax, %r12d
> +        movq      %r13, 8(%rsp)
> +        cfi_offset(13, -72)
> +        movl      %edx, %r13d
> +        movq      %r14, (%rsp)
> +        cfi_offset(14, -80)
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $4, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      16(%rsp), %r12
> +        cfi_restore(12)
> +        movq      8(%rsp), %r13
> +        cfi_restore(13)
> +        movq      (%rsp), %r14
> +        cfi_restore(14)
> +        movups    48(%rsp), %xmm7
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        cfi_offset(12, -64)
> +        cfi_offset(13, -72)
> +        cfi_offset(14, -80)
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movss     32(%rsp,%r14,4), %xmm0
> +        call      acosf@PLT
> +        movss     %xmm0, 48(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVbN4v_acosf_sse4)
> +
> +        .section .rodata, "a"
> +        .align 16
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +        __declspec(align(16)) VUINT32 SgnBit[4][1];
> +        __declspec(align(16)) VUINT32 OneHalf[4][1];
> +        __declspec(align(16)) VUINT32 SmallNorm[4][1];
> +        __declspec(align(16)) VUINT32 MOne[4][1];
> +        __declspec(align(16)) VUINT32 Two[4][1];
> +        __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
> +        __declspec(align(16)) VUINT32 poly_coeff[5][4][1];
> +        __declspec(align(16)) VUINT32 Pi2H[4][1];
> +        __declspec(align(16)) VUINT32 PiH[4][1];
> +} __svml_sacos_data_internal;
> +#endif
> +__svml_sacos_data_internal:
> +        /*== SgnBit ==*/
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== OneHalf ==*/
> +        .align 16
> +        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> +        /*== SmallNorm ==*/
> +        .align 16
> +        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> +        /*== MOne ==*/
> +        .align 16
> +        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> +        /*== Two ==*/
> +        .align 16
> +        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
> +        /*== sqrt_coeff[2] ==*/
> +        .align 16
> +        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> +        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> +        /*== poly_coeff[5] ==*/
> +        .align 16
> +        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> +        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> +        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> +        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> +        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> +        /*== Pi2H ==*/
> +        .align 16
> +        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> +        /*== PiH ==*/
> +        .align 16
> +        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> +        .align 16
> +        .type  __svml_sacos_data_internal,@object
> +        .size  __svml_sacos_data_internal,.-__svml_sacos_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
> new file mode 100644
> index 0000000000..583ef54fee
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
> @@ -0,0 +1,20 @@
> +/* SSE version of vectorized acosf, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +    Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper
> +#include "../svml_s_acosf8_core.S"
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
> new file mode 100644
> index 0000000000..dd360a9479
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
> @@ -0,0 +1,28 @@
> +/* Multiple versions of vectorized acosf, vector length is 8.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define SYMBOL_NAME _ZGVdN8v_acosf
> +#include "ifunc-mathvec-avx2.h"
> +
> +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf,
> +              __redirect__ZGVdN8v_acosf)
> +  __attribute__ ((visibility ("hidden")));
> +#endif
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> new file mode 100644
> index 0000000000..2b6dd2c2c2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> @@ -0,0 +1,252 @@
> +/* Function acosf vectorized with AVX2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   https://www.gnu.org/licenses/.  */
> +
> +/*
> + * ALGORITHM DESCRIPTION:
> + *
> + *      SelMask = (|x| >= 0.5) ? 1 : 0;
> + *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
> + *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
> + *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
> + *
> + *
> + */
> +
> +/* Offsets for data table __svml_sacos_data_internal
> + */
> +#define SgnBit                         0
> +#define OneHalf                        32
> +#define SmallNorm                      64
> +#define MOne                           96
> +#define Two                            128
> +#define sqrt_coeff                     160
> +#define poly_coeff                     224
> +#define Pi2H                           384
> +#define PiH                            416
> +
> +#include <sysdep.h>
> +
> +        .text
> +       .section .text.avx2,"ax",@progbits
> +ENTRY(_ZGVdN8v_acosf_avx2)
> +        pushq     %rbp
> +        cfi_def_cfa_offset(16)
> +        movq      %rsp, %rbp
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +        andq      $-32, %rsp
> +        subq      $96, %rsp
> +
> +/*
> + * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
> + * SQ ~ 2*sqrt(X)
> + */
> +        vmovups   __svml_sacos_data_internal(%rip), %ymm6
> +        vmovups   OneHalf+__svml_sacos_data_internal(%rip), %ymm7
> +        vmovaps   %ymm0, %ymm5
> +
> +/* x = -|arg| */
> +        vorps     %ymm5, %ymm6, %ymm4
> +
> +/* Y = 0.5 + 0.5*(-x) */
> +        vfmadd231ps %ymm4, %ymm7, %ymm7
> +
> +/* x^2 */
> +        vmulps    %ymm4, %ymm4, %ymm8
> +
> +/* SQ ~ 2*sqrt(Y) */
> +        vmovups   sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
> +        vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9
> +        vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10
> +        vminps    %ymm7, %ymm8, %ymm2
> +        vaddps    %ymm7, %ymm7, %ymm14
> +        vrsqrtps  %ymm7, %ymm11
> +        vmovups   poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8
> +        vcmpnlt_uqps %ymm7, %ymm2, %ymm1
> +        vmulps    %ymm2, %ymm2, %ymm7
> +        vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8
> +        vmovmskps %ymm9, %edx
> +
> +/* polynomial */
> +        vmovups   poly_coeff+__svml_sacos_data_internal(%rip), %ymm9
> +        vandnps   %ymm11, %ymm10, %ymm12
> +        vmulps    %ymm12, %ymm12, %ymm13
> +        vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
> +
> +/* X<X^2 iff X<0 */
> +        vcmplt_oqps %ymm2, %ymm5, %ymm10
> +        vfmadd213ps %ymm8, %ymm7, %ymm9
> +        vandps    %ymm5, %ymm6, %ymm3
> +        vmulps    %ymm14, %ymm12, %ymm6
> +        vfmsub213ps Two+__svml_sacos_data_internal(%rip), %ymm13, %ymm14
> +        vfmadd213ps poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
> +        vfmadd213ps sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm14, %ymm0
> +        vmulps    %ymm14, %ymm6, %ymm15
> +        vmulps    %ymm9, %ymm2, %ymm14
> +        vfnmadd213ps %ymm6, %ymm15, %ymm0
> +        vblendvps %ymm1, %ymm0, %ymm4, %ymm0
> +        vandps    PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
> +        vandnps   Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm12
> +        vxorps    %ymm3, %ymm0, %ymm1
> +        vfmadd213ps %ymm1, %ymm1, %ymm14
> +        vandps    %ymm10, %ymm2, %ymm11
> +        vaddps    %ymm12, %ymm11, %ymm13
> +        vaddps    %ymm14, %ymm13, %ymm0
> +        testl     %edx, %edx
> +
> +/* Go to special inputs processing branch */
> +        jne       L(SPECIAL_VALUES_BRANCH)
> +
> +/* Restore registers
> + * and exit the function
> + */
> +
> +L(EXIT):
> +        movq      %rbp, %rsp
> +        popq      %rbp
> +        cfi_def_cfa(7, 8)
> +        cfi_restore(6)
> +        ret
> +        cfi_def_cfa(6, 16)
> +        cfi_offset(6, -16)
> +
> +/* Branch to process
> + * special inputs
> + */
> +
> +L(SPECIAL_VALUES_BRANCH):
> +        vmovups   %ymm5, 32(%rsp)
> +        vmovups   %ymm0, 64(%rsp)
> +        xorl      %eax, %eax
> +        vzeroupper
> +        movq      %r12, 16(%rsp)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> +        movl      %eax, %r12d
> +        movq      %r13, 8(%rsp)
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> +        movl      %edx, %r13d
> +        movq      %r14, (%rsp)
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> +
> +/* Range mask
> + * bits check
> + */
> +
> +L(RANGEMASK_CHECK):
> +        btl       %r12d, %r13d
> +
> +/* Call scalar math function */
> +        jc        L(SCALAR_MATH_CALL)
> +
> +/* Special inputs
> + * processing loop
> + */
> +
> +L(SPECIAL_VALUES_LOOP):
> +        incl      %r12d
> +        cmpl      $8, %r12d
> +
> +/* Check bits in range mask */
> +        jl        L(RANGEMASK_CHECK)
> +        movq      16(%rsp), %r12
> +        cfi_restore(12)
> +        movq      8(%rsp), %r13
> +        cfi_restore(13)
> +        movq      (%rsp), %r14
> +        cfi_restore(14)
> +        vmovups   64(%rsp), %ymm0
> +
> +/* Go to exit */
> +        jmp       L(EXIT)
> +        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
> +        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
> +        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
> +
> +/* Scalar math fucntion call
> + * to process special input
> + */
> +
> +L(SCALAR_MATH_CALL):
> +        movl      %r12d, %r14d
> +        movss     32(%rsp,%r14,4), %xmm0
> +        call      acosf@PLT
> +        movss     %xmm0, 64(%rsp,%r14,4)
> +
> +/* Process special inputs in loop */
> +        jmp       L(SPECIAL_VALUES_LOOP)
> +
> +END(_ZGVdN8v_acosf_avx2)
> +
> +        .section .rodata, "a"
> +        .align 32
> +
> +#ifdef __svml_sacos_data_internal_typedef
> +typedef unsigned int VUINT32;
> +typedef struct {
> +        __declspec(align(32)) VUINT32 SgnBit[8][1];
> +        __declspec(align(32)) VUINT32 OneHalf[8][1];
> +        __declspec(align(32)) VUINT32 SmallNorm[8][1];
> +        __declspec(align(32)) VUINT32 MOne[8][1];
> +        __declspec(align(32)) VUINT32 Two[8][1];
> +        __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1];
> +        __declspec(align(32)) VUINT32 poly_coeff[5][8][1];
> +        __declspec(align(32)) VUINT32 Pi2H[8][1];
> +        __declspec(align(32)) VUINT32 PiH[8][1];
> +} __svml_sacos_data_internal;
> +#endif
> +__svml_sacos_data_internal:
> +        /*== SgnBit ==*/
> +        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
> +        /*== OneHalf ==*/
> +        .align 32
> +        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> +        /*== SmallNorm ==*/
> +        .align 32
> +        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
> +        /*== MOne ==*/
> +        .align 32
> +        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
> +        /*== Two ==*/
> +        .align 32
> +        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
> +        /*== sqrt_coeff[2] ==*/
> +        .align 32
> +        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
> +        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
> +        /*== poly_coeff[5] ==*/
> +        .align 32
> +        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
> +        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
> +        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
> +        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
> +        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
> +        /*== Pi2H ==*/
> +        .align 32
> +        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
> +        /*== PiH ==*/
> +        .align 32
> +        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
> +        .align 32
> +        .type  __svml_sacos_data_internal,@object
> +        .size  __svml_sacos_data_internal,.-__svml_sacos_data_internal
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
> new file mode 100644
> index 0000000000..9656478b2d
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
> @@ -0,0 +1,29 @@
> +/* Function acos vectorized with SSE2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVbN2v_acos)
> +WRAPPER_IMPL_SSE2 acos
> +END (_ZGVbN2v_acos)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN2v_acos)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
> new file mode 100644
> index 0000000000..e99cb4ae78
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
> @@ -0,0 +1,29 @@
> +/* Function acos vectorized with AVX2, wrapper version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVdN4v_acos)
> +WRAPPER_IMPL_AVX _ZGVbN2v_acos
> +END (_ZGVdN4v_acos)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN4v_acos)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
> new file mode 100644
> index 0000000000..7cbcbc965c
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVcN4v_acos)
> +WRAPPER_IMPL_AVX _ZGVbN2v_acos
> +END (_ZGVcN4v_acos)
> diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
> new file mode 100644
> index 0000000000..e26b30d81a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
> @@ -0,0 +1,25 @@
> +/* Function acos vectorized with AVX-512, wrapper to AVX2.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_d_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVeN8v_acos)
> +WRAPPER_IMPL_AVX512 _ZGVdN4v_acos
> +END (_ZGVeN8v_acos)
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
> new file mode 100644
> index 0000000000..70e046d492
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
> @@ -0,0 +1,25 @@
> +/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVeN16v_acosf)
> +WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf
> +END (_ZGVeN16v_acosf)
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
> new file mode 100644
> index 0000000000..36354b32b5
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
> @@ -0,0 +1,29 @@
> +/* Function acosf vectorized with SSE2, wrapper version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVbN4v_acosf)
> +WRAPPER_IMPL_SSE2 acosf
> +END (_ZGVbN4v_acosf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVbN4v_acosf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
> new file mode 100644
> index 0000000000..f08864a511
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
> @@ -0,0 +1,29 @@
> +/* Function acosf vectorized with AVX2, wrapper version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +       .text
> +ENTRY (_ZGVdN8v_acosf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_acosf
> +END (_ZGVdN8v_acosf)
> +
> +#ifndef USE_MULTIARCH
> + libmvec_hidden_def (_ZGVdN8v_acosf)
> +#endif
> diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
> new file mode 100644
> index 0000000000..f3ed4d8e78
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
> @@ -0,0 +1,25 @@
> +/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "svml_s_wrapper_impl.h"
> +
> +        .text
> +ENTRY (_ZGVcN8v_acosf)
> +WRAPPER_IMPL_AVX _ZGVbN4v_acosf
> +END (_ZGVcN8v_acosf)
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
> new file mode 100644
> index 0000000000..4f74b4260a
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-double-libmvec-acos.c"
> diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
> new file mode 100644
> index 0000000000..e38b8ce821
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE double
> +#define LIBMVEC_FUNC acos
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> index ed932fc98d..0abc7d2021 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
>
>  #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> index 3a6e37044f..dda093b914 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
> @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
>
>  #ifndef __ILP32__
>  # define VEC_INT_TYPE __m256i
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> index 99db4e7616..f3230463bb 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
>
>  #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> index 251d429ac0..cf9f52faf0 100644
> --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
>  VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
>  VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
> +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
>
>  #ifndef __ILP32__
>  # define VEC_INT_TYPE __m512i
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
> new file mode 100644
> index 0000000000..1e6474dfa2
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
> @@ -0,0 +1 @@
> +#include "test-float-libmvec-acosf.c"
> diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
> new file mode 100644
> index 0000000000..fb47f974fd
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
> @@ -0,0 +1,3 @@
> +#define LIBMVEC_TYPE float
> +#define LIBMVEC_FUNC acosf
> +#include "test-vector-abi-arg1.h"
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> index c1d14cd79e..abbd3ed870 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
>
>  #define VEC_INT_TYPE __m512i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> index d23c372060..8a24027952 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
>
>  #define VEC_INT_TYPE __m128i
>
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> index 3152cffb0c..aff0442606 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
> @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
>
>  /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
>  #undef VECTOR_WRAPPER_fFF
> diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> index a8492abfef..913584d111 100644
> --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
> @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
>  VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
>  VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
>  VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
> +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
>
>  #define VEC_INT_TYPE __m128i

>
> --
> 2.31.1
>
H.J. Lu Dec. 19, 2021, 8:26 p.m. UTC | #2
On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote:
> On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
> > AVX512 versions for libmvec as per vector ABI.  It also contains
> > accuracy and ABI tests for vector acos/acosf with regenerated ulps.
> > ---
> 
> Have a few small comments but generally okay with a patch like this
> one going out in
> 2.35.

...

> 
> > +#define poly_coeff_6                   896
> > +#define poly_coeff_7                   960
> > +#define poly_coeff_8                   1024
> > +#define poly_coeff_9                   1088
> > +#define poly_coeff_10                  1152
> > +#define poly_coeff_11                  1216
> > +#define poly_coeff_12                  1280
> > +#define PiH                            1344
> > +#define Pi2H                           1408
> 
> There is enough memory here it may pay to make the accesses

Did you enough registers?

> sequential in memory.

This is based on Intel compiler generated codes.  We will evaluate
Intel compiler changes.

...

> > +
> > +#include <sysdep.h>
> > +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> > +        andl      %eax, %ecx
> drop I think
> 
> > +        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> > +        kmovw     %ecx, %k3
> kandw %k4, %k2, %k3

This may not be faster since mask register can only go to port 0.  We
will evaluate register allocation in Intel compiler.


Thanks.

H.J.
Noah Goldstein Dec. 19, 2021, 8:42 p.m. UTC | #3
On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote:
> > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
> > > AVX512 versions for libmvec as per vector ABI.  It also contains
> > > accuracy and ABI tests for vector acos/acosf with regenerated ulps.
> > > ---
> >
> > Have a few small comments but generally okay with a patch like this
> > one going out in
> > 2.35.
>
> ...
>
> >
> > > +#define poly_coeff_6                   896
> > > +#define poly_coeff_7                   960
> > > +#define poly_coeff_8                   1024
> > > +#define poly_coeff_9                   1088
> > > +#define poly_coeff_10                  1152
> > > +#define poly_coeff_11                  1216
> > > +#define poly_coeff_12                  1280
> > > +#define PiH                            1344
> > > +#define Pi2H                           1408
> >
> > There is enough memory here it may pay to make the accesses
>
> Did you enough registers?

This shouldn't affect register allocation.
It's just if in the program we access: poly_coeff_11 -> poly_coeff_6
-> poly_coeff_8

it might be beneficial to organize the addresses of 11/6/8 s.t its
sequential memory
accesses from the table i.e
#define poly_coeff_11 896
#define poly_coeff_6 960
#define poly_coeff_8 1024
...

Random example and just a thought. Figure if coming in cold it might
save a cache miss or two because it has an easy to recognize pattern
for the HW prefetcher. Don't think it's make or break.

>
> > sequential in memory.
>
> This is based on Intel compiler generated codes.  We will evaluate
> Intel compiler changes.
>
> ...
>
> > > +
> > > +#include <sysdep.h>
> > > +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> > > +        andl      %eax, %ecx
> > drop I think
> >
> > > +        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> > > +        kmovw     %ecx, %k3
> > kandw %k4, %k2, %k3
>
> This may not be faster since mask register can only go to port 0.  We
> will evaluate register allocation in Intel compiler.

`kmovw` and `kandw` are both 1uop port0.

`andl` + `kmovw` is 2 uops and has 4c latency
vs
`kandw` is 1 uop and 1c latency.
>
>
> Thanks.
>
> H.J.
Sunil Pandey Dec. 20, 2021, 4:08 p.m. UTC | #4
On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote:
> > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
> > > > AVX512 versions for libmvec as per vector ABI.  It also contains
> > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps.
> > > > ---
> > >
> > > Have a few small comments but generally okay with a patch like this
> > > one going out in
> > > 2.35.
> >
> > ...
> >
> > >
> > > > +#define poly_coeff_6                   896
> > > > +#define poly_coeff_7                   960
> > > > +#define poly_coeff_8                   1024
> > > > +#define poly_coeff_9                   1088
> > > > +#define poly_coeff_10                  1152
> > > > +#define poly_coeff_11                  1216
> > > > +#define poly_coeff_12                  1280
> > > > +#define PiH                            1344
> > > > +#define Pi2H                           1408
> > >
> > > There is enough memory here it may pay to make the accesses
> >
> > Did you enough registers?
>
> This shouldn't affect register allocation.
> It's just if in the program we access: poly_coeff_11 -> poly_coeff_6
> -> poly_coeff_8
>
> it might be beneficial to organize the addresses of 11/6/8 s.t its
> sequential memory
> accesses from the table i.e
> #define poly_coeff_11 896
> #define poly_coeff_6 960
> #define poly_coeff_8 1024
> ...
>
> Random example and just a thought. Figure if coming in cold it might
> save a cache miss or two because it has an easy to recognize pattern
> for the HW prefetcher. Don't think it's make or break.
>

Good suggestion. It's difficult to hand modify. Will let compiler team know
about this optimization.

> >
> > > sequential in memory.
> >
> > This is based on Intel compiler generated codes.  We will evaluate
> > Intel compiler changes.
> >
> > ...
> >
> > > > +
> > > > +#include <sysdep.h>
> > > > +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> > > > +        andl      %eax, %ecx
> > > drop I think
> > >
> > > > +        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> > > > +        kmovw     %ecx, %k3
> > > kandw %k4, %k2, %k3
> >
> > This may not be faster since mask register can only go to port 0.  We
> > will evaluate register allocation in Intel compiler.
>
> `kmovw` and `kandw` are both 1uop port0.
>
> `andl` + `kmovw` is 2 uops and has 4c latency
> vs
> `kandw` is 1 uop and 1c latency.

Will be fixed in v6.

> >
> >
> > Thanks.
> >
> > H.J.
Noah Goldstein Dec. 20, 2021, 7:20 p.m. UTC | #5
On Mon, Dec 20, 2021 at 10:08 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote:
> > > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha
> > > > <libc-alpha@sourceware.org> wrote:
> > > > >
> > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
> > > > > AVX512 versions for libmvec as per vector ABI.  It also contains
> > > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps.
> > > > > ---
> > > >
> > > > Have a few small comments but generally okay with a patch like this
> > > > one going out in
> > > > 2.35.
> > >
> > > ...
> > >
> > > >
> > > > > +#define poly_coeff_6                   896
> > > > > +#define poly_coeff_7                   960
> > > > > +#define poly_coeff_8                   1024
> > > > > +#define poly_coeff_9                   1088
> > > > > +#define poly_coeff_10                  1152
> > > > > +#define poly_coeff_11                  1216
> > > > > +#define poly_coeff_12                  1280
> > > > > +#define PiH                            1344
> > > > > +#define Pi2H                           1408
> > > >
> > > > There is enough memory here it may pay to make the accesses
> > >
> > > Did you enough registers?
> >
> > This shouldn't affect register allocation.
> > It's just if in the program we access: poly_coeff_11 -> poly_coeff_6
> > -> poly_coeff_8
> >
> > it might be beneficial to organize the addresses of 11/6/8 s.t its
> > sequential memory
> > accesses from the table i.e
> > #define poly_coeff_11 896
> > #define poly_coeff_6 960
> > #define poly_coeff_8 1024
> > ...
> >
> > Random example and just a thought. Figure if coming in cold it might
> > save a cache miss or two because it has an easy to recognize pattern
> > for the HW prefetcher. Don't think it's make or break.
> >
>
> Good suggestion. It's difficult to hand modify. Will let compiler team know
> about this optimization.

Like I said, can live with/without this optimization in the first
version (mostly
because I think its unclear what the actual best schema is), but this patch
is being submitted as asm and meant to be maintained as asm. If the
only feasible
way to make future changes/optimizations is to update the compiler and
recompile
some higher level language, that's an issue.

>
> > >
> > > > sequential in memory.
> > >
> > > This is based on Intel compiler generated codes.  We will evaluate
> > > Intel compiler changes.
> > >
> > > ...
> > >
> > > > > +
> > > > > +#include <sysdep.h>
> > > > > +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> > > > > +        andl      %eax, %ecx
> > > > drop I think
> > > >
> > > > > +        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> > > > > +        kmovw     %ecx, %k3
> > > > kandw %k4, %k2, %k3
> > >
> > > This may not be faster since mask register can only go to port 0.  We
> > > will evaluate register allocation in Intel compiler.
> >
> > `kmovw` and `kandw` are both 1uop port0.
> >
> > `andl` + `kmovw` is 2 uops and has 4c latency
> > vs
> > `kandw` is 1 uop and 1c latency.
>
> Will be fixed in v6.
>
> > >
> > >
> > > Thanks.
> > >
> > > H.J.
Noah Goldstein Dec. 20, 2021, 7:36 p.m. UTC | #6
On Mon, Dec 20, 2021 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Dec 20, 2021 at 10:08 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote:
> > > > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha
> > > > > <libc-alpha@sourceware.org> wrote:
> > > > > >
> > > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
> > > > > > AVX512 versions for libmvec as per vector ABI.  It also contains
> > > > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps.
> > > > > > ---
> > > > >
> > > > > Have a few small comments but generally okay with a patch like this
> > > > > one going out in
> > > > > 2.35.
> > > >
> > > > ...
> > > >
> > > > >
> > > > > > +#define poly_coeff_6                   896
> > > > > > +#define poly_coeff_7                   960
> > > > > > +#define poly_coeff_8                   1024
> > > > > > +#define poly_coeff_9                   1088
> > > > > > +#define poly_coeff_10                  1152
> > > > > > +#define poly_coeff_11                  1216
> > > > > > +#define poly_coeff_12                  1280
> > > > > > +#define PiH                            1344
> > > > > > +#define Pi2H                           1408
> > > > >
> > > > > There is enough memory here it may pay to make the accesses
> > > >
> > > > Did you enough registers?
> > >
> > > This shouldn't affect register allocation.
> > > It's just if in the program we access: poly_coeff_11 -> poly_coeff_6
> > > -> poly_coeff_8
> > >
> > > it might be beneficial to organize the addresses of 11/6/8 s.t its
> > > sequential memory
> > > accesses from the table i.e
> > > #define poly_coeff_11 896
> > > #define poly_coeff_6 960
> > > #define poly_coeff_8 1024
> > > ...
> > >
> > > Random example and just a thought. Figure if coming in cold it might
> > > save a cache miss or two because it has an easy to recognize pattern
> > > for the HW prefetcher. Don't think it's make or break.
> > >
> >
> > Good suggestion. It's difficult to hand modify. Will let compiler team know
> > about this optimization.
>
> Like I said, can live with/without this optimization in the first
> version (mostly
> because I think its unclear what the actual best schema is), but this patch
> is being submitted as asm and meant to be maintained as asm. If the
> only feasible
> way to make future changes/optimizations is to update the compiler and
> recompile
> some higher level language, that's an issue.
>
> >
> > > >
> > > > > sequential in memory.
> > > >
> > > > This is based on Intel compiler generated codes.  We will evaluate
> > > > Intel compiler changes.
> > > >
> > > > ...
> > > >
> > > > > > +
> > > > > > +#include <sysdep.h>
> > > > > > +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> > > > > > +        andl      %eax, %ecx
> > > > > drop I think
> > > > >
> > > > > > +        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> > > > > > +        kmovw     %ecx, %k3
> > > > > kandw %k4, %k2, %k3
> > > >
> > > > This may not be faster since mask register can only go to port 0.  We
> > > > will evaluate register allocation in Intel compiler.
> > >
> > > `kmovw` and `kandw` are both 1uop port0.
> > >
> > > `andl` + `kmovw` is 2 uops and has 4c latency
> > > vs
> > > `kandw` is 1 uop and 1c latency.
> >
> > Will be fixed in v6.

In the other patches (for other functions, this one is fine) can you
have the compiler printout (maybe just a comment at the end of the line)
the the live-intervals for each register assignment. Looking at this code
there is a perception of extreme register pressure but a lot of that seems
forced by suspect instruction scheduling. It would be easier to notice that
for future maintenance with the the comment.
> >
> > > >
> > > >
> > > > Thanks.
> > > >
> > > > H.J.
Sunil Pandey Dec. 20, 2021, 8:30 p.m. UTC | #7
On Mon, Dec 20, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Dec 20, 2021 at 10:08 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote:
> > > > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha
> > > > > <libc-alpha@sourceware.org> wrote:
> > > > > >
> > > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and
> > > > > > AVX512 versions for libmvec as per vector ABI.  It also contains
> > > > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps.
> > > > > > ---
> > > > >
> > > > > Have a few small comments but generally okay with a patch like this
> > > > > one going out in
> > > > > 2.35.
> > > >
> > > > ...
> > > >
> > > > >
> > > > > > +#define poly_coeff_6                   896
> > > > > > +#define poly_coeff_7                   960
> > > > > > +#define poly_coeff_8                   1024
> > > > > > +#define poly_coeff_9                   1088
> > > > > > +#define poly_coeff_10                  1152
> > > > > > +#define poly_coeff_11                  1216
> > > > > > +#define poly_coeff_12                  1280
> > > > > > +#define PiH                            1344
> > > > > > +#define Pi2H                           1408
> > > > >
> > > > > There is enough memory here it may pay to make the accesses
> > > >
> > > > Did you enough registers?
> > >
> > > This shouldn't affect register allocation.
> > > It's just if in the program we access: poly_coeff_11 -> poly_coeff_6
> > > -> poly_coeff_8
> > >
> > > it might be beneficial to organize the addresses of 11/6/8 s.t its
> > > sequential memory
> > > accesses from the table i.e
> > > #define poly_coeff_11 896
> > > #define poly_coeff_6 960
> > > #define poly_coeff_8 1024
> > > ...
> > >
> > > Random example and just a thought. Figure if coming in cold it might
> > > save a cache miss or two because it has an easy to recognize pattern
> > > for the HW prefetcher. Don't think it's make or break.
> > >
> >
> > Good suggestion. It's difficult to hand modify. Will let compiler team know
> > about this optimization.
>
> Like I said, can live with/without this optimization in the first
> version (mostly
> because I think its unclear what the actual best schema is), but this patch
> is being submitted as asm and meant to be maintained as asm. If the
> only feasible
> way to make future changes/optimizations is to update the compiler and
> recompile
> some higher level language, that's an issue.

We prefer to generate compiler optimized code. We can certainly hand
optimize just like we did for other cases. For this version we want to leave
as is.

>
> >
> > > >
> > > > > sequential in memory.
> > > >
> > > > This is based on Intel compiler generated codes.  We will evaluate
> > > > Intel compiler changes.
> > > >
> > > > ...
> > > >
> > > > > > +
> > > > > > +#include <sysdep.h>
> > > > > > +        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
> > > > > > +        andl      %eax, %ecx
> > > > > drop I think
> > > > >
> > > > > > +        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
> > > > > > +        kmovw     %ecx, %k3
> > > > > kandw %k4, %k2, %k3
> > > >
> > > > This may not be faster since mask register can only go to port 0.  We
> > > > will evaluate register allocation in Intel compiler.
> > >
> > > `kmovw` and `kandw` are both 1uop port0.
> > >
> > > `andl` + `kmovw` is 2 uops and has 4c latency
> > > vs
> > > `kandw` is 1 uop and 1c latency.
> >
> > Will be fixed in v6.
> >
> > > >
> > > >
> > > > Thanks.
> > > >
> > > > H.J.
diff mbox series

Patch

diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h
index b80ff332a0..2ccdd1fc53 100644
--- a/bits/libm-simd-decl-stubs.h
+++ b/bits/libm-simd-decl-stubs.h
@@ -98,4 +98,15 @@ 
 #define __DECL_SIMD_powf32x
 #define __DECL_SIMD_powf64x
 #define __DECL_SIMD_powf128x
+
+#define __DECL_SIMD_acos
+#define __DECL_SIMD_acosf
+#define __DECL_SIMD_acosl
+#define __DECL_SIMD_acosf16
+#define __DECL_SIMD_acosf32
+#define __DECL_SIMD_acosf64
+#define __DECL_SIMD_acosf128
+#define __DECL_SIMD_acosf32x
+#define __DECL_SIMD_acosf64x
+#define __DECL_SIMD_acosf128x
 #endif
diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h
index da4cf4e10c..2cc6654208 100644
--- a/math/bits/mathcalls.h
+++ b/math/bits/mathcalls.h
@@ -50,7 +50,7 @@ 
 /* Trigonometric functions.  */
 
 /* Arc cosine of X.  */
-__MATHCALL (acos,, (_Mdouble_ __x));
+__MATHCALL_VEC (acos,, (_Mdouble_ __x));
 /* Arc sine of X.  */
 __MATHCALL (asin,, (_Mdouble_ __x));
 /* Arc tangent of X.  */
diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
index 363d4ace1e..b37b55777e 100644
--- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
@@ -46,3 +46,11 @@  GLIBC_2.22 _ZGVeN8v_log F
 GLIBC_2.22 _ZGVeN8v_sin F
 GLIBC_2.22 _ZGVeN8vv_pow F
 GLIBC_2.22 _ZGVeN8vvv_sincos F
+GLIBC_2.35 _ZGVbN2v_acos F
+GLIBC_2.35 _ZGVbN4v_acosf F
+GLIBC_2.35 _ZGVcN4v_acos F
+GLIBC_2.35 _ZGVcN8v_acosf F
+GLIBC_2.35 _ZGVdN4v_acos F
+GLIBC_2.35 _ZGVdN8v_acosf F
+GLIBC_2.35 _ZGVeN16v_acosf F
+GLIBC_2.35 _ZGVeN8v_acos F
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
index dc0bfb3705..dabb74cbb9 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86/fpu/bits/math-vector.h
@@ -58,6 +58,10 @@ 
 #  define __DECL_SIMD_pow __DECL_SIMD_x86_64
 #  undef __DECL_SIMD_powf
 #  define __DECL_SIMD_powf __DECL_SIMD_x86_64
+#  undef __DECL_SIMD_acos
+#  define __DECL_SIMD_acos __DECL_SIMD_x86_64
+#  undef __DECL_SIMD_acosf
+#  define __DECL_SIMD_acosf __DECL_SIMD_x86_64
 
 # endif
 #endif
diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
index 311bb4e391..4bcbd1fbce 100644
--- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h
+++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h
@@ -28,6 +28,8 @@ 
 !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64')
 !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64')
 
 !GCC$ builtin (cos) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32')
@@ -41,3 +43,5 @@ 
 !GCC$ builtin (expf) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (pow) attributes simd (notinbranch) if('x32')
 !GCC$ builtin (powf) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acos) attributes simd (notinbranch) if('x32')
+!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32')
diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
index b0e3bf7887..7acf1f306c 100644
--- a/sysdeps/x86_64/fpu/Makeconfig
+++ b/sysdeps/x86_64/fpu/Makeconfig
@@ -22,6 +22,7 @@  postclean-generated += libmvec.mk
 
 # Define for both math and mathvec directories.
 libmvec-funcs = \
+  acos \
   cos \
   exp \
   log \
diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
index 08132045d6..2985fe7ca7 100644
--- a/sysdeps/x86_64/fpu/Versions
+++ b/sysdeps/x86_64/fpu/Versions
@@ -13,4 +13,8 @@  libmvec {
     _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf;
     _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf;
   }
+  GLIBC_2.35 {
+    _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos;
+    _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf;
+  }
 }
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 312575f933..85a568ed29 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -25,6 +25,26 @@  float: 1
 float128: 1
 ldouble: 2
 
+Function: "acos_vlen16":
+float: 1
+
+Function: "acos_vlen2":
+double: 1
+
+Function: "acos_vlen4":
+double: 1
+float: 2
+
+Function: "acos_vlen4_avx2":
+double: 1
+
+Function: "acos_vlen8":
+double: 1
+float: 2
+
+Function: "acos_vlen8_avx2":
+float: 1
+
 Function: "acosh":
 double: 2
 float: 2
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
new file mode 100644
index 0000000000..3aed563dde
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h
@@ -0,0 +1,39 @@ 
+/* Common definition for libmathvec ifunc selections optimized with
+   AVX512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+#undef PASTER2
+#define PASTER2(x,y)   x##_##y
+
+extern void REDIRECT_NAME (void);
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512)
+      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ))
+    return OPTIMIZE (skx);
+
+  return OPTIMIZE (avx2_wrapper);
+}
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
new file mode 100644
index 0000000000..25fb8d0cac
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S
@@ -0,0 +1,20 @@ 
+/* SSE2 version of vectorized acos, vector length is 2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2
+#include "../svml_d_acos2_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
new file mode 100644
index 0000000000..5ba5d6fac2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c
@@ -0,0 +1,27 @@ 
+/* Multiple versions of vectorized acos, vector length is 2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVbN2v_acos
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
new file mode 100644
index 0000000000..2c528c012e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S
@@ -0,0 +1,293 @@ 
+/* Function acos vectorized with SSE4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define SgnBit                        	0
+#define OneHalf                       	16
+#define SmallNorm                     	32
+#define MOne                          	48
+#define Two                           	64
+#define sqrt_coeff                    	80
+#define poly_coeff                    	144
+#define PiH                           	336
+#define Pi2H                          	352
+
+#include <sysdep.h>
+
+        .text
+	.section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN2v_acos_sse4)
+        subq      $72, %rsp
+        cfi_def_cfa_offset(80)
+        movaps    %xmm0, %xmm5
+        movups    __svml_dacos_data_internal(%rip), %xmm3
+        movups    OneHalf+__svml_dacos_data_internal(%rip), %xmm6
+
+/* x = -|arg| */
+        movaps    %xmm3, %xmm4
+        orps      %xmm5, %xmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        movaps    %xmm6, %xmm7
+        mulpd     %xmm4, %xmm7
+        addpd     %xmm7, %xmm6
+
+/* S ~ 2*sqrt(Y) */
+        cvtpd2ps  %xmm6, %xmm9
+        movlhps   %xmm9, %xmm9
+
+/* x^2 */
+        movaps    %xmm4, %xmm0
+        rsqrtps   %xmm9, %xmm10
+        mulpd     %xmm4, %xmm0
+        cvtps2pd  %xmm10, %xmm11
+        minpd     %xmm6, %xmm0
+        movaps    %xmm6, %xmm1
+        movaps    %xmm0, %xmm2
+        cmpltpd   SmallNorm+__svml_dacos_data_internal(%rip), %xmm1
+        cmpnltpd  %xmm6, %xmm2
+        addpd     %xmm6, %xmm6
+        andnps    %xmm11, %xmm1
+        movaps    %xmm0, %xmm11
+        movaps    %xmm1, %xmm12
+        andps     %xmm5, %xmm3
+        mulpd     %xmm1, %xmm12
+        mulpd     %xmm6, %xmm1
+        mulpd     %xmm12, %xmm6
+        mulpd     %xmm0, %xmm11
+        subpd     Two+__svml_dacos_data_internal(%rip), %xmm6
+        movups    sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm13
+        movaps    %xmm6, %xmm14
+        mulpd     %xmm6, %xmm13
+        mulpd     %xmm1, %xmm14
+        addpd     sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm13
+        mulpd     %xmm6, %xmm13
+        addpd     sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm13
+        mulpd     %xmm13, %xmm6
+
+/* polynomial */
+        movups    poly_coeff+__svml_dacos_data_internal(%rip), %xmm15
+        movaps    %xmm11, %xmm7
+        mulpd     %xmm0, %xmm15
+        addpd     sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6
+        addpd     poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm15
+        mulpd     %xmm11, %xmm7
+        mulpd     %xmm6, %xmm14
+        mulpd     %xmm11, %xmm15
+        subpd     %xmm14, %xmm1
+        movups    MOne+__svml_dacos_data_internal(%rip), %xmm8
+        andps     %xmm2, %xmm1
+
+/* NaN processed in special branch (so wind test passed) */
+        cmpnlepd  %xmm4, %xmm8
+        movmskpd  %xmm8, %edx
+
+/* X<X^2 iff X<0 */
+        movaps    %xmm5, %xmm12
+        movups    poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm8
+        movaps    %xmm2, %xmm13
+        movups    poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm6
+        mulpd     %xmm0, %xmm8
+        mulpd     %xmm0, %xmm6
+        addpd     poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm8
+        addpd     poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm6
+        cmpltpd   %xmm0, %xmm12
+        addpd     %xmm15, %xmm8
+        mulpd     %xmm11, %xmm6
+        mulpd     %xmm7, %xmm8
+        movups    poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm9
+        mulpd     %xmm0, %xmm9
+        addpd     poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm9
+        addpd     %xmm6, %xmm9
+        movups    poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm10
+        movaps    %xmm2, %xmm6
+        mulpd     %xmm0, %xmm10
+        addpd     %xmm8, %xmm9
+        addpd     poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm10
+        mulpd     %xmm11, %xmm9
+        movups    poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm14
+        andnps    %xmm4, %xmm6
+        addpd     %xmm9, %xmm10
+        mulpd     %xmm0, %xmm14
+        mulpd     %xmm10, %xmm11
+        addpd     poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm14
+        addpd     %xmm11, %xmm14
+        mulpd     %xmm0, %xmm14
+        orps      %xmm1, %xmm6
+        pxor      %xmm3, %xmm6
+        mulpd     %xmm6, %xmm14
+        movups    PiH+__svml_dacos_data_internal(%rip), %xmm0
+        andps     %xmm2, %xmm0
+        andnps    Pi2H+__svml_dacos_data_internal(%rip), %xmm13
+        andps     %xmm12, %xmm0
+        addpd     %xmm13, %xmm0
+        addpd     %xmm14, %xmm6
+        addpd     %xmm6, %xmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        addq      $72, %rsp
+        cfi_def_cfa_offset(8)
+        ret
+        cfi_def_cfa_offset(80)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        movups    %xmm5, 32(%rsp)
+        movups    %xmm0, 48(%rsp)
+        xorl      %eax, %eax
+        movq      %r12, 16(%rsp)
+        cfi_offset(12, -64)
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        cfi_offset(13, -72)
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        cfi_offset(14, -80)
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $2, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        movups    48(%rsp), %xmm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        cfi_offset(12, -64)
+        cfi_offset(13, -72)
+        cfi_offset(14, -80)
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     32(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+        movsd     %xmm0, 48(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVbN2v_acos_sse4)
+
+        .section .rodata, "a"
+        .align 16
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(16)) VUINT32 SgnBit[2][2];
+        __declspec(align(16)) VUINT32 OneHalf[2][2];
+        __declspec(align(16)) VUINT32 SmallNorm[2][2];
+        __declspec(align(16)) VUINT32 MOne[2][2];
+        __declspec(align(16)) VUINT32 Two[2][2];
+        __declspec(align(16)) VUINT32 sqrt_coeff[4][2][2];
+        __declspec(align(16)) VUINT32 poly_coeff[12][2][2];
+        __declspec(align(16)) VUINT32 PiH[2][2];
+        __declspec(align(16)) VUINT32 Pi2H[2][2];
+} __svml_dacos_data_internal;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 16
+        .quad 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 16
+        .quad 0x3000000000000000, 0x3000000000000000
+        /*== MOne ==*/
+        .align 16
+        .quad 0xbff0000000000000, 0xbff0000000000000
+        /*== Two ==*/
+        .align 16
+        .quad 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 16
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 16
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiH ==*/
+        .align 16
+        .quad 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2H ==*/
+        .align 16
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        .align 16
+        .type	__svml_dacos_data_internal,@object
+        .size	__svml_dacos_data_internal,.-__svml_dacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
new file mode 100644
index 0000000000..750f71c81c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S
@@ -0,0 +1,20 @@ 
+/* SSE version of vectorized acos, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper
+#include "../svml_d_acos4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
new file mode 100644
index 0000000000..6453e7ebe2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c
@@ -0,0 +1,27 @@ 
+/* Multiple versions of vectorized acos, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVdN4v_acos
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
new file mode 100644
index 0000000000..172080e3ea
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
@@ -0,0 +1,273 @@ 
+/* Function acos vectorized with AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define SgnBit                        	0
+#define OneHalf                       	32
+#define SmallNorm                     	64
+#define MOne                          	96
+#define Two                           	128
+#define sqrt_coeff                    	160
+#define poly_coeff                    	288
+#define PiH                           	672
+#define Pi2H                          	704
+
+#include <sysdep.h>
+
+        .text
+	.section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN4v_acos_avx2)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-32, %rsp
+        subq      $96, %rsp
+        vmovupd   __svml_dacos_data_internal(%rip), %ymm6
+        vmovupd   OneHalf+__svml_dacos_data_internal(%rip), %ymm7
+        vmovapd   %ymm0, %ymm5
+
+/* x = -|arg| */
+        vorpd     %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231pd %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+        vmulpd    %ymm4, %ymm4, %ymm8
+
+/* S ~ 2*sqrt(Y) */
+        vmovupd   sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0
+        vcmplt_oqpd SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm12
+        vminpd    %ymm7, %ymm8, %ymm2
+
+/* NaN processed in special branch (so wind test passed) */
+        vcmpnge_uqpd MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm9
+        vcvtpd2ps %ymm7, %xmm10
+        vmovupd   poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm8
+        vcmpnlt_uqpd %ymm7, %ymm2, %ymm1
+        vrsqrtps  %xmm10, %xmm11
+        vfmadd213pd poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm8
+        vcvtps2pd %xmm11, %ymm13
+        vmovupd   poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm11
+        vandnpd   %ymm13, %ymm12, %ymm14
+        vmulpd    %ymm14, %ymm14, %ymm15
+        vfmadd213pd poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm11
+        vmulpd    %ymm2, %ymm2, %ymm13
+        vmovupd   poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm12
+        vmulpd    %ymm13, %ymm13, %ymm10
+        vfmadd213pd poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm12
+        vandpd    %ymm5, %ymm6, %ymm3
+        vaddpd    %ymm7, %ymm7, %ymm6
+        vmulpd    %ymm6, %ymm14, %ymm7
+        vfmsub213pd Two+__svml_dacos_data_internal(%rip), %ymm15, %ymm6
+        vmovupd   poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm14
+        vfmadd213pd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
+        vmulpd    %ymm6, %ymm7, %ymm15
+        vfmadd213pd poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm14
+        vfmadd213pd sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
+        vfmadd213pd sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm6, %ymm0
+
+/* polynomial */
+        vmovupd   poly_coeff+__svml_dacos_data_internal(%rip), %ymm6
+        vfnmadd213pd %ymm7, %ymm15, %ymm0
+        vfmadd213pd poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm6
+        vblendvpd %ymm1, %ymm0, %ymm4, %ymm0
+        vfmadd213pd %ymm8, %ymm13, %ymm6
+        vmovmskpd %ymm9, %edx
+        vmovupd   poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm9
+        vfmadd213pd poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm9
+        vfmadd213pd %ymm9, %ymm13, %ymm11
+        vfmadd213pd %ymm11, %ymm10, %ymm6
+        vfmadd213pd %ymm12, %ymm13, %ymm6
+        vfmadd213pd %ymm14, %ymm13, %ymm6
+        vmulpd    %ymm6, %ymm2, %ymm9
+
+/* X<X^2 iff X<0 */
+        vcmplt_oqpd %ymm2, %ymm5, %ymm6
+        vandpd    PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2
+        vandnpd   Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7
+        vxorpd    %ymm3, %ymm0, %ymm1
+        vfmadd213pd %ymm1, %ymm1, %ymm9
+        vandpd    %ymm6, %ymm2, %ymm2
+        vaddpd    %ymm7, %ymm2, %ymm8
+        vaddpd    %ymm9, %ymm8, %ymm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovupd   %ymm5, 32(%rsp)
+        vmovupd   %ymm0, 64(%rsp)
+        xorl      %eax, %eax
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $4, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovupd   64(%rsp), %ymm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     32(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+        movsd     %xmm0, 64(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVdN4v_acos_avx2)
+
+        .section .rodata, "a"
+        .align 32
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(32)) VUINT32 SgnBit[4][2];
+        __declspec(align(32)) VUINT32 OneHalf[4][2];
+        __declspec(align(32)) VUINT32 SmallNorm[4][2];
+        __declspec(align(32)) VUINT32 MOne[4][2];
+        __declspec(align(32)) VUINT32 Two[4][2];
+        __declspec(align(32)) VUINT32 sqrt_coeff[4][4][2];
+        __declspec(align(32)) VUINT32 poly_coeff[12][4][2];
+        __declspec(align(32)) VUINT32 PiH[4][2];
+        __declspec(align(32)) VUINT32 Pi2H[4][2];
+} __svml_dacos_data_internal;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 32
+        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 32
+        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+        /*== MOne ==*/
+        .align 32
+        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+        /*== Two ==*/
+        .align 32
+        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 32
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 32
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiH ==*/
+        .align 32
+        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2H ==*/
+        .align 32
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        .align 32
+        .type	__svml_dacos_data_internal,@object
+        .size	__svml_dacos_data_internal,.-__svml_dacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
new file mode 100644
index 0000000000..4d64fd1c00
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S
@@ -0,0 +1,20 @@ 
+/* AVX2 version of vectorized acos, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper
+#include "../svml_d_acos8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
new file mode 100644
index 0000000000..1e7d1865fb
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c
@@ -0,0 +1,27 @@ 
+/* Multiple versions of vectorized acos, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVeN8v_acos
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
new file mode 100644
index 0000000000..76ca35ad7b
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
@@ -0,0 +1,298 @@ 
+/* Function acos vectorized with AVX-512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ */
+
+/* Offsets for data table __svml_dacos_data_internal
+ */
+#define SgnBit                        	0
+#define OneHalf                       	64
+#define SmallNorm                     	128
+#define MOne                          	192
+#define Two                           	256
+#define sqrt_coeff_1                  	320
+#define sqrt_coeff_2                  	384
+#define sqrt_coeff_3                  	448
+#define sqrt_coeff_4                  	512
+#define poly_coeff_1                  	576
+#define poly_coeff_2                  	640
+#define poly_coeff_3                  	704
+#define poly_coeff_4                  	768
+#define poly_coeff_5                  	832
+#define poly_coeff_6                  	896
+#define poly_coeff_7                  	960
+#define poly_coeff_8                  	1024
+#define poly_coeff_9                  	1088
+#define poly_coeff_10                 	1152
+#define poly_coeff_11                 	1216
+#define poly_coeff_12                 	1280
+#define PiH                           	1344
+#define Pi2H                          	1408
+
+#include <sysdep.h>
+
+        .text
+	.section .text.evex512,"ax",@progbits
+ENTRY(_ZGVeN8v_acos_skx)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $192, %rsp
+        vmovups   __svml_dacos_data_internal(%rip), %zmm7
+        vmovups   OneHalf+__svml_dacos_data_internal(%rip), %zmm8
+
+/* S ~ 2*sqrt(Y) */
+        vmovups   SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
+        vmovups   Two+__svml_dacos_data_internal(%rip), %zmm14
+        vmovups   sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
+        vmovups   sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
+        vmovups   sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
+        vmovups   MOne+__svml_dacos_data_internal(%rip), %zmm10
+        vmovaps   %zmm0, %zmm6
+
+/* x = -|arg| */
+        vorpd     %zmm6, %zmm7, %zmm5
+        vandpd    %zmm6, %zmm7, %zmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
+
+/* x^2 */
+        vmulpd    {rn-sae}, %zmm5, %zmm5, %zmm9
+        vrsqrt14pd %zmm8, %zmm12
+        vcmppd    $17, {sae}, %zmm11, %zmm8, %k1
+        vcmppd    $17, {sae}, %zmm10, %zmm5, %k0
+        vmovups   poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
+        vmovups   poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
+        vminpd    {sae}, %zmm8, %zmm9, %zmm3
+        vmovups   poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
+        vxorpd    %zmm12, %zmm12, %zmm12{%k1}
+        vaddpd    {rn-sae}, %zmm8, %zmm8, %zmm0
+        vcmppd    $21, {sae}, %zmm8, %zmm3, %k4
+
+/* X<X^2 iff X<0 */
+        vcmppd    $17, {sae}, %zmm3, %zmm6, %k2
+        vmulpd    {rn-sae}, %zmm12, %zmm12, %zmm13
+        vmulpd    {rn-sae}, %zmm12, %zmm0, %zmm7
+        vmovups   poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
+
+/* polynomial */
+        vmovups   poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
+        vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
+        vmovups   sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
+        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
+        vmovups   poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
+        vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
+        vmovups   poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
+        vmulpd    {rn-sae}, %zmm0, %zmm7, %zmm14
+        vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
+        vmovups   poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
+        kmovw     %k4, %eax
+        kmovw     %k2, %ecx
+        kmovw     %k0, %edx
+        vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
+        vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
+        vmovups   poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
+        vmulpd    {rn-sae}, %zmm3, %zmm3, %zmm0
+        vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
+        vmovups   poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
+        vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
+        vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
+        vblendmpd %zmm2, %zmm5, %zmm2{%k4}
+        vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
+        vmovups   poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
+        vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
+        andl      %eax, %ecx
+        vmovups   poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
+        kmovw     %ecx, %k3
+        vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
+        vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
+        vmulpd    {rn-sae}, %zmm0, %zmm0, %zmm10
+        vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
+        vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
+        vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
+        vmovups   Pi2H+__svml_dacos_data_internal(%rip), %zmm0
+        vmulpd    {rn-sae}, %zmm3, %zmm1, %zmm1
+        vxorpd    %zmm4, %zmm2, %zmm3
+        vxorpd    %zmm0, %zmm0, %zmm0{%k4}
+        vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
+        vorpd     PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k3}
+        vaddpd    {rn-sae}, %zmm1, %zmm0, %zmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %zmm6, 64(%rsp)
+        vmovups   %zmm0, 128(%rsp)
+        xorl      %eax, %eax
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $8, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovups   128(%rsp), %zmm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movsd     64(%rsp,%r14,8), %xmm0
+        call      acos@PLT
+        movsd     %xmm0, 128(%rsp,%r14,8)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVeN8v_acos_skx)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_dacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(64)) VUINT32 SgnBit[8][2];
+        __declspec(align(64)) VUINT32 OneHalf[8][2];
+        __declspec(align(64)) VUINT32 SmallNorm[8][2];
+        __declspec(align(64)) VUINT32 MOne[8][2];
+        __declspec(align(64)) VUINT32 Two[8][2];
+        __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
+        __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
+        __declspec(align(64)) VUINT32 PiH[8][2];
+        __declspec(align(64)) VUINT32 Pi2H[8][2];
+} __svml_dacos_data_internal;
+#endif
+__svml_dacos_data_internal:
+        /*== SgnBit ==*/
+        .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
+        /*== OneHalf ==*/
+        .align 64
+        .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
+        /*== SmallNorm ==*/
+        .align 64
+        .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
+        /*== MOne ==*/
+        .align 64
+        .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
+        /*== Two ==*/
+        .align 64
+        .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
+        /*== sqrt_coeff[4] ==*/
+        .align 64
+        .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
+        .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
+        .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
+        .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
+        /*== poly_coeff[12] ==*/
+        .align 64
+        .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
+        .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
+        .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
+        .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
+        .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
+        .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
+        .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
+        .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
+        .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
+        .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
+        .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
+        .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
+        /*== PiH ==*/
+        .align 64
+        .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
+        /*== Pi2H ==*/
+        .align 64
+        .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
+        .align 64
+        .type	__svml_dacos_data_internal,@object
+        .size	__svml_dacos_data_internal,.-__svml_dacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
new file mode 100644
index 0000000000..1ff0cfc8d5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S
@@ -0,0 +1,20 @@ 
+/* AVX2 version of vectorized acosf.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper
+#include "../svml_s_acosf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
new file mode 100644
index 0000000000..fcf05782c5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c
@@ -0,0 +1,28 @@ 
+/* Multiple versions of vectorized acosf, vector length is 16.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVeN16v_acosf
+#include "ifunc-mathvec-avx512-skx.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf,
+	       __redirect__ZGVeN16v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
new file mode 100644
index 0000000000..1db2969c77
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
@@ -0,0 +1,262 @@ 
+/* Function acosf vectorized with AVX-512.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define SgnBit                        	0
+#define OneHalf                       	64
+#define SmallNorm                     	128
+#define MOne                          	192
+#define Two                           	256
+#define sqrt_coeff_1                  	320
+#define sqrt_coeff_2                  	384
+#define poly_coeff_1                  	448
+#define poly_coeff_2                  	512
+#define poly_coeff_3                  	576
+#define poly_coeff_4                  	640
+#define poly_coeff_5                  	704
+#define Pi2H                          	768
+#define PiH                           	832
+
+#include <sysdep.h>
+
+        .text
+	.section .text.exex512,"ax",@progbits
+ENTRY(_ZGVeN16v_acosf_skx)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-64, %rsp
+        subq      $192, %rsp
+        vmovups   __svml_sacos_data_internal(%rip), %zmm5
+        vmovups   OneHalf+__svml_sacos_data_internal(%rip), %zmm6
+
+/* SQ ~ 2*sqrt(Y) */
+        vmovups   SmallNorm+__svml_sacos_data_internal(%rip), %zmm9
+        vmovups   MOne+__svml_sacos_data_internal(%rip), %zmm8
+        vmovups   Two+__svml_sacos_data_internal(%rip), %zmm12
+        vmovups   sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13
+        vmovaps   %zmm0, %zmm4
+
+/* x = -|arg| */
+        vorps     %zmm4, %zmm5, %zmm3
+        vandps    %zmm4, %zmm5, %zmm2
+        vmovups   sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6
+
+/* x^2 */
+        vmulps    {rn-sae}, %zmm3, %zmm3, %zmm7
+        vrsqrt14ps %zmm6, %zmm10
+        vcmpps    $17, {sae}, %zmm9, %zmm6, %k1
+        vcmpps    $22, {sae}, %zmm3, %zmm8, %k0
+        vmovups   poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9
+        vminps    {sae}, %zmm6, %zmm7, %zmm1
+        vmovups   poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7
+        vxorps    %zmm10, %zmm10, %zmm10{%k1}
+        vaddps    {rn-sae}, %zmm6, %zmm6, %zmm14
+        vmulps    {rn-sae}, %zmm1, %zmm1, %zmm8
+        vmulps    {rn-sae}, %zmm10, %zmm10, %zmm11
+        vmulps    {rn-sae}, %zmm10, %zmm14, %zmm5
+        vcmpps    $21, {sae}, %zmm6, %zmm1, %k4
+
+/* X<X^2 iff X<0 */
+        vcmpps    $17, {sae}, %zmm1, %zmm4, %k2
+
+/* polynomial */
+        vmovups   poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6
+        vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14
+        vmovups   poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11
+        vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9
+        vmovups   poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10
+        vmovups   Pi2H+__svml_sacos_data_internal(%rip), %zmm12
+        vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0
+        vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11
+        vmulps    {rn-sae}, %zmm14, %zmm5, %zmm15
+        vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11
+        vxorps    %zmm12, %zmm12, %zmm12{%k4}
+        vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0
+        vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11
+        kmovw     %k4, %eax
+        kmovw     %k2, %ecx
+        kmovw     %k0, %edx
+        vmulps    {rn-sae}, %zmm1, %zmm11, %zmm13
+        vblendmps %zmm0, %zmm3, %zmm0{%k4}
+        vxorps    %zmm2, %zmm0, %zmm1
+        andl      %eax, %ecx
+        kmovw     %ecx, %k3
+        vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13
+        vorps     PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3}
+        vaddps    {rn-sae}, %zmm13, %zmm12, %zmm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %zmm4, 64(%rsp)
+        vmovups   %zmm0, 128(%rsp)
+        xorl      %eax, %eax
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $16, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovups   128(%rsp), %zmm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     64(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+        movss     %xmm0, 128(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVeN16v_acosf_skx)
+
+        .section .rodata, "a"
+        .align 64
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(64)) VUINT32 SgnBit[16][1];
+        __declspec(align(64)) VUINT32 OneHalf[16][1];
+        __declspec(align(64)) VUINT32 SmallNorm[16][1];
+        __declspec(align(64)) VUINT32 MOne[16][1];
+        __declspec(align(64)) VUINT32 Two[16][1];
+        __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1];
+        __declspec(align(64)) VUINT32 poly_coeff[5][16][1];
+        __declspec(align(64)) VUINT32 Pi2H[16][1];
+        __declspec(align(64)) VUINT32 PiH[16][1];
+} __svml_sacos_data_internal;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 64
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== SmallNorm ==*/
+        .align 64
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 64
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== Two ==*/
+        .align 64
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 64
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 64
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 64
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== PiH ==*/
+        .align 64
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        .align 64
+        .type	__svml_sacos_data_internal,@object
+        .size	__svml_sacos_data_internal,.-__svml_sacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
new file mode 100644
index 0000000000..f94b3eb01a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S
@@ -0,0 +1,20 @@ 
+/* SSE2 version of vectorized acosf, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2
+#include "../svml_s_acosf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
new file mode 100644
index 0000000000..6f9a5c1082
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c
@@ -0,0 +1,28 @@ 
+/* Multiple versions of vectorized acosf, vector length is 4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVbN4v_acosf
+#include "ifunc-mathvec-sse4_1.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf,
+	       __redirect__ZGVbN4v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
new file mode 100644
index 0000000000..fe0c94aeb5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S
@@ -0,0 +1,260 @@ 
+/* Function acosf vectorized with SSE4.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define SgnBit                        	0
+#define OneHalf                       	16
+#define SmallNorm                     	32
+#define MOne                          	48
+#define Two                           	64
+#define sqrt_coeff                    	80
+#define poly_coeff                    	112
+#define Pi2H                          	192
+#define PiH                           	208
+
+#include <sysdep.h>
+
+        .text
+	.section .text.sse4,"ax",@progbits
+ENTRY(_ZGVbN4v_acosf_sse4)
+        subq      $72, %rsp
+        cfi_def_cfa_offset(80)
+
+/* X<X^2 iff X<0 */
+        movaps    %xmm0, %xmm14
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+        movups    __svml_sacos_data_internal(%rip), %xmm3
+        movups    OneHalf+__svml_sacos_data_internal(%rip), %xmm5
+
+/* x = -|arg| */
+        movaps    %xmm3, %xmm4
+        orps      %xmm0, %xmm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        movaps    %xmm5, %xmm6
+        mulps     %xmm4, %xmm6
+
+/* x^2 */
+        movaps    %xmm4, %xmm13
+        mulps     %xmm4, %xmm13
+        addps     %xmm6, %xmm5
+
+/* SQ ~ 2*sqrt(Y) */
+        rsqrtps   %xmm5, %xmm8
+        minps     %xmm5, %xmm13
+        movaps    %xmm5, %xmm2
+        movaps    %xmm13, %xmm1
+        cmpltps   SmallNorm+__svml_sacos_data_internal(%rip), %xmm2
+        cmpnltps  %xmm5, %xmm1
+        cmpltps   %xmm13, %xmm14
+        addps     %xmm5, %xmm5
+        andnps    %xmm8, %xmm2
+        movaps    %xmm13, %xmm11
+        movaps    %xmm2, %xmm9
+        movaps    %xmm1, %xmm6
+        mulps     %xmm2, %xmm9
+        andnps    %xmm4, %xmm6
+        mulps     %xmm5, %xmm2
+        mulps     %xmm13, %xmm11
+        mulps     %xmm9, %xmm5
+        movups    sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm10
+        andps     %xmm0, %xmm3
+
+/* polynomial */
+        movups    poly_coeff+__svml_sacos_data_internal(%rip), %xmm12
+        movaps    %xmm1, %xmm15
+        mulps     %xmm13, %xmm12
+        subps     Two+__svml_sacos_data_internal(%rip), %xmm5
+        mulps     %xmm5, %xmm10
+        addps     poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm12
+        mulps     %xmm2, %xmm5
+        mulps     %xmm11, %xmm12
+        addps     sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm10
+        mulps     %xmm5, %xmm10
+        movups    poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm5
+        subps     %xmm10, %xmm2
+        mulps     %xmm13, %xmm5
+        movups    MOne+__svml_sacos_data_internal(%rip), %xmm7
+        andps     %xmm1, %xmm2
+        cmpnleps  %xmm4, %xmm7
+        addps     poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm5
+        movmskps  %xmm7, %edx
+        orps      %xmm2, %xmm6
+        addps     %xmm12, %xmm5
+        mulps     %xmm13, %xmm5
+        pxor      %xmm3, %xmm6
+        movups    PiH+__svml_sacos_data_internal(%rip), %xmm7
+        andps     %xmm1, %xmm7
+        addps     poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm5
+        mulps     %xmm13, %xmm5
+        andps     %xmm14, %xmm7
+        mulps     %xmm6, %xmm5
+        andnps    Pi2H+__svml_sacos_data_internal(%rip), %xmm15
+        addps     %xmm5, %xmm6
+        addps     %xmm15, %xmm7
+        addps     %xmm6, %xmm7
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movaps    %xmm7, %xmm0
+        addq      $72, %rsp
+        cfi_def_cfa_offset(8)
+        ret
+        cfi_def_cfa_offset(80)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        movups    %xmm0, 32(%rsp)
+        movups    %xmm7, 48(%rsp)
+        xorl      %eax, %eax
+        movq      %r12, 16(%rsp)
+        cfi_offset(12, -64)
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        cfi_offset(13, -72)
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        cfi_offset(14, -80)
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $4, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        movups    48(%rsp), %xmm7
+
+/* Go to exit */
+        jmp       L(EXIT)
+        cfi_offset(12, -64)
+        cfi_offset(13, -72)
+        cfi_offset(14, -80)
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     32(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+        movss     %xmm0, 48(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVbN4v_acosf_sse4)
+
+        .section .rodata, "a"
+        .align 16
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(16)) VUINT32 SgnBit[4][1];
+        __declspec(align(16)) VUINT32 OneHalf[4][1];
+        __declspec(align(16)) VUINT32 SmallNorm[4][1];
+        __declspec(align(16)) VUINT32 MOne[4][1];
+        __declspec(align(16)) VUINT32 Two[4][1];
+        __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1];
+        __declspec(align(16)) VUINT32 poly_coeff[5][4][1];
+        __declspec(align(16)) VUINT32 Pi2H[4][1];
+        __declspec(align(16)) VUINT32 PiH[4][1];
+} __svml_sacos_data_internal;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 16
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== SmallNorm ==*/
+        .align 16
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 16
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== Two ==*/
+        .align 16
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 16
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 16
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 16
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== PiH ==*/
+        .align 16
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        .align 16
+        .type	__svml_sacos_data_internal,@object
+        .size	__svml_sacos_data_internal,.-__svml_sacos_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
new file mode 100644
index 0000000000..583ef54fee
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S
@@ -0,0 +1,20 @@ 
+/* SSE version of vectorized acosf, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper
+#include "../svml_s_acosf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
new file mode 100644
index 0000000000..dd360a9479
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c
@@ -0,0 +1,28 @@ 
+/* Multiple versions of vectorized acosf, vector length is 8.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define SYMBOL_NAME _ZGVdN8v_acosf
+#include "ifunc-mathvec-avx2.h"
+
+libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ());
+
+#ifdef SHARED
+__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf,
+	       __redirect__ZGVdN8v_acosf)
+  __attribute__ ((visibility ("hidden")));
+#endif
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
new file mode 100644
index 0000000000..2b6dd2c2c2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
@@ -0,0 +1,252 @@ 
+/* Function acosf vectorized with AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   https://www.gnu.org/licenses/.  */
+
+/*
+ * ALGORITHM DESCRIPTION:
+ *
+ *      SelMask = (|x| >= 0.5) ? 1 : 0;
+ *      R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
+ *      acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
+ *      acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
+ *
+ *
+ */
+
+/* Offsets for data table __svml_sacos_data_internal
+ */
+#define SgnBit                        	0
+#define OneHalf                       	32
+#define SmallNorm                     	64
+#define MOne                          	96
+#define Two                           	128
+#define sqrt_coeff                    	160
+#define poly_coeff                    	224
+#define Pi2H                          	384
+#define PiH                           	416
+
+#include <sysdep.h>
+
+        .text
+	.section .text.avx2,"ax",@progbits
+ENTRY(_ZGVdN8v_acosf_avx2)
+        pushq     %rbp
+        cfi_def_cfa_offset(16)
+        movq      %rsp, %rbp
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+        andq      $-32, %rsp
+        subq      $96, %rsp
+
+/*
+ * 2*sqrt(X) ~ Sh - Sl  (to 24+ bits)
+ * SQ ~ 2*sqrt(X)
+ */
+        vmovups   __svml_sacos_data_internal(%rip), %ymm6
+        vmovups   OneHalf+__svml_sacos_data_internal(%rip), %ymm7
+        vmovaps   %ymm0, %ymm5
+
+/* x = -|arg| */
+        vorps     %ymm5, %ymm6, %ymm4
+
+/* Y = 0.5 + 0.5*(-x) */
+        vfmadd231ps %ymm4, %ymm7, %ymm7
+
+/* x^2 */
+        vmulps    %ymm4, %ymm4, %ymm8
+
+/* SQ ~ 2*sqrt(Y) */
+        vmovups   sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
+        vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9
+        vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10
+        vminps    %ymm7, %ymm8, %ymm2
+        vaddps    %ymm7, %ymm7, %ymm14
+        vrsqrtps  %ymm7, %ymm11
+        vmovups   poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8
+        vcmpnlt_uqps %ymm7, %ymm2, %ymm1
+        vmulps    %ymm2, %ymm2, %ymm7
+        vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8
+        vmovmskps %ymm9, %edx
+
+/* polynomial */
+        vmovups   poly_coeff+__svml_sacos_data_internal(%rip), %ymm9
+        vandnps   %ymm11, %ymm10, %ymm12
+        vmulps    %ymm12, %ymm12, %ymm13
+        vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
+
+/* X<X^2 iff X<0 */
+        vcmplt_oqps %ymm2, %ymm5, %ymm10
+        vfmadd213ps %ymm8, %ymm7, %ymm9
+        vandps    %ymm5, %ymm6, %ymm3
+        vmulps    %ymm14, %ymm12, %ymm6
+        vfmsub213ps Two+__svml_sacos_data_internal(%rip), %ymm13, %ymm14
+        vfmadd213ps poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
+        vfmadd213ps sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm14, %ymm0
+        vmulps    %ymm14, %ymm6, %ymm15
+        vmulps    %ymm9, %ymm2, %ymm14
+        vfnmadd213ps %ymm6, %ymm15, %ymm0
+        vblendvps %ymm1, %ymm0, %ymm4, %ymm0
+        vandps    PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
+        vandnps   Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm12
+        vxorps    %ymm3, %ymm0, %ymm1
+        vfmadd213ps %ymm1, %ymm1, %ymm14
+        vandps    %ymm10, %ymm2, %ymm11
+        vaddps    %ymm12, %ymm11, %ymm13
+        vaddps    %ymm14, %ymm13, %ymm0
+        testl     %edx, %edx
+
+/* Go to special inputs processing branch */
+        jne       L(SPECIAL_VALUES_BRANCH)
+
+/* Restore registers
+ * and exit the function
+ */
+
+L(EXIT):
+        movq      %rbp, %rsp
+        popq      %rbp
+        cfi_def_cfa(7, 8)
+        cfi_restore(6)
+        ret
+        cfi_def_cfa(6, 16)
+        cfi_offset(6, -16)
+
+/* Branch to process
+ * special inputs
+ */
+
+L(SPECIAL_VALUES_BRANCH):
+        vmovups   %ymm5, 32(%rsp)
+        vmovups   %ymm0, 64(%rsp)
+        xorl      %eax, %eax
+        vzeroupper
+        movq      %r12, 16(%rsp)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        movl      %eax, %r12d
+        movq      %r13, 8(%rsp)
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        movl      %edx, %r13d
+        movq      %r14, (%rsp)
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+
+/* Range mask
+ * bits check
+ */
+
+L(RANGEMASK_CHECK):
+        btl       %r12d, %r13d
+
+/* Call scalar math function */
+        jc        L(SCALAR_MATH_CALL)
+
+/* Special inputs
+ * processing loop
+ */
+
+L(SPECIAL_VALUES_LOOP):
+        incl      %r12d
+        cmpl      $8, %r12d
+
+/* Check bits in range mask */
+        jl        L(RANGEMASK_CHECK)
+        movq      16(%rsp), %r12
+        cfi_restore(12)
+        movq      8(%rsp), %r13
+        cfi_restore(13)
+        movq      (%rsp), %r14
+        cfi_restore(14)
+        vmovups   64(%rsp), %ymm0
+
+/* Go to exit */
+        jmp       L(EXIT)
+        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
+        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+
+/* Scalar math fucntion call
+ * to process special input
+ */
+
+L(SCALAR_MATH_CALL):
+        movl      %r12d, %r14d
+        movss     32(%rsp,%r14,4), %xmm0
+        call      acosf@PLT
+        movss     %xmm0, 64(%rsp,%r14,4)
+
+/* Process special inputs in loop */
+        jmp       L(SPECIAL_VALUES_LOOP)
+
+END(_ZGVdN8v_acosf_avx2)
+
+        .section .rodata, "a"
+        .align 32
+
+#ifdef __svml_sacos_data_internal_typedef
+typedef unsigned int VUINT32;
+typedef struct {
+        __declspec(align(32)) VUINT32 SgnBit[8][1];
+        __declspec(align(32)) VUINT32 OneHalf[8][1];
+        __declspec(align(32)) VUINT32 SmallNorm[8][1];
+        __declspec(align(32)) VUINT32 MOne[8][1];
+        __declspec(align(32)) VUINT32 Two[8][1];
+        __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1];
+        __declspec(align(32)) VUINT32 poly_coeff[5][8][1];
+        __declspec(align(32)) VUINT32 Pi2H[8][1];
+        __declspec(align(32)) VUINT32 PiH[8][1];
+} __svml_sacos_data_internal;
+#endif
+__svml_sacos_data_internal:
+        /*== SgnBit ==*/
+        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
+        /*== OneHalf ==*/
+        .align 32
+        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+        /*== SmallNorm ==*/
+        .align 32
+        .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
+        /*== MOne ==*/
+        .align 32
+        .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
+        /*== Two ==*/
+        .align 32
+        .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
+        /*== sqrt_coeff[2] ==*/
+        .align 32
+        .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
+        .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
+        /*== poly_coeff[5] ==*/
+        .align 32
+        .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
+        .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
+        .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
+        .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
+        .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
+        /*== Pi2H ==*/
+        .align 32
+        .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
+        /*== PiH ==*/
+        .align 32
+        .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
+        .align 32
+        .type	__svml_sacos_data_internal,@object
+        .size	__svml_sacos_data_internal,.-__svml_sacos_data_internal
diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
new file mode 100644
index 0000000000..9656478b2d
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S
@@ -0,0 +1,29 @@ 
+/* Function acos vectorized with SSE2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN2v_acos)
+WRAPPER_IMPL_SSE2 acos
+END (_ZGVbN2v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN2v_acos)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
new file mode 100644
index 0000000000..e99cb4ae78
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S
@@ -0,0 +1,29 @@ 
+/* Function acos vectorized with AVX2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVdN4v_acos)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN4v_acos)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
new file mode 100644
index 0000000000..7cbcbc965c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S
@@ -0,0 +1,25 @@ 
+/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVcN4v_acos)
+WRAPPER_IMPL_AVX _ZGVbN2v_acos
+END (_ZGVcN4v_acos)
diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
new file mode 100644
index 0000000000..e26b30d81a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S
@@ -0,0 +1,25 @@ 
+/* Function acos vectorized with AVX-512, wrapper to AVX2.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_d_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN8v_acos)
+WRAPPER_IMPL_AVX512 _ZGVdN4v_acos
+END (_ZGVeN8v_acos)
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
new file mode 100644
index 0000000000..70e046d492
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S
@@ -0,0 +1,25 @@ 
+/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVeN16v_acosf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf
+END (_ZGVeN16v_acosf)
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
new file mode 100644
index 0000000000..36354b32b5
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S
@@ -0,0 +1,29 @@ 
+/* Function acosf vectorized with SSE2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVbN4v_acosf)
+WRAPPER_IMPL_SSE2 acosf
+END (_ZGVbN4v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_acosf)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
new file mode 100644
index 0000000000..f08864a511
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S
@@ -0,0 +1,29 @@ 
+/* Function acosf vectorized with AVX2, wrapper version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+	.text
+ENTRY (_ZGVdN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVdN8v_acosf)
+
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_acosf)
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
new file mode 100644
index 0000000000..f3ed4d8e78
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S
@@ -0,0 +1,25 @@ 
+/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
+
+        .text
+ENTRY (_ZGVcN8v_acosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_acosf
+END (_ZGVcN8v_acosf)
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
new file mode 100644
index 0000000000..4f74b4260a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c
@@ -0,0 +1 @@ 
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
new file mode 100644
index 0000000000..4f74b4260a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c
@@ -0,0 +1 @@ 
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
new file mode 100644
index 0000000000..4f74b4260a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c
@@ -0,0 +1 @@ 
+#include "test-double-libmvec-acos.c"
diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
new file mode 100644
index 0000000000..e38b8ce821
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c
@@ -0,0 +1,3 @@ 
+#define LIBMVEC_TYPE double
+#define LIBMVEC_FUNC acos
+#include "test-vector-abi-arg1.h"
diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
index ed932fc98d..0abc7d2021 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos)
 
 #define VEC_INT_TYPE __m128i
 
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
index 3a6e37044f..dda093b914 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c
@@ -30,6 +30,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos)
 
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m256i
diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
index 99db4e7616..f3230463bb 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos)
 
 #define VEC_INT_TYPE __m128i
 
diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
index 251d429ac0..cf9f52faf0 100644
--- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin)
 VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log)
 VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow)
+VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos)
 
 #ifndef __ILP32__
 # define VEC_INT_TYPE __m512i
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
new file mode 100644
index 0000000000..1e6474dfa2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c
@@ -0,0 +1 @@ 
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
new file mode 100644
index 0000000000..1e6474dfa2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c
@@ -0,0 +1 @@ 
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
new file mode 100644
index 0000000000..1e6474dfa2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c
@@ -0,0 +1 @@ 
+#include "test-float-libmvec-acosf.c"
diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
new file mode 100644
index 0000000000..fb47f974fd
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c
@@ -0,0 +1,3 @@ 
+#define LIBMVEC_TYPE float
+#define LIBMVEC_FUNC acosf
+#include "test-vector-abi-arg1.h"
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
index c1d14cd79e..abbd3ed870 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf)
 
 #define VEC_INT_TYPE __m512i
 
diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
index d23c372060..8a24027952 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf)
 
 #define VEC_INT_TYPE __m128i
 
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
index 3152cffb0c..aff0442606 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -30,6 +30,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf)
 
 /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf.  */
 #undef VECTOR_WRAPPER_fFF
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
index a8492abfef..913584d111 100644
--- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -27,6 +27,7 @@  VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf)
 VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf)
 VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf)
 VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf)
+VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf)
 
 #define VEC_INT_TYPE __m128i