Message ID | 20211219171809.2912282-2-skpgkp2@gmail.com |
---|---|
State | New |
Headers | show |
Series | Add vector math function acos/acosf to libmvec | expand |
On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and > AVX512 versions for libmvec as per vector ABI. It also contains > accuracy and ABI tests for vector acos/acosf with regenerated ulps. > --- Have a few small comments but generally okay with a patch like this one going out in 2.35. > bits/libm-simd-decl-stubs.h | 11 + > math/bits/mathcalls.h | 2 +- > .../unix/sysv/linux/x86_64/libmvec.abilist | 8 + > sysdeps/x86/fpu/bits/math-vector.h | 4 + > .../x86/fpu/finclude/math-vector-fortran.h | 4 + > sysdeps/x86_64/fpu/Makeconfig | 1 + > sysdeps/x86_64/fpu/Versions | 4 + > sysdeps/x86_64/fpu/libm-test-ulps | 20 ++ > .../fpu/multiarch/ifunc-mathvec-avx512-skx.h | 39 +++ > .../fpu/multiarch/svml_d_acos2_core-sse2.S | 20 ++ > .../x86_64/fpu/multiarch/svml_d_acos2_core.c | 27 ++ > .../fpu/multiarch/svml_d_acos2_core_sse4.S | 293 +++++++++++++++++ > .../fpu/multiarch/svml_d_acos4_core-sse.S | 20 ++ > .../x86_64/fpu/multiarch/svml_d_acos4_core.c | 27 ++ > .../fpu/multiarch/svml_d_acos4_core_avx2.S | 273 ++++++++++++++++ > .../fpu/multiarch/svml_d_acos8_core-avx2.S | 20 ++ > .../x86_64/fpu/multiarch/svml_d_acos8_core.c | 27 ++ > .../fpu/multiarch/svml_d_acos8_core_avx512.S | 298 ++++++++++++++++++ > .../fpu/multiarch/svml_s_acosf16_core-avx2.S | 20 ++ > .../fpu/multiarch/svml_s_acosf16_core.c | 28 ++ > .../multiarch/svml_s_acosf16_core_avx512.S | 262 +++++++++++++++ > .../fpu/multiarch/svml_s_acosf4_core-sse2.S | 20 ++ > .../x86_64/fpu/multiarch/svml_s_acosf4_core.c | 28 ++ > .../fpu/multiarch/svml_s_acosf4_core_sse4.S | 260 +++++++++++++++ > .../fpu/multiarch/svml_s_acosf8_core-sse.S | 20 ++ > .../x86_64/fpu/multiarch/svml_s_acosf8_core.c | 28 ++ > .../fpu/multiarch/svml_s_acosf8_core_avx2.S | 252 +++++++++++++++ > sysdeps/x86_64/fpu/svml_d_acos2_core.S | 29 ++ > sysdeps/x86_64/fpu/svml_d_acos4_core.S | 29 ++ > sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S | 25 ++ > sysdeps/x86_64/fpu/svml_d_acos8_core.S | 25 ++ > sysdeps/x86_64/fpu/svml_s_acosf16_core.S | 25 ++ > sysdeps/x86_64/fpu/svml_s_acosf4_core.S | 29 ++ > sysdeps/x86_64/fpu/svml_s_acosf8_core.S | 29 ++ > sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S | 25 ++ > .../x86_64/fpu/test-double-libmvec-acos-avx.c | 1 + > .../fpu/test-double-libmvec-acos-avx2.c | 1 + > .../fpu/test-double-libmvec-acos-avx512f.c | 1 + > sysdeps/x86_64/fpu/test-double-libmvec-acos.c | 3 + > .../x86_64/fpu/test-double-vlen2-wrappers.c | 1 + > .../fpu/test-double-vlen4-avx2-wrappers.c | 1 + > .../x86_64/fpu/test-double-vlen4-wrappers.c | 1 + > .../x86_64/fpu/test-double-vlen8-wrappers.c | 1 + > .../x86_64/fpu/test-float-libmvec-acosf-avx.c | 1 + > .../fpu/test-float-libmvec-acosf-avx2.c | 1 + > .../fpu/test-float-libmvec-acosf-avx512f.c | 1 + > sysdeps/x86_64/fpu/test-float-libmvec-acosf.c | 3 + > .../x86_64/fpu/test-float-vlen16-wrappers.c | 1 + > .../x86_64/fpu/test-float-vlen4-wrappers.c | 1 + > .../fpu/test-float-vlen8-avx2-wrappers.c | 1 + > .../x86_64/fpu/test-float-vlen8-wrappers.c | 1 + > 51 files changed, 2251 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c > create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S > create mode 100644 sysdeps/x86_64/fpu/svml_d_acos2_core.S > create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core.S > create mode 100644 sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S > create mode 100644 sysdeps/x86_64/fpu/svml_d_acos8_core.S > create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf16_core.S > create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf4_core.S > create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core.S > create mode 100644 sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S > create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c > create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c > create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c > create mode 100644 sysdeps/x86_64/fpu/test-double-libmvec-acos.c > create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c > create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c > create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c > create mode 100644 sysdeps/x86_64/fpu/test-float-libmvec-acosf.c > > diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h > index b80ff332a0..2ccdd1fc53 100644 > --- a/bits/libm-simd-decl-stubs.h > +++ b/bits/libm-simd-decl-stubs.h > @@ -98,4 +98,15 @@ > #define __DECL_SIMD_powf32x > #define __DECL_SIMD_powf64x > #define __DECL_SIMD_powf128x > + > +#define __DECL_SIMD_acos > +#define __DECL_SIMD_acosf > +#define __DECL_SIMD_acosl > +#define __DECL_SIMD_acosf16 > +#define __DECL_SIMD_acosf32 > +#define __DECL_SIMD_acosf64 > +#define __DECL_SIMD_acosf128 > +#define __DECL_SIMD_acosf32x > +#define __DECL_SIMD_acosf64x > +#define __DECL_SIMD_acosf128x > #endif > diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h > index da4cf4e10c..2cc6654208 100644 > --- a/math/bits/mathcalls.h > +++ b/math/bits/mathcalls.h > @@ -50,7 +50,7 @@ > /* Trigonometric functions. */ > > /* Arc cosine of X. */ > -__MATHCALL (acos,, (_Mdouble_ __x)); > +__MATHCALL_VEC (acos,, (_Mdouble_ __x)); > /* Arc sine of X. */ > __MATHCALL (asin,, (_Mdouble_ __x)); > /* Arc tangent of X. */ > diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist > index 363d4ace1e..b37b55777e 100644 > --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist > +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist > @@ -46,3 +46,11 @@ GLIBC_2.22 _ZGVeN8v_log F > GLIBC_2.22 _ZGVeN8v_sin F > GLIBC_2.22 _ZGVeN8vv_pow F > GLIBC_2.22 _ZGVeN8vvv_sincos F > +GLIBC_2.35 _ZGVbN2v_acos F > +GLIBC_2.35 _ZGVbN4v_acosf F > +GLIBC_2.35 _ZGVcN4v_acos F > +GLIBC_2.35 _ZGVcN8v_acosf F > +GLIBC_2.35 _ZGVdN4v_acos F > +GLIBC_2.35 _ZGVdN8v_acosf F > +GLIBC_2.35 _ZGVeN16v_acosf F > +GLIBC_2.35 _ZGVeN8v_acos F > diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h > index dc0bfb3705..dabb74cbb9 100644 > --- a/sysdeps/x86/fpu/bits/math-vector.h > +++ b/sysdeps/x86/fpu/bits/math-vector.h > @@ -58,6 +58,10 @@ > # define __DECL_SIMD_pow __DECL_SIMD_x86_64 > # undef __DECL_SIMD_powf > # define __DECL_SIMD_powf __DECL_SIMD_x86_64 > +# undef __DECL_SIMD_acos > +# define __DECL_SIMD_acos __DECL_SIMD_x86_64 > +# undef __DECL_SIMD_acosf > +# define __DECL_SIMD_acosf __DECL_SIMD_x86_64 > > # endif > #endif > diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h > index 311bb4e391..4bcbd1fbce 100644 > --- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h > +++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h > @@ -28,6 +28,8 @@ > !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64') > !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64') > !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64') > +!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64') > +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64') > > !GCC$ builtin (cos) attributes simd (notinbranch) if('x32') > !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32') > @@ -41,3 +43,5 @@ > !GCC$ builtin (expf) attributes simd (notinbranch) if('x32') > !GCC$ builtin (pow) attributes simd (notinbranch) if('x32') > !GCC$ builtin (powf) attributes simd (notinbranch) if('x32') > +!GCC$ builtin (acos) attributes simd (notinbranch) if('x32') > +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32') > diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig > index b0e3bf7887..7acf1f306c 100644 > --- a/sysdeps/x86_64/fpu/Makeconfig > +++ b/sysdeps/x86_64/fpu/Makeconfig > @@ -22,6 +22,7 @@ postclean-generated += libmvec.mk > > # Define for both math and mathvec directories. > libmvec-funcs = \ > + acos \ > cos \ > exp \ > log \ > diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions > index 08132045d6..2985fe7ca7 100644 > --- a/sysdeps/x86_64/fpu/Versions > +++ b/sysdeps/x86_64/fpu/Versions > @@ -13,4 +13,8 @@ libmvec { > _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf; > _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf; > } > + GLIBC_2.35 { > + _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos; > + _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf; > + } > } > diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps > index 312575f933..85a568ed29 100644 > --- a/sysdeps/x86_64/fpu/libm-test-ulps > +++ b/sysdeps/x86_64/fpu/libm-test-ulps > @@ -25,6 +25,26 @@ float: 1 > float128: 1 > ldouble: 2 > > +Function: "acos_vlen16": > +float: 1 > + > +Function: "acos_vlen2": > +double: 1 > + > +Function: "acos_vlen4": > +double: 1 > +float: 2 > + > +Function: "acos_vlen4_avx2": > +double: 1 > + > +Function: "acos_vlen8": > +double: 1 > +float: 2 > + > +Function: "acos_vlen8_avx2": > +float: 1 > + > Function: "acosh": > double: 2 > float: 2 > diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h > new file mode 100644 > index 0000000000..3aed563dde > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h > @@ -0,0 +1,39 @@ > +/* Common definition for libmathvec ifunc selections optimized with > + AVX512. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <init-arch.h> > + > +#undef PASTER2 > +#define PASTER2(x,y) x##_##y > + > +extern void REDIRECT_NAME (void); > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features* cpu_features = __get_cpu_features (); > + > + if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)) > + return OPTIMIZE (skx); > + > + return OPTIMIZE (avx2_wrapper); > +} > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S > new file mode 100644 > index 0000000000..25fb8d0cac > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S > @@ -0,0 +1,20 @@ > +/* SSE2 version of vectorized acos, vector length is 2. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2 > +#include "../svml_d_acos2_core.S" > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c > new file mode 100644 > index 0000000000..5ba5d6fac2 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c > @@ -0,0 +1,27 @@ > +/* Multiple versions of vectorized acos, vector length is 2. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define SYMBOL_NAME _ZGVbN2v_acos > +#include "ifunc-mathvec-sse4_1.h" > + > +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); > + > +#ifdef SHARED > +__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos) > + __attribute__ ((visibility ("hidden"))); > +#endif > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S > new file mode 100644 > index 0000000000..2c528c012e > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S > @@ -0,0 +1,293 @@ > +/* Function acos vectorized with SSE4. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + https://www.gnu.org/licenses/. */ > + > +/* > + * ALGORITHM DESCRIPTION: > + * > + * SelMask = (|x| >= 0.5) ? 1 : 0; > + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| > + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) > + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) > + * > + */ > + > +/* Offsets for data table __svml_dacos_data_internal > + */ > +#define SgnBit 0 > +#define OneHalf 16 > +#define SmallNorm 32 > +#define MOne 48 > +#define Two 64 > +#define sqrt_coeff 80 > +#define poly_coeff 144 > +#define PiH 336 > +#define Pi2H 352 > + > +#include <sysdep.h> > + > + .text > + .section .text.sse4,"ax",@progbits > +ENTRY(_ZGVbN2v_acos_sse4) > + subq $72, %rsp > + cfi_def_cfa_offset(80) > + movaps %xmm0, %xmm5 > + movups __svml_dacos_data_internal(%rip), %xmm3 > + movups OneHalf+__svml_dacos_data_internal(%rip), %xmm6 > + > +/* x = -|arg| */ > + movaps %xmm3, %xmm4 > + orps %xmm5, %xmm4 > + > +/* Y = 0.5 + 0.5*(-x) */ > + movaps %xmm6, %xmm7 > + mulpd %xmm4, %xmm7 > + addpd %xmm7, %xmm6 > + > +/* S ~ 2*sqrt(Y) */ > + cvtpd2ps %xmm6, %xmm9 > + movlhps %xmm9, %xmm9 > + > +/* x^2 */ > + movaps %xmm4, %xmm0 > + rsqrtps %xmm9, %xmm10 > + mulpd %xmm4, %xmm0 > + cvtps2pd %xmm10, %xmm11 > + minpd %xmm6, %xmm0 > + movaps %xmm6, %xmm1 > + movaps %xmm0, %xmm2 > + cmpltpd SmallNorm+__svml_dacos_data_internal(%rip), %xmm1 > + cmpnltpd %xmm6, %xmm2 > + addpd %xmm6, %xmm6 > + andnps %xmm11, %xmm1 > + movaps %xmm0, %xmm11 > + movaps %xmm1, %xmm12 > + andps %xmm5, %xmm3 > + mulpd %xmm1, %xmm12 > + mulpd %xmm6, %xmm1 > + mulpd %xmm12, %xmm6 > + mulpd %xmm0, %xmm11 > + subpd Two+__svml_dacos_data_internal(%rip), %xmm6 > + movups sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm13 > + movaps %xmm6, %xmm14 > + mulpd %xmm6, %xmm13 > + mulpd %xmm1, %xmm14 > + addpd sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm13 > + mulpd %xmm6, %xmm13 > + addpd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm13 > + mulpd %xmm13, %xmm6 > + > +/* polynomial */ > + movups poly_coeff+__svml_dacos_data_internal(%rip), %xmm15 > + movaps %xmm11, %xmm7 > + mulpd %xmm0, %xmm15 > + addpd sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6 > + addpd poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm15 > + mulpd %xmm11, %xmm7 > + mulpd %xmm6, %xmm14 > + mulpd %xmm11, %xmm15 > + subpd %xmm14, %xmm1 > + movups MOne+__svml_dacos_data_internal(%rip), %xmm8 > + andps %xmm2, %xmm1 > + > +/* NaN processed in special branch (so wind test passed) */ > + cmpnlepd %xmm4, %xmm8 > + movmskpd %xmm8, %edx > + > +/* X<X^2 iff X<0 */ > + movaps %xmm5, %xmm12 > + movups poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm8 > + movaps %xmm2, %xmm13 > + movups poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm6 > + mulpd %xmm0, %xmm8 > + mulpd %xmm0, %xmm6 > + addpd poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm8 > + addpd poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm6 > + cmpltpd %xmm0, %xmm12 > + addpd %xmm15, %xmm8 > + mulpd %xmm11, %xmm6 > + mulpd %xmm7, %xmm8 > + movups poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm9 > + mulpd %xmm0, %xmm9 > + addpd poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm9 > + addpd %xmm6, %xmm9 > + movups poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm10 > + movaps %xmm2, %xmm6 > + mulpd %xmm0, %xmm10 > + addpd %xmm8, %xmm9 > + addpd poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm10 > + mulpd %xmm11, %xmm9 > + movups poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm14 > + andnps %xmm4, %xmm6 > + addpd %xmm9, %xmm10 > + mulpd %xmm0, %xmm14 > + mulpd %xmm10, %xmm11 > + addpd poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm14 > + addpd %xmm11, %xmm14 > + mulpd %xmm0, %xmm14 > + orps %xmm1, %xmm6 > + pxor %xmm3, %xmm6 > + mulpd %xmm6, %xmm14 > + movups PiH+__svml_dacos_data_internal(%rip), %xmm0 > + andps %xmm2, %xmm0 > + andnps Pi2H+__svml_dacos_data_internal(%rip), %xmm13 > + andps %xmm12, %xmm0 > + addpd %xmm13, %xmm0 > + addpd %xmm14, %xmm6 > + addpd %xmm6, %xmm0 > + testl %edx, %edx > + > +/* Go to special inputs processing branch */ > + jne L(SPECIAL_VALUES_BRANCH) > + > +/* Restore registers > + * and exit the function > + */ > + > +L(EXIT): > + addq $72, %rsp > + cfi_def_cfa_offset(8) > + ret > + cfi_def_cfa_offset(80) > + > +/* Branch to process > + * special inputs > + */ > + > +L(SPECIAL_VALUES_BRANCH): > + movups %xmm5, 32(%rsp) > + movups %xmm0, 48(%rsp) > + xorl %eax, %eax > + movq %r12, 16(%rsp) > + cfi_offset(12, -64) > + movl %eax, %r12d > + movq %r13, 8(%rsp) > + cfi_offset(13, -72) > + movl %edx, %r13d > + movq %r14, (%rsp) > + cfi_offset(14, -80) > + > +/* Range mask > + * bits check > + */ > + > +L(RANGEMASK_CHECK): > + btl %r12d, %r13d > + > +/* Call scalar math function */ > + jc L(SCALAR_MATH_CALL) > + > +/* Special inputs > + * processing loop > + */ > + > +L(SPECIAL_VALUES_LOOP): > + incl %r12d > + cmpl $2, %r12d > + > +/* Check bits in range mask */ > + jl L(RANGEMASK_CHECK) > + movq 16(%rsp), %r12 > + cfi_restore(12) > + movq 8(%rsp), %r13 > + cfi_restore(13) > + movq (%rsp), %r14 > + cfi_restore(14) > + movups 48(%rsp), %xmm0 > + > +/* Go to exit */ > + jmp L(EXIT) > + cfi_offset(12, -64) > + cfi_offset(13, -72) > + cfi_offset(14, -80) > + > +/* Scalar math fucntion call > + * to process special input > + */ > + > +L(SCALAR_MATH_CALL): > + movl %r12d, %r14d > + movsd 32(%rsp,%r14,8), %xmm0 > + call acos@PLT > + movsd %xmm0, 48(%rsp,%r14,8) > + > +/* Process special inputs in loop */ > + jmp L(SPECIAL_VALUES_LOOP) > + > +END(_ZGVbN2v_acos_sse4) > + > + .section .rodata, "a" > + .align 16 > + > +#ifdef __svml_dacos_data_internal_typedef > +typedef unsigned int VUINT32; > +typedef struct { > + __declspec(align(16)) VUINT32 SgnBit[2][2]; > + __declspec(align(16)) VUINT32 OneHalf[2][2]; > + __declspec(align(16)) VUINT32 SmallNorm[2][2]; > + __declspec(align(16)) VUINT32 MOne[2][2]; > + __declspec(align(16)) VUINT32 Two[2][2]; > + __declspec(align(16)) VUINT32 sqrt_coeff[4][2][2]; > + __declspec(align(16)) VUINT32 poly_coeff[12][2][2]; > + __declspec(align(16)) VUINT32 PiH[2][2]; > + __declspec(align(16)) VUINT32 Pi2H[2][2]; > +} __svml_dacos_data_internal; > +#endif > +__svml_dacos_data_internal: > + /*== SgnBit ==*/ > + .quad 0x8000000000000000, 0x8000000000000000 > + /*== OneHalf ==*/ > + .align 16 > + .quad 0x3fe0000000000000, 0x3fe0000000000000 > + /*== SmallNorm ==*/ > + .align 16 > + .quad 0x3000000000000000, 0x3000000000000000 > + /*== MOne ==*/ > + .align 16 > + .quad 0xbff0000000000000, 0xbff0000000000000 > + /*== Two ==*/ > + .align 16 > + .quad 0x4000000000000000, 0x4000000000000000 > + /*== sqrt_coeff[4] ==*/ > + .align 16 > + .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */ > + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */ > + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */ > + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */ > + /*== poly_coeff[12] ==*/ > + .align 16 > + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */ > + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */ > + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */ > + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */ > + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */ > + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */ > + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */ > + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */ > + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */ > + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */ > + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */ > + .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */ > + /*== PiH ==*/ > + .align 16 > + .quad 0x400921fb54442d18, 0x400921fb54442d18 > + /*== Pi2H ==*/ > + .align 16 > + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18 > + .align 16 > + .type __svml_dacos_data_internal,@object > + .size __svml_dacos_data_internal,.-__svml_dacos_data_internal > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S > new file mode 100644 > index 0000000000..750f71c81c > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S > @@ -0,0 +1,20 @@ > +/* SSE version of vectorized acos, vector length is 4. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper > +#include "../svml_d_acos4_core.S" > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c > new file mode 100644 > index 0000000000..6453e7ebe2 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c > @@ -0,0 +1,27 @@ > +/* Multiple versions of vectorized acos, vector length is 4. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define SYMBOL_NAME _ZGVdN4v_acos > +#include "ifunc-mathvec-avx2.h" > + > +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); > + > +#ifdef SHARED > +__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos) > + __attribute__ ((visibility ("hidden"))); > +#endif > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S > new file mode 100644 > index 0000000000..172080e3ea > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S > @@ -0,0 +1,273 @@ > +/* Function acos vectorized with AVX2. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + https://www.gnu.org/licenses/. */ > + > +/* > + * ALGORITHM DESCRIPTION: > + * > + * SelMask = (|x| >= 0.5) ? 1 : 0; > + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| > + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) > + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) > + * > + */ > + > +/* Offsets for data table __svml_dacos_data_internal > + */ > +#define SgnBit 0 > +#define OneHalf 32 > +#define SmallNorm 64 > +#define MOne 96 > +#define Two 128 > +#define sqrt_coeff 160 > +#define poly_coeff 288 > +#define PiH 672 > +#define Pi2H 704 > + > +#include <sysdep.h> > + > + .text > + .section .text.avx2,"ax",@progbits > +ENTRY(_ZGVdN4v_acos_avx2) > + pushq %rbp > + cfi_def_cfa_offset(16) > + movq %rsp, %rbp > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + andq $-32, %rsp > + subq $96, %rsp > + vmovupd __svml_dacos_data_internal(%rip), %ymm6 > + vmovupd OneHalf+__svml_dacos_data_internal(%rip), %ymm7 > + vmovapd %ymm0, %ymm5 > + > +/* x = -|arg| */ > + vorpd %ymm5, %ymm6, %ymm4 > + > +/* Y = 0.5 + 0.5*(-x) */ > + vfmadd231pd %ymm4, %ymm7, %ymm7 > + > +/* x^2 */ > + vmulpd %ymm4, %ymm4, %ymm8 > + > +/* S ~ 2*sqrt(Y) */ > + vmovupd sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0 > + vcmplt_oqpd SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm12 > + vminpd %ymm7, %ymm8, %ymm2 > + > +/* NaN processed in special branch (so wind test passed) */ > + vcmpnge_uqpd MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm9 > + vcvtpd2ps %ymm7, %xmm10 > + vmovupd poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm8 > + vcmpnlt_uqpd %ymm7, %ymm2, %ymm1 > + vrsqrtps %xmm10, %xmm11 > + vfmadd213pd poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm8 > + vcvtps2pd %xmm11, %ymm13 > + vmovupd poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm11 > + vandnpd %ymm13, %ymm12, %ymm14 > + vmulpd %ymm14, %ymm14, %ymm15 > + vfmadd213pd poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm11 > + vmulpd %ymm2, %ymm2, %ymm13 > + vmovupd poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm12 > + vmulpd %ymm13, %ymm13, %ymm10 > + vfmadd213pd poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm12 > + vandpd %ymm5, %ymm6, %ymm3 > + vaddpd %ymm7, %ymm7, %ymm6 > + vmulpd %ymm6, %ymm14, %ymm7 > + vfmsub213pd Two+__svml_dacos_data_internal(%rip), %ymm15, %ymm6 > + vmovupd poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm14 > + vfmadd213pd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 > + vmulpd %ymm6, %ymm7, %ymm15 > + vfmadd213pd poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm14 > + vfmadd213pd sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 > + vfmadd213pd sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 > + > +/* polynomial */ > + vmovupd poly_coeff+__svml_dacos_data_internal(%rip), %ymm6 > + vfnmadd213pd %ymm7, %ymm15, %ymm0 > + vfmadd213pd poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm6 > + vblendvpd %ymm1, %ymm0, %ymm4, %ymm0 > + vfmadd213pd %ymm8, %ymm13, %ymm6 > + vmovmskpd %ymm9, %edx > + vmovupd poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm9 > + vfmadd213pd poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm9 > + vfmadd213pd %ymm9, %ymm13, %ymm11 > + vfmadd213pd %ymm11, %ymm10, %ymm6 > + vfmadd213pd %ymm12, %ymm13, %ymm6 > + vfmadd213pd %ymm14, %ymm13, %ymm6 > + vmulpd %ymm6, %ymm2, %ymm9 > + > +/* X<X^2 iff X<0 */ > + vcmplt_oqpd %ymm2, %ymm5, %ymm6 > + vandpd PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2 > + vandnpd Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7 > + vxorpd %ymm3, %ymm0, %ymm1 > + vfmadd213pd %ymm1, %ymm1, %ymm9 > + vandpd %ymm6, %ymm2, %ymm2 > + vaddpd %ymm7, %ymm2, %ymm8 > + vaddpd %ymm9, %ymm8, %ymm0 > + testl %edx, %edx > + > +/* Go to special inputs processing branch */ > + jne L(SPECIAL_VALUES_BRANCH) > + > +/* Restore registers > + * and exit the function > + */ > + > +L(EXIT): > + movq %rbp, %rsp > + popq %rbp > + cfi_def_cfa(7, 8) > + cfi_restore(6) > + ret > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + > +/* Branch to process > + * special inputs > + */ > + > +L(SPECIAL_VALUES_BRANCH): > + vmovupd %ymm5, 32(%rsp) > + vmovupd %ymm0, 64(%rsp) > + xorl %eax, %eax > + vzeroupper > + movq %r12, 16(%rsp) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 > + movl %eax, %r12d > + movq %r13, 8(%rsp) > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 > + movl %edx, %r13d > + movq %r14, (%rsp) > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 > + > +/* Range mask > + * bits check > + */ > + > +L(RANGEMASK_CHECK): > + btl %r12d, %r13d > + > +/* Call scalar math function */ > + jc L(SCALAR_MATH_CALL) > + > +/* Special inputs > + * processing loop > + */ > + > +L(SPECIAL_VALUES_LOOP): > + incl %r12d > + cmpl $4, %r12d > + > +/* Check bits in range mask */ > + jl L(RANGEMASK_CHECK) > + movq 16(%rsp), %r12 > + cfi_restore(12) > + movq 8(%rsp), %r13 > + cfi_restore(13) > + movq (%rsp), %r14 > + cfi_restore(14) > + vmovupd 64(%rsp), %ymm0 > + > +/* Go to exit */ > + jmp L(EXIT) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 > + > +/* Scalar math fucntion call > + * to process special input > + */ > + > +L(SCALAR_MATH_CALL): > + movl %r12d, %r14d > + movsd 32(%rsp,%r14,8), %xmm0 > + call acos@PLT > + movsd %xmm0, 64(%rsp,%r14,8) > + > +/* Process special inputs in loop */ > + jmp L(SPECIAL_VALUES_LOOP) > + > +END(_ZGVdN4v_acos_avx2) > + > + .section .rodata, "a" > + .align 32 > + > +#ifdef __svml_dacos_data_internal_typedef > +typedef unsigned int VUINT32; > +typedef struct { > + __declspec(align(32)) VUINT32 SgnBit[4][2]; > + __declspec(align(32)) VUINT32 OneHalf[4][2]; > + __declspec(align(32)) VUINT32 SmallNorm[4][2]; > + __declspec(align(32)) VUINT32 MOne[4][2]; > + __declspec(align(32)) VUINT32 Two[4][2]; > + __declspec(align(32)) VUINT32 sqrt_coeff[4][4][2]; > + __declspec(align(32)) VUINT32 poly_coeff[12][4][2]; > + __declspec(align(32)) VUINT32 PiH[4][2]; > + __declspec(align(32)) VUINT32 Pi2H[4][2]; > +} __svml_dacos_data_internal; > +#endif > +__svml_dacos_data_internal: > + /*== SgnBit ==*/ > + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 > + /*== OneHalf ==*/ > + .align 32 > + .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000 > + /*== SmallNorm ==*/ > + .align 32 > + .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000 > + /*== MOne ==*/ > + .align 32 > + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 > + /*== Two ==*/ > + .align 32 > + .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 > + /*== sqrt_coeff[4] ==*/ > + .align 32 > + .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */ > + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */ > + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */ > + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */ > + /*== poly_coeff[12] ==*/ > + .align 32 > + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */ > + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */ > + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */ > + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */ > + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */ > + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */ > + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */ > + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */ > + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */ > + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */ > + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */ > + .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */ > + /*== PiH ==*/ > + .align 32 > + .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18 > + /*== Pi2H ==*/ > + .align 32 > + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18 > + .align 32 > + .type __svml_dacos_data_internal,@object > + .size __svml_dacos_data_internal,.-__svml_dacos_data_internal > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S > new file mode 100644 > index 0000000000..4d64fd1c00 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S > @@ -0,0 +1,20 @@ > +/* AVX2 version of vectorized acos, vector length is 8. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper > +#include "../svml_d_acos8_core.S" > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c > new file mode 100644 > index 0000000000..1e7d1865fb > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c > @@ -0,0 +1,27 @@ > +/* Multiple versions of vectorized acos, vector length is 8. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define SYMBOL_NAME _ZGVeN8v_acos > +#include "ifunc-mathvec-avx512-skx.h" > + > +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); > + > +#ifdef SHARED > +__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos) > + __attribute__ ((visibility ("hidden"))); > +#endif > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S > new file mode 100644 > index 0000000000..76ca35ad7b > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S > @@ -0,0 +1,298 @@ > +/* Function acos vectorized with AVX-512. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + https://www.gnu.org/licenses/. */ > + > +/* > + * ALGORITHM DESCRIPTION: > + * > + * SelMask = (|x| >= 0.5) ? 1 : 0; > + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| > + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) > + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) > + * > + */ > + > +/* Offsets for data table __svml_dacos_data_internal > + */ > +#define SgnBit 0 > +#define OneHalf 64 > +#define SmallNorm 128 > +#define MOne 192 > +#define Two 256 > +#define sqrt_coeff_1 320 > +#define sqrt_coeff_2 384 > +#define sqrt_coeff_3 448 > +#define sqrt_coeff_4 512 > +#define poly_coeff_1 576 > +#define poly_coeff_2 640 > +#define poly_coeff_3 704 > +#define poly_coeff_4 768 > +#define poly_coeff_5 832 > +#define poly_coeff_6 896 > +#define poly_coeff_7 960 > +#define poly_coeff_8 1024 > +#define poly_coeff_9 1088 > +#define poly_coeff_10 1152 > +#define poly_coeff_11 1216 > +#define poly_coeff_12 1280 > +#define PiH 1344 > +#define Pi2H 1408 There is enough memory here it may pay to make the accesses sequential in memory. > + > +#include <sysdep.h> > + > + .text > + .section .text.evex512,"ax",@progbits > +ENTRY(_ZGVeN8v_acos_skx) > + pushq %rbp > + cfi_def_cfa_offset(16) > + movq %rsp, %rbp > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + andq $-64, %rsp > + subq $192, %rsp > + vmovups __svml_dacos_data_internal(%rip), %zmm7 > + vmovups OneHalf+__svml_dacos_data_internal(%rip), %zmm8 > + > +/* S ~ 2*sqrt(Y) */ > + vmovups SmallNorm+__svml_dacos_data_internal(%rip), %zmm11 > + vmovups Two+__svml_dacos_data_internal(%rip), %zmm14 > + vmovups sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15 > + vmovups sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2 > + vmovups sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1 > + vmovups MOne+__svml_dacos_data_internal(%rip), %zmm10 > + vmovaps %zmm0, %zmm6 > + > +/* x = -|arg| */ > + vorpd %zmm6, %zmm7, %zmm5 > + vandpd %zmm6, %zmm7, %zmm4 > + > +/* Y = 0.5 + 0.5*(-x) */ > + vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8 > + > +/* x^2 */ > + vmulpd {rn-sae}, %zmm5, %zmm5, %zmm9 > + vrsqrt14pd %zmm8, %zmm12 > + vcmppd $17, {sae}, %zmm11, %zmm8, %k1 > + vcmppd $17, {sae}, %zmm10, %zmm5, %k0 > + vmovups poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10 > + vmovups poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11 > + vminpd {sae}, %zmm8, %zmm9, %zmm3 > + vmovups poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9 > + vxorpd %zmm12, %zmm12, %zmm12{%k1} > + vaddpd {rn-sae}, %zmm8, %zmm8, %zmm0 > + vcmppd $21, {sae}, %zmm8, %zmm3, %k4 > + > +/* X<X^2 iff X<0 */ > + vcmppd $17, {sae}, %zmm3, %zmm6, %k2 > + vmulpd {rn-sae}, %zmm12, %zmm12, %zmm13 > + vmulpd {rn-sae}, %zmm12, %zmm0, %zmm7 > + vmovups poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12 > + > +/* polynomial */ > + vmovups poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8 > + vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0 > + vmovups sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13 > + vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12 > + vmovups poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9 > + vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2 > + vmovups poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15 > + vmulpd {rn-sae}, %zmm0, %zmm7, %zmm14 > + vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2 > + vmovups poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1 > + kmovw %k4, %eax > + kmovw %k2, %ecx > + kmovw %k0, %edx > + vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2 > + vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1 > + vmovups poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8 > + vmulpd {rn-sae}, %zmm3, %zmm3, %zmm0 > + vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2 > + vmovups poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7 > + vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8 > + vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1 > + vblendmpd %zmm2, %zmm5, %zmm2{%k4} > + vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7 > + vmovups poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10 > + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 > + andl %eax, %ecx drop I think > + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 > + kmovw %ecx, %k3 kandw %k4, %k2, %k3 > + vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7 > + vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11 > + vmulpd {rn-sae}, %zmm0, %zmm0, %zmm10 > + vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1 > + vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1 > + vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1 > + vmovups Pi2H+__svml_dacos_data_internal(%rip), %zmm0 > + vmulpd {rn-sae}, %zmm3, %zmm1, %zmm1 > + vxorpd %zmm4, %zmm2, %zmm3 > + vxorpd %zmm0, %zmm0, %zmm0{%k4} > + vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1 > + vorpd PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k3} > + vaddpd {rn-sae}, %zmm1, %zmm0, %zmm0 > + testl %edx, %edx > + > +/* Go to special inputs processing branch */ > + jne L(SPECIAL_VALUES_BRANCH) > + > +/* Restore registers > + * and exit the function > + */ > + > +L(EXIT): > + movq %rbp, %rsp > + popq %rbp > + cfi_def_cfa(7, 8) > + cfi_restore(6) > + ret > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + > +/* Branch to process > + * special inputs > + */ > + > +L(SPECIAL_VALUES_BRANCH): > + vmovups %zmm6, 64(%rsp) > + vmovups %zmm0, 128(%rsp) > + xorl %eax, %eax > + vzeroupper > + movq %r12, 16(%rsp) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > + movl %eax, %r12d > + movq %r13, 8(%rsp) > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > + movl %edx, %r13d > + movq %r14, (%rsp) > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > + > +/* Range mask > + * bits check > + */ > + > +L(RANGEMASK_CHECK): > + btl %r12d, %r13d > + > +/* Call scalar math function */ > + jc L(SCALAR_MATH_CALL) > + > +/* Special inputs > + * processing loop > + */ > + > +L(SPECIAL_VALUES_LOOP): > + incl %r12d > + cmpl $8, %r12d > + > +/* Check bits in range mask */ > + jl L(RANGEMASK_CHECK) > + movq 16(%rsp), %r12 > + cfi_restore(12) > + movq 8(%rsp), %r13 > + cfi_restore(13) > + movq (%rsp), %r14 > + cfi_restore(14) > + vmovups 128(%rsp), %zmm0 > + > +/* Go to exit */ > + jmp L(EXIT) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > + > +/* Scalar math fucntion call > + * to process special input > + */ > + > +L(SCALAR_MATH_CALL): > + movl %r12d, %r14d > + movsd 64(%rsp,%r14,8), %xmm0 > + call acos@PLT > + movsd %xmm0, 128(%rsp,%r14,8) > + > +/* Process special inputs in loop */ > + jmp L(SPECIAL_VALUES_LOOP) > + > +END(_ZGVeN8v_acos_skx) > + > + .section .rodata, "a" > + .align 64 > + > +#ifdef __svml_dacos_data_internal_typedef > +typedef unsigned int VUINT32; > +typedef struct { > + __declspec(align(64)) VUINT32 SgnBit[8][2]; > + __declspec(align(64)) VUINT32 OneHalf[8][2]; > + __declspec(align(64)) VUINT32 SmallNorm[8][2]; > + __declspec(align(64)) VUINT32 MOne[8][2]; > + __declspec(align(64)) VUINT32 Two[8][2]; > + __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2]; > + __declspec(align(64)) VUINT32 poly_coeff[12][8][2]; > + __declspec(align(64)) VUINT32 PiH[8][2]; > + __declspec(align(64)) VUINT32 Pi2H[8][2]; > +} __svml_dacos_data_internal; > +#endif > +__svml_dacos_data_internal: > + /*== SgnBit ==*/ > + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 > + /*== OneHalf ==*/ > + .align 64 > + .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000 > + /*== SmallNorm ==*/ > + .align 64 > + .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000 > + /*== MOne ==*/ > + .align 64 > + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 > + /*== Two ==*/ > + .align 64 > + .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 > + /*== sqrt_coeff[4] ==*/ > + .align 64 > + .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */ > + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */ > + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */ > + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */ > + /*== poly_coeff[12] ==*/ > + .align 64 > + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */ > + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */ > + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */ > + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */ > + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */ > + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */ > + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */ > + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */ > + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */ > + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */ > + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */ > + .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */ > + /*== PiH ==*/ > + .align 64 > + .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18 > + /*== Pi2H ==*/ > + .align 64 > + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18 > + .align 64 > + .type __svml_dacos_data_internal,@object > + .size __svml_dacos_data_internal,.-__svml_dacos_data_internal > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S > new file mode 100644 > index 0000000000..1ff0cfc8d5 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S > @@ -0,0 +1,20 @@ > +/* AVX2 version of vectorized acosf. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper > +#include "../svml_s_acosf16_core.S" > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c > new file mode 100644 > index 0000000000..fcf05782c5 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c > @@ -0,0 +1,28 @@ > +/* Multiple versions of vectorized acosf, vector length is 16. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define SYMBOL_NAME _ZGVeN16v_acosf > +#include "ifunc-mathvec-avx512-skx.h" > + > +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); > + > +#ifdef SHARED > +__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf, > + __redirect__ZGVeN16v_acosf) > + __attribute__ ((visibility ("hidden"))); > +#endif > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > new file mode 100644 > index 0000000000..1db2969c77 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > @@ -0,0 +1,262 @@ > +/* Function acosf vectorized with AVX-512. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + https://www.gnu.org/licenses/. */ > + > +/* > + * ALGORITHM DESCRIPTION: > + * > + * SelMask = (|x| >= 0.5) ? 1 : 0; > + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| > + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) > + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) > + * > + * > + */ > + > +/* Offsets for data table __svml_sacos_data_internal > + */ > +#define SgnBit 0 > +#define OneHalf 64 > +#define SmallNorm 128 > +#define MOne 192 > +#define Two 256 > +#define sqrt_coeff_1 320 > +#define sqrt_coeff_2 384 > +#define poly_coeff_1 448 > +#define poly_coeff_2 512 > +#define poly_coeff_3 576 > +#define poly_coeff_4 640 > +#define poly_coeff_5 704 > +#define Pi2H 768 > +#define PiH 832 > + > +#include <sysdep.h> > + > + .text > + .section .text.exex512,"ax",@progbits > +ENTRY(_ZGVeN16v_acosf_skx) > + pushq %rbp > + cfi_def_cfa_offset(16) > + movq %rsp, %rbp > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + andq $-64, %rsp > + subq $192, %rsp > + vmovups __svml_sacos_data_internal(%rip), %zmm5 > + vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6 > + > +/* SQ ~ 2*sqrt(Y) */ > + vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9 > + vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8 > + vmovups Two+__svml_sacos_data_internal(%rip), %zmm12 > + vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13 > + vmovaps %zmm0, %zmm4 > + > +/* x = -|arg| */ > + vorps %zmm4, %zmm5, %zmm3 > + vandps %zmm4, %zmm5, %zmm2 > + vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0 > + > +/* Y = 0.5 + 0.5*(-x) */ > + vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6 > + > +/* x^2 */ > + vmulps {rn-sae}, %zmm3, %zmm3, %zmm7 > + vrsqrt14ps %zmm6, %zmm10 > + vcmpps $17, {sae}, %zmm9, %zmm6, %k1 > + vcmpps $22, {sae}, %zmm3, %zmm8, %k0 > + vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9 > + vminps {sae}, %zmm6, %zmm7, %zmm1 > + vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7 > + vxorps %zmm10, %zmm10, %zmm10{%k1} > + vaddps {rn-sae}, %zmm6, %zmm6, %zmm14 > + vmulps {rn-sae}, %zmm1, %zmm1, %zmm8 > + vmulps {rn-sae}, %zmm10, %zmm10, %zmm11 > + vmulps {rn-sae}, %zmm10, %zmm14, %zmm5 > + vcmpps $21, {sae}, %zmm6, %zmm1, %k4 > + > +/* X<X^2 iff X<0 */ > + vcmpps $17, {sae}, %zmm1, %zmm4, %k2 > + > +/* polynomial */ > + vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6 > + vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14 > + vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11 > + vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9 > + vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10 > + vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12 > + vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0 > + vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11 > + vmulps {rn-sae}, %zmm14, %zmm5, %zmm15 > + vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11 > + vxorps %zmm12, %zmm12, %zmm12{%k4} > + vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0 > + vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11 > + kmovw %k4, %eax > + kmovw %k2, %ecx > + kmovw %k0, %edx > + vmulps {rn-sae}, %zmm1, %zmm11, %zmm13 > + vblendmps %zmm0, %zmm3, %zmm0{%k4} > + vxorps %zmm2, %zmm0, %zmm1 > + andl %eax, %ecx drop I think > + kmovw %ecx, %k3 kandw %k4, %k2, %k3 > + vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13 > + vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3} > + vaddps {rn-sae}, %zmm13, %zmm12, %zmm0 > + testl %edx, %edx > + > +/* Go to special inputs processing branch */ > + jne L(SPECIAL_VALUES_BRANCH) > + > +/* Restore registers > + * and exit the function > + */ > + > +L(EXIT): > + movq %rbp, %rsp > + popq %rbp > + cfi_def_cfa(7, 8) > + cfi_restore(6) > + ret > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + > +/* Branch to process > + * special inputs > + */ > + > +L(SPECIAL_VALUES_BRANCH): > + vmovups %zmm4, 64(%rsp) > + vmovups %zmm0, 128(%rsp) > + xorl %eax, %eax > + vzeroupper > + movq %r12, 16(%rsp) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > + movl %eax, %r12d > + movq %r13, 8(%rsp) > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > + movl %edx, %r13d > + movq %r14, (%rsp) > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > + > +/* Range mask > + * bits check > + */ > + > +L(RANGEMASK_CHECK): > + btl %r12d, %r13d > + > +/* Call scalar math function */ > + jc L(SCALAR_MATH_CALL) > + > +/* Special inputs > + * processing loop > + */ > + > +L(SPECIAL_VALUES_LOOP): > + incl %r12d > + cmpl $16, %r12d > + > +/* Check bits in range mask */ > + jl L(RANGEMASK_CHECK) > + movq 16(%rsp), %r12 > + cfi_restore(12) > + movq 8(%rsp), %r13 > + cfi_restore(13) > + movq (%rsp), %r14 > + cfi_restore(14) > + vmovups 128(%rsp), %zmm0 > + > +/* Go to exit */ > + jmp L(EXIT) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > + > +/* Scalar math fucntion call > + * to process special input > + */ > + > +L(SCALAR_MATH_CALL): > + movl %r12d, %r14d > + movss 64(%rsp,%r14,4), %xmm0 > + call acosf@PLT > + movss %xmm0, 128(%rsp,%r14,4) > + > +/* Process special inputs in loop */ > + jmp L(SPECIAL_VALUES_LOOP) > + > +END(_ZGVeN16v_acosf_skx) > + > + .section .rodata, "a" > + .align 64 > + > +#ifdef __svml_sacos_data_internal_typedef > +typedef unsigned int VUINT32; > +typedef struct { > + __declspec(align(64)) VUINT32 SgnBit[16][1]; > + __declspec(align(64)) VUINT32 OneHalf[16][1]; > + __declspec(align(64)) VUINT32 SmallNorm[16][1]; > + __declspec(align(64)) VUINT32 MOne[16][1]; > + __declspec(align(64)) VUINT32 Two[16][1]; > + __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1]; > + __declspec(align(64)) VUINT32 poly_coeff[5][16][1]; > + __declspec(align(64)) VUINT32 Pi2H[16][1]; > + __declspec(align(64)) VUINT32 PiH[16][1]; > +} __svml_sacos_data_internal; > +#endif > +__svml_sacos_data_internal: > + /*== SgnBit ==*/ > + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 > + /*== OneHalf ==*/ > + .align 64 > + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 > + /*== SmallNorm ==*/ > + .align 64 > + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 > + /*== MOne ==*/ > + .align 64 > + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 > + /*== Two ==*/ > + .align 64 > + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 > + /*== sqrt_coeff[2] ==*/ > + .align 64 > + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ > + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ > + /*== poly_coeff[5] ==*/ > + .align 64 > + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ > + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ > + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ > + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ > + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ > + /*== Pi2H ==*/ > + .align 64 > + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB > + /*== PiH ==*/ > + .align 64 > + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB > + .align 64 > + .type __svml_sacos_data_internal,@object > + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S > new file mode 100644 > index 0000000000..f94b3eb01a > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S > @@ -0,0 +1,20 @@ > +/* SSE2 version of vectorized acosf, vector length is 4. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2 > +#include "../svml_s_acosf4_core.S" > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c > new file mode 100644 > index 0000000000..6f9a5c1082 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c > @@ -0,0 +1,28 @@ > +/* Multiple versions of vectorized acosf, vector length is 4. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define SYMBOL_NAME _ZGVbN4v_acosf > +#include "ifunc-mathvec-sse4_1.h" > + > +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); > + > +#ifdef SHARED > +__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf, > + __redirect__ZGVbN4v_acosf) > + __attribute__ ((visibility ("hidden"))); > +#endif > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S > new file mode 100644 > index 0000000000..fe0c94aeb5 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S > @@ -0,0 +1,260 @@ > +/* Function acosf vectorized with SSE4. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + https://www.gnu.org/licenses/. */ > + > +/* > + * ALGORITHM DESCRIPTION: > + * > + * SelMask = (|x| >= 0.5) ? 1 : 0; > + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| > + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) > + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) > + * > + * > + */ > + > +/* Offsets for data table __svml_sacos_data_internal > + */ > +#define SgnBit 0 > +#define OneHalf 16 > +#define SmallNorm 32 > +#define MOne 48 > +#define Two 64 > +#define sqrt_coeff 80 > +#define poly_coeff 112 > +#define Pi2H 192 > +#define PiH 208 > + > +#include <sysdep.h> > + > + .text > + .section .text.sse4,"ax",@progbits > +ENTRY(_ZGVbN4v_acosf_sse4) > + subq $72, %rsp > + cfi_def_cfa_offset(80) > + > +/* X<X^2 iff X<0 */ > + movaps %xmm0, %xmm14 > + > +/* > + * 2*sqrt(X) ~ Sh - Sl (to 24+ bits) > + * SQ ~ 2*sqrt(X) > + */ > + movups __svml_sacos_data_internal(%rip), %xmm3 > + movups OneHalf+__svml_sacos_data_internal(%rip), %xmm5 > + > +/* x = -|arg| */ > + movaps %xmm3, %xmm4 > + orps %xmm0, %xmm4 > + > +/* Y = 0.5 + 0.5*(-x) */ > + movaps %xmm5, %xmm6 > + mulps %xmm4, %xmm6 > + > +/* x^2 */ > + movaps %xmm4, %xmm13 > + mulps %xmm4, %xmm13 > + addps %xmm6, %xmm5 > + > +/* SQ ~ 2*sqrt(Y) */ > + rsqrtps %xmm5, %xmm8 > + minps %xmm5, %xmm13 > + movaps %xmm5, %xmm2 > + movaps %xmm13, %xmm1 > + cmpltps SmallNorm+__svml_sacos_data_internal(%rip), %xmm2 > + cmpnltps %xmm5, %xmm1 > + cmpltps %xmm13, %xmm14 > + addps %xmm5, %xmm5 > + andnps %xmm8, %xmm2 > + movaps %xmm13, %xmm11 > + movaps %xmm2, %xmm9 > + movaps %xmm1, %xmm6 > + mulps %xmm2, %xmm9 > + andnps %xmm4, %xmm6 > + mulps %xmm5, %xmm2 > + mulps %xmm13, %xmm11 > + mulps %xmm9, %xmm5 > + movups sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm10 > + andps %xmm0, %xmm3 > + > +/* polynomial */ > + movups poly_coeff+__svml_sacos_data_internal(%rip), %xmm12 > + movaps %xmm1, %xmm15 > + mulps %xmm13, %xmm12 > + subps Two+__svml_sacos_data_internal(%rip), %xmm5 > + mulps %xmm5, %xmm10 > + addps poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm12 > + mulps %xmm2, %xmm5 > + mulps %xmm11, %xmm12 > + addps sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm10 > + mulps %xmm5, %xmm10 > + movups poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm5 > + subps %xmm10, %xmm2 > + mulps %xmm13, %xmm5 > + movups MOne+__svml_sacos_data_internal(%rip), %xmm7 > + andps %xmm1, %xmm2 > + cmpnleps %xmm4, %xmm7 > + addps poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm5 > + movmskps %xmm7, %edx > + orps %xmm2, %xmm6 > + addps %xmm12, %xmm5 > + mulps %xmm13, %xmm5 > + pxor %xmm3, %xmm6 > + movups PiH+__svml_sacos_data_internal(%rip), %xmm7 > + andps %xmm1, %xmm7 > + addps poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm5 > + mulps %xmm13, %xmm5 > + andps %xmm14, %xmm7 > + mulps %xmm6, %xmm5 > + andnps Pi2H+__svml_sacos_data_internal(%rip), %xmm15 > + addps %xmm5, %xmm6 > + addps %xmm15, %xmm7 > + addps %xmm6, %xmm7 > + testl %edx, %edx > + > +/* Go to special inputs processing branch */ > + jne L(SPECIAL_VALUES_BRANCH) > + > +/* Restore registers > + * and exit the function > + */ > + > +L(EXIT): > + movaps %xmm7, %xmm0 > + addq $72, %rsp > + cfi_def_cfa_offset(8) > + ret > + cfi_def_cfa_offset(80) > + > +/* Branch to process > + * special inputs > + */ > + > +L(SPECIAL_VALUES_BRANCH): > + movups %xmm0, 32(%rsp) > + movups %xmm7, 48(%rsp) > + xorl %eax, %eax > + movq %r12, 16(%rsp) > + cfi_offset(12, -64) > + movl %eax, %r12d > + movq %r13, 8(%rsp) > + cfi_offset(13, -72) > + movl %edx, %r13d > + movq %r14, (%rsp) > + cfi_offset(14, -80) > + > +/* Range mask > + * bits check > + */ > + > +L(RANGEMASK_CHECK): > + btl %r12d, %r13d > + > +/* Call scalar math function */ > + jc L(SCALAR_MATH_CALL) > + > +/* Special inputs > + * processing loop > + */ > + > +L(SPECIAL_VALUES_LOOP): > + incl %r12d > + cmpl $4, %r12d > + > +/* Check bits in range mask */ > + jl L(RANGEMASK_CHECK) > + movq 16(%rsp), %r12 > + cfi_restore(12) > + movq 8(%rsp), %r13 > + cfi_restore(13) > + movq (%rsp), %r14 > + cfi_restore(14) > + movups 48(%rsp), %xmm7 > + > +/* Go to exit */ > + jmp L(EXIT) > + cfi_offset(12, -64) > + cfi_offset(13, -72) > + cfi_offset(14, -80) > + > +/* Scalar math fucntion call > + * to process special input > + */ > + > +L(SCALAR_MATH_CALL): > + movl %r12d, %r14d > + movss 32(%rsp,%r14,4), %xmm0 > + call acosf@PLT > + movss %xmm0, 48(%rsp,%r14,4) > + > +/* Process special inputs in loop */ > + jmp L(SPECIAL_VALUES_LOOP) > + > +END(_ZGVbN4v_acosf_sse4) > + > + .section .rodata, "a" > + .align 16 > + > +#ifdef __svml_sacos_data_internal_typedef > +typedef unsigned int VUINT32; > +typedef struct { > + __declspec(align(16)) VUINT32 SgnBit[4][1]; > + __declspec(align(16)) VUINT32 OneHalf[4][1]; > + __declspec(align(16)) VUINT32 SmallNorm[4][1]; > + __declspec(align(16)) VUINT32 MOne[4][1]; > + __declspec(align(16)) VUINT32 Two[4][1]; > + __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1]; > + __declspec(align(16)) VUINT32 poly_coeff[5][4][1]; > + __declspec(align(16)) VUINT32 Pi2H[4][1]; > + __declspec(align(16)) VUINT32 PiH[4][1]; > +} __svml_sacos_data_internal; > +#endif > +__svml_sacos_data_internal: > + /*== SgnBit ==*/ > + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 > + /*== OneHalf ==*/ > + .align 16 > + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 > + /*== SmallNorm ==*/ > + .align 16 > + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 > + /*== MOne ==*/ > + .align 16 > + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 > + /*== Two ==*/ > + .align 16 > + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000 > + /*== sqrt_coeff[2] ==*/ > + .align 16 > + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ > + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ > + /*== poly_coeff[5] ==*/ > + .align 16 > + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ > + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ > + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ > + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ > + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ > + /*== Pi2H ==*/ > + .align 16 > + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB > + /*== PiH ==*/ > + .align 16 > + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB > + .align 16 > + .type __svml_sacos_data_internal,@object > + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S > new file mode 100644 > index 0000000000..583ef54fee > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S > @@ -0,0 +1,20 @@ > +/* SSE version of vectorized acosf, vector length is 8. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper > +#include "../svml_s_acosf8_core.S" > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c > new file mode 100644 > index 0000000000..dd360a9479 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c > @@ -0,0 +1,28 @@ > +/* Multiple versions of vectorized acosf, vector length is 8. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define SYMBOL_NAME _ZGVdN8v_acosf > +#include "ifunc-mathvec-avx2.h" > + > +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); > + > +#ifdef SHARED > +__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf, > + __redirect__ZGVdN8v_acosf) > + __attribute__ ((visibility ("hidden"))); > +#endif > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S > new file mode 100644 > index 0000000000..2b6dd2c2c2 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S > @@ -0,0 +1,252 @@ > +/* Function acosf vectorized with AVX2. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + https://www.gnu.org/licenses/. */ > + > +/* > + * ALGORITHM DESCRIPTION: > + * > + * SelMask = (|x| >= 0.5) ? 1 : 0; > + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| > + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) > + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) > + * > + * > + */ > + > +/* Offsets for data table __svml_sacos_data_internal > + */ > +#define SgnBit 0 > +#define OneHalf 32 > +#define SmallNorm 64 > +#define MOne 96 > +#define Two 128 > +#define sqrt_coeff 160 > +#define poly_coeff 224 > +#define Pi2H 384 > +#define PiH 416 > + > +#include <sysdep.h> > + > + .text > + .section .text.avx2,"ax",@progbits > +ENTRY(_ZGVdN8v_acosf_avx2) > + pushq %rbp > + cfi_def_cfa_offset(16) > + movq %rsp, %rbp > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + andq $-32, %rsp > + subq $96, %rsp > + > +/* > + * 2*sqrt(X) ~ Sh - Sl (to 24+ bits) > + * SQ ~ 2*sqrt(X) > + */ > + vmovups __svml_sacos_data_internal(%rip), %ymm6 > + vmovups OneHalf+__svml_sacos_data_internal(%rip), %ymm7 > + vmovaps %ymm0, %ymm5 > + > +/* x = -|arg| */ > + vorps %ymm5, %ymm6, %ymm4 > + > +/* Y = 0.5 + 0.5*(-x) */ > + vfmadd231ps %ymm4, %ymm7, %ymm7 > + > +/* x^2 */ > + vmulps %ymm4, %ymm4, %ymm8 > + > +/* SQ ~ 2*sqrt(Y) */ > + vmovups sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0 > + vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9 > + vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10 > + vminps %ymm7, %ymm8, %ymm2 > + vaddps %ymm7, %ymm7, %ymm14 > + vrsqrtps %ymm7, %ymm11 > + vmovups poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8 > + vcmpnlt_uqps %ymm7, %ymm2, %ymm1 > + vmulps %ymm2, %ymm2, %ymm7 > + vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8 > + vmovmskps %ymm9, %edx > + > +/* polynomial */ > + vmovups poly_coeff+__svml_sacos_data_internal(%rip), %ymm9 > + vandnps %ymm11, %ymm10, %ymm12 > + vmulps %ymm12, %ymm12, %ymm13 > + vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9 > + > +/* X<X^2 iff X<0 */ > + vcmplt_oqps %ymm2, %ymm5, %ymm10 > + vfmadd213ps %ymm8, %ymm7, %ymm9 > + vandps %ymm5, %ymm6, %ymm3 > + vmulps %ymm14, %ymm12, %ymm6 > + vfmsub213ps Two+__svml_sacos_data_internal(%rip), %ymm13, %ymm14 > + vfmadd213ps poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm9 > + vfmadd213ps sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm14, %ymm0 > + vmulps %ymm14, %ymm6, %ymm15 > + vmulps %ymm9, %ymm2, %ymm14 > + vfnmadd213ps %ymm6, %ymm15, %ymm0 > + vblendvps %ymm1, %ymm0, %ymm4, %ymm0 > + vandps PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2 > + vandnps Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm12 > + vxorps %ymm3, %ymm0, %ymm1 > + vfmadd213ps %ymm1, %ymm1, %ymm14 > + vandps %ymm10, %ymm2, %ymm11 > + vaddps %ymm12, %ymm11, %ymm13 > + vaddps %ymm14, %ymm13, %ymm0 > + testl %edx, %edx > + > +/* Go to special inputs processing branch */ > + jne L(SPECIAL_VALUES_BRANCH) > + > +/* Restore registers > + * and exit the function > + */ > + > +L(EXIT): > + movq %rbp, %rsp > + popq %rbp > + cfi_def_cfa(7, 8) > + cfi_restore(6) > + ret > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + > +/* Branch to process > + * special inputs > + */ > + > +L(SPECIAL_VALUES_BRANCH): > + vmovups %ymm5, 32(%rsp) > + vmovups %ymm0, 64(%rsp) > + xorl %eax, %eax > + vzeroupper > + movq %r12, 16(%rsp) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 > + movl %eax, %r12d > + movq %r13, 8(%rsp) > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 > + movl %edx, %r13d > + movq %r14, (%rsp) > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 > + > +/* Range mask > + * bits check > + */ > + > +L(RANGEMASK_CHECK): > + btl %r12d, %r13d > + > +/* Call scalar math function */ > + jc L(SCALAR_MATH_CALL) > + > +/* Special inputs > + * processing loop > + */ > + > +L(SPECIAL_VALUES_LOOP): > + incl %r12d > + cmpl $8, %r12d > + > +/* Check bits in range mask */ > + jl L(RANGEMASK_CHECK) > + movq 16(%rsp), %r12 > + cfi_restore(12) > + movq 8(%rsp), %r13 > + cfi_restore(13) > + movq (%rsp), %r14 > + cfi_restore(14) > + vmovups 64(%rsp), %ymm0 > + > +/* Go to exit */ > + jmp L(EXIT) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 > + > +/* Scalar math fucntion call > + * to process special input > + */ > + > +L(SCALAR_MATH_CALL): > + movl %r12d, %r14d > + movss 32(%rsp,%r14,4), %xmm0 > + call acosf@PLT > + movss %xmm0, 64(%rsp,%r14,4) > + > +/* Process special inputs in loop */ > + jmp L(SPECIAL_VALUES_LOOP) > + > +END(_ZGVdN8v_acosf_avx2) > + > + .section .rodata, "a" > + .align 32 > + > +#ifdef __svml_sacos_data_internal_typedef > +typedef unsigned int VUINT32; > +typedef struct { > + __declspec(align(32)) VUINT32 SgnBit[8][1]; > + __declspec(align(32)) VUINT32 OneHalf[8][1]; > + __declspec(align(32)) VUINT32 SmallNorm[8][1]; > + __declspec(align(32)) VUINT32 MOne[8][1]; > + __declspec(align(32)) VUINT32 Two[8][1]; > + __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1]; > + __declspec(align(32)) VUINT32 poly_coeff[5][8][1]; > + __declspec(align(32)) VUINT32 Pi2H[8][1]; > + __declspec(align(32)) VUINT32 PiH[8][1]; > +} __svml_sacos_data_internal; > +#endif > +__svml_sacos_data_internal: > + /*== SgnBit ==*/ > + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 > + /*== OneHalf ==*/ > + .align 32 > + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 > + /*== SmallNorm ==*/ > + .align 32 > + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 > + /*== MOne ==*/ > + .align 32 > + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 > + /*== Two ==*/ > + .align 32 > + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 > + /*== sqrt_coeff[2] ==*/ > + .align 32 > + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ > + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ > + /*== poly_coeff[5] ==*/ > + .align 32 > + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ > + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ > + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ > + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ > + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ > + /*== Pi2H ==*/ > + .align 32 > + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB > + /*== PiH ==*/ > + .align 32 > + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB > + .align 32 > + .type __svml_sacos_data_internal,@object > + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal > diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S > new file mode 100644 > index 0000000000..9656478b2d > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S > @@ -0,0 +1,29 @@ > +/* Function acos vectorized with SSE2. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_d_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVbN2v_acos) > +WRAPPER_IMPL_SSE2 acos > +END (_ZGVbN2v_acos) > + > +#ifndef USE_MULTIARCH > + libmvec_hidden_def (_ZGVbN2v_acos) > +#endif > diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S > new file mode 100644 > index 0000000000..e99cb4ae78 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S > @@ -0,0 +1,29 @@ > +/* Function acos vectorized with AVX2, wrapper version. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_d_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVdN4v_acos) > +WRAPPER_IMPL_AVX _ZGVbN2v_acos > +END (_ZGVdN4v_acos) > + > +#ifndef USE_MULTIARCH > + libmvec_hidden_def (_ZGVdN4v_acos) > +#endif > diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S > new file mode 100644 > index 0000000000..7cbcbc965c > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S > @@ -0,0 +1,25 @@ > +/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_d_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVcN4v_acos) > +WRAPPER_IMPL_AVX _ZGVbN2v_acos > +END (_ZGVcN4v_acos) > diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S > new file mode 100644 > index 0000000000..e26b30d81a > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S > @@ -0,0 +1,25 @@ > +/* Function acos vectorized with AVX-512, wrapper to AVX2. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_d_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVeN8v_acos) > +WRAPPER_IMPL_AVX512 _ZGVdN4v_acos > +END (_ZGVeN8v_acos) > diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S > new file mode 100644 > index 0000000000..70e046d492 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S > @@ -0,0 +1,25 @@ > +/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_s_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVeN16v_acosf) > +WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf > +END (_ZGVeN16v_acosf) > diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S > new file mode 100644 > index 0000000000..36354b32b5 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S > @@ -0,0 +1,29 @@ > +/* Function acosf vectorized with SSE2, wrapper version. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_s_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVbN4v_acosf) > +WRAPPER_IMPL_SSE2 acosf > +END (_ZGVbN4v_acosf) > + > +#ifndef USE_MULTIARCH > + libmvec_hidden_def (_ZGVbN4v_acosf) > +#endif > diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S > new file mode 100644 > index 0000000000..f08864a511 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S > @@ -0,0 +1,29 @@ > +/* Function acosf vectorized with AVX2, wrapper version. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_s_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVdN8v_acosf) > +WRAPPER_IMPL_AVX _ZGVbN4v_acosf > +END (_ZGVdN8v_acosf) > + > +#ifndef USE_MULTIARCH > + libmvec_hidden_def (_ZGVdN8v_acosf) > +#endif > diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S > new file mode 100644 > index 0000000000..f3ed4d8e78 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S > @@ -0,0 +1,25 @@ > +/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version. > + Copyright (C) 2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include "svml_s_wrapper_impl.h" > + > + .text > +ENTRY (_ZGVcN8v_acosf) > +WRAPPER_IMPL_AVX _ZGVbN4v_acosf > +END (_ZGVcN8v_acosf) > diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c > new file mode 100644 > index 0000000000..4f74b4260a > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c > @@ -0,0 +1 @@ > +#include "test-double-libmvec-acos.c" > diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c > new file mode 100644 > index 0000000000..4f74b4260a > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c > @@ -0,0 +1 @@ > +#include "test-double-libmvec-acos.c" > diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c > new file mode 100644 > index 0000000000..4f74b4260a > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c > @@ -0,0 +1 @@ > +#include "test-double-libmvec-acos.c" > diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c > new file mode 100644 > index 0000000000..e38b8ce821 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c > @@ -0,0 +1,3 @@ > +#define LIBMVEC_TYPE double > +#define LIBMVEC_FUNC acos > +#include "test-vector-abi-arg1.h" > diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c > index ed932fc98d..0abc7d2021 100644 > --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c > @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin) > VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log) > VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp) > VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow) > +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos) > > #define VEC_INT_TYPE __m128i > > diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c > index 3a6e37044f..dda093b914 100644 > --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c > @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin) > VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log) > VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp) > VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow) > +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos) > > #ifndef __ILP32__ > # define VEC_INT_TYPE __m256i > diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c > index 99db4e7616..f3230463bb 100644 > --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c > @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin) > VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log) > VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp) > VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow) > +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos) > > #define VEC_INT_TYPE __m128i > > diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c > index 251d429ac0..cf9f52faf0 100644 > --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c > @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin) > VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log) > VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp) > VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow) > +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos) > > #ifndef __ILP32__ > # define VEC_INT_TYPE __m512i > diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c > new file mode 100644 > index 0000000000..1e6474dfa2 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c > @@ -0,0 +1 @@ > +#include "test-float-libmvec-acosf.c" > diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c > new file mode 100644 > index 0000000000..1e6474dfa2 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c > @@ -0,0 +1 @@ > +#include "test-float-libmvec-acosf.c" > diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c > new file mode 100644 > index 0000000000..1e6474dfa2 > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c > @@ -0,0 +1 @@ > +#include "test-float-libmvec-acosf.c" > diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c > new file mode 100644 > index 0000000000..fb47f974fd > --- /dev/null > +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c > @@ -0,0 +1,3 @@ > +#define LIBMVEC_TYPE float > +#define LIBMVEC_FUNC acosf > +#include "test-vector-abi-arg1.h" > diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c > index c1d14cd79e..abbd3ed870 100644 > --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c > @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf) > VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf) > VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf) > VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf) > +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf) > > #define VEC_INT_TYPE __m512i > > diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c > index d23c372060..8a24027952 100644 > --- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c > @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf) > VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf) > VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf) > VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf) > +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf) > > #define VEC_INT_TYPE __m128i > > diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c > index 3152cffb0c..aff0442606 100644 > --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c > @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf) > VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf) > VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf) > VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf) > +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf) > > /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */ > #undef VECTOR_WRAPPER_fFF > diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c > index a8492abfef..913584d111 100644 > --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c > +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c > @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf) > VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf) > VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf) > VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf) > +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf) > > #define VEC_INT_TYPE __m128i > > -- > 2.31.1 >
On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote: > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and > > AVX512 versions for libmvec as per vector ABI. It also contains > > accuracy and ABI tests for vector acos/acosf with regenerated ulps. > > --- > > Have a few small comments but generally okay with a patch like this > one going out in > 2.35. ... > > > +#define poly_coeff_6 896 > > +#define poly_coeff_7 960 > > +#define poly_coeff_8 1024 > > +#define poly_coeff_9 1088 > > +#define poly_coeff_10 1152 > > +#define poly_coeff_11 1216 > > +#define poly_coeff_12 1280 > > +#define PiH 1344 > > +#define Pi2H 1408 > > There is enough memory here it may pay to make the accesses Did you enough registers? > sequential in memory. This is based on Intel compiler generated codes. We will evaluate Intel compiler changes. ... > > + > > +#include <sysdep.h> > > + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 > > + andl %eax, %ecx > drop I think > > > + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 > > + kmovw %ecx, %k3 > kandw %k4, %k2, %k3 This may not be faster since mask register can only go to port 0. We will evaluate register allocation in Intel compiler. Thanks. H.J.
On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote: > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and > > > AVX512 versions for libmvec as per vector ABI. It also contains > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps. > > > --- > > > > Have a few small comments but generally okay with a patch like this > > one going out in > > 2.35. > > ... > > > > > > +#define poly_coeff_6 896 > > > +#define poly_coeff_7 960 > > > +#define poly_coeff_8 1024 > > > +#define poly_coeff_9 1088 > > > +#define poly_coeff_10 1152 > > > +#define poly_coeff_11 1216 > > > +#define poly_coeff_12 1280 > > > +#define PiH 1344 > > > +#define Pi2H 1408 > > > > There is enough memory here it may pay to make the accesses > > Did you enough registers? This shouldn't affect register allocation. It's just if in the program we access: poly_coeff_11 -> poly_coeff_6 -> poly_coeff_8 it might be beneficial to organize the addresses of 11/6/8 s.t its sequential memory accesses from the table i.e #define poly_coeff_11 896 #define poly_coeff_6 960 #define poly_coeff_8 1024 ... Random example and just a thought. Figure if coming in cold it might save a cache miss or two because it has an easy to recognize pattern for the HW prefetcher. Don't think it's make or break. > > > sequential in memory. > > This is based on Intel compiler generated codes. We will evaluate > Intel compiler changes. > > ... > > > > + > > > +#include <sysdep.h> > > > + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 > > > + andl %eax, %ecx > > drop I think > > > > > + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 > > > + kmovw %ecx, %k3 > > kandw %k4, %k2, %k3 > > This may not be faster since mask register can only go to port 0. We > will evaluate register allocation in Intel compiler. `kmovw` and `kandw` are both 1uop port0. `andl` + `kmovw` is 2 uops and has 4c latency vs `kandw` is 1 uop and 1c latency. > > > Thanks. > > H.J.
On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote: > > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and > > > > AVX512 versions for libmvec as per vector ABI. It also contains > > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps. > > > > --- > > > > > > Have a few small comments but generally okay with a patch like this > > > one going out in > > > 2.35. > > > > ... > > > > > > > > > +#define poly_coeff_6 896 > > > > +#define poly_coeff_7 960 > > > > +#define poly_coeff_8 1024 > > > > +#define poly_coeff_9 1088 > > > > +#define poly_coeff_10 1152 > > > > +#define poly_coeff_11 1216 > > > > +#define poly_coeff_12 1280 > > > > +#define PiH 1344 > > > > +#define Pi2H 1408 > > > > > > There is enough memory here it may pay to make the accesses > > > > Did you enough registers? > > This shouldn't affect register allocation. > It's just if in the program we access: poly_coeff_11 -> poly_coeff_6 > -> poly_coeff_8 > > it might be beneficial to organize the addresses of 11/6/8 s.t its > sequential memory > accesses from the table i.e > #define poly_coeff_11 896 > #define poly_coeff_6 960 > #define poly_coeff_8 1024 > ... > > Random example and just a thought. Figure if coming in cold it might > save a cache miss or two because it has an easy to recognize pattern > for the HW prefetcher. Don't think it's make or break. > Good suggestion. It's difficult to hand modify. Will let compiler team know about this optimization. > > > > > sequential in memory. > > > > This is based on Intel compiler generated codes. We will evaluate > > Intel compiler changes. > > > > ... > > > > > > + > > > > +#include <sysdep.h> > > > > + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 > > > > + andl %eax, %ecx > > > drop I think > > > > > > > + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 > > > > + kmovw %ecx, %k3 > > > kandw %k4, %k2, %k3 > > > > This may not be faster since mask register can only go to port 0. We > > will evaluate register allocation in Intel compiler. > > `kmovw` and `kandw` are both 1uop port0. > > `andl` + `kmovw` is 2 uops and has 4c latency > vs > `kandw` is 1 uop and 1c latency. Will be fixed in v6. > > > > > > Thanks. > > > > H.J.
On Mon, Dec 20, 2021 at 10:08 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote: > > > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha > > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and > > > > > AVX512 versions for libmvec as per vector ABI. It also contains > > > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps. > > > > > --- > > > > > > > > Have a few small comments but generally okay with a patch like this > > > > one going out in > > > > 2.35. > > > > > > ... > > > > > > > > > > > > +#define poly_coeff_6 896 > > > > > +#define poly_coeff_7 960 > > > > > +#define poly_coeff_8 1024 > > > > > +#define poly_coeff_9 1088 > > > > > +#define poly_coeff_10 1152 > > > > > +#define poly_coeff_11 1216 > > > > > +#define poly_coeff_12 1280 > > > > > +#define PiH 1344 > > > > > +#define Pi2H 1408 > > > > > > > > There is enough memory here it may pay to make the accesses > > > > > > Did you enough registers? > > > > This shouldn't affect register allocation. > > It's just if in the program we access: poly_coeff_11 -> poly_coeff_6 > > -> poly_coeff_8 > > > > it might be beneficial to organize the addresses of 11/6/8 s.t its > > sequential memory > > accesses from the table i.e > > #define poly_coeff_11 896 > > #define poly_coeff_6 960 > > #define poly_coeff_8 1024 > > ... > > > > Random example and just a thought. Figure if coming in cold it might > > save a cache miss or two because it has an easy to recognize pattern > > for the HW prefetcher. Don't think it's make or break. > > > > Good suggestion. It's difficult to hand modify. Will let compiler team know > about this optimization. Like I said, can live with/without this optimization in the first version (mostly because I think its unclear what the actual best schema is), but this patch is being submitted as asm and meant to be maintained as asm. If the only feasible way to make future changes/optimizations is to update the compiler and recompile some higher level language, that's an issue. > > > > > > > > sequential in memory. > > > > > > This is based on Intel compiler generated codes. We will evaluate > > > Intel compiler changes. > > > > > > ... > > > > > > > > + > > > > > +#include <sysdep.h> > > > > > + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 > > > > > + andl %eax, %ecx > > > > drop I think > > > > > > > > > + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 > > > > > + kmovw %ecx, %k3 > > > > kandw %k4, %k2, %k3 > > > > > > This may not be faster since mask register can only go to port 0. We > > > will evaluate register allocation in Intel compiler. > > > > `kmovw` and `kandw` are both 1uop port0. > > > > `andl` + `kmovw` is 2 uops and has 4c latency > > vs > > `kandw` is 1 uop and 1c latency. > > Will be fixed in v6. > > > > > > > > > > Thanks. > > > > > > H.J.
On Mon, Dec 20, 2021 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Dec 20, 2021 at 10:08 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote: > > > > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha > > > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and > > > > > > AVX512 versions for libmvec as per vector ABI. It also contains > > > > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps. > > > > > > --- > > > > > > > > > > Have a few small comments but generally okay with a patch like this > > > > > one going out in > > > > > 2.35. > > > > > > > > ... > > > > > > > > > > > > > > > +#define poly_coeff_6 896 > > > > > > +#define poly_coeff_7 960 > > > > > > +#define poly_coeff_8 1024 > > > > > > +#define poly_coeff_9 1088 > > > > > > +#define poly_coeff_10 1152 > > > > > > +#define poly_coeff_11 1216 > > > > > > +#define poly_coeff_12 1280 > > > > > > +#define PiH 1344 > > > > > > +#define Pi2H 1408 > > > > > > > > > > There is enough memory here it may pay to make the accesses > > > > > > > > Did you enough registers? > > > > > > This shouldn't affect register allocation. > > > It's just if in the program we access: poly_coeff_11 -> poly_coeff_6 > > > -> poly_coeff_8 > > > > > > it might be beneficial to organize the addresses of 11/6/8 s.t its > > > sequential memory > > > accesses from the table i.e > > > #define poly_coeff_11 896 > > > #define poly_coeff_6 960 > > > #define poly_coeff_8 1024 > > > ... > > > > > > Random example and just a thought. Figure if coming in cold it might > > > save a cache miss or two because it has an easy to recognize pattern > > > for the HW prefetcher. Don't think it's make or break. > > > > > > > Good suggestion. It's difficult to hand modify. Will let compiler team know > > about this optimization. > > Like I said, can live with/without this optimization in the first > version (mostly > because I think its unclear what the actual best schema is), but this patch > is being submitted as asm and meant to be maintained as asm. If the > only feasible > way to make future changes/optimizations is to update the compiler and > recompile > some higher level language, that's an issue. > > > > > > > > > > > > sequential in memory. > > > > > > > > This is based on Intel compiler generated codes. We will evaluate > > > > Intel compiler changes. > > > > > > > > ... > > > > > > > > > > + > > > > > > +#include <sysdep.h> > > > > > > + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 > > > > > > + andl %eax, %ecx > > > > > drop I think > > > > > > > > > > > + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 > > > > > > + kmovw %ecx, %k3 > > > > > kandw %k4, %k2, %k3 > > > > > > > > This may not be faster since mask register can only go to port 0. We > > > > will evaluate register allocation in Intel compiler. > > > > > > `kmovw` and `kandw` are both 1uop port0. > > > > > > `andl` + `kmovw` is 2 uops and has 4c latency > > > vs > > > `kandw` is 1 uop and 1c latency. > > > > Will be fixed in v6. In the other patches (for other functions, this one is fine) can you have the compiler printout (maybe just a comment at the end of the line) the the live-intervals for each register assignment. Looking at this code there is a perception of extreme register pressure but a lot of that seems forced by suspect instruction scheduling. It would be easier to notice that for future maintenance with the the comment. > > > > > > > > > > > > > > Thanks. > > > > > > > > H.J.
On Mon, Dec 20, 2021 at 11:20 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Mon, Dec 20, 2021 at 10:08 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Sun, Dec 19, 2021 at 12:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Sun, Dec 19, 2021 at 2:26 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > > > On Sun, Dec 19, 2021 at 12:29:07PM -0600, GNU C Library wrote: > > > > > On Sun, Dec 19, 2021 at 11:19 AM Sunil K Pandey via Libc-alpha > > > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > > > > > Implement vectorized acos/acosf containing SSE, AVX, AVX2 and > > > > > > AVX512 versions for libmvec as per vector ABI. It also contains > > > > > > accuracy and ABI tests for vector acos/acosf with regenerated ulps. > > > > > > --- > > > > > > > > > > Have a few small comments but generally okay with a patch like this > > > > > one going out in > > > > > 2.35. > > > > > > > > ... > > > > > > > > > > > > > > > +#define poly_coeff_6 896 > > > > > > +#define poly_coeff_7 960 > > > > > > +#define poly_coeff_8 1024 > > > > > > +#define poly_coeff_9 1088 > > > > > > +#define poly_coeff_10 1152 > > > > > > +#define poly_coeff_11 1216 > > > > > > +#define poly_coeff_12 1280 > > > > > > +#define PiH 1344 > > > > > > +#define Pi2H 1408 > > > > > > > > > > There is enough memory here it may pay to make the accesses > > > > > > > > Did you enough registers? > > > > > > This shouldn't affect register allocation. > > > It's just if in the program we access: poly_coeff_11 -> poly_coeff_6 > > > -> poly_coeff_8 > > > > > > it might be beneficial to organize the addresses of 11/6/8 s.t its > > > sequential memory > > > accesses from the table i.e > > > #define poly_coeff_11 896 > > > #define poly_coeff_6 960 > > > #define poly_coeff_8 1024 > > > ... > > > > > > Random example and just a thought. Figure if coming in cold it might > > > save a cache miss or two because it has an easy to recognize pattern > > > for the HW prefetcher. Don't think it's make or break. > > > > > > > Good suggestion. It's difficult to hand modify. Will let compiler team know > > about this optimization. > > Like I said, can live with/without this optimization in the first > version (mostly > because I think its unclear what the actual best schema is), but this patch > is being submitted as asm and meant to be maintained as asm. If the > only feasible > way to make future changes/optimizations is to update the compiler and > recompile > some higher level language, that's an issue. We prefer to generate compiler optimized code. We can certainly hand optimize just like we did for other cases. For this version we want to leave as is. > > > > > > > > > > > > sequential in memory. > > > > > > > > This is based on Intel compiler generated codes. We will evaluate > > > > Intel compiler changes. > > > > > > > > ... > > > > > > > > > > + > > > > > > +#include <sysdep.h> > > > > > > + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 > > > > > > + andl %eax, %ecx > > > > > drop I think > > > > > > > > > > > + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 > > > > > > + kmovw %ecx, %k3 > > > > > kandw %k4, %k2, %k3 > > > > > > > > This may not be faster since mask register can only go to port 0. We > > > > will evaluate register allocation in Intel compiler. > > > > > > `kmovw` and `kandw` are both 1uop port0. > > > > > > `andl` + `kmovw` is 2 uops and has 4c latency > > > vs > > > `kandw` is 1 uop and 1c latency. > > > > Will be fixed in v6. > > > > > > > > > > > > > > Thanks. > > > > > > > > H.J.
diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h index b80ff332a0..2ccdd1fc53 100644 --- a/bits/libm-simd-decl-stubs.h +++ b/bits/libm-simd-decl-stubs.h @@ -98,4 +98,15 @@ #define __DECL_SIMD_powf32x #define __DECL_SIMD_powf64x #define __DECL_SIMD_powf128x + +#define __DECL_SIMD_acos +#define __DECL_SIMD_acosf +#define __DECL_SIMD_acosl +#define __DECL_SIMD_acosf16 +#define __DECL_SIMD_acosf32 +#define __DECL_SIMD_acosf64 +#define __DECL_SIMD_acosf128 +#define __DECL_SIMD_acosf32x +#define __DECL_SIMD_acosf64x +#define __DECL_SIMD_acosf128x #endif diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h index da4cf4e10c..2cc6654208 100644 --- a/math/bits/mathcalls.h +++ b/math/bits/mathcalls.h @@ -50,7 +50,7 @@ /* Trigonometric functions. */ /* Arc cosine of X. */ -__MATHCALL (acos,, (_Mdouble_ __x)); +__MATHCALL_VEC (acos,, (_Mdouble_ __x)); /* Arc sine of X. */ __MATHCALL (asin,, (_Mdouble_ __x)); /* Arc tangent of X. */ diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist index 363d4ace1e..b37b55777e 100644 --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist @@ -46,3 +46,11 @@ GLIBC_2.22 _ZGVeN8v_log F GLIBC_2.22 _ZGVeN8v_sin F GLIBC_2.22 _ZGVeN8vv_pow F GLIBC_2.22 _ZGVeN8vvv_sincos F +GLIBC_2.35 _ZGVbN2v_acos F +GLIBC_2.35 _ZGVbN4v_acosf F +GLIBC_2.35 _ZGVcN4v_acos F +GLIBC_2.35 _ZGVcN8v_acosf F +GLIBC_2.35 _ZGVdN4v_acos F +GLIBC_2.35 _ZGVdN8v_acosf F +GLIBC_2.35 _ZGVeN16v_acosf F +GLIBC_2.35 _ZGVeN8v_acos F diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h index dc0bfb3705..dabb74cbb9 100644 --- a/sysdeps/x86/fpu/bits/math-vector.h +++ b/sysdeps/x86/fpu/bits/math-vector.h @@ -58,6 +58,10 @@ # define __DECL_SIMD_pow __DECL_SIMD_x86_64 # undef __DECL_SIMD_powf # define __DECL_SIMD_powf __DECL_SIMD_x86_64 +# undef __DECL_SIMD_acos +# define __DECL_SIMD_acos __DECL_SIMD_x86_64 +# undef __DECL_SIMD_acosf +# define __DECL_SIMD_acosf __DECL_SIMD_x86_64 # endif #endif diff --git a/sysdeps/x86/fpu/finclude/math-vector-fortran.h b/sysdeps/x86/fpu/finclude/math-vector-fortran.h index 311bb4e391..4bcbd1fbce 100644 --- a/sysdeps/x86/fpu/finclude/math-vector-fortran.h +++ b/sysdeps/x86/fpu/finclude/math-vector-fortran.h @@ -28,6 +28,8 @@ !GCC$ builtin (expf) attributes simd (notinbranch) if('x86_64') !GCC$ builtin (pow) attributes simd (notinbranch) if('x86_64') !GCC$ builtin (powf) attributes simd (notinbranch) if('x86_64') +!GCC$ builtin (acos) attributes simd (notinbranch) if('x86_64') +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x86_64') !GCC$ builtin (cos) attributes simd (notinbranch) if('x32') !GCC$ builtin (cosf) attributes simd (notinbranch) if('x32') @@ -41,3 +43,5 @@ !GCC$ builtin (expf) attributes simd (notinbranch) if('x32') !GCC$ builtin (pow) attributes simd (notinbranch) if('x32') !GCC$ builtin (powf) attributes simd (notinbranch) if('x32') +!GCC$ builtin (acos) attributes simd (notinbranch) if('x32') +!GCC$ builtin (acosf) attributes simd (notinbranch) if('x32') diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig index b0e3bf7887..7acf1f306c 100644 --- a/sysdeps/x86_64/fpu/Makeconfig +++ b/sysdeps/x86_64/fpu/Makeconfig @@ -22,6 +22,7 @@ postclean-generated += libmvec.mk # Define for both math and mathvec directories. libmvec-funcs = \ + acos \ cos \ exp \ log \ diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions index 08132045d6..2985fe7ca7 100644 --- a/sysdeps/x86_64/fpu/Versions +++ b/sysdeps/x86_64/fpu/Versions @@ -13,4 +13,8 @@ libmvec { _ZGVbN4vv_powf; _ZGVcN8vv_powf; _ZGVdN8vv_powf; _ZGVeN16vv_powf; _ZGVbN4vvv_sincosf; _ZGVcN8vvv_sincosf; _ZGVdN8vvv_sincosf; _ZGVeN16vvv_sincosf; } + GLIBC_2.35 { + _ZGVbN2v_acos; _ZGVcN4v_acos; _ZGVdN4v_acos; _ZGVeN8v_acos; + _ZGVbN4v_acosf; _ZGVcN8v_acosf; _ZGVdN8v_acosf; _ZGVeN16v_acosf; + } } diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index 312575f933..85a568ed29 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -25,6 +25,26 @@ float: 1 float128: 1 ldouble: 2 +Function: "acos_vlen16": +float: 1 + +Function: "acos_vlen2": +double: 1 + +Function: "acos_vlen4": +double: 1 +float: 2 + +Function: "acos_vlen4_avx2": +double: 1 + +Function: "acos_vlen8": +double: 1 +float: 2 + +Function: "acos_vlen8_avx2": +float: 1 + Function: "acosh": double: 2 float: 2 diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h new file mode 100644 index 0000000000..3aed563dde --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-mathvec-avx512-skx.h @@ -0,0 +1,39 @@ +/* Common definition for libmathvec ifunc selections optimized with + AVX512. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +#undef PASTER2 +#define PASTER2(x,y) x##_##y + +extern void REDIRECT_NAME (void); +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_wrapper) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (skx) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, MathVec_Prefer_No_AVX512) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)) + return OPTIMIZE (skx); + + return OPTIMIZE (avx2_wrapper); +} diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S new file mode 100644 index 0000000000..25fb8d0cac --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized acos, vector length is 2. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define _ZGVbN2v_acos _ZGVbN2v_acos_sse2 +#include "../svml_d_acos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c new file mode 100644 index 0000000000..5ba5d6fac2 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized acos, vector length is 2. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN2v_acos +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN2v_acos, __GI__ZGVbN2v_acos, __redirect__ZGVbN2v_acos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S new file mode 100644 index 0000000000..2c528c012e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos2_core_sse4.S @@ -0,0 +1,293 @@ +/* Function acos vectorized with SSE4. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + https://www.gnu.org/licenses/. */ + +/* + * ALGORITHM DESCRIPTION: + * + * SelMask = (|x| >= 0.5) ? 1 : 0; + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) + * + */ + +/* Offsets for data table __svml_dacos_data_internal + */ +#define SgnBit 0 +#define OneHalf 16 +#define SmallNorm 32 +#define MOne 48 +#define Two 64 +#define sqrt_coeff 80 +#define poly_coeff 144 +#define PiH 336 +#define Pi2H 352 + +#include <sysdep.h> + + .text + .section .text.sse4,"ax",@progbits +ENTRY(_ZGVbN2v_acos_sse4) + subq $72, %rsp + cfi_def_cfa_offset(80) + movaps %xmm0, %xmm5 + movups __svml_dacos_data_internal(%rip), %xmm3 + movups OneHalf+__svml_dacos_data_internal(%rip), %xmm6 + +/* x = -|arg| */ + movaps %xmm3, %xmm4 + orps %xmm5, %xmm4 + +/* Y = 0.5 + 0.5*(-x) */ + movaps %xmm6, %xmm7 + mulpd %xmm4, %xmm7 + addpd %xmm7, %xmm6 + +/* S ~ 2*sqrt(Y) */ + cvtpd2ps %xmm6, %xmm9 + movlhps %xmm9, %xmm9 + +/* x^2 */ + movaps %xmm4, %xmm0 + rsqrtps %xmm9, %xmm10 + mulpd %xmm4, %xmm0 + cvtps2pd %xmm10, %xmm11 + minpd %xmm6, %xmm0 + movaps %xmm6, %xmm1 + movaps %xmm0, %xmm2 + cmpltpd SmallNorm+__svml_dacos_data_internal(%rip), %xmm1 + cmpnltpd %xmm6, %xmm2 + addpd %xmm6, %xmm6 + andnps %xmm11, %xmm1 + movaps %xmm0, %xmm11 + movaps %xmm1, %xmm12 + andps %xmm5, %xmm3 + mulpd %xmm1, %xmm12 + mulpd %xmm6, %xmm1 + mulpd %xmm12, %xmm6 + mulpd %xmm0, %xmm11 + subpd Two+__svml_dacos_data_internal(%rip), %xmm6 + movups sqrt_coeff+__svml_dacos_data_internal(%rip), %xmm13 + movaps %xmm6, %xmm14 + mulpd %xmm6, %xmm13 + mulpd %xmm1, %xmm14 + addpd sqrt_coeff+16+__svml_dacos_data_internal(%rip), %xmm13 + mulpd %xmm6, %xmm13 + addpd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %xmm13 + mulpd %xmm13, %xmm6 + +/* polynomial */ + movups poly_coeff+__svml_dacos_data_internal(%rip), %xmm15 + movaps %xmm11, %xmm7 + mulpd %xmm0, %xmm15 + addpd sqrt_coeff+48+__svml_dacos_data_internal(%rip), %xmm6 + addpd poly_coeff+16+__svml_dacos_data_internal(%rip), %xmm15 + mulpd %xmm11, %xmm7 + mulpd %xmm6, %xmm14 + mulpd %xmm11, %xmm15 + subpd %xmm14, %xmm1 + movups MOne+__svml_dacos_data_internal(%rip), %xmm8 + andps %xmm2, %xmm1 + +/* NaN processed in special branch (so wind test passed) */ + cmpnlepd %xmm4, %xmm8 + movmskpd %xmm8, %edx + +/* X<X^2 iff X<0 */ + movaps %xmm5, %xmm12 + movups poly_coeff+32+__svml_dacos_data_internal(%rip), %xmm8 + movaps %xmm2, %xmm13 + movups poly_coeff+64+__svml_dacos_data_internal(%rip), %xmm6 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm6 + addpd poly_coeff+48+__svml_dacos_data_internal(%rip), %xmm8 + addpd poly_coeff+80+__svml_dacos_data_internal(%rip), %xmm6 + cmpltpd %xmm0, %xmm12 + addpd %xmm15, %xmm8 + mulpd %xmm11, %xmm6 + mulpd %xmm7, %xmm8 + movups poly_coeff+96+__svml_dacos_data_internal(%rip), %xmm9 + mulpd %xmm0, %xmm9 + addpd poly_coeff+112+__svml_dacos_data_internal(%rip), %xmm9 + addpd %xmm6, %xmm9 + movups poly_coeff+128+__svml_dacos_data_internal(%rip), %xmm10 + movaps %xmm2, %xmm6 + mulpd %xmm0, %xmm10 + addpd %xmm8, %xmm9 + addpd poly_coeff+144+__svml_dacos_data_internal(%rip), %xmm10 + mulpd %xmm11, %xmm9 + movups poly_coeff+160+__svml_dacos_data_internal(%rip), %xmm14 + andnps %xmm4, %xmm6 + addpd %xmm9, %xmm10 + mulpd %xmm0, %xmm14 + mulpd %xmm10, %xmm11 + addpd poly_coeff+176+__svml_dacos_data_internal(%rip), %xmm14 + addpd %xmm11, %xmm14 + mulpd %xmm0, %xmm14 + orps %xmm1, %xmm6 + pxor %xmm3, %xmm6 + mulpd %xmm6, %xmm14 + movups PiH+__svml_dacos_data_internal(%rip), %xmm0 + andps %xmm2, %xmm0 + andnps Pi2H+__svml_dacos_data_internal(%rip), %xmm13 + andps %xmm12, %xmm0 + addpd %xmm13, %xmm0 + addpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + testl %edx, %edx + +/* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + +/* Restore registers + * and exit the function + */ + +L(EXIT): + addq $72, %rsp + cfi_def_cfa_offset(8) + ret + cfi_def_cfa_offset(80) + +/* Branch to process + * special inputs + */ + +L(SPECIAL_VALUES_BRANCH): + movups %xmm5, 32(%rsp) + movups %xmm0, 48(%rsp) + xorl %eax, %eax + movq %r12, 16(%rsp) + cfi_offset(12, -64) + movl %eax, %r12d + movq %r13, 8(%rsp) + cfi_offset(13, -72) + movl %edx, %r13d + movq %r14, (%rsp) + cfi_offset(14, -80) + +/* Range mask + * bits check + */ + +L(RANGEMASK_CHECK): + btl %r12d, %r13d + +/* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + +/* Special inputs + * processing loop + */ + +L(SPECIAL_VALUES_LOOP): + incl %r12d + cmpl $2, %r12d + +/* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + movups 48(%rsp), %xmm0 + +/* Go to exit */ + jmp L(EXIT) + cfi_offset(12, -64) + cfi_offset(13, -72) + cfi_offset(14, -80) + +/* Scalar math fucntion call + * to process special input + */ + +L(SCALAR_MATH_CALL): + movl %r12d, %r14d + movsd 32(%rsp,%r14,8), %xmm0 + call acos@PLT + movsd %xmm0, 48(%rsp,%r14,8) + +/* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + +END(_ZGVbN2v_acos_sse4) + + .section .rodata, "a" + .align 16 + +#ifdef __svml_dacos_data_internal_typedef +typedef unsigned int VUINT32; +typedef struct { + __declspec(align(16)) VUINT32 SgnBit[2][2]; + __declspec(align(16)) VUINT32 OneHalf[2][2]; + __declspec(align(16)) VUINT32 SmallNorm[2][2]; + __declspec(align(16)) VUINT32 MOne[2][2]; + __declspec(align(16)) VUINT32 Two[2][2]; + __declspec(align(16)) VUINT32 sqrt_coeff[4][2][2]; + __declspec(align(16)) VUINT32 poly_coeff[12][2][2]; + __declspec(align(16)) VUINT32 PiH[2][2]; + __declspec(align(16)) VUINT32 Pi2H[2][2]; +} __svml_dacos_data_internal; +#endif +__svml_dacos_data_internal: + /*== SgnBit ==*/ + .quad 0x8000000000000000, 0x8000000000000000 + /*== OneHalf ==*/ + .align 16 + .quad 0x3fe0000000000000, 0x3fe0000000000000 + /*== SmallNorm ==*/ + .align 16 + .quad 0x3000000000000000, 0x3000000000000000 + /*== MOne ==*/ + .align 16 + .quad 0xbff0000000000000, 0xbff0000000000000 + /*== Two ==*/ + .align 16 + .quad 0x4000000000000000, 0x4000000000000000 + /*== sqrt_coeff[4] ==*/ + .align 16 + .quad 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */ + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */ + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */ + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */ + /*== poly_coeff[12] ==*/ + .align 16 + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */ + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */ + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */ + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */ + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */ + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */ + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */ + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */ + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */ + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */ + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */ + .quad 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */ + /*== PiH ==*/ + .align 16 + .quad 0x400921fb54442d18, 0x400921fb54442d18 + /*== Pi2H ==*/ + .align 16 + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18 + .align 16 + .type __svml_dacos_data_internal,@object + .size __svml_dacos_data_internal,.-__svml_dacos_data_internal diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S new file mode 100644 index 0000000000..750f71c81c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized acos, vector length is 4. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define _ZGVdN4v_acos _ZGVdN4v_acos_sse_wrapper +#include "../svml_d_acos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c new file mode 100644 index 0000000000..6453e7ebe2 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized acos, vector length is 4. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN4v_acos +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN4v_acos, __GI__ZGVdN4v_acos, __redirect__ZGVdN4v_acos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S new file mode 100644 index 0000000000..172080e3ea --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S @@ -0,0 +1,273 @@ +/* Function acos vectorized with AVX2. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + https://www.gnu.org/licenses/. */ + +/* + * ALGORITHM DESCRIPTION: + * + * SelMask = (|x| >= 0.5) ? 1 : 0; + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) + * + */ + +/* Offsets for data table __svml_dacos_data_internal + */ +#define SgnBit 0 +#define OneHalf 32 +#define SmallNorm 64 +#define MOne 96 +#define Two 128 +#define sqrt_coeff 160 +#define poly_coeff 288 +#define PiH 672 +#define Pi2H 704 + +#include <sysdep.h> + + .text + .section .text.avx2,"ax",@progbits +ENTRY(_ZGVdN4v_acos_avx2) + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-32, %rsp + subq $96, %rsp + vmovupd __svml_dacos_data_internal(%rip), %ymm6 + vmovupd OneHalf+__svml_dacos_data_internal(%rip), %ymm7 + vmovapd %ymm0, %ymm5 + +/* x = -|arg| */ + vorpd %ymm5, %ymm6, %ymm4 + +/* Y = 0.5 + 0.5*(-x) */ + vfmadd231pd %ymm4, %ymm7, %ymm7 + +/* x^2 */ + vmulpd %ymm4, %ymm4, %ymm8 + +/* S ~ 2*sqrt(Y) */ + vmovupd sqrt_coeff+__svml_dacos_data_internal(%rip), %ymm0 + vcmplt_oqpd SmallNorm+__svml_dacos_data_internal(%rip), %ymm7, %ymm12 + vminpd %ymm7, %ymm8, %ymm2 + +/* NaN processed in special branch (so wind test passed) */ + vcmpnge_uqpd MOne+__svml_dacos_data_internal(%rip), %ymm4, %ymm9 + vcvtpd2ps %ymm7, %xmm10 + vmovupd poly_coeff+64+__svml_dacos_data_internal(%rip), %ymm8 + vcmpnlt_uqpd %ymm7, %ymm2, %ymm1 + vrsqrtps %xmm10, %xmm11 + vfmadd213pd poly_coeff+96+__svml_dacos_data_internal(%rip), %ymm2, %ymm8 + vcvtps2pd %xmm11, %ymm13 + vmovupd poly_coeff+128+__svml_dacos_data_internal(%rip), %ymm11 + vandnpd %ymm13, %ymm12, %ymm14 + vmulpd %ymm14, %ymm14, %ymm15 + vfmadd213pd poly_coeff+160+__svml_dacos_data_internal(%rip), %ymm2, %ymm11 + vmulpd %ymm2, %ymm2, %ymm13 + vmovupd poly_coeff+256+__svml_dacos_data_internal(%rip), %ymm12 + vmulpd %ymm13, %ymm13, %ymm10 + vfmadd213pd poly_coeff+288+__svml_dacos_data_internal(%rip), %ymm2, %ymm12 + vandpd %ymm5, %ymm6, %ymm3 + vaddpd %ymm7, %ymm7, %ymm6 + vmulpd %ymm6, %ymm14, %ymm7 + vfmsub213pd Two+__svml_dacos_data_internal(%rip), %ymm15, %ymm6 + vmovupd poly_coeff+320+__svml_dacos_data_internal(%rip), %ymm14 + vfmadd213pd sqrt_coeff+32+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 + vmulpd %ymm6, %ymm7, %ymm15 + vfmadd213pd poly_coeff+352+__svml_dacos_data_internal(%rip), %ymm2, %ymm14 + vfmadd213pd sqrt_coeff+64+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 + vfmadd213pd sqrt_coeff+96+__svml_dacos_data_internal(%rip), %ymm6, %ymm0 + +/* polynomial */ + vmovupd poly_coeff+__svml_dacos_data_internal(%rip), %ymm6 + vfnmadd213pd %ymm7, %ymm15, %ymm0 + vfmadd213pd poly_coeff+32+__svml_dacos_data_internal(%rip), %ymm2, %ymm6 + vblendvpd %ymm1, %ymm0, %ymm4, %ymm0 + vfmadd213pd %ymm8, %ymm13, %ymm6 + vmovmskpd %ymm9, %edx + vmovupd poly_coeff+192+__svml_dacos_data_internal(%rip), %ymm9 + vfmadd213pd poly_coeff+224+__svml_dacos_data_internal(%rip), %ymm2, %ymm9 + vfmadd213pd %ymm9, %ymm13, %ymm11 + vfmadd213pd %ymm11, %ymm10, %ymm6 + vfmadd213pd %ymm12, %ymm13, %ymm6 + vfmadd213pd %ymm14, %ymm13, %ymm6 + vmulpd %ymm6, %ymm2, %ymm9 + +/* X<X^2 iff X<0 */ + vcmplt_oqpd %ymm2, %ymm5, %ymm6 + vandpd PiH+__svml_dacos_data_internal(%rip), %ymm1, %ymm2 + vandnpd Pi2H+__svml_dacos_data_internal(%rip), %ymm1, %ymm7 + vxorpd %ymm3, %ymm0, %ymm1 + vfmadd213pd %ymm1, %ymm1, %ymm9 + vandpd %ymm6, %ymm2, %ymm2 + vaddpd %ymm7, %ymm2, %ymm8 + vaddpd %ymm9, %ymm8, %ymm0 + testl %edx, %edx + +/* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + +/* Restore registers + * and exit the function + */ + +L(EXIT): + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + +/* Branch to process + * special inputs + */ + +L(SPECIAL_VALUES_BRANCH): + vmovupd %ymm5, 32(%rsp) + vmovupd %ymm0, 64(%rsp) + xorl %eax, %eax + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 + +/* Range mask + * bits check + */ + +L(RANGEMASK_CHECK): + btl %r12d, %r13d + +/* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + +/* Special inputs + * processing loop + */ + +L(SPECIAL_VALUES_LOOP): + incl %r12d + cmpl $4, %r12d + +/* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovupd 64(%rsp), %ymm0 + +/* Go to exit */ + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 + +/* Scalar math fucntion call + * to process special input + */ + +L(SCALAR_MATH_CALL): + movl %r12d, %r14d + movsd 32(%rsp,%r14,8), %xmm0 + call acos@PLT + movsd %xmm0, 64(%rsp,%r14,8) + +/* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + +END(_ZGVdN4v_acos_avx2) + + .section .rodata, "a" + .align 32 + +#ifdef __svml_dacos_data_internal_typedef +typedef unsigned int VUINT32; +typedef struct { + __declspec(align(32)) VUINT32 SgnBit[4][2]; + __declspec(align(32)) VUINT32 OneHalf[4][2]; + __declspec(align(32)) VUINT32 SmallNorm[4][2]; + __declspec(align(32)) VUINT32 MOne[4][2]; + __declspec(align(32)) VUINT32 Two[4][2]; + __declspec(align(32)) VUINT32 sqrt_coeff[4][4][2]; + __declspec(align(32)) VUINT32 poly_coeff[12][4][2]; + __declspec(align(32)) VUINT32 PiH[4][2]; + __declspec(align(32)) VUINT32 Pi2H[4][2]; +} __svml_dacos_data_internal; +#endif +__svml_dacos_data_internal: + /*== SgnBit ==*/ + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 + /*== OneHalf ==*/ + .align 32 + .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000 + /*== SmallNorm ==*/ + .align 32 + .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000 + /*== MOne ==*/ + .align 32 + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 + /*== Two ==*/ + .align 32 + .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 + /*== sqrt_coeff[4] ==*/ + .align 32 + .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */ + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */ + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */ + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */ + /*== poly_coeff[12] ==*/ + .align 32 + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */ + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */ + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */ + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */ + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */ + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */ + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */ + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */ + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */ + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */ + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */ + .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */ + /*== PiH ==*/ + .align 32 + .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18 + /*== Pi2H ==*/ + .align 32 + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18 + .align 32 + .type __svml_dacos_data_internal,@object + .size __svml_dacos_data_internal,.-__svml_dacos_data_internal diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S new file mode 100644 index 0000000000..4d64fd1c00 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized acos, vector length is 8. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define _ZGVeN8v_acos _ZGVeN8v_acos_avx2_wrapper +#include "../svml_d_acos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c new file mode 100644 index 0000000000..1e7d1865fb --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core.c @@ -0,0 +1,27 @@ +/* Multiple versions of vectorized acos, vector length is 8. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN8v_acos +#include "ifunc-mathvec-avx512-skx.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN8v_acos, __GI__ZGVeN8v_acos, __redirect__ZGVeN8v_acos) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S new file mode 100644 index 0000000000..76ca35ad7b --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S @@ -0,0 +1,298 @@ +/* Function acos vectorized with AVX-512. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + https://www.gnu.org/licenses/. */ + +/* + * ALGORITHM DESCRIPTION: + * + * SelMask = (|x| >= 0.5) ? 1 : 0; + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) + * + */ + +/* Offsets for data table __svml_dacos_data_internal + */ +#define SgnBit 0 +#define OneHalf 64 +#define SmallNorm 128 +#define MOne 192 +#define Two 256 +#define sqrt_coeff_1 320 +#define sqrt_coeff_2 384 +#define sqrt_coeff_3 448 +#define sqrt_coeff_4 512 +#define poly_coeff_1 576 +#define poly_coeff_2 640 +#define poly_coeff_3 704 +#define poly_coeff_4 768 +#define poly_coeff_5 832 +#define poly_coeff_6 896 +#define poly_coeff_7 960 +#define poly_coeff_8 1024 +#define poly_coeff_9 1088 +#define poly_coeff_10 1152 +#define poly_coeff_11 1216 +#define poly_coeff_12 1280 +#define PiH 1344 +#define Pi2H 1408 + +#include <sysdep.h> + + .text + .section .text.evex512,"ax",@progbits +ENTRY(_ZGVeN8v_acos_skx) + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-64, %rsp + subq $192, %rsp + vmovups __svml_dacos_data_internal(%rip), %zmm7 + vmovups OneHalf+__svml_dacos_data_internal(%rip), %zmm8 + +/* S ~ 2*sqrt(Y) */ + vmovups SmallNorm+__svml_dacos_data_internal(%rip), %zmm11 + vmovups Two+__svml_dacos_data_internal(%rip), %zmm14 + vmovups sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15 + vmovups sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2 + vmovups sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1 + vmovups MOne+__svml_dacos_data_internal(%rip), %zmm10 + vmovaps %zmm0, %zmm6 + +/* x = -|arg| */ + vorpd %zmm6, %zmm7, %zmm5 + vandpd %zmm6, %zmm7, %zmm4 + +/* Y = 0.5 + 0.5*(-x) */ + vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8 + +/* x^2 */ + vmulpd {rn-sae}, %zmm5, %zmm5, %zmm9 + vrsqrt14pd %zmm8, %zmm12 + vcmppd $17, {sae}, %zmm11, %zmm8, %k1 + vcmppd $17, {sae}, %zmm10, %zmm5, %k0 + vmovups poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10 + vmovups poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11 + vminpd {sae}, %zmm8, %zmm9, %zmm3 + vmovups poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9 + vxorpd %zmm12, %zmm12, %zmm12{%k1} + vaddpd {rn-sae}, %zmm8, %zmm8, %zmm0 + vcmppd $21, {sae}, %zmm8, %zmm3, %k4 + +/* X<X^2 iff X<0 */ + vcmppd $17, {sae}, %zmm3, %zmm6, %k2 + vmulpd {rn-sae}, %zmm12, %zmm12, %zmm13 + vmulpd {rn-sae}, %zmm12, %zmm0, %zmm7 + vmovups poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12 + +/* polynomial */ + vmovups poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8 + vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0 + vmovups sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13 + vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12 + vmovups poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9 + vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2 + vmovups poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15 + vmulpd {rn-sae}, %zmm0, %zmm7, %zmm14 + vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2 + vmovups poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1 + kmovw %k4, %eax + kmovw %k2, %ecx + kmovw %k0, %edx + vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2 + vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1 + vmovups poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8 + vmulpd {rn-sae}, %zmm3, %zmm3, %zmm0 + vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2 + vmovups poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7 + vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8 + vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1 + vblendmpd %zmm2, %zmm5, %zmm2{%k4} + vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7 + vmovups poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10 + vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10 + andl %eax, %ecx + vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11 + kmovw %ecx, %k3 + vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7 + vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11 + vmulpd {rn-sae}, %zmm0, %zmm0, %zmm10 + vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1 + vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1 + vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1 + vmovups Pi2H+__svml_dacos_data_internal(%rip), %zmm0 + vmulpd {rn-sae}, %zmm3, %zmm1, %zmm1 + vxorpd %zmm4, %zmm2, %zmm3 + vxorpd %zmm0, %zmm0, %zmm0{%k4} + vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1 + vorpd PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k3} + vaddpd {rn-sae}, %zmm1, %zmm0, %zmm0 + testl %edx, %edx + +/* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + +/* Restore registers + * and exit the function + */ + +L(EXIT): + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + +/* Branch to process + * special inputs + */ + +L(SPECIAL_VALUES_BRANCH): + vmovups %zmm6, 64(%rsp) + vmovups %zmm0, 128(%rsp) + xorl %eax, %eax + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + +/* Range mask + * bits check + */ + +L(RANGEMASK_CHECK): + btl %r12d, %r13d + +/* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + +/* Special inputs + * processing loop + */ + +L(SPECIAL_VALUES_LOOP): + incl %r12d + cmpl $8, %r12d + +/* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovups 128(%rsp), %zmm0 + +/* Go to exit */ + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + +/* Scalar math fucntion call + * to process special input + */ + +L(SCALAR_MATH_CALL): + movl %r12d, %r14d + movsd 64(%rsp,%r14,8), %xmm0 + call acos@PLT + movsd %xmm0, 128(%rsp,%r14,8) + +/* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + +END(_ZGVeN8v_acos_skx) + + .section .rodata, "a" + .align 64 + +#ifdef __svml_dacos_data_internal_typedef +typedef unsigned int VUINT32; +typedef struct { + __declspec(align(64)) VUINT32 SgnBit[8][2]; + __declspec(align(64)) VUINT32 OneHalf[8][2]; + __declspec(align(64)) VUINT32 SmallNorm[8][2]; + __declspec(align(64)) VUINT32 MOne[8][2]; + __declspec(align(64)) VUINT32 Two[8][2]; + __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2]; + __declspec(align(64)) VUINT32 poly_coeff[12][8][2]; + __declspec(align(64)) VUINT32 PiH[8][2]; + __declspec(align(64)) VUINT32 Pi2H[8][2]; +} __svml_dacos_data_internal; +#endif +__svml_dacos_data_internal: + /*== SgnBit ==*/ + .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 + /*== OneHalf ==*/ + .align 64 + .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000 + /*== SmallNorm ==*/ + .align 64 + .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000 + /*== MOne ==*/ + .align 64 + .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000 + /*== Two ==*/ + .align 64 + .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000 + /*== sqrt_coeff[4] ==*/ + .align 64 + .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */ + .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */ + .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */ + .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */ + /*== poly_coeff[12] ==*/ + .align 64 + .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */ + .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */ + .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */ + .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */ + .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */ + .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */ + .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */ + .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */ + .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */ + .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */ + .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */ + .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */ + /*== PiH ==*/ + .align 64 + .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18 + /*== Pi2H ==*/ + .align 64 + .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18 + .align 64 + .type __svml_dacos_data_internal,@object + .size __svml_dacos_data_internal,.-__svml_dacos_data_internal diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S new file mode 100644 index 0000000000..1ff0cfc8d5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core-avx2.S @@ -0,0 +1,20 @@ +/* AVX2 version of vectorized acosf. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define _ZGVeN16v_acosf _ZGVeN16v_acosf_avx2_wrapper +#include "../svml_s_acosf16_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c new file mode 100644 index 0000000000..fcf05782c5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized acosf, vector length is 16. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVeN16v_acosf +#include "ifunc-mathvec-avx512-skx.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVeN16v_acosf, __GI__ZGVeN16v_acosf, + __redirect__ZGVeN16v_acosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S new file mode 100644 index 0000000000..1db2969c77 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S @@ -0,0 +1,262 @@ +/* Function acosf vectorized with AVX-512. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + https://www.gnu.org/licenses/. */ + +/* + * ALGORITHM DESCRIPTION: + * + * SelMask = (|x| >= 0.5) ? 1 : 0; + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) + * + * + */ + +/* Offsets for data table __svml_sacos_data_internal + */ +#define SgnBit 0 +#define OneHalf 64 +#define SmallNorm 128 +#define MOne 192 +#define Two 256 +#define sqrt_coeff_1 320 +#define sqrt_coeff_2 384 +#define poly_coeff_1 448 +#define poly_coeff_2 512 +#define poly_coeff_3 576 +#define poly_coeff_4 640 +#define poly_coeff_5 704 +#define Pi2H 768 +#define PiH 832 + +#include <sysdep.h> + + .text + .section .text.exex512,"ax",@progbits +ENTRY(_ZGVeN16v_acosf_skx) + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-64, %rsp + subq $192, %rsp + vmovups __svml_sacos_data_internal(%rip), %zmm5 + vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6 + +/* SQ ~ 2*sqrt(Y) */ + vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9 + vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8 + vmovups Two+__svml_sacos_data_internal(%rip), %zmm12 + vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13 + vmovaps %zmm0, %zmm4 + +/* x = -|arg| */ + vorps %zmm4, %zmm5, %zmm3 + vandps %zmm4, %zmm5, %zmm2 + vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0 + +/* Y = 0.5 + 0.5*(-x) */ + vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6 + +/* x^2 */ + vmulps {rn-sae}, %zmm3, %zmm3, %zmm7 + vrsqrt14ps %zmm6, %zmm10 + vcmpps $17, {sae}, %zmm9, %zmm6, %k1 + vcmpps $22, {sae}, %zmm3, %zmm8, %k0 + vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9 + vminps {sae}, %zmm6, %zmm7, %zmm1 + vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7 + vxorps %zmm10, %zmm10, %zmm10{%k1} + vaddps {rn-sae}, %zmm6, %zmm6, %zmm14 + vmulps {rn-sae}, %zmm1, %zmm1, %zmm8 + vmulps {rn-sae}, %zmm10, %zmm10, %zmm11 + vmulps {rn-sae}, %zmm10, %zmm14, %zmm5 + vcmpps $21, {sae}, %zmm6, %zmm1, %k4 + +/* X<X^2 iff X<0 */ + vcmpps $17, {sae}, %zmm1, %zmm4, %k2 + +/* polynomial */ + vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6 + vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14 + vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11 + vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9 + vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10 + vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12 + vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0 + vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11 + vmulps {rn-sae}, %zmm14, %zmm5, %zmm15 + vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11 + vxorps %zmm12, %zmm12, %zmm12{%k4} + vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0 + vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11 + kmovw %k4, %eax + kmovw %k2, %ecx + kmovw %k0, %edx + vmulps {rn-sae}, %zmm1, %zmm11, %zmm13 + vblendmps %zmm0, %zmm3, %zmm0{%k4} + vxorps %zmm2, %zmm0, %zmm1 + andl %eax, %ecx + kmovw %ecx, %k3 + vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13 + vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3} + vaddps {rn-sae}, %zmm13, %zmm12, %zmm0 + testl %edx, %edx + +/* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + +/* Restore registers + * and exit the function + */ + +L(EXIT): + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + +/* Branch to process + * special inputs + */ + +L(SPECIAL_VALUES_BRANCH): + vmovups %zmm4, 64(%rsp) + vmovups %zmm0, 128(%rsp) + xorl %eax, %eax + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + +/* Range mask + * bits check + */ + +L(RANGEMASK_CHECK): + btl %r12d, %r13d + +/* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + +/* Special inputs + * processing loop + */ + +L(SPECIAL_VALUES_LOOP): + incl %r12d + cmpl $16, %r12d + +/* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovups 128(%rsp), %zmm0 + +/* Go to exit */ + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + +/* Scalar math fucntion call + * to process special input + */ + +L(SCALAR_MATH_CALL): + movl %r12d, %r14d + movss 64(%rsp,%r14,4), %xmm0 + call acosf@PLT + movss %xmm0, 128(%rsp,%r14,4) + +/* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + +END(_ZGVeN16v_acosf_skx) + + .section .rodata, "a" + .align 64 + +#ifdef __svml_sacos_data_internal_typedef +typedef unsigned int VUINT32; +typedef struct { + __declspec(align(64)) VUINT32 SgnBit[16][1]; + __declspec(align(64)) VUINT32 OneHalf[16][1]; + __declspec(align(64)) VUINT32 SmallNorm[16][1]; + __declspec(align(64)) VUINT32 MOne[16][1]; + __declspec(align(64)) VUINT32 Two[16][1]; + __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1]; + __declspec(align(64)) VUINT32 poly_coeff[5][16][1]; + __declspec(align(64)) VUINT32 Pi2H[16][1]; + __declspec(align(64)) VUINT32 PiH[16][1]; +} __svml_sacos_data_internal; +#endif +__svml_sacos_data_internal: + /*== SgnBit ==*/ + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 + /*== OneHalf ==*/ + .align 64 + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 + /*== SmallNorm ==*/ + .align 64 + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 + /*== MOne ==*/ + .align 64 + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 + /*== Two ==*/ + .align 64 + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 + /*== sqrt_coeff[2] ==*/ + .align 64 + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ + /*== poly_coeff[5] ==*/ + .align 64 + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ + /*== Pi2H ==*/ + .align 64 + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB + /*== PiH ==*/ + .align 64 + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB + .align 64 + .type __svml_sacos_data_internal,@object + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S new file mode 100644 index 0000000000..f94b3eb01a --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core-sse2.S @@ -0,0 +1,20 @@ +/* SSE2 version of vectorized acosf, vector length is 4. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define _ZGVbN4v_acosf _ZGVbN4v_acosf_sse2 +#include "../svml_s_acosf4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c new file mode 100644 index 0000000000..6f9a5c1082 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized acosf, vector length is 4. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVbN4v_acosf +#include "ifunc-mathvec-sse4_1.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVbN4v_acosf, __GI__ZGVbN4v_acosf, + __redirect__ZGVbN4v_acosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S new file mode 100644 index 0000000000..fe0c94aeb5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf4_core_sse4.S @@ -0,0 +1,260 @@ +/* Function acosf vectorized with SSE4. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + https://www.gnu.org/licenses/. */ + +/* + * ALGORITHM DESCRIPTION: + * + * SelMask = (|x| >= 0.5) ? 1 : 0; + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) + * + * + */ + +/* Offsets for data table __svml_sacos_data_internal + */ +#define SgnBit 0 +#define OneHalf 16 +#define SmallNorm 32 +#define MOne 48 +#define Two 64 +#define sqrt_coeff 80 +#define poly_coeff 112 +#define Pi2H 192 +#define PiH 208 + +#include <sysdep.h> + + .text + .section .text.sse4,"ax",@progbits +ENTRY(_ZGVbN4v_acosf_sse4) + subq $72, %rsp + cfi_def_cfa_offset(80) + +/* X<X^2 iff X<0 */ + movaps %xmm0, %xmm14 + +/* + * 2*sqrt(X) ~ Sh - Sl (to 24+ bits) + * SQ ~ 2*sqrt(X) + */ + movups __svml_sacos_data_internal(%rip), %xmm3 + movups OneHalf+__svml_sacos_data_internal(%rip), %xmm5 + +/* x = -|arg| */ + movaps %xmm3, %xmm4 + orps %xmm0, %xmm4 + +/* Y = 0.5 + 0.5*(-x) */ + movaps %xmm5, %xmm6 + mulps %xmm4, %xmm6 + +/* x^2 */ + movaps %xmm4, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm6, %xmm5 + +/* SQ ~ 2*sqrt(Y) */ + rsqrtps %xmm5, %xmm8 + minps %xmm5, %xmm13 + movaps %xmm5, %xmm2 + movaps %xmm13, %xmm1 + cmpltps SmallNorm+__svml_sacos_data_internal(%rip), %xmm2 + cmpnltps %xmm5, %xmm1 + cmpltps %xmm13, %xmm14 + addps %xmm5, %xmm5 + andnps %xmm8, %xmm2 + movaps %xmm13, %xmm11 + movaps %xmm2, %xmm9 + movaps %xmm1, %xmm6 + mulps %xmm2, %xmm9 + andnps %xmm4, %xmm6 + mulps %xmm5, %xmm2 + mulps %xmm13, %xmm11 + mulps %xmm9, %xmm5 + movups sqrt_coeff+__svml_sacos_data_internal(%rip), %xmm10 + andps %xmm0, %xmm3 + +/* polynomial */ + movups poly_coeff+__svml_sacos_data_internal(%rip), %xmm12 + movaps %xmm1, %xmm15 + mulps %xmm13, %xmm12 + subps Two+__svml_sacos_data_internal(%rip), %xmm5 + mulps %xmm5, %xmm10 + addps poly_coeff+16+__svml_sacos_data_internal(%rip), %xmm12 + mulps %xmm2, %xmm5 + mulps %xmm11, %xmm12 + addps sqrt_coeff+16+__svml_sacos_data_internal(%rip), %xmm10 + mulps %xmm5, %xmm10 + movups poly_coeff+32+__svml_sacos_data_internal(%rip), %xmm5 + subps %xmm10, %xmm2 + mulps %xmm13, %xmm5 + movups MOne+__svml_sacos_data_internal(%rip), %xmm7 + andps %xmm1, %xmm2 + cmpnleps %xmm4, %xmm7 + addps poly_coeff+48+__svml_sacos_data_internal(%rip), %xmm5 + movmskps %xmm7, %edx + orps %xmm2, %xmm6 + addps %xmm12, %xmm5 + mulps %xmm13, %xmm5 + pxor %xmm3, %xmm6 + movups PiH+__svml_sacos_data_internal(%rip), %xmm7 + andps %xmm1, %xmm7 + addps poly_coeff+64+__svml_sacos_data_internal(%rip), %xmm5 + mulps %xmm13, %xmm5 + andps %xmm14, %xmm7 + mulps %xmm6, %xmm5 + andnps Pi2H+__svml_sacos_data_internal(%rip), %xmm15 + addps %xmm5, %xmm6 + addps %xmm15, %xmm7 + addps %xmm6, %xmm7 + testl %edx, %edx + +/* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + +/* Restore registers + * and exit the function + */ + +L(EXIT): + movaps %xmm7, %xmm0 + addq $72, %rsp + cfi_def_cfa_offset(8) + ret + cfi_def_cfa_offset(80) + +/* Branch to process + * special inputs + */ + +L(SPECIAL_VALUES_BRANCH): + movups %xmm0, 32(%rsp) + movups %xmm7, 48(%rsp) + xorl %eax, %eax + movq %r12, 16(%rsp) + cfi_offset(12, -64) + movl %eax, %r12d + movq %r13, 8(%rsp) + cfi_offset(13, -72) + movl %edx, %r13d + movq %r14, (%rsp) + cfi_offset(14, -80) + +/* Range mask + * bits check + */ + +L(RANGEMASK_CHECK): + btl %r12d, %r13d + +/* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + +/* Special inputs + * processing loop + */ + +L(SPECIAL_VALUES_LOOP): + incl %r12d + cmpl $4, %r12d + +/* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + movups 48(%rsp), %xmm7 + +/* Go to exit */ + jmp L(EXIT) + cfi_offset(12, -64) + cfi_offset(13, -72) + cfi_offset(14, -80) + +/* Scalar math fucntion call + * to process special input + */ + +L(SCALAR_MATH_CALL): + movl %r12d, %r14d + movss 32(%rsp,%r14,4), %xmm0 + call acosf@PLT + movss %xmm0, 48(%rsp,%r14,4) + +/* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + +END(_ZGVbN4v_acosf_sse4) + + .section .rodata, "a" + .align 16 + +#ifdef __svml_sacos_data_internal_typedef +typedef unsigned int VUINT32; +typedef struct { + __declspec(align(16)) VUINT32 SgnBit[4][1]; + __declspec(align(16)) VUINT32 OneHalf[4][1]; + __declspec(align(16)) VUINT32 SmallNorm[4][1]; + __declspec(align(16)) VUINT32 MOne[4][1]; + __declspec(align(16)) VUINT32 Two[4][1]; + __declspec(align(16)) VUINT32 sqrt_coeff[2][4][1]; + __declspec(align(16)) VUINT32 poly_coeff[5][4][1]; + __declspec(align(16)) VUINT32 Pi2H[4][1]; + __declspec(align(16)) VUINT32 PiH[4][1]; +} __svml_sacos_data_internal; +#endif +__svml_sacos_data_internal: + /*== SgnBit ==*/ + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + /*== OneHalf ==*/ + .align 16 + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 + /*== SmallNorm ==*/ + .align 16 + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 + /*== MOne ==*/ + .align 16 + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 + /*== Two ==*/ + .align 16 + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000 + /*== sqrt_coeff[2] ==*/ + .align 16 + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ + /*== poly_coeff[5] ==*/ + .align 16 + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ + /*== Pi2H ==*/ + .align 16 + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB + /*== PiH ==*/ + .align 16 + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB + .align 16 + .type __svml_sacos_data_internal,@object + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S new file mode 100644 index 0000000000..583ef54fee --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core-sse.S @@ -0,0 +1,20 @@ +/* SSE version of vectorized acosf, vector length is 8. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define _ZGVdN8v_acosf _ZGVdN8v_acosf_sse_wrapper +#include "../svml_s_acosf8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c new file mode 100644 index 0000000000..dd360a9479 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core.c @@ -0,0 +1,28 @@ +/* Multiple versions of vectorized acosf, vector length is 8. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define SYMBOL_NAME _ZGVdN8v_acosf +#include "ifunc-mathvec-avx2.h" + +libc_ifunc_redirected (REDIRECT_NAME, SYMBOL_NAME, IFUNC_SELECTOR ()); + +#ifdef SHARED +__hidden_ver1 (_ZGVdN8v_acosf, __GI__ZGVdN8v_acosf, + __redirect__ZGVdN8v_acosf) + __attribute__ ((visibility ("hidden"))); +#endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S new file mode 100644 index 0000000000..2b6dd2c2c2 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S @@ -0,0 +1,252 @@ +/* Function acosf vectorized with AVX2. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + https://www.gnu.org/licenses/. */ + +/* + * ALGORITHM DESCRIPTION: + * + * SelMask = (|x| >= 0.5) ? 1 : 0; + * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x| + * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R)) + * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|) + * + * + */ + +/* Offsets for data table __svml_sacos_data_internal + */ +#define SgnBit 0 +#define OneHalf 32 +#define SmallNorm 64 +#define MOne 96 +#define Two 128 +#define sqrt_coeff 160 +#define poly_coeff 224 +#define Pi2H 384 +#define PiH 416 + +#include <sysdep.h> + + .text + .section .text.avx2,"ax",@progbits +ENTRY(_ZGVdN8v_acosf_avx2) + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-32, %rsp + subq $96, %rsp + +/* + * 2*sqrt(X) ~ Sh - Sl (to 24+ bits) + * SQ ~ 2*sqrt(X) + */ + vmovups __svml_sacos_data_internal(%rip), %ymm6 + vmovups OneHalf+__svml_sacos_data_internal(%rip), %ymm7 + vmovaps %ymm0, %ymm5 + +/* x = -|arg| */ + vorps %ymm5, %ymm6, %ymm4 + +/* Y = 0.5 + 0.5*(-x) */ + vfmadd231ps %ymm4, %ymm7, %ymm7 + +/* x^2 */ + vmulps %ymm4, %ymm4, %ymm8 + +/* SQ ~ 2*sqrt(Y) */ + vmovups sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0 + vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9 + vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10 + vminps %ymm7, %ymm8, %ymm2 + vaddps %ymm7, %ymm7, %ymm14 + vrsqrtps %ymm7, %ymm11 + vmovups poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8 + vcmpnlt_uqps %ymm7, %ymm2, %ymm1 + vmulps %ymm2, %ymm2, %ymm7 + vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8 + vmovmskps %ymm9, %edx + +/* polynomial */ + vmovups poly_coeff+__svml_sacos_data_internal(%rip), %ymm9 + vandnps %ymm11, %ymm10, %ymm12 + vmulps %ymm12, %ymm12, %ymm13 + vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9 + +/* X<X^2 iff X<0 */ + vcmplt_oqps %ymm2, %ymm5, %ymm10 + vfmadd213ps %ymm8, %ymm7, %ymm9 + vandps %ymm5, %ymm6, %ymm3 + vmulps %ymm14, %ymm12, %ymm6 + vfmsub213ps Two+__svml_sacos_data_internal(%rip), %ymm13, %ymm14 + vfmadd213ps poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm9 + vfmadd213ps sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm14, %ymm0 + vmulps %ymm14, %ymm6, %ymm15 + vmulps %ymm9, %ymm2, %ymm14 + vfnmadd213ps %ymm6, %ymm15, %ymm0 + vblendvps %ymm1, %ymm0, %ymm4, %ymm0 + vandps PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2 + vandnps Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm12 + vxorps %ymm3, %ymm0, %ymm1 + vfmadd213ps %ymm1, %ymm1, %ymm14 + vandps %ymm10, %ymm2, %ymm11 + vaddps %ymm12, %ymm11, %ymm13 + vaddps %ymm14, %ymm13, %ymm0 + testl %edx, %edx + +/* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + +/* Restore registers + * and exit the function + */ + +L(EXIT): + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + +/* Branch to process + * special inputs + */ + +L(SPECIAL_VALUES_BRANCH): + vmovups %ymm5, 32(%rsp) + vmovups %ymm0, 64(%rsp) + xorl %eax, %eax + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 + +/* Range mask + * bits check + */ + +L(RANGEMASK_CHECK): + btl %r12d, %r13d + +/* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + +/* Special inputs + * processing loop + */ + +L(SPECIAL_VALUES_LOOP): + incl %r12d + cmpl $8, %r12d + +/* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovups 64(%rsp), %ymm0 + +/* Go to exit */ + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 + +/* Scalar math fucntion call + * to process special input + */ + +L(SCALAR_MATH_CALL): + movl %r12d, %r14d + movss 32(%rsp,%r14,4), %xmm0 + call acosf@PLT + movss %xmm0, 64(%rsp,%r14,4) + +/* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + +END(_ZGVdN8v_acosf_avx2) + + .section .rodata, "a" + .align 32 + +#ifdef __svml_sacos_data_internal_typedef +typedef unsigned int VUINT32; +typedef struct { + __declspec(align(32)) VUINT32 SgnBit[8][1]; + __declspec(align(32)) VUINT32 OneHalf[8][1]; + __declspec(align(32)) VUINT32 SmallNorm[8][1]; + __declspec(align(32)) VUINT32 MOne[8][1]; + __declspec(align(32)) VUINT32 Two[8][1]; + __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1]; + __declspec(align(32)) VUINT32 poly_coeff[5][8][1]; + __declspec(align(32)) VUINT32 Pi2H[8][1]; + __declspec(align(32)) VUINT32 PiH[8][1]; +} __svml_sacos_data_internal; +#endif +__svml_sacos_data_internal: + /*== SgnBit ==*/ + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 + /*== OneHalf ==*/ + .align 32 + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 + /*== SmallNorm ==*/ + .align 32 + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 + /*== MOne ==*/ + .align 32 + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 + /*== Two ==*/ + .align 32 + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 + /*== sqrt_coeff[2] ==*/ + .align 32 + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ + /*== poly_coeff[5] ==*/ + .align 32 + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ + /*== Pi2H ==*/ + .align 32 + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB + /*== PiH ==*/ + .align 32 + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB + .align 32 + .type __svml_sacos_data_internal,@object + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal diff --git a/sysdeps/x86_64/fpu/svml_d_acos2_core.S b/sysdeps/x86_64/fpu/svml_d_acos2_core.S new file mode 100644 index 0000000000..9656478b2d --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_acos2_core.S @@ -0,0 +1,29 @@ +/* Function acos vectorized with SSE2. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2v_acos) +WRAPPER_IMPL_SSE2 acos +END (_ZGVbN2v_acos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2v_acos) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core.S b/sysdeps/x86_64/fpu/svml_d_acos4_core.S new file mode 100644 index 0000000000..e99cb4ae78 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core.S @@ -0,0 +1,29 @@ +/* Function acos vectorized with AVX2, wrapper version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4v_acos) +WRAPPER_IMPL_AVX _ZGVbN2v_acos +END (_ZGVdN4v_acos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4v_acos) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S new file mode 100644 index 0000000000..7cbcbc965c --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_acos4_core_avx.S @@ -0,0 +1,25 @@ +/* Function acos vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4v_acos) +WRAPPER_IMPL_AVX _ZGVbN2v_acos +END (_ZGVcN4v_acos) diff --git a/sysdeps/x86_64/fpu/svml_d_acos8_core.S b/sysdeps/x86_64/fpu/svml_d_acos8_core.S new file mode 100644 index 0000000000..e26b30d81a --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_acos8_core.S @@ -0,0 +1,25 @@ +/* Function acos vectorized with AVX-512, wrapper to AVX2. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_acos) +WRAPPER_IMPL_AVX512 _ZGVdN4v_acos +END (_ZGVeN8v_acos) diff --git a/sysdeps/x86_64/fpu/svml_s_acosf16_core.S b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S new file mode 100644 index 0000000000..70e046d492 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_s_acosf16_core.S @@ -0,0 +1,25 @@ +/* Function acosf vectorized with AVX-512. Wrapper to AVX2 version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVeN16v_acosf) +WRAPPER_IMPL_AVX512 _ZGVdN8v_acosf +END (_ZGVeN16v_acosf) diff --git a/sysdeps/x86_64/fpu/svml_s_acosf4_core.S b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S new file mode 100644 index 0000000000..36354b32b5 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_s_acosf4_core.S @@ -0,0 +1,29 @@ +/* Function acosf vectorized with SSE2, wrapper version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVbN4v_acosf) +WRAPPER_IMPL_SSE2 acosf +END (_ZGVbN4v_acosf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN4v_acosf) +#endif diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S new file mode 100644 index 0000000000..f08864a511 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core.S @@ -0,0 +1,29 @@ +/* Function acosf vectorized with AVX2, wrapper version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVdN8v_acosf) +WRAPPER_IMPL_AVX _ZGVbN4v_acosf +END (_ZGVdN8v_acosf) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN8v_acosf) +#endif diff --git a/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S new file mode 100644 index 0000000000..f3ed4d8e78 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_s_acosf8_core_avx.S @@ -0,0 +1,25 @@ +/* Function acosf vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_s_wrapper_impl.h" + + .text +ENTRY (_ZGVcN8v_acosf) +WRAPPER_IMPL_AVX _ZGVbN4v_acosf +END (_ZGVcN8v_acosf) diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c new file mode 100644 index 0000000000..4f74b4260a --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx.c @@ -0,0 +1 @@ +#include "test-double-libmvec-acos.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c new file mode 100644 index 0000000000..4f74b4260a --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx2.c @@ -0,0 +1 @@ +#include "test-double-libmvec-acos.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c new file mode 100644 index 0000000000..4f74b4260a --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos-avx512f.c @@ -0,0 +1 @@ +#include "test-double-libmvec-acos.c" diff --git a/sysdeps/x86_64/fpu/test-double-libmvec-acos.c b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c new file mode 100644 index 0000000000..e38b8ce821 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-double-libmvec-acos.c @@ -0,0 +1,3 @@ +#define LIBMVEC_TYPE double +#define LIBMVEC_FUNC acos +#include "test-vector-abi-arg1.h" diff --git a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c index ed932fc98d..0abc7d2021 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVbN2v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVbN2v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVbN2v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVbN2vv_pow) +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVbN2v_acos) #define VEC_INT_TYPE __m128i diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c index 3a6e37044f..dda093b914 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVdN4v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVdN4v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVdN4v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVdN4vv_pow) +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVdN4v_acos) #ifndef __ILP32__ # define VEC_INT_TYPE __m256i diff --git a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c index 99db4e7616..f3230463bb 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVcN4v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVcN4v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVcN4v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVcN4vv_pow) +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVcN4v_acos) #define VEC_INT_TYPE __m128i diff --git a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c index 251d429ac0..cf9f52faf0 100644 --- a/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sin), _ZGVeN8v_sin) VECTOR_WRAPPER (WRAPPER_NAME (log), _ZGVeN8v_log) VECTOR_WRAPPER (WRAPPER_NAME (exp), _ZGVeN8v_exp) VECTOR_WRAPPER_ff (WRAPPER_NAME (pow), _ZGVeN8vv_pow) +VECTOR_WRAPPER (WRAPPER_NAME (acos), _ZGVeN8v_acos) #ifndef __ILP32__ # define VEC_INT_TYPE __m512i diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c new file mode 100644 index 0000000000..1e6474dfa2 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx.c @@ -0,0 +1 @@ +#include "test-float-libmvec-acosf.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c new file mode 100644 index 0000000000..1e6474dfa2 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx2.c @@ -0,0 +1 @@ +#include "test-float-libmvec-acosf.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c new file mode 100644 index 0000000000..1e6474dfa2 --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf-avx512f.c @@ -0,0 +1 @@ +#include "test-float-libmvec-acosf.c" diff --git a/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c new file mode 100644 index 0000000000..fb47f974fd --- /dev/null +++ b/sysdeps/x86_64/fpu/test-float-libmvec-acosf.c @@ -0,0 +1,3 @@ +#define LIBMVEC_TYPE float +#define LIBMVEC_FUNC acosf +#include "test-vector-abi-arg1.h" diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c index c1d14cd79e..abbd3ed870 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVeN16v_sinf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVeN16v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVeN16v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVeN16vv_powf) +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVeN16v_acosf) #define VEC_INT_TYPE __m512i diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c index d23c372060..8a24027952 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVbN4v_sinf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVbN4v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVbN4v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVbN4vv_powf) +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVbN4v_acosf) #define VEC_INT_TYPE __m128i diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c index 3152cffb0c..aff0442606 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c @@ -30,6 +30,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVdN8v_sinf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVdN8v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVdN8v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVdN8vv_powf) +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVdN8v_acosf) /* Redefinition of wrapper to be compatible with _ZGVdN8vvv_sincosf. */ #undef VECTOR_WRAPPER_fFF diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c index a8492abfef..913584d111 100644 --- a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c +++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c @@ -27,6 +27,7 @@ VECTOR_WRAPPER (WRAPPER_NAME (sinf), _ZGVcN8v_sinf) VECTOR_WRAPPER (WRAPPER_NAME (logf), _ZGVcN8v_logf) VECTOR_WRAPPER (WRAPPER_NAME (expf), _ZGVcN8v_expf) VECTOR_WRAPPER_ff (WRAPPER_NAME (powf), _ZGVcN8vv_powf) +VECTOR_WRAPPER (WRAPPER_NAME (acosf), _ZGVcN8v_acosf) #define VEC_INT_TYPE __m128i