Message ID | 20220304181934.1556938-1-skpgkp2@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86_64: Fix svml_s_acosf16_core_avx512.S code formatting | expand |
On Fri, Mar 4, 2022 at 12:19 PM Sunil K Pandey <skpgkp2@gmail.com> wrote: > > This commit contains following formatting changes > > 1. Instructions proceeded by a tab. > 2. Instruction less than 8 characters in length have a tab > between it and the first operand. > 3. Instruction greater than 7 characters in length have a > space between it and the first operand. > 4. Tab after `#define`d names and their value. > 5. 8 space at the beginning of line replaced by tab. By hand or with a script, if script repo? > --- > .../multiarch/svml_s_acosf16_core_avx512.S | 344 +++++++++--------- > 1 file changed, 172 insertions(+), 172 deletions(-) > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > index 7708073975..026c15830d 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > @@ -29,243 +29,243 @@ > > /* Offsets for data table __svml_sacos_data_internal > */ > -#define SgnBit 0 > -#define OneHalf 64 > -#define SmallNorm 128 > -#define MOne 192 > -#define Two 256 > -#define sqrt_coeff_1 320 > -#define sqrt_coeff_2 384 > -#define poly_coeff_1 448 > -#define poly_coeff_2 512 > -#define poly_coeff_3 576 > -#define poly_coeff_4 640 > -#define poly_coeff_5 704 > -#define Pi2H 768 > -#define PiH 832 > +#define SgnBit 0 > +#define OneHalf 64 > +#define SmallNorm 128 > +#define MOne 192 > +#define Two 256 > +#define sqrt_coeff_1 320 > +#define sqrt_coeff_2 384 > +#define poly_coeff_1 448 > +#define poly_coeff_2 512 > +#define poly_coeff_3 576 > +#define poly_coeff_4 640 > +#define poly_coeff_5 704 > +#define Pi2H 768 > +#define PiH 832 > > #include <sysdep.h> > > - .text > + .text > .section .text.exex512,"ax",@progbits > ENTRY(_ZGVeN16v_acosf_skx) > - pushq %rbp > - cfi_def_cfa_offset(16) > - movq %rsp, %rbp > - cfi_def_cfa(6, 16) > - cfi_offset(6, -16) > - andq $-64, %rsp > - subq $192, %rsp > - vmovups __svml_sacos_data_internal(%rip), %zmm5 > - vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6 > + pushq %rbp > + cfi_def_cfa_offset(16) > + movq %rsp, %rbp > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > + andq $-64, %rsp > + subq $192, %rsp > + vmovups __svml_sacos_data_internal(%rip), %zmm5 > + vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6 > > /* SQ ~ 2*sqrt(Y) */ Can we indent comments with code? > - vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9 > - vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8 > - vmovups Two+__svml_sacos_data_internal(%rip), %zmm12 > - vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13 > - vmovaps %zmm0, %zmm4 > + vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9 > + vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8 > + vmovups Two+__svml_sacos_data_internal(%rip), %zmm12 > + vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13 > + vmovaps %zmm0, %zmm4 > > /* x = -|arg| */ > - vorps %zmm4, %zmm5, %zmm3 > - vandps %zmm4, %zmm5, %zmm2 > - vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0 > + vorps %zmm4, %zmm5, %zmm3 > + vandps %zmm4, %zmm5, %zmm2 > + vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0 > > /* Y = 0.5 + 0.5*(-x) */ > - vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6 > + vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6 > > /* x^2 */ > - vmulps {rn-sae}, %zmm3, %zmm3, %zmm7 > - vrsqrt14ps %zmm6, %zmm10 > - vcmpps $17, {sae}, %zmm9, %zmm6, %k1 > - vcmpps $22, {sae}, %zmm3, %zmm8, %k0 > - vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9 > - vminps {sae}, %zmm6, %zmm7, %zmm1 > - vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7 > - vxorps %zmm10, %zmm10, %zmm10{%k1} > - vaddps {rn-sae}, %zmm6, %zmm6, %zmm14 > - vmulps {rn-sae}, %zmm1, %zmm1, %zmm8 > - vmulps {rn-sae}, %zmm10, %zmm10, %zmm11 > - vmulps {rn-sae}, %zmm10, %zmm14, %zmm5 > - vcmpps $21, {sae}, %zmm6, %zmm1, %k4 > + vmulps {rn-sae}, %zmm3, %zmm3, %zmm7 > + vrsqrt14ps %zmm6, %zmm10 > + vcmpps $17, {sae}, %zmm9, %zmm6, %k1 > + vcmpps $22, {sae}, %zmm3, %zmm8, %k0 > + vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9 > + vminps {sae}, %zmm6, %zmm7, %zmm1 > + vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7 > + vxorps %zmm10, %zmm10, %zmm10{%k1} > + vaddps {rn-sae}, %zmm6, %zmm6, %zmm14 > + vmulps {rn-sae}, %zmm1, %zmm1, %zmm8 > + vmulps {rn-sae}, %zmm10, %zmm10, %zmm11 > + vmulps {rn-sae}, %zmm10, %zmm14, %zmm5 > + vcmpps $21, {sae}, %zmm6, %zmm1, %k4 > > /* X<X^2 iff X<0 */ > - vcmpps $17, {sae}, %zmm1, %zmm4, %k2 > + vcmpps $17, {sae}, %zmm1, %zmm4, %k2 > > /* polynomial */ > - vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6 > - vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14 > - vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11 > - vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9 > - vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10 > - vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12 > - vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0 > - vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11 > - vmulps {rn-sae}, %zmm14, %zmm5, %zmm15 > - vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11 > - vxorps %zmm12, %zmm12, %zmm12{%k4} > - vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0 > - vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11 > - kmovw %k0, %edx > - vmulps {rn-sae}, %zmm1, %zmm11, %zmm13 > - vblendmps %zmm0, %zmm3, %zmm0{%k4} > - vxorps %zmm2, %zmm0, %zmm1 > - kandw %k4, %k2, %k3 > - vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13 > - vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3} > - vaddps {rn-sae}, %zmm13, %zmm12, %zmm0 > - testl %edx, %edx > + vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6 > + vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14 > + vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11 > + vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9 > + vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10 > + vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12 > + vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0 > + vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11 > + vmulps {rn-sae}, %zmm14, %zmm5, %zmm15 > + vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11 > + vxorps %zmm12, %zmm12, %zmm12{%k4} > + vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0 > + vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11 > + kmovw %k0, %edx > + vmulps {rn-sae}, %zmm1, %zmm11, %zmm13 > + vblendmps %zmm0, %zmm3, %zmm0{%k4} > + vxorps %zmm2, %zmm0, %zmm1 > + kandw %k4, %k2, %k3 > + vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13 > + vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3} > + vaddps {rn-sae}, %zmm13, %zmm12, %zmm0 > + testl %edx, %edx > > /* Go to special inputs processing branch */ > - jne L(SPECIAL_VALUES_BRANCH) > - # LOE rbx r12 r13 r14 r15 edx zmm0 zmm4 > + jne L(SPECIAL_VALUES_BRANCH) > + # LOE rbx r12 r13 r14 r15 edx zmm0 zmm4 > > /* Restore registers > * and exit the function > */ > > L(EXIT): > - movq %rbp, %rsp > - popq %rbp > - cfi_def_cfa(7, 8) > - cfi_restore(6) > - ret > - cfi_def_cfa(6, 16) > - cfi_offset(6, -16) > + movq %rbp, %rsp > + popq %rbp > + cfi_def_cfa(7, 8) > + cfi_restore(6) > + ret > + cfi_def_cfa(6, 16) > + cfi_offset(6, -16) > > /* Branch to process > * special inputs > */ > > L(SPECIAL_VALUES_BRANCH): > - vmovups %zmm4, 64(%rsp) > - vmovups %zmm0, 128(%rsp) > - # LOE rbx r12 r13 r14 r15 edx zmm0 > - > - xorl %eax, %eax > - # LOE rbx r12 r13 r14 r15 eax edx > - > - vzeroupper > - movq %r12, 16(%rsp) > - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > - movl %eax, %r12d > - movq %r13, 8(%rsp) > - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > - movl %edx, %r13d > - movq %r14, (%rsp) > - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > - # LOE rbx r15 r12d r13d > + vmovups %zmm4, 64(%rsp) > + vmovups %zmm0, 128(%rsp) > + # LOE rbx r12 r13 r14 r15 edx zmm0 > + > + xorl %eax, %eax > + # LOE rbx r12 r13 r14 r15 eax edx > + > + vzeroupper > + movq %r12, 16(%rsp) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > + movl %eax, %r12d > + movq %r13, 8(%rsp) > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > + movl %edx, %r13d > + movq %r14, (%rsp) > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > + # LOE rbx r15 r12d r13d > > /* Range mask > * bits check > */ > > L(RANGEMASK_CHECK): > - btl %r12d, %r13d > + btl %r12d, %r13d > > /* Call scalar math function */ > - jc L(SCALAR_MATH_CALL) > - # LOE rbx r15 r12d r13d > + jc L(SCALAR_MATH_CALL) > + # LOE rbx r15 r12d r13d > > /* Special inputs > * processing loop > */ > > L(SPECIAL_VALUES_LOOP): > - incl %r12d > - cmpl $16, %r12d > + incl %r12d > + cmpl $16, %r12d > > /* Check bits in range mask */ > - jl L(RANGEMASK_CHECK) > - # LOE rbx r15 r12d r13d > + jl L(RANGEMASK_CHECK) > + # LOE rbx r15 r12d r13d > > - movq 16(%rsp), %r12 > - cfi_restore(12) > - movq 8(%rsp), %r13 > - cfi_restore(13) > - movq (%rsp), %r14 > - cfi_restore(14) > - vmovups 128(%rsp), %zmm0 > + movq 16(%rsp), %r12 > + cfi_restore(12) > + movq 8(%rsp), %r13 > + cfi_restore(13) > + movq (%rsp), %r14 > + cfi_restore(14) > + vmovups 128(%rsp), %zmm0 > > /* Go to exit */ > - jmp L(EXIT) > - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > - # LOE rbx r12 r13 r14 r15 zmm0 > + jmp L(EXIT) > + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 > + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ > + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 > + # LOE rbx r12 r13 r14 r15 zmm0 > > /* Scalar math fucntion call > * to process special input > */ > > L(SCALAR_MATH_CALL): > - movl %r12d, %r14d > - movss 64(%rsp,%r14,4), %xmm0 > - call acosf@PLT > - # LOE rbx r14 r15 r12d r13d xmm0 > + movl %r12d, %r14d > + movss 64(%rsp,%r14,4), %xmm0 > + call acosf@PLT > + # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp,%r14,4) > + movss %xmm0, 128(%rsp,%r14,4) > > /* Process special inputs in loop */ > - jmp L(SPECIAL_VALUES_LOOP) > - # LOE rbx r15 r12d r13d > + jmp L(SPECIAL_VALUES_LOOP) > + # LOE rbx r15 r12d r13d > END(_ZGVeN16v_acosf_skx) > > - .section .rodata, "a" > - .align 64 > + .section .rodata, "a" > + .align 64 > > #ifdef __svml_sacos_data_internal_typedef > typedef unsigned int VUINT32; > typedef struct { > - __declspec(align(64)) VUINT32 SgnBit[16][1]; > - __declspec(align(64)) VUINT32 OneHalf[16][1]; > - __declspec(align(64)) VUINT32 SmallNorm[16][1]; > - __declspec(align(64)) VUINT32 MOne[16][1]; > - __declspec(align(64)) VUINT32 Two[16][1]; > - __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1]; > - __declspec(align(64)) VUINT32 poly_coeff[5][16][1]; > - __declspec(align(64)) VUINT32 Pi2H[16][1]; > - __declspec(align(64)) VUINT32 PiH[16][1]; > + __declspec(align(64)) VUINT32 SgnBit[16][1]; > + __declspec(align(64)) VUINT32 OneHalf[16][1]; > + __declspec(align(64)) VUINT32 SmallNorm[16][1]; > + __declspec(align(64)) VUINT32 MOne[16][1]; > + __declspec(align(64)) VUINT32 Two[16][1]; > + __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1]; > + __declspec(align(64)) VUINT32 poly_coeff[5][16][1]; > + __declspec(align(64)) VUINT32 Pi2H[16][1]; > + __declspec(align(64)) VUINT32 PiH[16][1]; > } __svml_sacos_data_internal; > #endif > __svml_sacos_data_internal: > - /*== SgnBit ==*/ Or use a consistent comment indentation here. > - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 > - /*== OneHalf ==*/ > - .align 64 > - .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 > - /*== SmallNorm ==*/ > - .align 64 > - .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 > - /*== MOne ==*/ > - .align 64 > - .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 > - /*== Two ==*/ > - .align 64 > - .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 > - /*== sqrt_coeff[2] ==*/ > - .align 64 > - .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ > - .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ > - /*== poly_coeff[5] ==*/ > - .align 64 > - .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ > - .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ > - .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ > - .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ > - .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ > - /*== Pi2H ==*/ > - .align 64 > - .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB > - /*== PiH ==*/ > - .align 64 > - .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB > - .align 64 > - .type __svml_sacos_data_internal,@object > - .size __svml_sacos_data_internal,.-__svml_sacos_data_internal > + /*== SgnBit ==*/ > + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 > + /*== OneHalf ==*/ > + .align 64 > + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 > + /*== SmallNorm ==*/ > + .align 64 > + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 > + /*== MOne ==*/ > + .align 64 > + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 > + /*== Two ==*/ > + .align 64 > + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 > + /*== sqrt_coeff[2] ==*/ > + .align 64 > + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ > + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ > + /*== poly_coeff[5] ==*/ > + .align 64 > + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ > + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ > + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ > + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ > + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ > + /*== Pi2H ==*/ > + .align 64 > + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB > + /*== PiH ==*/ > + .align 64 > + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB > + .align 64 > + .type __svml_sacos_data_internal,@object > + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal > -- > 2.34.1 >
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S index 7708073975..026c15830d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S @@ -29,243 +29,243 @@ /* Offsets for data table __svml_sacos_data_internal */ -#define SgnBit 0 -#define OneHalf 64 -#define SmallNorm 128 -#define MOne 192 -#define Two 256 -#define sqrt_coeff_1 320 -#define sqrt_coeff_2 384 -#define poly_coeff_1 448 -#define poly_coeff_2 512 -#define poly_coeff_3 576 -#define poly_coeff_4 640 -#define poly_coeff_5 704 -#define Pi2H 768 -#define PiH 832 +#define SgnBit 0 +#define OneHalf 64 +#define SmallNorm 128 +#define MOne 192 +#define Two 256 +#define sqrt_coeff_1 320 +#define sqrt_coeff_2 384 +#define poly_coeff_1 448 +#define poly_coeff_2 512 +#define poly_coeff_3 576 +#define poly_coeff_4 640 +#define poly_coeff_5 704 +#define Pi2H 768 +#define PiH 832 #include <sysdep.h> - .text + .text .section .text.exex512,"ax",@progbits ENTRY(_ZGVeN16v_acosf_skx) - pushq %rbp - cfi_def_cfa_offset(16) - movq %rsp, %rbp - cfi_def_cfa(6, 16) - cfi_offset(6, -16) - andq $-64, %rsp - subq $192, %rsp - vmovups __svml_sacos_data_internal(%rip), %zmm5 - vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6 + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-64, %rsp + subq $192, %rsp + vmovups __svml_sacos_data_internal(%rip), %zmm5 + vmovups OneHalf+__svml_sacos_data_internal(%rip), %zmm6 /* SQ ~ 2*sqrt(Y) */ - vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9 - vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8 - vmovups Two+__svml_sacos_data_internal(%rip), %zmm12 - vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13 - vmovaps %zmm0, %zmm4 + vmovups SmallNorm+__svml_sacos_data_internal(%rip), %zmm9 + vmovups MOne+__svml_sacos_data_internal(%rip), %zmm8 + vmovups Two+__svml_sacos_data_internal(%rip), %zmm12 + vmovups sqrt_coeff_1+__svml_sacos_data_internal(%rip), %zmm13 + vmovaps %zmm0, %zmm4 /* x = -|arg| */ - vorps %zmm4, %zmm5, %zmm3 - vandps %zmm4, %zmm5, %zmm2 - vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0 + vorps %zmm4, %zmm5, %zmm3 + vandps %zmm4, %zmm5, %zmm2 + vmovups sqrt_coeff_2+__svml_sacos_data_internal(%rip), %zmm0 /* Y = 0.5 + 0.5*(-x) */ - vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6 + vfmadd231ps {rn-sae}, %zmm3, %zmm6, %zmm6 /* x^2 */ - vmulps {rn-sae}, %zmm3, %zmm3, %zmm7 - vrsqrt14ps %zmm6, %zmm10 - vcmpps $17, {sae}, %zmm9, %zmm6, %k1 - vcmpps $22, {sae}, %zmm3, %zmm8, %k0 - vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9 - vminps {sae}, %zmm6, %zmm7, %zmm1 - vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7 - vxorps %zmm10, %zmm10, %zmm10{%k1} - vaddps {rn-sae}, %zmm6, %zmm6, %zmm14 - vmulps {rn-sae}, %zmm1, %zmm1, %zmm8 - vmulps {rn-sae}, %zmm10, %zmm10, %zmm11 - vmulps {rn-sae}, %zmm10, %zmm14, %zmm5 - vcmpps $21, {sae}, %zmm6, %zmm1, %k4 + vmulps {rn-sae}, %zmm3, %zmm3, %zmm7 + vrsqrt14ps %zmm6, %zmm10 + vcmpps $17, {sae}, %zmm9, %zmm6, %k1 + vcmpps $22, {sae}, %zmm3, %zmm8, %k0 + vmovups poly_coeff_4+__svml_sacos_data_internal(%rip), %zmm9 + vminps {sae}, %zmm6, %zmm7, %zmm1 + vmovups poly_coeff_3+__svml_sacos_data_internal(%rip), %zmm7 + vxorps %zmm10, %zmm10, %zmm10{%k1} + vaddps {rn-sae}, %zmm6, %zmm6, %zmm14 + vmulps {rn-sae}, %zmm1, %zmm1, %zmm8 + vmulps {rn-sae}, %zmm10, %zmm10, %zmm11 + vmulps {rn-sae}, %zmm10, %zmm14, %zmm5 + vcmpps $21, {sae}, %zmm6, %zmm1, %k4 /* X<X^2 iff X<0 */ - vcmpps $17, {sae}, %zmm1, %zmm4, %k2 + vcmpps $17, {sae}, %zmm1, %zmm4, %k2 /* polynomial */ - vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6 - vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14 - vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11 - vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9 - vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10 - vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12 - vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0 - vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11 - vmulps {rn-sae}, %zmm14, %zmm5, %zmm15 - vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11 - vxorps %zmm12, %zmm12, %zmm12{%k4} - vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0 - vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11 - kmovw %k0, %edx - vmulps {rn-sae}, %zmm1, %zmm11, %zmm13 - vblendmps %zmm0, %zmm3, %zmm0{%k4} - vxorps %zmm2, %zmm0, %zmm1 - kandw %k4, %k2, %k3 - vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13 - vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3} - vaddps {rn-sae}, %zmm13, %zmm12, %zmm0 - testl %edx, %edx + vmovups poly_coeff_1+__svml_sacos_data_internal(%rip), %zmm6 + vfmsub213ps {rn-sae}, %zmm12, %zmm11, %zmm14 + vmovups poly_coeff_2+__svml_sacos_data_internal(%rip), %zmm11 + vfmadd231ps {rn-sae}, %zmm1, %zmm7, %zmm9 + vmovups poly_coeff_5+__svml_sacos_data_internal(%rip), %zmm10 + vmovups Pi2H+__svml_sacos_data_internal(%rip), %zmm12 + vfmadd231ps {rn-sae}, %zmm14, %zmm13, %zmm0 + vfmadd231ps {rn-sae}, %zmm1, %zmm6, %zmm11 + vmulps {rn-sae}, %zmm14, %zmm5, %zmm15 + vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm11 + vxorps %zmm12, %zmm12, %zmm12{%k4} + vfnmadd213ps {rn-sae}, %zmm5, %zmm15, %zmm0 + vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm11 + kmovw %k0, %edx + vmulps {rn-sae}, %zmm1, %zmm11, %zmm13 + vblendmps %zmm0, %zmm3, %zmm0{%k4} + vxorps %zmm2, %zmm0, %zmm1 + kandw %k4, %k2, %k3 + vfmadd213ps {rn-sae}, %zmm1, %zmm1, %zmm13 + vorps PiH+__svml_sacos_data_internal(%rip), %zmm12, %zmm12{%k3} + vaddps {rn-sae}, %zmm13, %zmm12, %zmm0 + testl %edx, %edx /* Go to special inputs processing branch */ - jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx r12 r13 r14 r15 edx zmm0 zmm4 + jne L(SPECIAL_VALUES_BRANCH) + # LOE rbx r12 r13 r14 r15 edx zmm0 zmm4 /* Restore registers * and exit the function */ L(EXIT): - movq %rbp, %rsp - popq %rbp - cfi_def_cfa(7, 8) - cfi_restore(6) - ret - cfi_def_cfa(6, 16) - cfi_offset(6, -16) + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) /* Branch to process * special inputs */ L(SPECIAL_VALUES_BRANCH): - vmovups %zmm4, 64(%rsp) - vmovups %zmm0, 128(%rsp) - # LOE rbx r12 r13 r14 r15 edx zmm0 - - xorl %eax, %eax - # LOE rbx r12 r13 r14 r15 eax edx - - vzeroupper - movq %r12, 16(%rsp) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 - movl %eax, %r12d - movq %r13, 8(%rsp) - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 - movl %edx, %r13d - movq %r14, (%rsp) - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r15 r12d r13d + vmovups %zmm4, 64(%rsp) + vmovups %zmm0, 128(%rsp) + # LOE rbx r12 r13 r14 r15 edx zmm0 + + xorl %eax, %eax + # LOE rbx r12 r13 r14 r15 eax edx + + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r15 r12d r13d /* Range mask * bits check */ L(RANGEMASK_CHECK): - btl %r12d, %r13d + btl %r12d, %r13d /* Call scalar math function */ - jc L(SCALAR_MATH_CALL) - # LOE rbx r15 r12d r13d + jc L(SCALAR_MATH_CALL) + # LOE rbx r15 r12d r13d /* Special inputs * processing loop */ L(SPECIAL_VALUES_LOOP): - incl %r12d - cmpl $16, %r12d + incl %r12d + cmpl $16, %r12d /* Check bits in range mask */ - jl L(RANGEMASK_CHECK) - # LOE rbx r15 r12d r13d + jl L(RANGEMASK_CHECK) + # LOE rbx r15 r12d r13d - movq 16(%rsp), %r12 - cfi_restore(12) - movq 8(%rsp), %r13 - cfi_restore(13) - movq (%rsp), %r14 - cfi_restore(14) - vmovups 128(%rsp), %zmm0 + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovups 128(%rsp), %zmm0 /* Go to exit */ - jmp L(EXIT) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r12 r13 r14 r15 zmm0 + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r12 r13 r14 r15 zmm0 /* Scalar math fucntion call * to process special input */ L(SCALAR_MATH_CALL): - movl %r12d, %r14d - movss 64(%rsp,%r14,4), %xmm0 - call acosf@PLT - # LOE rbx r14 r15 r12d r13d xmm0 + movl %r12d, %r14d + movss 64(%rsp,%r14,4), %xmm0 + call acosf@PLT + # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp,%r14,4) + movss %xmm0, 128(%rsp,%r14,4) /* Process special inputs in loop */ - jmp L(SPECIAL_VALUES_LOOP) - # LOE rbx r15 r12d r13d + jmp L(SPECIAL_VALUES_LOOP) + # LOE rbx r15 r12d r13d END(_ZGVeN16v_acosf_skx) - .section .rodata, "a" - .align 64 + .section .rodata, "a" + .align 64 #ifdef __svml_sacos_data_internal_typedef typedef unsigned int VUINT32; typedef struct { - __declspec(align(64)) VUINT32 SgnBit[16][1]; - __declspec(align(64)) VUINT32 OneHalf[16][1]; - __declspec(align(64)) VUINT32 SmallNorm[16][1]; - __declspec(align(64)) VUINT32 MOne[16][1]; - __declspec(align(64)) VUINT32 Two[16][1]; - __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1]; - __declspec(align(64)) VUINT32 poly_coeff[5][16][1]; - __declspec(align(64)) VUINT32 Pi2H[16][1]; - __declspec(align(64)) VUINT32 PiH[16][1]; + __declspec(align(64)) VUINT32 SgnBit[16][1]; + __declspec(align(64)) VUINT32 OneHalf[16][1]; + __declspec(align(64)) VUINT32 SmallNorm[16][1]; + __declspec(align(64)) VUINT32 MOne[16][1]; + __declspec(align(64)) VUINT32 Two[16][1]; + __declspec(align(64)) VUINT32 sqrt_coeff[2][16][1]; + __declspec(align(64)) VUINT32 poly_coeff[5][16][1]; + __declspec(align(64)) VUINT32 Pi2H[16][1]; + __declspec(align(64)) VUINT32 PiH[16][1]; } __svml_sacos_data_internal; #endif __svml_sacos_data_internal: - /*== SgnBit ==*/ - .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 - /*== OneHalf ==*/ - .align 64 - .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 - /*== SmallNorm ==*/ - .align 64 - .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 - /*== MOne ==*/ - .align 64 - .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 - /*== Two ==*/ - .align 64 - .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 - /*== sqrt_coeff[2] ==*/ - .align 64 - .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ - .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ - /*== poly_coeff[5] ==*/ - .align 64 - .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ - .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ - .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ - .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ - .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ - /*== Pi2H ==*/ - .align 64 - .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB - /*== PiH ==*/ - .align 64 - .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB - .align 64 - .type __svml_sacos_data_internal,@object - .size __svml_sacos_data_internal,.-__svml_sacos_data_internal + /*== SgnBit ==*/ + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 + /*== OneHalf ==*/ + .align 64 + .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 + /*== SmallNorm ==*/ + .align 64 + .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000 + /*== MOne ==*/ + .align 64 + .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000 + /*== Two ==*/ + .align 64 + .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000 + /*== sqrt_coeff[2] ==*/ + .align 64 + .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */ + .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */ + /*== poly_coeff[5] ==*/ + .align 64 + .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */ + .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */ + .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */ + .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */ + .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */ + /*== Pi2H ==*/ + .align 64 + .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB + /*== PiH ==*/ + .align 64 + .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB + .align 64 + .type __svml_sacos_data_internal,@object + .size __svml_sacos_data_internal,.-__svml_sacos_data_internal