diff mbox series

[v1] x86: Reformat code in svml_s_atanhf16_core_avx512.S

Message ID 20220203215315.3285202-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Reformat code in svml_s_atanhf16_core_avx512.S | expand

Commit Message

Noah Goldstein Feb. 3, 2022, 9:53 p.m. UTC
Reformats to match style of other hand coded assembly files.

The changes are:
    1. Replace 8x space with tab before instructions.
    2. After instruction len < 8 use tab.
    3. After instruction len >= 8 use space.
    4. 1 Space after comma between instruction operands.
    5. Indent comments similiar with code.
    6. Make comments complete sentences.
    7. Tab after '#define'
    8. Spaces at '#' representing the depth.

The final executable is unchanged by this commit.
---
The changes for this patch where made with the following
script: https://github.com/goldsteinn/assembly-beautifier

The goal of this patch is just to reformat the code
so it is more human friendly and try create a style
that future patches will be based on.

If this patch is accepted ensuing patches to
optimize the performance of svml_s_atanhf16_core_avx512.S
will be based on this patch.

 .../multiarch/svml_s_atanhf16_core_avx512.S   | 655 +++++++++---------
 1 file changed, 321 insertions(+), 334 deletions(-)

Comments

Sunil Pandey Feb. 5, 2022, 7:01 a.m. UTC | #1
Hi Noah,

Since this patch is about glibc assembly language formatting, it may be
helpful to everyone, if there is a section for glibc assembly coding
in manual.

https://sourceware.org/glibc/wiki/Style_and_Conventions

You may also consider putting your tool in glibc code base, so that people
can easily find/use it to check/format their assembly code.

https://github.com/goldsteinn/assembly-beautifier

Thank,
Sunil

On Thu, Feb 3, 2022 at 2:00 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Reformats to match style of other hand coded assembly files.
>
> The changes are:
>     1. Replace 8x space with tab before instructions.
>     2. After instruction len < 8 use tab.
>     3. After instruction len >= 8 use space.
>     4. 1 Space after comma between instruction operands.
>     5. Indent comments similiar with code.
>     6. Make comments complete sentences.
>     7. Tab after '#define'
>     8. Spaces at '#' representing the depth.
>
> The final executable is unchanged by this commit.
> ---
> The changes for this patch where made with the following
> script: https://github.com/goldsteinn/assembly-beautifier
>
> The goal of this patch is just to reformat the code
> so it is more human friendly and try create a style
> that future patches will be based on.
>
> If this patch is accepted ensuing patches to
> optimize the performance of svml_s_atanhf16_core_avx512.S
> will be based on this patch.
>
>  .../multiarch/svml_s_atanhf16_core_avx512.S   | 655 +++++++++---------
>  1 file changed, 321 insertions(+), 334 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> index f863f4f959..ed90a427a6 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> @@ -33,361 +33,348 @@
>
>  /* Offsets for data table __svml_satanh_data_internal_avx512
>   */
> -#define Log_tbl_H                      0
> -#define Log_tbl_L                      128
> -#define One                            256
> -#define AbsMask                        320
> -#define AddB5                          384
> -#define RcpBitMask                     448
> -#define poly_coeff3                    512
> -#define poly_coeff2                    576
> -#define poly_coeff1                    640
> -#define poly_coeff0                    704
> -#define Half                           768
> -#define L2H                            832
> -#define L2L                            896
> +#define Log_tbl_H      0
> +#define Log_tbl_L      128
> +#define One    256
> +#define AbsMask        320
> +#define AddB5  384
> +#define RcpBitMask     448
> +#define poly_coeff3    512
> +#define poly_coeff2    576
> +#define poly_coeff1    640
> +#define poly_coeff0    704
> +#define Half   768
> +#define L2H    832
> +#define L2L    896
>
>  #include <sysdep.h>
>
> -        .text
> -       .section .text.exex512,"ax",@progbits
> +       .text
> +       .section .text.exex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_atanhf_skx)
> -        pushq     %rbp
> -        cfi_def_cfa_offset(16)
> -        movq      %rsp, %rbp
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -        andq      $-64, %rsp
> -        subq      $192, %rsp
> -        vmovups   One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> -
> -/* round reciprocals to 1+5b mantissas */
> -        vmovups   AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> -        vmovups   RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> -        vmovaps   %zmm0, %zmm11
> -        vandps    AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> -
> -/* 1+y */
> -        vaddps    {rn-sae}, %zmm4, %zmm6, %zmm9
> -
> -/* 1-y */
> -        vsubps    {rn-sae}, %zmm6, %zmm4, %zmm8
> -        vxorps    %zmm6, %zmm11, %zmm10
> -
> -/* Yp_high */
> -        vsubps    {rn-sae}, %zmm4, %zmm9, %zmm2
> -
> -/* -Ym_high */
> -        vsubps    {rn-sae}, %zmm4, %zmm8, %zmm5
> -
> -/* RcpP ~ 1/Yp */
> -        vrcp14ps  %zmm9, %zmm12
> -
> -/* RcpM ~ 1/Ym */
> -        vrcp14ps  %zmm8, %zmm13
> -
> -/* input outside (-1, 1) ? */
> -        vcmpps    $21, {sae}, %zmm4, %zmm6, %k0
> -        vpaddd    %zmm14, %zmm12, %zmm15
> -        vpaddd    %zmm14, %zmm13, %zmm0
> -
> -/* Yp_low */
> -        vsubps    {rn-sae}, %zmm2, %zmm6, %zmm3
> -        vandps    %zmm1, %zmm15, %zmm7
> -        vandps    %zmm1, %zmm0, %zmm12
> -
> -/* Ym_low */
> -        vaddps    {rn-sae}, %zmm5, %zmm6, %zmm5
> -
> -/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
> -        vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> -
> -/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> -        vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> -        vmovups   Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> -        vmovups   Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> -
> -/* exponents */
> -        vgetexpps {sae}, %zmm7, %zmm15
> -        vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> -
> -/* Table lookups */
> -        vmovups   __svml_satanh_data_internal_avx512(%rip), %zmm6
> -        vgetexpps {sae}, %zmm12, %zmm14
> -        vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> -
> -/* Prepare table index */
> -        vpsrld    $18, %zmm7, %zmm3
> -        vpsrld    $18, %zmm12, %zmm2
> -        vmovups   Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -        vmovups   poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> -
> -/* Km-Kp */
> -        vsubps    {rn-sae}, %zmm15, %zmm14, %zmm1
> -        kmovw     %k0, %edx
> -        vmovaps   %zmm3, %zmm0
> -        vpermi2ps %zmm13, %zmm8, %zmm3
> -        vpermt2ps %zmm13, %zmm2, %zmm8
> -        vpermi2ps %zmm7, %zmm6, %zmm0
> -        vpermt2ps %zmm7, %zmm2, %zmm6
> -        vsubps    {rn-sae}, %zmm3, %zmm8, %zmm5
> -
> -/* K*L2H + Th */
> -        vmovups   L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> -
> -/* K*L2L + Tl */
> -        vmovups   L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -
> -/* polynomials */
> -        vmovups   poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> -        vmovups   poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> -
> -/* table values */
> -        vsubps    {rn-sae}, %zmm0, %zmm6, %zmm0
> -        vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> -        vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> -        vmovups   poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> -        vmovaps   %zmm3, %zmm2
> -        vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> -        vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> -        vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> -        vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> -        vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> -        vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> -
> -/* (K*L2L + Tl) + Rp*PolyP */
> -        vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> -        vorps     Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> -
> -/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> -        vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> -        vaddps    {rn-sae}, %zmm3, %zmm0, %zmm4
> -        vmulps    {rn-sae}, %zmm9, %zmm4, %zmm0
> -        testl     %edx, %edx
> -
> -/* Go to special inputs processing branch */
> -        jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> -
> -/* Restore registers
> - * and exit the function
> - */
> +       pushq   %rbp
> +       cfi_def_cfa_offset (16)
> +       movq    %rsp, %rbp
> +       cfi_def_cfa (6, 16)
> +       cfi_offset (6, -16)
> +       andq    $-64, %rsp
> +       subq    $192, %rsp
> +       vmovups One + __svml_satanh_data_internal_avx512(%rip), %zmm4
> +
> +       /* round reciprocals to 1+5b mantissas.  */
> +       vmovups AddB5 + __svml_satanh_data_internal_avx512(%rip), %zmm14
> +       vmovups RcpBitMask + __svml_satanh_data_internal_avx512(%rip), %zmm1
> +       vmovaps %zmm0, %zmm11
> +       vandps  AbsMask + __svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> +
> +       /* 1+y.  */
> +       vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
> +
> +       /* 1-y.  */
> +       vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> +       vxorps  %zmm6, %zmm11, %zmm10
> +
> +       /* Yp_high.  */
> +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> +
> +       /* -Ym_high.  */
> +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> +
> +       /* RcpP ~ 1/Yp.  */
> +       vrcp14ps %zmm9, %zmm12
> +
> +       /* RcpM ~ 1/Ym.  */
> +       vrcp14ps %zmm8, %zmm13
> +
> +       /* input outside (-1, 1) ?.  */
> +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> +       vpaddd  %zmm14, %zmm12, %zmm15
> +       vpaddd  %zmm14, %zmm13, %zmm0
> +
> +       /* Yp_low.  */
> +       vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
> +       vandps  %zmm1, %zmm15, %zmm7
> +       vandps  %zmm1, %zmm0, %zmm12
> +
> +       /* Ym_low.  */
> +       vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> +
> +       /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
> +       vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> +
> +       /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
> +       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> +       vmovups Log_tbl_L + __svml_satanh_data_internal_avx512(%rip), %zmm8
> +       vmovups Log_tbl_L + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> +
> +       /* exponents.  */
> +       vgetexpps {sae}, %zmm7, %zmm15
> +       vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> +
> +       /* Table lookups.  */
> +       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> +       vgetexpps {sae}, %zmm12, %zmm14
> +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> +
> +       /* Prepare table index.  */
> +       vpsrld  $18, %zmm7, %zmm3
> +       vpsrld  $18, %zmm12, %zmm2
> +       vmovups Log_tbl_H + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> +       vmovups poly_coeff1 + __svml_satanh_data_internal_avx512(%rip), %zmm12
> +
> +       /* Km-Kp.  */
> +       vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> +       kmovw   %k0, %edx
> +       vmovaps %zmm3, %zmm0
> +       vpermi2ps %zmm13, %zmm8, %zmm3
> +       vpermt2ps %zmm13, %zmm2, %zmm8
> +       vpermi2ps %zmm7, %zmm6, %zmm0
> +       vpermt2ps %zmm7, %zmm2, %zmm6
> +       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> +
> +       /* K*L2H + Th.  */
> +       vmovups L2H + __svml_satanh_data_internal_avx512(%rip), %zmm2
> +
> +       /* K*L2L + Tl.  */
> +       vmovups L2L + __svml_satanh_data_internal_avx512(%rip), %zmm3
> +
> +       /* polynomials.  */
> +       vmovups poly_coeff3 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> +       vmovups poly_coeff0 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> +
> +       /* table values.  */
> +       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> +       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> +       vmovups poly_coeff2 + __svml_satanh_data_internal_avx512(%rip), %zmm3
> +       vmovaps %zmm3, %zmm2
> +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> +       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> +       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> +
> +       /* (K*L2L + Tl) + Rp*PolyP.  */
> +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> +       vorps   Half + __svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> +
> +       /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
> +       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> +       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> +       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> +       testl   %edx, %edx
> +
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +
> +       /* Restore registers * and exit the function.  */
>
>  L(EXIT):
> -        movq      %rbp, %rsp
> -        popq      %rbp
> -        cfi_def_cfa(7, 8)
> -        cfi_restore(6)
> -        ret
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -
> -/* Branch to process
> - * special inputs
> - */
> +       movq    %rbp, %rsp
> +       popq    %rbp
> +       cfi_def_cfa (7, 8)
> +       cfi_restore (6)
> +       ret
> +       cfi_def_cfa (6, 16)
> +       cfi_offset (6, -16)
> +
> +       /* Branch to process special inputs.  */
>
>  L(SPECIAL_VALUES_BRANCH):
> -        vmovups   %zmm11, 64(%rsp)
> -        vmovups   %zmm0, 128(%rsp)
> -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> -
> -        xorl      %eax, %eax
> -                                # LOE rbx r12 r13 r14 r15 eax edx
> -
> -        vzeroupper
> -        movq      %r12, 16(%rsp)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -        movl      %eax, %r12d
> -        movq      %r13, 8(%rsp)
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -        movl      %edx, %r13d
> -        movq      %r14, (%rsp)
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> +       vmovups %zmm11, 64(%rsp)
> +       vmovups %zmm0, 128(%rsp)
> +
> +       xorl    %eax, %eax
> +
> +       vzeroupper
> +       movq    %r12, 16(%rsp)
> +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> +       movl    %eax, %r12d
> +       movq    %r13, 8(%rsp)
> +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> +       movl    %edx, %r13d
> +       movq    %r14, (%rsp)
> +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> +
> +       /* Range mask * bits check.  */
>
>  L(RANGEMASK_CHECK):
> -        btl       %r12d, %r13d
> +       btl     %r12d, %r13d
>
> -/* Call scalar math function */
> -        jc        L(SCALAR_MATH_CALL)
> -                                # LOE rbx r15 r12d r13d
> +       /* Call scalar math function.  */
> +       jc      L(SCALAR_MATH_CALL)
>
> -/* Special inputs
> - * processing loop
> - */
> +       /* Special inputs processing loop.  */
>
>  L(SPECIAL_VALUES_LOOP):
> -        incl      %r12d
> -        cmpl      $16, %r12d
> -
> -/* Check bits in range mask */
> -        jl        L(RANGEMASK_CHECK)
> -                                # LOE rbx r15 r12d r13d
> -
> -        movq      16(%rsp), %r12
> -        cfi_restore(12)
> -        movq      8(%rsp), %r13
> -        cfi_restore(13)
> -        movq      (%rsp), %r14
> -        cfi_restore(14)
> -        vmovups   128(%rsp), %zmm0
> -
> -/* Go to exit */
> -        jmp       L(EXIT)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r12 r13 r14 r15 zmm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> +       incl    %r12d
> +       cmpl    $16, %r12d
> +
> +       /* Check bits in range mask.  */
> +       jl      L(RANGEMASK_CHECK)
> +
> +       movq    16(%rsp), %r12
> +       cfi_restore (12)
> +       movq    8(%rsp), %r13
> +       cfi_restore (13)
> +       movq    (%rsp), %r14
> +       cfi_restore (14)
> +       vmovups 128(%rsp), %zmm0
> +
> +       /* Go to exit.  */
> +       jmp     L(EXIT)
> +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> +
> +       /* Scalar math fucntion call to process special input.  */
>
>  L(SCALAR_MATH_CALL):
> -        movl      %r12d, %r14d
> -        movss     64(%rsp,%r14,4), %xmm0
> -        call      atanhf@PLT
> -                                # LOE rbx r14 r15 r12d r13d xmm0
> +       movl    %r12d, %r14d
> +       movss   64(%rsp, %r14, 4), %xmm0
> +       call    atanhf@PLT
>
> -        movss     %xmm0, 128(%rsp,%r14,4)
> +       movss   %xmm0, 128(%rsp, %r14, 4)
>
> -/* Process special inputs in loop */
> -        jmp       L(SPECIAL_VALUES_LOOP)
> -                                # LOE rbx r15 r12d r13d
> +       /* Process special inputs in loop.  */
> +       jmp     L(SPECIAL_VALUES_LOOP)
>  END(_ZGVeN16v_atanhf_skx)
>
> -        .section .rodata, "a"
> -        .align 64
> +       .section .rodata, "a"
> +       .align  64
>
>  #ifdef __svml_satanh_data_internal_avx512_typedef
> -typedef unsigned int VUINT32;
> -typedef struct {
> -        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> -        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> -        __declspec(align(64)) VUINT32 One[16][1];
> -        __declspec(align(64)) VUINT32 AbsMask[16][1];
> -        __declspec(align(64)) VUINT32 AddB5[16][1];
> -        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> -        __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> -        __declspec(align(64)) VUINT32 Half[16][1];
> -        __declspec(align(64)) VUINT32 L2H[16][1];
> -        __declspec(align(64)) VUINT32 L2L[16][1];
> -    } __svml_satanh_data_internal_avx512;
> +       typedef unsigned int VUINT32;
> +       typedef struct{
> +       __declspec (align(64))VUINT32 Log_tbl_H[32][1];
> +       __declspec (align(64))VUINT32 Log_tbl_L[32][1];
> +       __declspec (align(64))VUINT32 One[16][1];
> +       __declspec (align(64))VUINT32 AbsMask[16][1];
> +       __declspec (align(64))VUINT32 AddB5[16][1];
> +       __declspec (align(64))VUINT32 RcpBitMask[16][1];
> +       __declspec (align(64))VUINT32 poly_coeff3[16][1];
> +       __declspec (align(64))VUINT32 poly_coeff2[16][1];
> +       __declspec (align(64))VUINT32 poly_coeff1[16][1];
> +       __declspec (align(64))VUINT32 poly_coeff0[16][1];
> +       __declspec (align(64))VUINT32 Half[16][1];
> +       __declspec (align(64))VUINT32 L2H[16][1];
> +       __declspec (align(64))VUINT32 L2L[16][1];
> +       }__svml_satanh_data_internal_avx512;
>  #endif
>  __svml_satanh_data_internal_avx512:
> -        /*== Log_tbl_H ==*/
> -        .long 0x00000000
> -        .long 0x3cfc0000
> -        .long 0x3d780000
> -        .long 0x3db78000
> -        .long 0x3df10000
> -        .long 0x3e14c000
> -        .long 0x3e300000
> -        .long 0x3e4a8000
> -        .long 0x3e648000
> -        .long 0x3e7dc000
> -        .long 0x3e8b4000
> -        .long 0x3e974000
> -        .long 0x3ea30000
> -        .long 0x3eae8000
> -        .long 0x3eb9c000
> -        .long 0x3ec4e000
> -        .long 0x3ecfa000
> -        .long 0x3eda2000
> -        .long 0x3ee48000
> -        .long 0x3eeea000
> -        .long 0x3ef8a000
> -        .long 0x3f013000
> -        .long 0x3f05f000
> -        .long 0x3f0aa000
> -        .long 0x3f0f4000
> -        .long 0x3f13d000
> -        .long 0x3f184000
> -        .long 0x3f1ca000
> -        .long 0x3f20f000
> -        .long 0x3f252000
> -        .long 0x3f295000
> -        .long 0x3f2d7000
> -        /*== Log_tbl_L ==*/
> -        .align 64
> -        .long 0x00000000
> -        .long 0x3726c39e
> -        .long 0x38a30c01
> -        .long 0x37528ae5
> -        .long 0x38e0edc5
> -        .long 0xb8ab41f8
> -        .long 0xb7cf8f58
> -        .long 0x3896a73d
> -        .long 0xb5838656
> -        .long 0x380c36af
> -        .long 0xb8235454
> -        .long 0x3862bae1
> -        .long 0x38c5e10e
> -        .long 0x38dedfac
> -        .long 0x38ebfb5e
> -        .long 0xb8e63c9f
> -        .long 0xb85c1340
> -        .long 0x38777bcd
> -        .long 0xb6038656
> -        .long 0x37d40984
> -        .long 0xb8b85028
> -        .long 0xb8ad5a5a
> -        .long 0x3865c84a
> -        .long 0x38c3d2f5
> -        .long 0x383ebce1
> -        .long 0xb8a1ed76
> -        .long 0xb7a332c4
> -        .long 0xb779654f
> -        .long 0xb8602f73
> -        .long 0x38f85db0
> -        .long 0x37b4996f
> -        .long 0xb8bfb3ca
> -        /*== One ==*/
> -        .align 64
> -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -        /*== AbsMask ==*/
> -        .align 64
> -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> -        /*== AddB5 ==*/
> -        .align 64
> -        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> -        /*== RcpBitMask ==*/
> -        .align 64
> -        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> -        /*== poly_coeff3 ==*/
> -        .align 64
> -        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> -        /*== poly_coeff2 ==*/
> -        .align 64
> -        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> -        /*== poly_coeff1 ==*/
> -        .align 64
> -        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> -        /*== poly_coeff0 ==*/
> -        .align 64
> -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> -        /*== Half ==*/
> -        .align 64
> -        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> -        /*== L2H = log(2)_high ==*/
> -        .align 64
> -        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> -        /*== L2L = log(2)_low ==*/
> -        .align 64
> -        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> -        .align 64
> -        .type  __svml_satanh_data_internal_avx512,@object
> -        .size  __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
> +       /* == Log_tbl_H ==.  */
> +       .long   0x00000000
> +       .long   0x3cfc0000
> +       .long   0x3d780000
> +       .long   0x3db78000
> +       .long   0x3df10000
> +       .long   0x3e14c000
> +       .long   0x3e300000
> +       .long   0x3e4a8000
> +       .long   0x3e648000
> +       .long   0x3e7dc000
> +       .long   0x3e8b4000
> +       .long   0x3e974000
> +       .long   0x3ea30000
> +       .long   0x3eae8000
> +       .long   0x3eb9c000
> +       .long   0x3ec4e000
> +       .long   0x3ecfa000
> +       .long   0x3eda2000
> +       .long   0x3ee48000
> +       .long   0x3eeea000
> +       .long   0x3ef8a000
> +       .long   0x3f013000
> +       .long   0x3f05f000
> +       .long   0x3f0aa000
> +       .long   0x3f0f4000
> +       .long   0x3f13d000
> +       .long   0x3f184000
> +       .long   0x3f1ca000
> +       .long   0x3f20f000
> +       .long   0x3f252000
> +       .long   0x3f295000
> +       .long   0x3f2d7000
> +       /* == Log_tbl_L ==.  */
> +       .align  64
> +       .long   0x00000000
> +       .long   0x3726c39e
> +       .long   0x38a30c01
> +       .long   0x37528ae5
> +       .long   0x38e0edc5
> +       .long   0xb8ab41f8
> +       .long   0xb7cf8f58
> +       .long   0x3896a73d
> +       .long   0xb5838656
> +       .long   0x380c36af
> +       .long   0xb8235454
> +       .long   0x3862bae1
> +       .long   0x38c5e10e
> +       .long   0x38dedfac
> +       .long   0x38ebfb5e
> +       .long   0xb8e63c9f
> +       .long   0xb85c1340
> +       .long   0x38777bcd
> +       .long   0xb6038656
> +       .long   0x37d40984
> +       .long   0xb8b85028
> +       .long   0xb8ad5a5a
> +       .long   0x3865c84a
> +       .long   0x38c3d2f5
> +       .long   0x383ebce1
> +       .long   0xb8a1ed76
> +       .long   0xb7a332c4
> +       .long   0xb779654f
> +       .long   0xb8602f73
> +       .long   0x38f85db0
> +       .long   0x37b4996f
> +       .long   0xb8bfb3ca
> +       /* == One ==.  */
> +       .align  64
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* == AbsMask ==.  */
> +       .align  64
> +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> +       /* == AddB5 ==.  */
> +       .align  64
> +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> +       /* == RcpBitMask ==.  */
> +       .align  64
> +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> +       /* == poly_coeff3 ==.  */
> +       .align  64
> +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> +       /* == poly_coeff2 ==.  */
> +       .align  64
> +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> +       /* == poly_coeff1 ==.  */
> +       .align  64
> +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> +       /* == poly_coeff0 ==.  */
> +       .align  64
> +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> +       /* == Half ==.  */
> +       .align  64
> +       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> +       /* == L2H = log(2)_high ==.  */
> +       .align  64
> +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> +       /* == L2L = log(2)_low ==.  */
> +       .align  64
> +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> +       .align  64
> +       .type   __svml_satanh_data_internal_avx512, @object
> +       .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> --
> 2.25.1
>
Noah Goldstein Feb. 5, 2022, 7:46 a.m. UTC | #2
On Sat, Feb 5, 2022 at 2:01 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Hi Noah,
>
> Since this patch is about glibc assembly language formatting, it may be
> helpful to everyone, if there is a section for glibc assembly coding
> in manual.
>
> https://sourceware.org/glibc/wiki/Style_and_Conventions
>
> You may also consider putting your tool in glibc code base, so that people
> can easily find/use it to check/format their assembly code.

Like the idea. How does the following sound:

== x86 GAS (Assembly) ==

The following are stylistic conventions for x86 GAS
contributions. Other style conventions that are relevant to x86 GAS
syntax are also included (i.e column limit, indenting preprocessor
directives, etc..).

1.  Instructions should be proceeded by a tab.
2.  Instruction less than 8 characters in length should have a tab
    between it and the first operand.
3.  Instruction greater than 7 characters in length should have a
    space between it and the first operand.
4.  There should be a space after the comma seperating operands.
    For example for rules 1, 2, 3, and 4:
    ```
    /* <tab><short instruction><tab><op0><comma><space><op1>.  */
    addl $1, %eax
    /* <tab><long instruction><space><op0><comma><space><op1>.  */
    vpmovmaskb %ymm0, %eax
    ```

5.  Comments should be indented with code:
    For example for rule 5:
    ```
    /* Function XYZ returns.  */
    ENTRY(XYZ)
    /* Return from XYZ.  */
    ret
    END(XYZ)
    ```

6.  Tab after `#define`d names and their value.
    For example:
    ```
    /* No value for 'HELLO' so no tab afterwards.  */
    #define HELLO

    /* Value for 'WORLD' so tab separated 'WORLD' from '10'.  */
    #define WORLD 10
    ```


>
> https://github.com/goldsteinn/assembly-beautifier

Thats not an official tool. Its code quality is not really up to standard
and I can't really commit to maintaining it.

If its improved to a state where its good enough for GLIBC I'll
post a patch.
>
> Thank,
> Sunil
>
> On Thu, Feb 3, 2022 at 2:00 PM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Reformats to match style of other hand coded assembly files.
> >
> > The changes are:
> >     1. Replace 8x space with tab before instructions.
> >     2. After instruction len < 8 use tab.
> >     3. After instruction len >= 8 use space.
> >     4. 1 Space after comma between instruction operands.
> >     5. Indent comments similiar with code.
> >     6. Make comments complete sentences.
> >     7. Tab after '#define'
> >     8. Spaces at '#' representing the depth.
> >
> > The final executable is unchanged by this commit.
> > ---
> > The changes for this patch where made with the following
> > script: https://github.com/goldsteinn/assembly-beautifier
> >
> > The goal of this patch is just to reformat the code
> > so it is more human friendly and try create a style
> > that future patches will be based on.
> >
> > If this patch is accepted ensuing patches to
> > optimize the performance of svml_s_atanhf16_core_avx512.S
> > will be based on this patch.
> >
> >  .../multiarch/svml_s_atanhf16_core_avx512.S   | 655 +++++++++---------
> >  1 file changed, 321 insertions(+), 334 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > index f863f4f959..ed90a427a6 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > @@ -33,361 +33,348 @@
> >
> >  /* Offsets for data table __svml_satanh_data_internal_avx512
> >   */
> > -#define Log_tbl_H                      0
> > -#define Log_tbl_L                      128
> > -#define One                            256
> > -#define AbsMask                        320
> > -#define AddB5                          384
> > -#define RcpBitMask                     448
> > -#define poly_coeff3                    512
> > -#define poly_coeff2                    576
> > -#define poly_coeff1                    640
> > -#define poly_coeff0                    704
> > -#define Half                           768
> > -#define L2H                            832
> > -#define L2L                            896
> > +#define Log_tbl_H      0
> > +#define Log_tbl_L      128
> > +#define One    256
> > +#define AbsMask        320
> > +#define AddB5  384
> > +#define RcpBitMask     448
> > +#define poly_coeff3    512
> > +#define poly_coeff2    576
> > +#define poly_coeff1    640
> > +#define poly_coeff0    704
> > +#define Half   768
> > +#define L2H    832
> > +#define L2L    896
> >
> >  #include <sysdep.h>
> >
> > -        .text
> > -       .section .text.exex512,"ax",@progbits
> > +       .text
> > +       .section .text.exex512, "ax", @progbits
> >  ENTRY(_ZGVeN16v_atanhf_skx)
> > -        pushq     %rbp
> > -        cfi_def_cfa_offset(16)
> > -        movq      %rsp, %rbp
> > -        cfi_def_cfa(6, 16)
> > -        cfi_offset(6, -16)
> > -        andq      $-64, %rsp
> > -        subq      $192, %rsp
> > -        vmovups   One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> > -
> > -/* round reciprocals to 1+5b mantissas */
> > -        vmovups   AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> > -        vmovups   RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> > -        vmovaps   %zmm0, %zmm11
> > -        vandps    AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > -
> > -/* 1+y */
> > -        vaddps    {rn-sae}, %zmm4, %zmm6, %zmm9
> > -
> > -/* 1-y */
> > -        vsubps    {rn-sae}, %zmm6, %zmm4, %zmm8
> > -        vxorps    %zmm6, %zmm11, %zmm10
> > -
> > -/* Yp_high */
> > -        vsubps    {rn-sae}, %zmm4, %zmm9, %zmm2
> > -
> > -/* -Ym_high */
> > -        vsubps    {rn-sae}, %zmm4, %zmm8, %zmm5
> > -
> > -/* RcpP ~ 1/Yp */
> > -        vrcp14ps  %zmm9, %zmm12
> > -
> > -/* RcpM ~ 1/Ym */
> > -        vrcp14ps  %zmm8, %zmm13
> > -
> > -/* input outside (-1, 1) ? */
> > -        vcmpps    $21, {sae}, %zmm4, %zmm6, %k0
> > -        vpaddd    %zmm14, %zmm12, %zmm15
> > -        vpaddd    %zmm14, %zmm13, %zmm0
> > -
> > -/* Yp_low */
> > -        vsubps    {rn-sae}, %zmm2, %zmm6, %zmm3
> > -        vandps    %zmm1, %zmm15, %zmm7
> > -        vandps    %zmm1, %zmm0, %zmm12
> > -
> > -/* Ym_low */
> > -        vaddps    {rn-sae}, %zmm5, %zmm6, %zmm5
> > -
> > -/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
> > -        vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> > -
> > -/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> > -        vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > -        vmovups   Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> > -        vmovups   Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > -
> > -/* exponents */
> > -        vgetexpps {sae}, %zmm7, %zmm15
> > -        vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > -
> > -/* Table lookups */
> > -        vmovups   __svml_satanh_data_internal_avx512(%rip), %zmm6
> > -        vgetexpps {sae}, %zmm12, %zmm14
> > -        vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > -
> > -/* Prepare table index */
> > -        vpsrld    $18, %zmm7, %zmm3
> > -        vpsrld    $18, %zmm12, %zmm2
> > -        vmovups   Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > -        vmovups   poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> > -
> > -/* Km-Kp */
> > -        vsubps    {rn-sae}, %zmm15, %zmm14, %zmm1
> > -        kmovw     %k0, %edx
> > -        vmovaps   %zmm3, %zmm0
> > -        vpermi2ps %zmm13, %zmm8, %zmm3
> > -        vpermt2ps %zmm13, %zmm2, %zmm8
> > -        vpermi2ps %zmm7, %zmm6, %zmm0
> > -        vpermt2ps %zmm7, %zmm2, %zmm6
> > -        vsubps    {rn-sae}, %zmm3, %zmm8, %zmm5
> > -
> > -/* K*L2H + Th */
> > -        vmovups   L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> > -
> > -/* K*L2L + Tl */
> > -        vmovups   L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > -
> > -/* polynomials */
> > -        vmovups   poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > -        vmovups   poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > -
> > -/* table values */
> > -        vsubps    {rn-sae}, %zmm0, %zmm6, %zmm0
> > -        vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > -        vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > -        vmovups   poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > -        vmovaps   %zmm3, %zmm2
> > -        vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > -        vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > -        vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > -        vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > -        vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > -        vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > -
> > -/* (K*L2L + Tl) + Rp*PolyP */
> > -        vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > -        vorps     Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > -
> > -/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> > -        vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > -        vaddps    {rn-sae}, %zmm3, %zmm0, %zmm4
> > -        vmulps    {rn-sae}, %zmm9, %zmm4, %zmm0
> > -        testl     %edx, %edx
> > -
> > -/* Go to special inputs processing branch */
> > -        jne       L(SPECIAL_VALUES_BRANCH)
> > -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> > -
> > -/* Restore registers
> > - * and exit the function
> > - */
> > +       pushq   %rbp
> > +       cfi_def_cfa_offset (16)
> > +       movq    %rsp, %rbp
> > +       cfi_def_cfa (6, 16)
> > +       cfi_offset (6, -16)
> > +       andq    $-64, %rsp
> > +       subq    $192, %rsp
> > +       vmovups One + __svml_satanh_data_internal_avx512(%rip), %zmm4
> > +
> > +       /* round reciprocals to 1+5b mantissas.  */
> > +       vmovups AddB5 + __svml_satanh_data_internal_avx512(%rip), %zmm14
> > +       vmovups RcpBitMask + __svml_satanh_data_internal_avx512(%rip), %zmm1
> > +       vmovaps %zmm0, %zmm11
> > +       vandps  AbsMask + __svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > +
> > +       /* 1+y.  */
> > +       vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
> > +
> > +       /* 1-y.  */
> > +       vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> > +       vxorps  %zmm6, %zmm11, %zmm10
> > +
> > +       /* Yp_high.  */
> > +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> > +
> > +       /* -Ym_high.  */
> > +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> > +
> > +       /* RcpP ~ 1/Yp.  */
> > +       vrcp14ps %zmm9, %zmm12
> > +
> > +       /* RcpM ~ 1/Ym.  */
> > +       vrcp14ps %zmm8, %zmm13
> > +
> > +       /* input outside (-1, 1) ?.  */
> > +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> > +       vpaddd  %zmm14, %zmm12, %zmm15
> > +       vpaddd  %zmm14, %zmm13, %zmm0
> > +
> > +       /* Yp_low.  */
> > +       vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
> > +       vandps  %zmm1, %zmm15, %zmm7
> > +       vandps  %zmm1, %zmm0, %zmm12
> > +
> > +       /* Ym_low.  */
> > +       vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> > +
> > +       /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
> > +       vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> > +
> > +       /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
> > +       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > +       vmovups Log_tbl_L + __svml_satanh_data_internal_avx512(%rip), %zmm8
> > +       vmovups Log_tbl_L + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> > +
> > +       /* exponents.  */
> > +       vgetexpps {sae}, %zmm7, %zmm15
> > +       vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > +
> > +       /* Table lookups.  */
> > +       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> > +       vgetexpps {sae}, %zmm12, %zmm14
> > +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > +
> > +       /* Prepare table index.  */
> > +       vpsrld  $18, %zmm7, %zmm3
> > +       vpsrld  $18, %zmm12, %zmm2
> > +       vmovups Log_tbl_H + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> > +       vmovups poly_coeff1 + __svml_satanh_data_internal_avx512(%rip), %zmm12
> > +
> > +       /* Km-Kp.  */
> > +       vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> > +       kmovw   %k0, %edx
> > +       vmovaps %zmm3, %zmm0
> > +       vpermi2ps %zmm13, %zmm8, %zmm3
> > +       vpermt2ps %zmm13, %zmm2, %zmm8
> > +       vpermi2ps %zmm7, %zmm6, %zmm0
> > +       vpermt2ps %zmm7, %zmm2, %zmm6
> > +       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> > +
> > +       /* K*L2H + Th.  */
> > +       vmovups L2H + __svml_satanh_data_internal_avx512(%rip), %zmm2
> > +
> > +       /* K*L2L + Tl.  */
> > +       vmovups L2L + __svml_satanh_data_internal_avx512(%rip), %zmm3
> > +
> > +       /* polynomials.  */
> > +       vmovups poly_coeff3 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> > +       vmovups poly_coeff0 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> > +
> > +       /* table values.  */
> > +       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> > +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > +       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > +       vmovups poly_coeff2 + __svml_satanh_data_internal_avx512(%rip), %zmm3
> > +       vmovaps %zmm3, %zmm2
> > +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > +       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > +       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > +
> > +       /* (K*L2L + Tl) + Rp*PolyP.  */
> > +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > +       vorps   Half + __svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > +
> > +       /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
> > +       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > +       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> > +       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> > +       testl   %edx, %edx
> > +
> > +       /* Go to special inputs processing branch.  */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +
> > +       /* Restore registers * and exit the function.  */
> >
> >  L(EXIT):
> > -        movq      %rbp, %rsp
> > -        popq      %rbp
> > -        cfi_def_cfa(7, 8)
> > -        cfi_restore(6)
> > -        ret
> > -        cfi_def_cfa(6, 16)
> > -        cfi_offset(6, -16)
> > -
> > -/* Branch to process
> > - * special inputs
> > - */
> > +       movq    %rbp, %rsp
> > +       popq    %rbp
> > +       cfi_def_cfa (7, 8)
> > +       cfi_restore (6)
> > +       ret
> > +       cfi_def_cfa (6, 16)
> > +       cfi_offset (6, -16)
> > +
> > +       /* Branch to process special inputs.  */
> >
> >  L(SPECIAL_VALUES_BRANCH):
> > -        vmovups   %zmm11, 64(%rsp)
> > -        vmovups   %zmm0, 128(%rsp)
> > -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> > -
> > -        xorl      %eax, %eax
> > -                                # LOE rbx r12 r13 r14 r15 eax edx
> > -
> > -        vzeroupper
> > -        movq      %r12, 16(%rsp)
> > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -        movl      %eax, %r12d
> > -        movq      %r13, 8(%rsp)
> > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -        movl      %edx, %r13d
> > -        movq      %r14, (%rsp)
> > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -                                # LOE rbx r15 r12d r13d
> > -
> > -/* Range mask
> > - * bits check
> > - */
> > +       vmovups %zmm11, 64(%rsp)
> > +       vmovups %zmm0, 128(%rsp)
> > +
> > +       xorl    %eax, %eax
> > +
> > +       vzeroupper
> > +       movq    %r12, 16(%rsp)
> > +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> > +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> > +       movl    %eax, %r12d
> > +       movq    %r13, 8(%rsp)
> > +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> > +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> > +       movl    %edx, %r13d
> > +       movq    %r14, (%rsp)
> > +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> > +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> > +
> > +       /* Range mask * bits check.  */
> >
> >  L(RANGEMASK_CHECK):
> > -        btl       %r12d, %r13d
> > +       btl     %r12d, %r13d
> >
> > -/* Call scalar math function */
> > -        jc        L(SCALAR_MATH_CALL)
> > -                                # LOE rbx r15 r12d r13d
> > +       /* Call scalar math function.  */
> > +       jc      L(SCALAR_MATH_CALL)
> >
> > -/* Special inputs
> > - * processing loop
> > - */
> > +       /* Special inputs processing loop.  */
> >
> >  L(SPECIAL_VALUES_LOOP):
> > -        incl      %r12d
> > -        cmpl      $16, %r12d
> > -
> > -/* Check bits in range mask */
> > -        jl        L(RANGEMASK_CHECK)
> > -                                # LOE rbx r15 r12d r13d
> > -
> > -        movq      16(%rsp), %r12
> > -        cfi_restore(12)
> > -        movq      8(%rsp), %r13
> > -        cfi_restore(13)
> > -        movq      (%rsp), %r14
> > -        cfi_restore(14)
> > -        vmovups   128(%rsp), %zmm0
> > -
> > -/* Go to exit */
> > -        jmp       L(EXIT)
> > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -                                # LOE rbx r12 r13 r14 r15 zmm0
> > -
> > -/* Scalar math fucntion call
> > - * to process special input
> > - */
> > +       incl    %r12d
> > +       cmpl    $16, %r12d
> > +
> > +       /* Check bits in range mask.  */
> > +       jl      L(RANGEMASK_CHECK)
> > +
> > +       movq    16(%rsp), %r12
> > +       cfi_restore (12)
> > +       movq    8(%rsp), %r13
> > +       cfi_restore (13)
> > +       movq    (%rsp), %r14
> > +       cfi_restore (14)
> > +       vmovups 128(%rsp), %zmm0
> > +
> > +       /* Go to exit.  */
> > +       jmp     L(EXIT)
> > +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> > +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> > +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> > +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> > +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> > +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> > +
> > +       /* Scalar math fucntion call to process special input.  */
> >
> >  L(SCALAR_MATH_CALL):
> > -        movl      %r12d, %r14d
> > -        movss     64(%rsp,%r14,4), %xmm0
> > -        call      atanhf@PLT
> > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > +       movl    %r12d, %r14d
> > +       movss   64(%rsp, %r14, 4), %xmm0
> > +       call    atanhf@PLT
> >
> > -        movss     %xmm0, 128(%rsp,%r14,4)
> > +       movss   %xmm0, 128(%rsp, %r14, 4)
> >
> > -/* Process special inputs in loop */
> > -        jmp       L(SPECIAL_VALUES_LOOP)
> > -                                # LOE rbx r15 r12d r13d
> > +       /* Process special inputs in loop.  */
> > +       jmp     L(SPECIAL_VALUES_LOOP)
> >  END(_ZGVeN16v_atanhf_skx)
> >
> > -        .section .rodata, "a"
> > -        .align 64
> > +       .section .rodata, "a"
> > +       .align  64
> >
> >  #ifdef __svml_satanh_data_internal_avx512_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct {
> > -        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> > -        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> > -        __declspec(align(64)) VUINT32 One[16][1];
> > -        __declspec(align(64)) VUINT32 AbsMask[16][1];
> > -        __declspec(align(64)) VUINT32 AddB5[16][1];
> > -        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> > -        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> > -        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> > -        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> > -        __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> > -        __declspec(align(64)) VUINT32 Half[16][1];
> > -        __declspec(align(64)) VUINT32 L2H[16][1];
> > -        __declspec(align(64)) VUINT32 L2L[16][1];
> > -    } __svml_satanh_data_internal_avx512;
> > +       typedef unsigned int VUINT32;
> > +       typedef struct{
> > +       __declspec (align(64))VUINT32 Log_tbl_H[32][1];
> > +       __declspec (align(64))VUINT32 Log_tbl_L[32][1];
> > +       __declspec (align(64))VUINT32 One[16][1];
> > +       __declspec (align(64))VUINT32 AbsMask[16][1];
> > +       __declspec (align(64))VUINT32 AddB5[16][1];
> > +       __declspec (align(64))VUINT32 RcpBitMask[16][1];
> > +       __declspec (align(64))VUINT32 poly_coeff3[16][1];
> > +       __declspec (align(64))VUINT32 poly_coeff2[16][1];
> > +       __declspec (align(64))VUINT32 poly_coeff1[16][1];
> > +       __declspec (align(64))VUINT32 poly_coeff0[16][1];
> > +       __declspec (align(64))VUINT32 Half[16][1];
> > +       __declspec (align(64))VUINT32 L2H[16][1];
> > +       __declspec (align(64))VUINT32 L2L[16][1];
> > +       }__svml_satanh_data_internal_avx512;
> >  #endif
> >  __svml_satanh_data_internal_avx512:
> > -        /*== Log_tbl_H ==*/
> > -        .long 0x00000000
> > -        .long 0x3cfc0000
> > -        .long 0x3d780000
> > -        .long 0x3db78000
> > -        .long 0x3df10000
> > -        .long 0x3e14c000
> > -        .long 0x3e300000
> > -        .long 0x3e4a8000
> > -        .long 0x3e648000
> > -        .long 0x3e7dc000
> > -        .long 0x3e8b4000
> > -        .long 0x3e974000
> > -        .long 0x3ea30000
> > -        .long 0x3eae8000
> > -        .long 0x3eb9c000
> > -        .long 0x3ec4e000
> > -        .long 0x3ecfa000
> > -        .long 0x3eda2000
> > -        .long 0x3ee48000
> > -        .long 0x3eeea000
> > -        .long 0x3ef8a000
> > -        .long 0x3f013000
> > -        .long 0x3f05f000
> > -        .long 0x3f0aa000
> > -        .long 0x3f0f4000
> > -        .long 0x3f13d000
> > -        .long 0x3f184000
> > -        .long 0x3f1ca000
> > -        .long 0x3f20f000
> > -        .long 0x3f252000
> > -        .long 0x3f295000
> > -        .long 0x3f2d7000
> > -        /*== Log_tbl_L ==*/
> > -        .align 64
> > -        .long 0x00000000
> > -        .long 0x3726c39e
> > -        .long 0x38a30c01
> > -        .long 0x37528ae5
> > -        .long 0x38e0edc5
> > -        .long 0xb8ab41f8
> > -        .long 0xb7cf8f58
> > -        .long 0x3896a73d
> > -        .long 0xb5838656
> > -        .long 0x380c36af
> > -        .long 0xb8235454
> > -        .long 0x3862bae1
> > -        .long 0x38c5e10e
> > -        .long 0x38dedfac
> > -        .long 0x38ebfb5e
> > -        .long 0xb8e63c9f
> > -        .long 0xb85c1340
> > -        .long 0x38777bcd
> > -        .long 0xb6038656
> > -        .long 0x37d40984
> > -        .long 0xb8b85028
> > -        .long 0xb8ad5a5a
> > -        .long 0x3865c84a
> > -        .long 0x38c3d2f5
> > -        .long 0x383ebce1
> > -        .long 0xb8a1ed76
> > -        .long 0xb7a332c4
> > -        .long 0xb779654f
> > -        .long 0xb8602f73
> > -        .long 0x38f85db0
> > -        .long 0x37b4996f
> > -        .long 0xb8bfb3ca
> > -        /*== One ==*/
> > -        .align 64
> > -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > -        /*== AbsMask ==*/
> > -        .align 64
> > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > -        /*== AddB5 ==*/
> > -        .align 64
> > -        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > -        /*== RcpBitMask ==*/
> > -        .align 64
> > -        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > -        /*== poly_coeff3 ==*/
> > -        .align 64
> > -        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > -        /*== poly_coeff2 ==*/
> > -        .align 64
> > -        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > -        /*== poly_coeff1 ==*/
> > -        .align 64
> > -        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > -        /*== poly_coeff0 ==*/
> > -        .align 64
> > -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > -        /*== Half ==*/
> > -        .align 64
> > -        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > -        /*== L2H = log(2)_high ==*/
> > -        .align 64
> > -        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > -        /*== L2L = log(2)_low ==*/
> > -        .align 64
> > -        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > -        .align 64
> > -        .type  __svml_satanh_data_internal_avx512,@object
> > -        .size  __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
> > +       /* == Log_tbl_H ==.  */
> > +       .long   0x00000000
> > +       .long   0x3cfc0000
> > +       .long   0x3d780000
> > +       .long   0x3db78000
> > +       .long   0x3df10000
> > +       .long   0x3e14c000
> > +       .long   0x3e300000
> > +       .long   0x3e4a8000
> > +       .long   0x3e648000
> > +       .long   0x3e7dc000
> > +       .long   0x3e8b4000
> > +       .long   0x3e974000
> > +       .long   0x3ea30000
> > +       .long   0x3eae8000
> > +       .long   0x3eb9c000
> > +       .long   0x3ec4e000
> > +       .long   0x3ecfa000
> > +       .long   0x3eda2000
> > +       .long   0x3ee48000
> > +       .long   0x3eeea000
> > +       .long   0x3ef8a000
> > +       .long   0x3f013000
> > +       .long   0x3f05f000
> > +       .long   0x3f0aa000
> > +       .long   0x3f0f4000
> > +       .long   0x3f13d000
> > +       .long   0x3f184000
> > +       .long   0x3f1ca000
> > +       .long   0x3f20f000
> > +       .long   0x3f252000
> > +       .long   0x3f295000
> > +       .long   0x3f2d7000
> > +       /* == Log_tbl_L ==.  */
> > +       .align  64
> > +       .long   0x00000000
> > +       .long   0x3726c39e
> > +       .long   0x38a30c01
> > +       .long   0x37528ae5
> > +       .long   0x38e0edc5
> > +       .long   0xb8ab41f8
> > +       .long   0xb7cf8f58
> > +       .long   0x3896a73d
> > +       .long   0xb5838656
> > +       .long   0x380c36af
> > +       .long   0xb8235454
> > +       .long   0x3862bae1
> > +       .long   0x38c5e10e
> > +       .long   0x38dedfac
> > +       .long   0x38ebfb5e
> > +       .long   0xb8e63c9f
> > +       .long   0xb85c1340
> > +       .long   0x38777bcd
> > +       .long   0xb6038656
> > +       .long   0x37d40984
> > +       .long   0xb8b85028
> > +       .long   0xb8ad5a5a
> > +       .long   0x3865c84a
> > +       .long   0x38c3d2f5
> > +       .long   0x383ebce1
> > +       .long   0xb8a1ed76
> > +       .long   0xb7a332c4
> > +       .long   0xb779654f
> > +       .long   0xb8602f73
> > +       .long   0x38f85db0
> > +       .long   0x37b4996f
> > +       .long   0xb8bfb3ca
> > +       /* == One ==.  */
> > +       .align  64
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       /* == AbsMask ==.  */
> > +       .align  64
> > +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > +       /* == AddB5 ==.  */
> > +       .align  64
> > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > +       /* == RcpBitMask ==.  */
> > +       .align  64
> > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > +       /* == poly_coeff3 ==.  */
> > +       .align  64
> > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > +       /* == poly_coeff2 ==.  */
> > +       .align  64
> > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > +       /* == poly_coeff1 ==.  */
> > +       .align  64
> > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > +       /* == poly_coeff0 ==.  */
> > +       .align  64
> > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > +       /* == Half ==.  */
> > +       .align  64
> > +       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > +       /* == L2H = log(2)_high ==.  */
> > +       .align  64
> > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > +       /* == L2L = log(2)_low ==.  */
> > +       .align  64
> > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > +       .align  64
> > +       .type   __svml_satanh_data_internal_avx512, @object
> > +       .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> > --
> > 2.25.1
> >
Sunil Pandey Feb. 7, 2022, 1:20 p.m. UTC | #3
Can you please also include

Nested #ifdef example.
Multiline comment example.
Single line comment starts with #.

Thank you so much.
Sunil

On Fri, Feb 4, 2022 at 11:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Sat, Feb 5, 2022 at 2:01 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Hi Noah,
> >
> > Since this patch is about glibc assembly language formatting, it may be
> > helpful to everyone, if there is a section for glibc assembly coding
> > in manual.
> >
> > https://sourceware.org/glibc/wiki/Style_and_Conventions
> >
> > You may also consider putting your tool in glibc code base, so that people
> > can easily find/use it to check/format their assembly code.
>
> Like the idea. How does the following sound:
>
> == x86 GAS (Assembly) ==
>
> The following are stylistic conventions for x86 GAS
> contributions. Other style conventions that are relevant to x86 GAS
> syntax are also included (i.e column limit, indenting preprocessor
> directives, etc..).
>
> 1.  Instructions should be proceeded by a tab.
> 2.  Instruction less than 8 characters in length should have a tab
>     between it and the first operand.
> 3.  Instruction greater than 7 characters in length should have a
>     space between it and the first operand.
> 4.  There should be a space after the comma seperating operands.
>     For example for rules 1, 2, 3, and 4:
>     ```
>     /* <tab><short instruction><tab><op0><comma><space><op1>.  */
>     addl $1, %eax
>     /* <tab><long instruction><space><op0><comma><space><op1>.  */
>     vpmovmaskb %ymm0, %eax
>     ```
>
> 5.  Comments should be indented with code:
>     For example for rule 5:
>     ```
>     /* Function XYZ returns.  */
>     ENTRY(XYZ)
>     /* Return from XYZ.  */
>     ret
>     END(XYZ)
>     ```
>
> 6.  Tab after `#define`d names and their value.
>     For example:
>     ```
>     /* No value for 'HELLO' so no tab afterwards.  */
>     #define HELLO
>
>     /* Value for 'WORLD' so tab separated 'WORLD' from '10'.  */
>     #define WORLD 10
>     ```
>
>
> >
> > https://github.com/goldsteinn/assembly-beautifier
>
> Thats not an official tool. Its code quality is not really up to standard
> and I can't really commit to maintaining it.
>
> If its improved to a state where its good enough for GLIBC I'll
> post a patch.
> >
> > Thank,
> > Sunil
> >
> > On Thu, Feb 3, 2022 at 2:00 PM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > Reformats to match style of other hand coded assembly files.
> > >
> > > The changes are:
> > >     1. Replace 8x space with tab before instructions.
> > >     2. After instruction len < 8 use tab.
> > >     3. After instruction len >= 8 use space.
> > >     4. 1 Space after comma between instruction operands.
> > >     5. Indent comments similiar with code.
> > >     6. Make comments complete sentences.
> > >     7. Tab after '#define'
> > >     8. Spaces at '#' representing the depth.
> > >
> > > The final executable is unchanged by this commit.
> > > ---
> > > The changes for this patch where made with the following
> > > script: https://github.com/goldsteinn/assembly-beautifier
> > >
> > > The goal of this patch is just to reformat the code
> > > so it is more human friendly and try create a style
> > > that future patches will be based on.
> > >
> > > If this patch is accepted ensuing patches to
> > > optimize the performance of svml_s_atanhf16_core_avx512.S
> > > will be based on this patch.
> > >
> > >  .../multiarch/svml_s_atanhf16_core_avx512.S   | 655 +++++++++---------
> > >  1 file changed, 321 insertions(+), 334 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > index f863f4f959..ed90a427a6 100644
> > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > @@ -33,361 +33,348 @@
> > >
> > >  /* Offsets for data table __svml_satanh_data_internal_avx512
> > >   */
> > > -#define Log_tbl_H                      0
> > > -#define Log_tbl_L                      128
> > > -#define One                            256
> > > -#define AbsMask                        320
> > > -#define AddB5                          384
> > > -#define RcpBitMask                     448
> > > -#define poly_coeff3                    512
> > > -#define poly_coeff2                    576
> > > -#define poly_coeff1                    640
> > > -#define poly_coeff0                    704
> > > -#define Half                           768
> > > -#define L2H                            832
> > > -#define L2L                            896
> > > +#define Log_tbl_H      0
> > > +#define Log_tbl_L      128
> > > +#define One    256
> > > +#define AbsMask        320
> > > +#define AddB5  384
> > > +#define RcpBitMask     448
> > > +#define poly_coeff3    512
> > > +#define poly_coeff2    576
> > > +#define poly_coeff1    640
> > > +#define poly_coeff0    704
> > > +#define Half   768
> > > +#define L2H    832
> > > +#define L2L    896
> > >
> > >  #include <sysdep.h>
> > >
> > > -        .text
> > > -       .section .text.exex512,"ax",@progbits
> > > +       .text
> > > +       .section .text.exex512, "ax", @progbits
> > >  ENTRY(_ZGVeN16v_atanhf_skx)
> > > -        pushq     %rbp
> > > -        cfi_def_cfa_offset(16)
> > > -        movq      %rsp, %rbp
> > > -        cfi_def_cfa(6, 16)
> > > -        cfi_offset(6, -16)
> > > -        andq      $-64, %rsp
> > > -        subq      $192, %rsp
> > > -        vmovups   One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> > > -
> > > -/* round reciprocals to 1+5b mantissas */
> > > -        vmovups   AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> > > -        vmovups   RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> > > -        vmovaps   %zmm0, %zmm11
> > > -        vandps    AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > > -
> > > -/* 1+y */
> > > -        vaddps    {rn-sae}, %zmm4, %zmm6, %zmm9
> > > -
> > > -/* 1-y */
> > > -        vsubps    {rn-sae}, %zmm6, %zmm4, %zmm8
> > > -        vxorps    %zmm6, %zmm11, %zmm10
> > > -
> > > -/* Yp_high */
> > > -        vsubps    {rn-sae}, %zmm4, %zmm9, %zmm2
> > > -
> > > -/* -Ym_high */
> > > -        vsubps    {rn-sae}, %zmm4, %zmm8, %zmm5
> > > -
> > > -/* RcpP ~ 1/Yp */
> > > -        vrcp14ps  %zmm9, %zmm12
> > > -
> > > -/* RcpM ~ 1/Ym */
> > > -        vrcp14ps  %zmm8, %zmm13
> > > -
> > > -/* input outside (-1, 1) ? */
> > > -        vcmpps    $21, {sae}, %zmm4, %zmm6, %k0
> > > -        vpaddd    %zmm14, %zmm12, %zmm15
> > > -        vpaddd    %zmm14, %zmm13, %zmm0
> > > -
> > > -/* Yp_low */
> > > -        vsubps    {rn-sae}, %zmm2, %zmm6, %zmm3
> > > -        vandps    %zmm1, %zmm15, %zmm7
> > > -        vandps    %zmm1, %zmm0, %zmm12
> > > -
> > > -/* Ym_low */
> > > -        vaddps    {rn-sae}, %zmm5, %zmm6, %zmm5
> > > -
> > > -/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
> > > -        vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> > > -
> > > -/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> > > -        vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > > -        vmovups   Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> > > -        vmovups   Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > > -
> > > -/* exponents */
> > > -        vgetexpps {sae}, %zmm7, %zmm15
> > > -        vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > > -
> > > -/* Table lookups */
> > > -        vmovups   __svml_satanh_data_internal_avx512(%rip), %zmm6
> > > -        vgetexpps {sae}, %zmm12, %zmm14
> > > -        vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > > -
> > > -/* Prepare table index */
> > > -        vpsrld    $18, %zmm7, %zmm3
> > > -        vpsrld    $18, %zmm12, %zmm2
> > > -        vmovups   Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > > -        vmovups   poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> > > -
> > > -/* Km-Kp */
> > > -        vsubps    {rn-sae}, %zmm15, %zmm14, %zmm1
> > > -        kmovw     %k0, %edx
> > > -        vmovaps   %zmm3, %zmm0
> > > -        vpermi2ps %zmm13, %zmm8, %zmm3
> > > -        vpermt2ps %zmm13, %zmm2, %zmm8
> > > -        vpermi2ps %zmm7, %zmm6, %zmm0
> > > -        vpermt2ps %zmm7, %zmm2, %zmm6
> > > -        vsubps    {rn-sae}, %zmm3, %zmm8, %zmm5
> > > -
> > > -/* K*L2H + Th */
> > > -        vmovups   L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> > > -
> > > -/* K*L2L + Tl */
> > > -        vmovups   L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > > -
> > > -/* polynomials */
> > > -        vmovups   poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > > -        vmovups   poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > > -
> > > -/* table values */
> > > -        vsubps    {rn-sae}, %zmm0, %zmm6, %zmm0
> > > -        vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > > -        vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > > -        vmovups   poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > > -        vmovaps   %zmm3, %zmm2
> > > -        vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > > -        vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > > -        vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > > -        vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > > -        vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > > -        vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > > -
> > > -/* (K*L2L + Tl) + Rp*PolyP */
> > > -        vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > > -        vorps     Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > > -
> > > -/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> > > -        vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > > -        vaddps    {rn-sae}, %zmm3, %zmm0, %zmm4
> > > -        vmulps    {rn-sae}, %zmm9, %zmm4, %zmm0
> > > -        testl     %edx, %edx
> > > -
> > > -/* Go to special inputs processing branch */
> > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> > > -
> > > -/* Restore registers
> > > - * and exit the function
> > > - */
> > > +       pushq   %rbp
> > > +       cfi_def_cfa_offset (16)
> > > +       movq    %rsp, %rbp
> > > +       cfi_def_cfa (6, 16)
> > > +       cfi_offset (6, -16)
> > > +       andq    $-64, %rsp
> > > +       subq    $192, %rsp
> > > +       vmovups One + __svml_satanh_data_internal_avx512(%rip), %zmm4
> > > +
> > > +       /* round reciprocals to 1+5b mantissas.  */
> > > +       vmovups AddB5 + __svml_satanh_data_internal_avx512(%rip), %zmm14
> > > +       vmovups RcpBitMask + __svml_satanh_data_internal_avx512(%rip), %zmm1
> > > +       vmovaps %zmm0, %zmm11
> > > +       vandps  AbsMask + __svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > > +
> > > +       /* 1+y.  */
> > > +       vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
> > > +
> > > +       /* 1-y.  */
> > > +       vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> > > +       vxorps  %zmm6, %zmm11, %zmm10
> > > +
> > > +       /* Yp_high.  */
> > > +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> > > +
> > > +       /* -Ym_high.  */
> > > +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> > > +
> > > +       /* RcpP ~ 1/Yp.  */
> > > +       vrcp14ps %zmm9, %zmm12
> > > +
> > > +       /* RcpM ~ 1/Ym.  */
> > > +       vrcp14ps %zmm8, %zmm13
> > > +
> > > +       /* input outside (-1, 1) ?.  */
> > > +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> > > +       vpaddd  %zmm14, %zmm12, %zmm15
> > > +       vpaddd  %zmm14, %zmm13, %zmm0
> > > +
> > > +       /* Yp_low.  */
> > > +       vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
> > > +       vandps  %zmm1, %zmm15, %zmm7
> > > +       vandps  %zmm1, %zmm0, %zmm12
> > > +
> > > +       /* Ym_low.  */
> > > +       vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> > > +
> > > +       /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
> > > +       vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> > > +
> > > +       /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
> > > +       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > > +       vmovups Log_tbl_L + __svml_satanh_data_internal_avx512(%rip), %zmm8
> > > +       vmovups Log_tbl_L + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> > > +
> > > +       /* exponents.  */
> > > +       vgetexpps {sae}, %zmm7, %zmm15
> > > +       vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > > +
> > > +       /* Table lookups.  */
> > > +       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> > > +       vgetexpps {sae}, %zmm12, %zmm14
> > > +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > > +
> > > +       /* Prepare table index.  */
> > > +       vpsrld  $18, %zmm7, %zmm3
> > > +       vpsrld  $18, %zmm12, %zmm2
> > > +       vmovups Log_tbl_H + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> > > +       vmovups poly_coeff1 + __svml_satanh_data_internal_avx512(%rip), %zmm12
> > > +
> > > +       /* Km-Kp.  */
> > > +       vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> > > +       kmovw   %k0, %edx
> > > +       vmovaps %zmm3, %zmm0
> > > +       vpermi2ps %zmm13, %zmm8, %zmm3
> > > +       vpermt2ps %zmm13, %zmm2, %zmm8
> > > +       vpermi2ps %zmm7, %zmm6, %zmm0
> > > +       vpermt2ps %zmm7, %zmm2, %zmm6
> > > +       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> > > +
> > > +       /* K*L2H + Th.  */
> > > +       vmovups L2H + __svml_satanh_data_internal_avx512(%rip), %zmm2
> > > +
> > > +       /* K*L2L + Tl.  */
> > > +       vmovups L2L + __svml_satanh_data_internal_avx512(%rip), %zmm3
> > > +
> > > +       /* polynomials.  */
> > > +       vmovups poly_coeff3 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> > > +       vmovups poly_coeff0 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> > > +
> > > +       /* table values.  */
> > > +       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> > > +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > > +       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > > +       vmovups poly_coeff2 + __svml_satanh_data_internal_avx512(%rip), %zmm3
> > > +       vmovaps %zmm3, %zmm2
> > > +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > > +       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > > +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > > +       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > > +       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > > +       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > > +
> > > +       /* (K*L2L + Tl) + Rp*PolyP.  */
> > > +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > > +       vorps   Half + __svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > > +
> > > +       /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
> > > +       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > > +       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> > > +       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> > > +       testl   %edx, %edx
> > > +
> > > +       /* Go to special inputs processing branch.  */
> > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > +
> > > +       /* Restore registers * and exit the function.  */
> > >
> > >  L(EXIT):
> > > -        movq      %rbp, %rsp
> > > -        popq      %rbp
> > > -        cfi_def_cfa(7, 8)
> > > -        cfi_restore(6)
> > > -        ret
> > > -        cfi_def_cfa(6, 16)
> > > -        cfi_offset(6, -16)
> > > -
> > > -/* Branch to process
> > > - * special inputs
> > > - */
> > > +       movq    %rbp, %rsp
> > > +       popq    %rbp
> > > +       cfi_def_cfa (7, 8)
> > > +       cfi_restore (6)
> > > +       ret
> > > +       cfi_def_cfa (6, 16)
> > > +       cfi_offset (6, -16)
> > > +
> > > +       /* Branch to process special inputs.  */
> > >
> > >  L(SPECIAL_VALUES_BRANCH):
> > > -        vmovups   %zmm11, 64(%rsp)
> > > -        vmovups   %zmm0, 128(%rsp)
> > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> > > -
> > > -        xorl      %eax, %eax
> > > -                                # LOE rbx r12 r13 r14 r15 eax edx
> > > -
> > > -        vzeroupper
> > > -        movq      %r12, 16(%rsp)
> > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > -        movl      %eax, %r12d
> > > -        movq      %r13, 8(%rsp)
> > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > -        movl      %edx, %r13d
> > > -        movq      %r14, (%rsp)
> > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > -                                # LOE rbx r15 r12d r13d
> > > -
> > > -/* Range mask
> > > - * bits check
> > > - */
> > > +       vmovups %zmm11, 64(%rsp)
> > > +       vmovups %zmm0, 128(%rsp)
> > > +
> > > +       xorl    %eax, %eax
> > > +
> > > +       vzeroupper
> > > +       movq    %r12, 16(%rsp)
> > > +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> > > +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> > > +       movl    %eax, %r12d
> > > +       movq    %r13, 8(%rsp)
> > > +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> > > +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> > > +       movl    %edx, %r13d
> > > +       movq    %r14, (%rsp)
> > > +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> > > +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> > > +
> > > +       /* Range mask * bits check.  */
> > >
> > >  L(RANGEMASK_CHECK):
> > > -        btl       %r12d, %r13d
> > > +       btl     %r12d, %r13d
> > >
> > > -/* Call scalar math function */
> > > -        jc        L(SCALAR_MATH_CALL)
> > > -                                # LOE rbx r15 r12d r13d
> > > +       /* Call scalar math function.  */
> > > +       jc      L(SCALAR_MATH_CALL)
> > >
> > > -/* Special inputs
> > > - * processing loop
> > > - */
> > > +       /* Special inputs processing loop.  */
> > >
> > >  L(SPECIAL_VALUES_LOOP):
> > > -        incl      %r12d
> > > -        cmpl      $16, %r12d
> > > -
> > > -/* Check bits in range mask */
> > > -        jl        L(RANGEMASK_CHECK)
> > > -                                # LOE rbx r15 r12d r13d
> > > -
> > > -        movq      16(%rsp), %r12
> > > -        cfi_restore(12)
> > > -        movq      8(%rsp), %r13
> > > -        cfi_restore(13)
> > > -        movq      (%rsp), %r14
> > > -        cfi_restore(14)
> > > -        vmovups   128(%rsp), %zmm0
> > > -
> > > -/* Go to exit */
> > > -        jmp       L(EXIT)
> > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > -                                # LOE rbx r12 r13 r14 r15 zmm0
> > > -
> > > -/* Scalar math fucntion call
> > > - * to process special input
> > > - */
> > > +       incl    %r12d
> > > +       cmpl    $16, %r12d
> > > +
> > > +       /* Check bits in range mask.  */
> > > +       jl      L(RANGEMASK_CHECK)
> > > +
> > > +       movq    16(%rsp), %r12
> > > +       cfi_restore (12)
> > > +       movq    8(%rsp), %r13
> > > +       cfi_restore (13)
> > > +       movq    (%rsp), %r14
> > > +       cfi_restore (14)
> > > +       vmovups 128(%rsp), %zmm0
> > > +
> > > +       /* Go to exit.  */
> > > +       jmp     L(EXIT)
> > > +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> > > +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> > > +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> > > +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> > > +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> > > +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> > > +
> > > +       /* Scalar math fucntion call to process special input.  */
> > >
> > >  L(SCALAR_MATH_CALL):
> > > -        movl      %r12d, %r14d
> > > -        movss     64(%rsp,%r14,4), %xmm0
> > > -        call      atanhf@PLT
> > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > +       movl    %r12d, %r14d
> > > +       movss   64(%rsp, %r14, 4), %xmm0
> > > +       call    atanhf@PLT
> > >
> > > -        movss     %xmm0, 128(%rsp,%r14,4)
> > > +       movss   %xmm0, 128(%rsp, %r14, 4)
> > >
> > > -/* Process special inputs in loop */
> > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > -                                # LOE rbx r15 r12d r13d
> > > +       /* Process special inputs in loop.  */
> > > +       jmp     L(SPECIAL_VALUES_LOOP)
> > >  END(_ZGVeN16v_atanhf_skx)
> > >
> > > -        .section .rodata, "a"
> > > -        .align 64
> > > +       .section .rodata, "a"
> > > +       .align  64
> > >
> > >  #ifdef __svml_satanh_data_internal_avx512_typedef
> > > -typedef unsigned int VUINT32;
> > > -typedef struct {
> > > -        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> > > -        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> > > -        __declspec(align(64)) VUINT32 One[16][1];
> > > -        __declspec(align(64)) VUINT32 AbsMask[16][1];
> > > -        __declspec(align(64)) VUINT32 AddB5[16][1];
> > > -        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> > > -        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> > > -        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> > > -        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> > > -        __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> > > -        __declspec(align(64)) VUINT32 Half[16][1];
> > > -        __declspec(align(64)) VUINT32 L2H[16][1];
> > > -        __declspec(align(64)) VUINT32 L2L[16][1];
> > > -    } __svml_satanh_data_internal_avx512;
> > > +       typedef unsigned int VUINT32;
> > > +       typedef struct{
> > > +       __declspec (align(64))VUINT32 Log_tbl_H[32][1];
> > > +       __declspec (align(64))VUINT32 Log_tbl_L[32][1];
> > > +       __declspec (align(64))VUINT32 One[16][1];
> > > +       __declspec (align(64))VUINT32 AbsMask[16][1];
> > > +       __declspec (align(64))VUINT32 AddB5[16][1];
> > > +       __declspec (align(64))VUINT32 RcpBitMask[16][1];
> > > +       __declspec (align(64))VUINT32 poly_coeff3[16][1];
> > > +       __declspec (align(64))VUINT32 poly_coeff2[16][1];
> > > +       __declspec (align(64))VUINT32 poly_coeff1[16][1];
> > > +       __declspec (align(64))VUINT32 poly_coeff0[16][1];
> > > +       __declspec (align(64))VUINT32 Half[16][1];
> > > +       __declspec (align(64))VUINT32 L2H[16][1];
> > > +       __declspec (align(64))VUINT32 L2L[16][1];
> > > +       }__svml_satanh_data_internal_avx512;
> > >  #endif
> > >  __svml_satanh_data_internal_avx512:
> > > -        /*== Log_tbl_H ==*/
> > > -        .long 0x00000000
> > > -        .long 0x3cfc0000
> > > -        .long 0x3d780000
> > > -        .long 0x3db78000
> > > -        .long 0x3df10000
> > > -        .long 0x3e14c000
> > > -        .long 0x3e300000
> > > -        .long 0x3e4a8000
> > > -        .long 0x3e648000
> > > -        .long 0x3e7dc000
> > > -        .long 0x3e8b4000
> > > -        .long 0x3e974000
> > > -        .long 0x3ea30000
> > > -        .long 0x3eae8000
> > > -        .long 0x3eb9c000
> > > -        .long 0x3ec4e000
> > > -        .long 0x3ecfa000
> > > -        .long 0x3eda2000
> > > -        .long 0x3ee48000
> > > -        .long 0x3eeea000
> > > -        .long 0x3ef8a000
> > > -        .long 0x3f013000
> > > -        .long 0x3f05f000
> > > -        .long 0x3f0aa000
> > > -        .long 0x3f0f4000
> > > -        .long 0x3f13d000
> > > -        .long 0x3f184000
> > > -        .long 0x3f1ca000
> > > -        .long 0x3f20f000
> > > -        .long 0x3f252000
> > > -        .long 0x3f295000
> > > -        .long 0x3f2d7000
> > > -        /*== Log_tbl_L ==*/
> > > -        .align 64
> > > -        .long 0x00000000
> > > -        .long 0x3726c39e
> > > -        .long 0x38a30c01
> > > -        .long 0x37528ae5
> > > -        .long 0x38e0edc5
> > > -        .long 0xb8ab41f8
> > > -        .long 0xb7cf8f58
> > > -        .long 0x3896a73d
> > > -        .long 0xb5838656
> > > -        .long 0x380c36af
> > > -        .long 0xb8235454
> > > -        .long 0x3862bae1
> > > -        .long 0x38c5e10e
> > > -        .long 0x38dedfac
> > > -        .long 0x38ebfb5e
> > > -        .long 0xb8e63c9f
> > > -        .long 0xb85c1340
> > > -        .long 0x38777bcd
> > > -        .long 0xb6038656
> > > -        .long 0x37d40984
> > > -        .long 0xb8b85028
> > > -        .long 0xb8ad5a5a
> > > -        .long 0x3865c84a
> > > -        .long 0x38c3d2f5
> > > -        .long 0x383ebce1
> > > -        .long 0xb8a1ed76
> > > -        .long 0xb7a332c4
> > > -        .long 0xb779654f
> > > -        .long 0xb8602f73
> > > -        .long 0x38f85db0
> > > -        .long 0x37b4996f
> > > -        .long 0xb8bfb3ca
> > > -        /*== One ==*/
> > > -        .align 64
> > > -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > -        /*== AbsMask ==*/
> > > -        .align 64
> > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > > -        /*== AddB5 ==*/
> > > -        .align 64
> > > -        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > -        /*== RcpBitMask ==*/
> > > -        .align 64
> > > -        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > -        /*== poly_coeff3 ==*/
> > > -        .align 64
> > > -        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > -        /*== poly_coeff2 ==*/
> > > -        .align 64
> > > -        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > -        /*== poly_coeff1 ==*/
> > > -        .align 64
> > > -        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > -        /*== poly_coeff0 ==*/
> > > -        .align 64
> > > -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > -        /*== Half ==*/
> > > -        .align 64
> > > -        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > > -        /*== L2H = log(2)_high ==*/
> > > -        .align 64
> > > -        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > -        /*== L2L = log(2)_low ==*/
> > > -        .align 64
> > > -        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > -        .align 64
> > > -        .type  __svml_satanh_data_internal_avx512,@object
> > > -        .size  __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
> > > +       /* == Log_tbl_H ==.  */
> > > +       .long   0x00000000
> > > +       .long   0x3cfc0000
> > > +       .long   0x3d780000
> > > +       .long   0x3db78000
> > > +       .long   0x3df10000
> > > +       .long   0x3e14c000
> > > +       .long   0x3e300000
> > > +       .long   0x3e4a8000
> > > +       .long   0x3e648000
> > > +       .long   0x3e7dc000
> > > +       .long   0x3e8b4000
> > > +       .long   0x3e974000
> > > +       .long   0x3ea30000
> > > +       .long   0x3eae8000
> > > +       .long   0x3eb9c000
> > > +       .long   0x3ec4e000
> > > +       .long   0x3ecfa000
> > > +       .long   0x3eda2000
> > > +       .long   0x3ee48000
> > > +       .long   0x3eeea000
> > > +       .long   0x3ef8a000
> > > +       .long   0x3f013000
> > > +       .long   0x3f05f000
> > > +       .long   0x3f0aa000
> > > +       .long   0x3f0f4000
> > > +       .long   0x3f13d000
> > > +       .long   0x3f184000
> > > +       .long   0x3f1ca000
> > > +       .long   0x3f20f000
> > > +       .long   0x3f252000
> > > +       .long   0x3f295000
> > > +       .long   0x3f2d7000
> > > +       /* == Log_tbl_L ==.  */
> > > +       .align  64
> > > +       .long   0x00000000
> > > +       .long   0x3726c39e
> > > +       .long   0x38a30c01
> > > +       .long   0x37528ae5
> > > +       .long   0x38e0edc5
> > > +       .long   0xb8ab41f8
> > > +       .long   0xb7cf8f58
> > > +       .long   0x3896a73d
> > > +       .long   0xb5838656
> > > +       .long   0x380c36af
> > > +       .long   0xb8235454
> > > +       .long   0x3862bae1
> > > +       .long   0x38c5e10e
> > > +       .long   0x38dedfac
> > > +       .long   0x38ebfb5e
> > > +       .long   0xb8e63c9f
> > > +       .long   0xb85c1340
> > > +       .long   0x38777bcd
> > > +       .long   0xb6038656
> > > +       .long   0x37d40984
> > > +       .long   0xb8b85028
> > > +       .long   0xb8ad5a5a
> > > +       .long   0x3865c84a
> > > +       .long   0x38c3d2f5
> > > +       .long   0x383ebce1
> > > +       .long   0xb8a1ed76
> > > +       .long   0xb7a332c4
> > > +       .long   0xb779654f
> > > +       .long   0xb8602f73
> > > +       .long   0x38f85db0
> > > +       .long   0x37b4996f
> > > +       .long   0xb8bfb3ca
> > > +       /* == One ==.  */
> > > +       .align  64
> > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > +       /* == AbsMask ==.  */
> > > +       .align  64
> > > +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > > +       /* == AddB5 ==.  */
> > > +       .align  64
> > > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > +       /* == RcpBitMask ==.  */
> > > +       .align  64
> > > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > +       /* == poly_coeff3 ==.  */
> > > +       .align  64
> > > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > +       /* == poly_coeff2 ==.  */
> > > +       .align  64
> > > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > +       /* == poly_coeff1 ==.  */
> > > +       .align  64
> > > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > +       /* == poly_coeff0 ==.  */
> > > +       .align  64
> > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > +       /* == Half ==.  */
> > > +       .align  64
> > > +       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > > +       /* == L2H = log(2)_high ==.  */
> > > +       .align  64
> > > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > +       /* == L2L = log(2)_low ==.  */
> > > +       .align  64
> > > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > +       .align  64
> > > +       .type   __svml_satanh_data_internal_avx512, @object
> > > +       .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> > > --
> > > 2.25.1
> > >
Noah Goldstein Feb. 7, 2022, 7:33 p.m. UTC | #4
On Mon, Feb 7, 2022 at 7:21 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Can you please also include
>
> Nested #ifdef example.
> Multiline comment example.
> Single line comment starts with #.

V2 (note gmail is eating the tabs, but the examples where I mention
them will have
them when posted on the wiki).

== x86 GAS (Assembly) ==

The following are stylistic conventions for x86 GAS
contributions. Other style conventions that are relevant to x86 GAS
syntax are also included (for example
[[https://sourceware.org/glibc/wiki/Style_and_Conventions#A79-Column_Lines|Column
Limit of 79]] and
[[https://sourceware.org/glibc/wiki/Style_and_Conventions#Nested_C_Preprocessor_Directives|Nested
Preprocessor Directives]]


1.  Instructions should be proceeded by a tab.
2.  Instruction less than 8 characters in length should have a tab
    between it and the first operand.
3.  Instruction greater than 7 characters in length should have a
    space between it and the first operand.
4.  There should be a space after the comma seperating operands.
    For example for rules 1, 2, 3, and 4:
    ```
    /* <tab><short instruction><tab><op0><comma><space><op1>.  */
    addl $1, %eax
    /* <tab><long instruction><space><op0><comma><space><op1>.  */
    vpmovmaskb %ymm0, %eax
    ```

5.  Comments should be indented with code:
    For example for rule 5:
    ```
    /* Function XYZ returns.  */
    ENTRY(XYZ)
    /* Return from XYZ.  */
    ret
    END(XYZ)
    ```

6.  Tab after `#define`d names and their value.
    For example:
    ```
    /* No value for 'HELLO' so no tab afterwards.  */
    #define HELLO

    /* Value for 'WORLD' so tab separated 'WORLD' from '10'.  */
    #define WORLD 10
    ```

7.  Use C-Style comments.

8.  Comments should be filled to obey the 79 character column limit.
    For example:
    ```
    /* Bad: This comment is going to go beyond the 79 character column
limit but be on one line.  */

    /* Good: This comment is going to go beyond the 79 character
       column limit but be on one line.  */
    ```


>
> Thank you so much.
> Sunil
>
> On Fri, Feb 4, 2022 at 11:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Sat, Feb 5, 2022 at 2:01 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > Hi Noah,
> > >
> > > Since this patch is about glibc assembly language formatting, it may be
> > > helpful to everyone, if there is a section for glibc assembly coding
> > > in manual.
> > >
> > > https://sourceware.org/glibc/wiki/Style_and_Conventions
> > >
> > > You may also consider putting your tool in glibc code base, so that people
> > > can easily find/use it to check/format their assembly code.
> >
> > Like the idea. How does the following sound:
> >
> > == x86 GAS (Assembly) ==
> >
> > The following are stylistic conventions for x86 GAS
> > contributions. Other style conventions that are relevant to x86 GAS
> > syntax are also included (i.e column limit, indenting preprocessor
> > directives, etc..).
> >
> > 1.  Instructions should be proceeded by a tab.
> > 2.  Instruction less than 8 characters in length should have a tab
> >     between it and the first operand.
> > 3.  Instruction greater than 7 characters in length should have a
> >     space between it and the first operand.
> > 4.  There should be a space after the comma seperating operands.
> >     For example for rules 1, 2, 3, and 4:
> >     ```
> >     /* <tab><short instruction><tab><op0><comma><space><op1>.  */
> >     addl $1, %eax
> >     /* <tab><long instruction><space><op0><comma><space><op1>.  */
> >     vpmovmaskb %ymm0, %eax
> >     ```
> >
> > 5.  Comments should be indented with code:
> >     For example for rule 5:
> >     ```
> >     /* Function XYZ returns.  */
> >     ENTRY(XYZ)
> >     /* Return from XYZ.  */
> >     ret
> >     END(XYZ)
> >     ```
> >
> > 6.  Tab after `#define`d names and their value.
> >     For example:
> >     ```
> >     /* No value for 'HELLO' so no tab afterwards.  */
> >     #define HELLO
> >
> >     /* Value for 'WORLD' so tab separated 'WORLD' from '10'.  */
> >     #define WORLD 10
> >     ```
> >
> >
> > >
> > > https://github.com/goldsteinn/assembly-beautifier
> >
> > Thats not an official tool. Its code quality is not really up to standard
> > and I can't really commit to maintaining it.
> >
> > If its improved to a state where its good enough for GLIBC I'll
> > post a patch.
> > >
> > > Thank,
> > > Sunil
> > >
> > > On Thu, Feb 3, 2022 at 2:00 PM Noah Goldstein via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > Reformats to match style of other hand coded assembly files.
> > > >
> > > > The changes are:
> > > >     1. Replace 8x space with tab before instructions.
> > > >     2. After instruction len < 8 use tab.
> > > >     3. After instruction len >= 8 use space.
> > > >     4. 1 Space after comma between instruction operands.
> > > >     5. Indent comments similiar with code.
> > > >     6. Make comments complete sentences.
> > > >     7. Tab after '#define'
> > > >     8. Spaces at '#' representing the depth.
> > > >
> > > > The final executable is unchanged by this commit.
> > > > ---
> > > > The changes for this patch where made with the following
> > > > script: https://github.com/goldsteinn/assembly-beautifier
> > > >
> > > > The goal of this patch is just to reformat the code
> > > > so it is more human friendly and try create a style
> > > > that future patches will be based on.
> > > >
> > > > If this patch is accepted ensuing patches to
> > > > optimize the performance of svml_s_atanhf16_core_avx512.S
> > > > will be based on this patch.
> > > >
> > > >  .../multiarch/svml_s_atanhf16_core_avx512.S   | 655 +++++++++---------
> > > >  1 file changed, 321 insertions(+), 334 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > > index f863f4f959..ed90a427a6 100644
> > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> > > > @@ -33,361 +33,348 @@
> > > >
> > > >  /* Offsets for data table __svml_satanh_data_internal_avx512
> > > >   */
> > > > -#define Log_tbl_H                      0
> > > > -#define Log_tbl_L                      128
> > > > -#define One                            256
> > > > -#define AbsMask                        320
> > > > -#define AddB5                          384
> > > > -#define RcpBitMask                     448
> > > > -#define poly_coeff3                    512
> > > > -#define poly_coeff2                    576
> > > > -#define poly_coeff1                    640
> > > > -#define poly_coeff0                    704
> > > > -#define Half                           768
> > > > -#define L2H                            832
> > > > -#define L2L                            896
> > > > +#define Log_tbl_H      0
> > > > +#define Log_tbl_L      128
> > > > +#define One    256
> > > > +#define AbsMask        320
> > > > +#define AddB5  384
> > > > +#define RcpBitMask     448
> > > > +#define poly_coeff3    512
> > > > +#define poly_coeff2    576
> > > > +#define poly_coeff1    640
> > > > +#define poly_coeff0    704
> > > > +#define Half   768
> > > > +#define L2H    832
> > > > +#define L2L    896
> > > >
> > > >  #include <sysdep.h>
> > > >
> > > > -        .text
> > > > -       .section .text.exex512,"ax",@progbits
> > > > +       .text
> > > > +       .section .text.exex512, "ax", @progbits
> > > >  ENTRY(_ZGVeN16v_atanhf_skx)
> > > > -        pushq     %rbp
> > > > -        cfi_def_cfa_offset(16)
> > > > -        movq      %rsp, %rbp
> > > > -        cfi_def_cfa(6, 16)
> > > > -        cfi_offset(6, -16)
> > > > -        andq      $-64, %rsp
> > > > -        subq      $192, %rsp
> > > > -        vmovups   One+__svml_satanh_data_internal_avx512(%rip), %zmm4
> > > > -
> > > > -/* round reciprocals to 1+5b mantissas */
> > > > -        vmovups   AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
> > > > -        vmovups   RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
> > > > -        vmovaps   %zmm0, %zmm11
> > > > -        vandps    AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > > > -
> > > > -/* 1+y */
> > > > -        vaddps    {rn-sae}, %zmm4, %zmm6, %zmm9
> > > > -
> > > > -/* 1-y */
> > > > -        vsubps    {rn-sae}, %zmm6, %zmm4, %zmm8
> > > > -        vxorps    %zmm6, %zmm11, %zmm10
> > > > -
> > > > -/* Yp_high */
> > > > -        vsubps    {rn-sae}, %zmm4, %zmm9, %zmm2
> > > > -
> > > > -/* -Ym_high */
> > > > -        vsubps    {rn-sae}, %zmm4, %zmm8, %zmm5
> > > > -
> > > > -/* RcpP ~ 1/Yp */
> > > > -        vrcp14ps  %zmm9, %zmm12
> > > > -
> > > > -/* RcpM ~ 1/Ym */
> > > > -        vrcp14ps  %zmm8, %zmm13
> > > > -
> > > > -/* input outside (-1, 1) ? */
> > > > -        vcmpps    $21, {sae}, %zmm4, %zmm6, %k0
> > > > -        vpaddd    %zmm14, %zmm12, %zmm15
> > > > -        vpaddd    %zmm14, %zmm13, %zmm0
> > > > -
> > > > -/* Yp_low */
> > > > -        vsubps    {rn-sae}, %zmm2, %zmm6, %zmm3
> > > > -        vandps    %zmm1, %zmm15, %zmm7
> > > > -        vandps    %zmm1, %zmm0, %zmm12
> > > > -
> > > > -/* Ym_low */
> > > > -        vaddps    {rn-sae}, %zmm5, %zmm6, %zmm5
> > > > -
> > > > -/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
> > > > -        vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> > > > -
> > > > -/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
> > > > -        vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > > > -        vmovups   Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
> > > > -        vmovups   Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > > > -
> > > > -/* exponents */
> > > > -        vgetexpps {sae}, %zmm7, %zmm15
> > > > -        vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > > > -
> > > > -/* Table lookups */
> > > > -        vmovups   __svml_satanh_data_internal_avx512(%rip), %zmm6
> > > > -        vgetexpps {sae}, %zmm12, %zmm14
> > > > -        vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > > > -
> > > > -/* Prepare table index */
> > > > -        vpsrld    $18, %zmm7, %zmm3
> > > > -        vpsrld    $18, %zmm12, %zmm2
> > > > -        vmovups   Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > > > -        vmovups   poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
> > > > -
> > > > -/* Km-Kp */
> > > > -        vsubps    {rn-sae}, %zmm15, %zmm14, %zmm1
> > > > -        kmovw     %k0, %edx
> > > > -        vmovaps   %zmm3, %zmm0
> > > > -        vpermi2ps %zmm13, %zmm8, %zmm3
> > > > -        vpermt2ps %zmm13, %zmm2, %zmm8
> > > > -        vpermi2ps %zmm7, %zmm6, %zmm0
> > > > -        vpermt2ps %zmm7, %zmm2, %zmm6
> > > > -        vsubps    {rn-sae}, %zmm3, %zmm8, %zmm5
> > > > -
> > > > -/* K*L2H + Th */
> > > > -        vmovups   L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
> > > > -
> > > > -/* K*L2L + Tl */
> > > > -        vmovups   L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > > > -
> > > > -/* polynomials */
> > > > -        vmovups   poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
> > > > -        vmovups   poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
> > > > -
> > > > -/* table values */
> > > > -        vsubps    {rn-sae}, %zmm0, %zmm6, %zmm0
> > > > -        vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > > > -        vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > > > -        vmovups   poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
> > > > -        vmovaps   %zmm3, %zmm2
> > > > -        vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > > > -        vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > > > -        vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > > > -        vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > > > -        vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > > > -        vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > > > -
> > > > -/* (K*L2L + Tl) + Rp*PolyP */
> > > > -        vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > > > -        vorps     Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > > > -
> > > > -/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
> > > > -        vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > > > -        vaddps    {rn-sae}, %zmm3, %zmm0, %zmm4
> > > > -        vmulps    {rn-sae}, %zmm9, %zmm4, %zmm0
> > > > -        testl     %edx, %edx
> > > > -
> > > > -/* Go to special inputs processing branch */
> > > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
> > > > -
> > > > -/* Restore registers
> > > > - * and exit the function
> > > > - */
> > > > +       pushq   %rbp
> > > > +       cfi_def_cfa_offset (16)
> > > > +       movq    %rsp, %rbp
> > > > +       cfi_def_cfa (6, 16)
> > > > +       cfi_offset (6, -16)
> > > > +       andq    $-64, %rsp
> > > > +       subq    $192, %rsp
> > > > +       vmovups One + __svml_satanh_data_internal_avx512(%rip), %zmm4
> > > > +
> > > > +       /* round reciprocals to 1+5b mantissas.  */
> > > > +       vmovups AddB5 + __svml_satanh_data_internal_avx512(%rip), %zmm14
> > > > +       vmovups RcpBitMask + __svml_satanh_data_internal_avx512(%rip), %zmm1
> > > > +       vmovaps %zmm0, %zmm11
> > > > +       vandps  AbsMask + __svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
> > > > +
> > > > +       /* 1+y.  */
> > > > +       vaddps  {rn-sae}, %zmm4, %zmm6, %zmm9
> > > > +
> > > > +       /* 1-y.  */
> > > > +       vsubps  {rn-sae}, %zmm6, %zmm4, %zmm8
> > > > +       vxorps  %zmm6, %zmm11, %zmm10
> > > > +
> > > > +       /* Yp_high.  */
> > > > +       vsubps  {rn-sae}, %zmm4, %zmm9, %zmm2
> > > > +
> > > > +       /* -Ym_high.  */
> > > > +       vsubps  {rn-sae}, %zmm4, %zmm8, %zmm5
> > > > +
> > > > +       /* RcpP ~ 1/Yp.  */
> > > > +       vrcp14ps %zmm9, %zmm12
> > > > +
> > > > +       /* RcpM ~ 1/Ym.  */
> > > > +       vrcp14ps %zmm8, %zmm13
> > > > +
> > > > +       /* input outside (-1, 1) ?.  */
> > > > +       vcmpps  $21, {sae}, %zmm4, %zmm6, %k0
> > > > +       vpaddd  %zmm14, %zmm12, %zmm15
> > > > +       vpaddd  %zmm14, %zmm13, %zmm0
> > > > +
> > > > +       /* Yp_low.  */
> > > > +       vsubps  {rn-sae}, %zmm2, %zmm6, %zmm3
> > > > +       vandps  %zmm1, %zmm15, %zmm7
> > > > +       vandps  %zmm1, %zmm0, %zmm12
> > > > +
> > > > +       /* Ym_low.  */
> > > > +       vaddps  {rn-sae}, %zmm5, %zmm6, %zmm5
> > > > +
> > > > +       /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
> > > > +       vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
> > > > +
> > > > +       /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
> > > > +       vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
> > > > +       vmovups Log_tbl_L + __svml_satanh_data_internal_avx512(%rip), %zmm8
> > > > +       vmovups Log_tbl_L + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> > > > +
> > > > +       /* exponents.  */
> > > > +       vgetexpps {sae}, %zmm7, %zmm15
> > > > +       vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
> > > > +
> > > > +       /* Table lookups.  */
> > > > +       vmovups __svml_satanh_data_internal_avx512(%rip), %zmm6
> > > > +       vgetexpps {sae}, %zmm12, %zmm14
> > > > +       vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
> > > > +
> > > > +       /* Prepare table index.  */
> > > > +       vpsrld  $18, %zmm7, %zmm3
> > > > +       vpsrld  $18, %zmm12, %zmm2
> > > > +       vmovups Log_tbl_H + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> > > > +       vmovups poly_coeff1 + __svml_satanh_data_internal_avx512(%rip), %zmm12
> > > > +
> > > > +       /* Km-Kp.  */
> > > > +       vsubps  {rn-sae}, %zmm15, %zmm14, %zmm1
> > > > +       kmovw   %k0, %edx
> > > > +       vmovaps %zmm3, %zmm0
> > > > +       vpermi2ps %zmm13, %zmm8, %zmm3
> > > > +       vpermt2ps %zmm13, %zmm2, %zmm8
> > > > +       vpermi2ps %zmm7, %zmm6, %zmm0
> > > > +       vpermt2ps %zmm7, %zmm2, %zmm6
> > > > +       vsubps  {rn-sae}, %zmm3, %zmm8, %zmm5
> > > > +
> > > > +       /* K*L2H + Th.  */
> > > > +       vmovups L2H + __svml_satanh_data_internal_avx512(%rip), %zmm2
> > > > +
> > > > +       /* K*L2L + Tl.  */
> > > > +       vmovups L2L + __svml_satanh_data_internal_avx512(%rip), %zmm3
> > > > +
> > > > +       /* polynomials.  */
> > > > +       vmovups poly_coeff3 + __svml_satanh_data_internal_avx512(%rip), %zmm7
> > > > +       vmovups poly_coeff0 + __svml_satanh_data_internal_avx512(%rip), %zmm13
> > > > +
> > > > +       /* table values.  */
> > > > +       vsubps  {rn-sae}, %zmm0, %zmm6, %zmm0
> > > > +       vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
> > > > +       vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
> > > > +       vmovups poly_coeff2 + __svml_satanh_data_internal_avx512(%rip), %zmm3
> > > > +       vmovaps %zmm3, %zmm2
> > > > +       vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
> > > > +       vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
> > > > +       vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
> > > > +       vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
> > > > +       vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
> > > > +       vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
> > > > +
> > > > +       /* (K*L2L + Tl) + Rp*PolyP.  */
> > > > +       vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
> > > > +       vorps   Half + __svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
> > > > +
> > > > +       /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
> > > > +       vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
> > > > +       vaddps  {rn-sae}, %zmm3, %zmm0, %zmm4
> > > > +       vmulps  {rn-sae}, %zmm9, %zmm4, %zmm0
> > > > +       testl   %edx, %edx
> > > > +
> > > > +       /* Go to special inputs processing branch.  */
> > > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > > +
> > > > +       /* Restore registers * and exit the function.  */
> > > >
> > > >  L(EXIT):
> > > > -        movq      %rbp, %rsp
> > > > -        popq      %rbp
> > > > -        cfi_def_cfa(7, 8)
> > > > -        cfi_restore(6)
> > > > -        ret
> > > > -        cfi_def_cfa(6, 16)
> > > > -        cfi_offset(6, -16)
> > > > -
> > > > -/* Branch to process
> > > > - * special inputs
> > > > - */
> > > > +       movq    %rbp, %rsp
> > > > +       popq    %rbp
> > > > +       cfi_def_cfa (7, 8)
> > > > +       cfi_restore (6)
> > > > +       ret
> > > > +       cfi_def_cfa (6, 16)
> > > > +       cfi_offset (6, -16)
> > > > +
> > > > +       /* Branch to process special inputs.  */
> > > >
> > > >  L(SPECIAL_VALUES_BRANCH):
> > > > -        vmovups   %zmm11, 64(%rsp)
> > > > -        vmovups   %zmm0, 128(%rsp)
> > > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> > > > -
> > > > -        xorl      %eax, %eax
> > > > -                                # LOE rbx r12 r13 r14 r15 eax edx
> > > > -
> > > > -        vzeroupper
> > > > -        movq      %r12, 16(%rsp)
> > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > > -        movl      %eax, %r12d
> > > > -        movq      %r13, 8(%rsp)
> > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > > -        movl      %edx, %r13d
> > > > -        movq      %r14, (%rsp)
> > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -
> > > > -/* Range mask
> > > > - * bits check
> > > > - */
> > > > +       vmovups %zmm11, 64(%rsp)
> > > > +       vmovups %zmm0, 128(%rsp)
> > > > +
> > > > +       xorl    %eax, %eax
> > > > +
> > > > +       vzeroupper
> > > > +       movq    %r12, 16(%rsp)
> > > > +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > > +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> > > > +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> > > > +       movl    %eax, %r12d
> > > > +       movq    %r13, 8(%rsp)
> > > > +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > > +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> > > > +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> > > > +       movl    %edx, %r13d
> > > > +       movq    %r14, (%rsp)
> > > > +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > > +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> > > > +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> > > > +
> > > > +       /* Range mask * bits check.  */
> > > >
> > > >  L(RANGEMASK_CHECK):
> > > > -        btl       %r12d, %r13d
> > > > +       btl     %r12d, %r13d
> > > >
> > > > -/* Call scalar math function */
> > > > -        jc        L(SCALAR_MATH_CALL)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > +       /* Call scalar math function.  */
> > > > +       jc      L(SCALAR_MATH_CALL)
> > > >
> > > > -/* Special inputs
> > > > - * processing loop
> > > > - */
> > > > +       /* Special inputs processing loop.  */
> > > >
> > > >  L(SPECIAL_VALUES_LOOP):
> > > > -        incl      %r12d
> > > > -        cmpl      $16, %r12d
> > > > -
> > > > -/* Check bits in range mask */
> > > > -        jl        L(RANGEMASK_CHECK)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -
> > > > -        movq      16(%rsp), %r12
> > > > -        cfi_restore(12)
> > > > -        movq      8(%rsp), %r13
> > > > -        cfi_restore(13)
> > > > -        movq      (%rsp), %r14
> > > > -        cfi_restore(14)
> > > > -        vmovups   128(%rsp), %zmm0
> > > > -
> > > > -/* Go to exit */
> > > > -        jmp       L(EXIT)
> > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > > -                                # LOE rbx r12 r13 r14 r15 zmm0
> > > > -
> > > > -/* Scalar math fucntion call
> > > > - * to process special input
> > > > - */
> > > > +       incl    %r12d
> > > > +       cmpl    $16, %r12d
> > > > +
> > > > +       /* Check bits in range mask.  */
> > > > +       jl      L(RANGEMASK_CHECK)
> > > > +
> > > > +       movq    16(%rsp), %r12
> > > > +       cfi_restore (12)
> > > > +       movq    8(%rsp), %r13
> > > > +       cfi_restore (13)
> > > > +       movq    (%rsp), %r14
> > > > +       cfi_restore (14)
> > > > +       vmovups 128(%rsp), %zmm0
> > > > +
> > > > +       /* Go to exit.  */
> > > > +       jmp     L(EXIT)
> > > > +       /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > > +          -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
> > > > +       .cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
> > > > +       /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > > +          -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
> > > > +       .cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
> > > > +       /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
> > > > +          -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
> > > > +       .cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
> > > > +
> > > > +       /* Scalar math fucntion call to process special input.  */
> > > >
> > > >  L(SCALAR_MATH_CALL):
> > > > -        movl      %r12d, %r14d
> > > > -        movss     64(%rsp,%r14,4), %xmm0
> > > > -        call      atanhf@PLT
> > > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > > +       movl    %r12d, %r14d
> > > > +       movss   64(%rsp, %r14, 4), %xmm0
> > > > +       call    atanhf@PLT
> > > >
> > > > -        movss     %xmm0, 128(%rsp,%r14,4)
> > > > +       movss   %xmm0, 128(%rsp, %r14, 4)
> > > >
> > > > -/* Process special inputs in loop */
> > > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > +       /* Process special inputs in loop.  */
> > > > +       jmp     L(SPECIAL_VALUES_LOOP)
> > > >  END(_ZGVeN16v_atanhf_skx)
> > > >
> > > > -        .section .rodata, "a"
> > > > -        .align 64
> > > > +       .section .rodata, "a"
> > > > +       .align  64
> > > >
> > > >  #ifdef __svml_satanh_data_internal_avx512_typedef
> > > > -typedef unsigned int VUINT32;
> > > > -typedef struct {
> > > > -        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
> > > > -        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
> > > > -        __declspec(align(64)) VUINT32 One[16][1];
> > > > -        __declspec(align(64)) VUINT32 AbsMask[16][1];
> > > > -        __declspec(align(64)) VUINT32 AddB5[16][1];
> > > > -        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
> > > > -        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
> > > > -        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
> > > > -        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
> > > > -        __declspec(align(64)) VUINT32 poly_coeff0[16][1];
> > > > -        __declspec(align(64)) VUINT32 Half[16][1];
> > > > -        __declspec(align(64)) VUINT32 L2H[16][1];
> > > > -        __declspec(align(64)) VUINT32 L2L[16][1];
> > > > -    } __svml_satanh_data_internal_avx512;
> > > > +       typedef unsigned int VUINT32;
> > > > +       typedef struct{
> > > > +       __declspec (align(64))VUINT32 Log_tbl_H[32][1];
> > > > +       __declspec (align(64))VUINT32 Log_tbl_L[32][1];
> > > > +       __declspec (align(64))VUINT32 One[16][1];
> > > > +       __declspec (align(64))VUINT32 AbsMask[16][1];
> > > > +       __declspec (align(64))VUINT32 AddB5[16][1];
> > > > +       __declspec (align(64))VUINT32 RcpBitMask[16][1];
> > > > +       __declspec (align(64))VUINT32 poly_coeff3[16][1];
> > > > +       __declspec (align(64))VUINT32 poly_coeff2[16][1];
> > > > +       __declspec (align(64))VUINT32 poly_coeff1[16][1];
> > > > +       __declspec (align(64))VUINT32 poly_coeff0[16][1];
> > > > +       __declspec (align(64))VUINT32 Half[16][1];
> > > > +       __declspec (align(64))VUINT32 L2H[16][1];
> > > > +       __declspec (align(64))VUINT32 L2L[16][1];
> > > > +       }__svml_satanh_data_internal_avx512;
> > > >  #endif
> > > >  __svml_satanh_data_internal_avx512:
> > > > -        /*== Log_tbl_H ==*/
> > > > -        .long 0x00000000
> > > > -        .long 0x3cfc0000
> > > > -        .long 0x3d780000
> > > > -        .long 0x3db78000
> > > > -        .long 0x3df10000
> > > > -        .long 0x3e14c000
> > > > -        .long 0x3e300000
> > > > -        .long 0x3e4a8000
> > > > -        .long 0x3e648000
> > > > -        .long 0x3e7dc000
> > > > -        .long 0x3e8b4000
> > > > -        .long 0x3e974000
> > > > -        .long 0x3ea30000
> > > > -        .long 0x3eae8000
> > > > -        .long 0x3eb9c000
> > > > -        .long 0x3ec4e000
> > > > -        .long 0x3ecfa000
> > > > -        .long 0x3eda2000
> > > > -        .long 0x3ee48000
> > > > -        .long 0x3eeea000
> > > > -        .long 0x3ef8a000
> > > > -        .long 0x3f013000
> > > > -        .long 0x3f05f000
> > > > -        .long 0x3f0aa000
> > > > -        .long 0x3f0f4000
> > > > -        .long 0x3f13d000
> > > > -        .long 0x3f184000
> > > > -        .long 0x3f1ca000
> > > > -        .long 0x3f20f000
> > > > -        .long 0x3f252000
> > > > -        .long 0x3f295000
> > > > -        .long 0x3f2d7000
> > > > -        /*== Log_tbl_L ==*/
> > > > -        .align 64
> > > > -        .long 0x00000000
> > > > -        .long 0x3726c39e
> > > > -        .long 0x38a30c01
> > > > -        .long 0x37528ae5
> > > > -        .long 0x38e0edc5
> > > > -        .long 0xb8ab41f8
> > > > -        .long 0xb7cf8f58
> > > > -        .long 0x3896a73d
> > > > -        .long 0xb5838656
> > > > -        .long 0x380c36af
> > > > -        .long 0xb8235454
> > > > -        .long 0x3862bae1
> > > > -        .long 0x38c5e10e
> > > > -        .long 0x38dedfac
> > > > -        .long 0x38ebfb5e
> > > > -        .long 0xb8e63c9f
> > > > -        .long 0xb85c1340
> > > > -        .long 0x38777bcd
> > > > -        .long 0xb6038656
> > > > -        .long 0x37d40984
> > > > -        .long 0xb8b85028
> > > > -        .long 0xb8ad5a5a
> > > > -        .long 0x3865c84a
> > > > -        .long 0x38c3d2f5
> > > > -        .long 0x383ebce1
> > > > -        .long 0xb8a1ed76
> > > > -        .long 0xb7a332c4
> > > > -        .long 0xb779654f
> > > > -        .long 0xb8602f73
> > > > -        .long 0x38f85db0
> > > > -        .long 0x37b4996f
> > > > -        .long 0xb8bfb3ca
> > > > -        /*== One ==*/
> > > > -        .align 64
> > > > -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > > -        /*== AbsMask ==*/
> > > > -        .align 64
> > > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > > > -        /*== AddB5 ==*/
> > > > -        .align 64
> > > > -        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > > -        /*== RcpBitMask ==*/
> > > > -        .align 64
> > > > -        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > > -        /*== poly_coeff3 ==*/
> > > > -        .align 64
> > > > -        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > > -        /*== poly_coeff2 ==*/
> > > > -        .align 64
> > > > -        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > > -        /*== poly_coeff1 ==*/
> > > > -        .align 64
> > > > -        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > > -        /*== poly_coeff0 ==*/
> > > > -        .align 64
> > > > -        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > > -        /*== Half ==*/
> > > > -        .align 64
> > > > -        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > > > -        /*== L2H = log(2)_high ==*/
> > > > -        .align 64
> > > > -        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > > -        /*== L2L = log(2)_low ==*/
> > > > -        .align 64
> > > > -        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > > -        .align 64
> > > > -        .type  __svml_satanh_data_internal_avx512,@object
> > > > -        .size  __svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
> > > > +       /* == Log_tbl_H ==.  */
> > > > +       .long   0x00000000
> > > > +       .long   0x3cfc0000
> > > > +       .long   0x3d780000
> > > > +       .long   0x3db78000
> > > > +       .long   0x3df10000
> > > > +       .long   0x3e14c000
> > > > +       .long   0x3e300000
> > > > +       .long   0x3e4a8000
> > > > +       .long   0x3e648000
> > > > +       .long   0x3e7dc000
> > > > +       .long   0x3e8b4000
> > > > +       .long   0x3e974000
> > > > +       .long   0x3ea30000
> > > > +       .long   0x3eae8000
> > > > +       .long   0x3eb9c000
> > > > +       .long   0x3ec4e000
> > > > +       .long   0x3ecfa000
> > > > +       .long   0x3eda2000
> > > > +       .long   0x3ee48000
> > > > +       .long   0x3eeea000
> > > > +       .long   0x3ef8a000
> > > > +       .long   0x3f013000
> > > > +       .long   0x3f05f000
> > > > +       .long   0x3f0aa000
> > > > +       .long   0x3f0f4000
> > > > +       .long   0x3f13d000
> > > > +       .long   0x3f184000
> > > > +       .long   0x3f1ca000
> > > > +       .long   0x3f20f000
> > > > +       .long   0x3f252000
> > > > +       .long   0x3f295000
> > > > +       .long   0x3f2d7000
> > > > +       /* == Log_tbl_L ==.  */
> > > > +       .align  64
> > > > +       .long   0x00000000
> > > > +       .long   0x3726c39e
> > > > +       .long   0x38a30c01
> > > > +       .long   0x37528ae5
> > > > +       .long   0x38e0edc5
> > > > +       .long   0xb8ab41f8
> > > > +       .long   0xb7cf8f58
> > > > +       .long   0x3896a73d
> > > > +       .long   0xb5838656
> > > > +       .long   0x380c36af
> > > > +       .long   0xb8235454
> > > > +       .long   0x3862bae1
> > > > +       .long   0x38c5e10e
> > > > +       .long   0x38dedfac
> > > > +       .long   0x38ebfb5e
> > > > +       .long   0xb8e63c9f
> > > > +       .long   0xb85c1340
> > > > +       .long   0x38777bcd
> > > > +       .long   0xb6038656
> > > > +       .long   0x37d40984
> > > > +       .long   0xb8b85028
> > > > +       .long   0xb8ad5a5a
> > > > +       .long   0x3865c84a
> > > > +       .long   0x38c3d2f5
> > > > +       .long   0x383ebce1
> > > > +       .long   0xb8a1ed76
> > > > +       .long   0xb7a332c4
> > > > +       .long   0xb779654f
> > > > +       .long   0xb8602f73
> > > > +       .long   0x38f85db0
> > > > +       .long   0x37b4996f
> > > > +       .long   0xb8bfb3ca
> > > > +       /* == One ==.  */
> > > > +       .align  64
> > > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > > +       /* == AbsMask ==.  */
> > > > +       .align  64
> > > > +       .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
> > > > +       /* == AddB5 ==.  */
> > > > +       .align  64
> > > > +       .long   0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
> > > > +       /* == RcpBitMask ==.  */
> > > > +       .align  64
> > > > +       .long   0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
> > > > +       /* == poly_coeff3 ==.  */
> > > > +       .align  64
> > > > +       .long   0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
> > > > +       /* == poly_coeff2 ==.  */
> > > > +       .align  64
> > > > +       .long   0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
> > > > +       /* == poly_coeff1 ==.  */
> > > > +       .align  64
> > > > +       .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
> > > > +       /* == poly_coeff0 ==.  */
> > > > +       .align  64
> > > > +       .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
> > > > +       /* == Half ==.  */
> > > > +       .align  64
> > > > +       .long   0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
> > > > +       /* == L2H = log(2)_high ==.  */
> > > > +       .align  64
> > > > +       .long   0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
> > > > +       /* == L2L = log(2)_low ==.  */
> > > > +       .align  64
> > > > +       .long   0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
> > > > +       .align  64
> > > > +       .type   __svml_satanh_data_internal_avx512, @object
> > > > +       .size   __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512
> > > > --
> > > > 2.25.1
> > > >
diff mbox series

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index f863f4f959..ed90a427a6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -33,361 +33,348 @@ 
 
 /* Offsets for data table __svml_satanh_data_internal_avx512
  */
-#define Log_tbl_H                     	0
-#define Log_tbl_L                     	128
-#define One                           	256
-#define AbsMask                       	320
-#define AddB5                         	384
-#define RcpBitMask                    	448
-#define poly_coeff3                   	512
-#define poly_coeff2                   	576
-#define poly_coeff1                   	640
-#define poly_coeff0                   	704
-#define Half                          	768
-#define L2H                           	832
-#define L2L                           	896
+#define Log_tbl_H	0
+#define Log_tbl_L	128
+#define One	256
+#define AbsMask	320
+#define AddB5	384
+#define RcpBitMask	448
+#define poly_coeff3	512
+#define poly_coeff2	576
+#define poly_coeff1	640
+#define poly_coeff0	704
+#define Half	768
+#define L2H	832
+#define L2L	896
 
 #include <sysdep.h>
 
-        .text
-	.section .text.exex512,"ax",@progbits
+	.text
+	.section .text.exex512, "ax", @progbits
 ENTRY(_ZGVeN16v_atanhf_skx)
-        pushq     %rbp
-        cfi_def_cfa_offset(16)
-        movq      %rsp, %rbp
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-        andq      $-64, %rsp
-        subq      $192, %rsp
-        vmovups   One+__svml_satanh_data_internal_avx512(%rip), %zmm4
-
-/* round reciprocals to 1+5b mantissas */
-        vmovups   AddB5+__svml_satanh_data_internal_avx512(%rip), %zmm14
-        vmovups   RcpBitMask+__svml_satanh_data_internal_avx512(%rip), %zmm1
-        vmovaps   %zmm0, %zmm11
-        vandps    AbsMask+__svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
-
-/* 1+y */
-        vaddps    {rn-sae}, %zmm4, %zmm6, %zmm9
-
-/* 1-y */
-        vsubps    {rn-sae}, %zmm6, %zmm4, %zmm8
-        vxorps    %zmm6, %zmm11, %zmm10
-
-/* Yp_high */
-        vsubps    {rn-sae}, %zmm4, %zmm9, %zmm2
-
-/* -Ym_high */
-        vsubps    {rn-sae}, %zmm4, %zmm8, %zmm5
-
-/* RcpP ~ 1/Yp */
-        vrcp14ps  %zmm9, %zmm12
-
-/* RcpM ~ 1/Ym */
-        vrcp14ps  %zmm8, %zmm13
-
-/* input outside (-1, 1) ? */
-        vcmpps    $21, {sae}, %zmm4, %zmm6, %k0
-        vpaddd    %zmm14, %zmm12, %zmm15
-        vpaddd    %zmm14, %zmm13, %zmm0
-
-/* Yp_low */
-        vsubps    {rn-sae}, %zmm2, %zmm6, %zmm3
-        vandps    %zmm1, %zmm15, %zmm7
-        vandps    %zmm1, %zmm0, %zmm12
-
-/* Ym_low */
-        vaddps    {rn-sae}, %zmm5, %zmm6, %zmm5
-
-/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
-        vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
-
-/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
-        vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
-        vmovups   Log_tbl_L+__svml_satanh_data_internal_avx512(%rip), %zmm8
-        vmovups   Log_tbl_L+64+__svml_satanh_data_internal_avx512(%rip), %zmm13
-
-/* exponents */
-        vgetexpps {sae}, %zmm7, %zmm15
-        vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
-
-/* Table lookups */
-        vmovups   __svml_satanh_data_internal_avx512(%rip), %zmm6
-        vgetexpps {sae}, %zmm12, %zmm14
-        vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
-
-/* Prepare table index */
-        vpsrld    $18, %zmm7, %zmm3
-        vpsrld    $18, %zmm12, %zmm2
-        vmovups   Log_tbl_H+64+__svml_satanh_data_internal_avx512(%rip), %zmm7
-        vmovups   poly_coeff1+__svml_satanh_data_internal_avx512(%rip), %zmm12
-
-/* Km-Kp */
-        vsubps    {rn-sae}, %zmm15, %zmm14, %zmm1
-        kmovw     %k0, %edx
-        vmovaps   %zmm3, %zmm0
-        vpermi2ps %zmm13, %zmm8, %zmm3
-        vpermt2ps %zmm13, %zmm2, %zmm8
-        vpermi2ps %zmm7, %zmm6, %zmm0
-        vpermt2ps %zmm7, %zmm2, %zmm6
-        vsubps    {rn-sae}, %zmm3, %zmm8, %zmm5
-
-/* K*L2H + Th */
-        vmovups   L2H+__svml_satanh_data_internal_avx512(%rip), %zmm2
-
-/* K*L2L + Tl */
-        vmovups   L2L+__svml_satanh_data_internal_avx512(%rip), %zmm3
-
-/* polynomials */
-        vmovups   poly_coeff3+__svml_satanh_data_internal_avx512(%rip), %zmm7
-        vmovups   poly_coeff0+__svml_satanh_data_internal_avx512(%rip), %zmm13
-
-/* table values */
-        vsubps    {rn-sae}, %zmm0, %zmm6, %zmm0
-        vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
-        vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
-        vmovups   poly_coeff2+__svml_satanh_data_internal_avx512(%rip), %zmm3
-        vmovaps   %zmm3, %zmm2
-        vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
-        vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
-        vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
-        vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
-        vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
-        vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
-
-/* (K*L2L + Tl) + Rp*PolyP */
-        vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
-        vorps     Half+__svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
-
-/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
-        vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
-        vaddps    {rn-sae}, %zmm3, %zmm0, %zmm4
-        vmulps    {rn-sae}, %zmm9, %zmm4, %zmm0
-        testl     %edx, %edx
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11
-
-/* Restore registers
- * and exit the function
- */
+	pushq	%rbp
+	cfi_def_cfa_offset (16)
+	movq	%rsp, %rbp
+	cfi_def_cfa (6, 16)
+	cfi_offset (6, -16)
+	andq	$-64, %rsp
+	subq	$192, %rsp
+	vmovups	One + __svml_satanh_data_internal_avx512(%rip), %zmm4
+
+	/* round reciprocals to 1+5b mantissas.  */
+	vmovups	AddB5 + __svml_satanh_data_internal_avx512(%rip), %zmm14
+	vmovups	RcpBitMask + __svml_satanh_data_internal_avx512(%rip), %zmm1
+	vmovaps	%zmm0, %zmm11
+	vandps	AbsMask + __svml_satanh_data_internal_avx512(%rip), %zmm11, %zmm6
+
+	/* 1+y.  */
+	vaddps	{rn-sae}, %zmm4, %zmm6, %zmm9
+
+	/* 1-y.  */
+	vsubps	{rn-sae}, %zmm6, %zmm4, %zmm8
+	vxorps	%zmm6, %zmm11, %zmm10
+
+	/* Yp_high.  */
+	vsubps	{rn-sae}, %zmm4, %zmm9, %zmm2
+
+	/* -Ym_high.  */
+	vsubps	{rn-sae}, %zmm4, %zmm8, %zmm5
+
+	/* RcpP ~ 1/Yp.  */
+	vrcp14ps %zmm9, %zmm12
+
+	/* RcpM ~ 1/Ym.  */
+	vrcp14ps %zmm8, %zmm13
+
+	/* input outside (-1, 1) ?.  */
+	vcmpps	$21, {sae}, %zmm4, %zmm6, %k0
+	vpaddd	%zmm14, %zmm12, %zmm15
+	vpaddd	%zmm14, %zmm13, %zmm0
+
+	/* Yp_low.  */
+	vsubps	{rn-sae}, %zmm2, %zmm6, %zmm3
+	vandps	%zmm1, %zmm15, %zmm7
+	vandps	%zmm1, %zmm0, %zmm12
+
+	/* Ym_low.  */
+	vaddps	{rn-sae}, %zmm5, %zmm6, %zmm5
+
+	/* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low.  */
+	vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
+
+	/* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low.  */
+	vfmsub231ps {rn-sae}, %zmm12, %zmm8, %zmm4
+	vmovups	Log_tbl_L + __svml_satanh_data_internal_avx512(%rip), %zmm8
+	vmovups	Log_tbl_L + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm13
+
+	/* exponents.  */
+	vgetexpps {sae}, %zmm7, %zmm15
+	vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
+
+	/* Table lookups.  */
+	vmovups	__svml_satanh_data_internal_avx512(%rip), %zmm6
+	vgetexpps {sae}, %zmm12, %zmm14
+	vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm4
+
+	/* Prepare table index.  */
+	vpsrld	$18, %zmm7, %zmm3
+	vpsrld	$18, %zmm12, %zmm2
+	vmovups	Log_tbl_H + 64 + __svml_satanh_data_internal_avx512(%rip), %zmm7
+	vmovups	poly_coeff1 + __svml_satanh_data_internal_avx512(%rip), %zmm12
+
+	/* Km-Kp.  */
+	vsubps	{rn-sae}, %zmm15, %zmm14, %zmm1
+	kmovw	%k0, %edx
+	vmovaps	%zmm3, %zmm0
+	vpermi2ps %zmm13, %zmm8, %zmm3
+	vpermt2ps %zmm13, %zmm2, %zmm8
+	vpermi2ps %zmm7, %zmm6, %zmm0
+	vpermt2ps %zmm7, %zmm2, %zmm6
+	vsubps	{rn-sae}, %zmm3, %zmm8, %zmm5
+
+	/* K*L2H + Th.  */
+	vmovups	L2H + __svml_satanh_data_internal_avx512(%rip), %zmm2
+
+	/* K*L2L + Tl.  */
+	vmovups	L2L + __svml_satanh_data_internal_avx512(%rip), %zmm3
+
+	/* polynomials.  */
+	vmovups	poly_coeff3 + __svml_satanh_data_internal_avx512(%rip), %zmm7
+	vmovups	poly_coeff0 + __svml_satanh_data_internal_avx512(%rip), %zmm13
+
+	/* table values.  */
+	vsubps	{rn-sae}, %zmm0, %zmm6, %zmm0
+	vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm0
+	vfmadd213ps {rn-sae}, %zmm5, %zmm3, %zmm1
+	vmovups	poly_coeff2 + __svml_satanh_data_internal_avx512(%rip), %zmm3
+	vmovaps	%zmm3, %zmm2
+	vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm2
+	vfmadd231ps {rn-sae}, %zmm4, %zmm7, %zmm3
+	vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm2
+	vfmadd213ps {rn-sae}, %zmm12, %zmm4, %zmm3
+	vfmadd213ps {rn-sae}, %zmm13, %zmm9, %zmm2
+	vfmadd213ps {rn-sae}, %zmm13, %zmm4, %zmm3
+
+	/* (K*L2L + Tl) + Rp*PolyP.  */
+	vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm2
+	vorps	Half + __svml_satanh_data_internal_avx512(%rip), %zmm10, %zmm9
+
+	/* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM.  */
+	vfnmadd213ps {rn-sae}, %zmm2, %zmm4, %zmm3
+	vaddps	{rn-sae}, %zmm3, %zmm0, %zmm4
+	vmulps	{rn-sae}, %zmm9, %zmm4, %zmm0
+	testl	%edx, %edx
+
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+
+	/* Restore registers * and exit the function.  */
 
 L(EXIT):
-        movq      %rbp, %rsp
-        popq      %rbp
-        cfi_def_cfa(7, 8)
-        cfi_restore(6)
-        ret
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
+	movq	%rbp, %rsp
+	popq	%rbp
+	cfi_def_cfa (7, 8)
+	cfi_restore (6)
+	ret
+	cfi_def_cfa (6, 16)
+	cfi_offset (6, -16)
+
+	/* Branch to process special inputs.  */
 
 L(SPECIAL_VALUES_BRANCH):
-        vmovups   %zmm11, 64(%rsp)
-        vmovups   %zmm0, 128(%rsp)
-                                # LOE rbx r12 r13 r14 r15 edx zmm0
-
-        xorl      %eax, %eax
-                                # LOE rbx r12 r13 r14 r15 eax edx
-
-        vzeroupper
-        movq      %r12, 16(%rsp)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-        movl      %eax, %r12d
-        movq      %r13, 8(%rsp)
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-        movl      %edx, %r13d
-        movq      %r14, (%rsp)
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
+	vmovups	%zmm11, 64(%rsp)
+	vmovups	%zmm0, 128(%rsp)
+
+	xorl	%eax, %eax
+
+	vzeroupper
+	movq	%r12, 16(%rsp)
+	/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
+	   -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
+	.cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
+	movl	%eax, %r12d
+	movq	%r13, 8(%rsp)
+	/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
+	   -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
+	.cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
+	movl	%edx, %r13d
+	movq	%r14, (%rsp)
+	/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
+	   -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
+	.cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
+
+	/* Range mask * bits check.  */
 
 L(RANGEMASK_CHECK):
-        btl       %r12d, %r13d
+	btl	%r12d, %r13d
 
-/* Call scalar math function */
-        jc        L(SCALAR_MATH_CALL)
-                                # LOE rbx r15 r12d r13d
+	/* Call scalar math function.  */
+	jc	L(SCALAR_MATH_CALL)
 
-/* Special inputs
- * processing loop
- */
+	/* Special inputs processing loop.  */
 
 L(SPECIAL_VALUES_LOOP):
-        incl      %r12d
-        cmpl      $16, %r12d
-
-/* Check bits in range mask */
-        jl        L(RANGEMASK_CHECK)
-                                # LOE rbx r15 r12d r13d
-
-        movq      16(%rsp), %r12
-        cfi_restore(12)
-        movq      8(%rsp), %r13
-        cfi_restore(13)
-        movq      (%rsp), %r14
-        cfi_restore(14)
-        vmovups   128(%rsp), %zmm0
-
-/* Go to exit */
-        jmp       L(EXIT)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r12 r13 r14 r15 zmm0
-
-/* Scalar math fucntion call
- * to process special input
- */
+	incl	%r12d
+	cmpl	$16, %r12d
+
+	/* Check bits in range mask.  */
+	jl	L(RANGEMASK_CHECK)
+
+	movq	16(%rsp), %r12
+	cfi_restore (12)
+	movq	8(%rsp), %r13
+	cfi_restore (13)
+	movq	(%rsp), %r14
+	cfi_restore (14)
+	vmovups	128(%rsp), %zmm0
+
+	/* Go to exit.  */
+	jmp	L(EXIT)
+	/* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
+	   -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus).  */
+	.cfi_escape 0x10 , 0x0c , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x50 , 0xff , 0xff , 0xff , 0x22
+	/* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
+	   -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus).  */
+	.cfi_escape 0x10 , 0x0d , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x48 , 0xff , 0xff , 0xff , 0x22
+	/* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s:
+	   -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus).  */
+	.cfi_escape 0x10 , 0x0e , 0x0e , 0x38 , 0x1c , 0x0d , 0xc0 , 0xff , 0xff , 0xff , 0x1a , 0x0d , 0x40 , 0xff , 0xff , 0xff , 0x22
+
+	/* Scalar math fucntion call to process special input.  */
 
 L(SCALAR_MATH_CALL):
-        movl      %r12d, %r14d
-        movss     64(%rsp,%r14,4), %xmm0
-        call      atanhf@PLT
-                                # LOE rbx r14 r15 r12d r13d xmm0
+	movl	%r12d, %r14d
+	movss	64(%rsp, %r14, 4), %xmm0
+	call	atanhf@PLT
 
-        movss     %xmm0, 128(%rsp,%r14,4)
+	movss	%xmm0, 128(%rsp, %r14, 4)
 
-/* Process special inputs in loop */
-        jmp       L(SPECIAL_VALUES_LOOP)
-                                # LOE rbx r15 r12d r13d
+	/* Process special inputs in loop.  */
+	jmp	L(SPECIAL_VALUES_LOOP)
 END(_ZGVeN16v_atanhf_skx)
 
-        .section .rodata, "a"
-        .align 64
+	.section .rodata, "a"
+	.align	64
 
 #ifdef __svml_satanh_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
-        __declspec(align(64)) VUINT32 Log_tbl_H[32][1];
-        __declspec(align(64)) VUINT32 Log_tbl_L[32][1];
-        __declspec(align(64)) VUINT32 One[16][1];
-        __declspec(align(64)) VUINT32 AbsMask[16][1];
-        __declspec(align(64)) VUINT32 AddB5[16][1];
-        __declspec(align(64)) VUINT32 RcpBitMask[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff3[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff2[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff1[16][1];
-        __declspec(align(64)) VUINT32 poly_coeff0[16][1];
-        __declspec(align(64)) VUINT32 Half[16][1];
-        __declspec(align(64)) VUINT32 L2H[16][1];
-        __declspec(align(64)) VUINT32 L2L[16][1];
-    } __svml_satanh_data_internal_avx512;
+	typedef	unsigned int VUINT32;
+	typedef	struct{
+	__declspec (align(64))VUINT32 Log_tbl_H[32][1];
+	__declspec (align(64))VUINT32 Log_tbl_L[32][1];
+	__declspec (align(64))VUINT32 One[16][1];
+	__declspec (align(64))VUINT32 AbsMask[16][1];
+	__declspec (align(64))VUINT32 AddB5[16][1];
+	__declspec (align(64))VUINT32 RcpBitMask[16][1];
+	__declspec (align(64))VUINT32 poly_coeff3[16][1];
+	__declspec (align(64))VUINT32 poly_coeff2[16][1];
+	__declspec (align(64))VUINT32 poly_coeff1[16][1];
+	__declspec (align(64))VUINT32 poly_coeff0[16][1];
+	__declspec (align(64))VUINT32 Half[16][1];
+	__declspec (align(64))VUINT32 L2H[16][1];
+	__declspec (align(64))VUINT32 L2L[16][1];
+	}__svml_satanh_data_internal_avx512;
 #endif
 __svml_satanh_data_internal_avx512:
-        /*== Log_tbl_H ==*/
-        .long 0x00000000
-        .long 0x3cfc0000
-        .long 0x3d780000
-        .long 0x3db78000
-        .long 0x3df10000
-        .long 0x3e14c000
-        .long 0x3e300000
-        .long 0x3e4a8000
-        .long 0x3e648000
-        .long 0x3e7dc000
-        .long 0x3e8b4000
-        .long 0x3e974000
-        .long 0x3ea30000
-        .long 0x3eae8000
-        .long 0x3eb9c000
-        .long 0x3ec4e000
-        .long 0x3ecfa000
-        .long 0x3eda2000
-        .long 0x3ee48000
-        .long 0x3eeea000
-        .long 0x3ef8a000
-        .long 0x3f013000
-        .long 0x3f05f000
-        .long 0x3f0aa000
-        .long 0x3f0f4000
-        .long 0x3f13d000
-        .long 0x3f184000
-        .long 0x3f1ca000
-        .long 0x3f20f000
-        .long 0x3f252000
-        .long 0x3f295000
-        .long 0x3f2d7000
-        /*== Log_tbl_L ==*/
-        .align 64
-        .long 0x00000000
-        .long 0x3726c39e
-        .long 0x38a30c01
-        .long 0x37528ae5
-        .long 0x38e0edc5
-        .long 0xb8ab41f8
-        .long 0xb7cf8f58
-        .long 0x3896a73d
-        .long 0xb5838656
-        .long 0x380c36af
-        .long 0xb8235454
-        .long 0x3862bae1
-        .long 0x38c5e10e
-        .long 0x38dedfac
-        .long 0x38ebfb5e
-        .long 0xb8e63c9f
-        .long 0xb85c1340
-        .long 0x38777bcd
-        .long 0xb6038656
-        .long 0x37d40984
-        .long 0xb8b85028
-        .long 0xb8ad5a5a
-        .long 0x3865c84a
-        .long 0x38c3d2f5
-        .long 0x383ebce1
-        .long 0xb8a1ed76
-        .long 0xb7a332c4
-        .long 0xb779654f
-        .long 0xb8602f73
-        .long 0x38f85db0
-        .long 0x37b4996f
-        .long 0xb8bfb3ca
-        /*== One ==*/
-        .align 64
-        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-        /*== AbsMask ==*/
-        .align 64
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
-        /*== AddB5 ==*/
-        .align 64
-        .long 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
-        /*== RcpBitMask ==*/
-        .align 64
-        .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
-        /*== poly_coeff3 ==*/
-        .align 64
-        .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
-        /*== poly_coeff2 ==*/
-        .align 64
-        .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
-        /*== poly_coeff1 ==*/
-        .align 64
-        .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
-        /*== poly_coeff0 ==*/
-        .align 64
-        .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
-        /*== Half ==*/
-        .align 64
-        .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
-        /*== L2H = log(2)_high ==*/
-        .align 64
-        .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
-        /*== L2L = log(2)_low ==*/
-        .align 64
-        .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
-        .align 64
-        .type	__svml_satanh_data_internal_avx512,@object
-        .size	__svml_satanh_data_internal_avx512,.-__svml_satanh_data_internal_avx512
+	/* == Log_tbl_H ==.  */
+	.long	0x00000000
+	.long	0x3cfc0000
+	.long	0x3d780000
+	.long	0x3db78000
+	.long	0x3df10000
+	.long	0x3e14c000
+	.long	0x3e300000
+	.long	0x3e4a8000
+	.long	0x3e648000
+	.long	0x3e7dc000
+	.long	0x3e8b4000
+	.long	0x3e974000
+	.long	0x3ea30000
+	.long	0x3eae8000
+	.long	0x3eb9c000
+	.long	0x3ec4e000
+	.long	0x3ecfa000
+	.long	0x3eda2000
+	.long	0x3ee48000
+	.long	0x3eeea000
+	.long	0x3ef8a000
+	.long	0x3f013000
+	.long	0x3f05f000
+	.long	0x3f0aa000
+	.long	0x3f0f4000
+	.long	0x3f13d000
+	.long	0x3f184000
+	.long	0x3f1ca000
+	.long	0x3f20f000
+	.long	0x3f252000
+	.long	0x3f295000
+	.long	0x3f2d7000
+	/* == Log_tbl_L ==.  */
+	.align	64
+	.long	0x00000000
+	.long	0x3726c39e
+	.long	0x38a30c01
+	.long	0x37528ae5
+	.long	0x38e0edc5
+	.long	0xb8ab41f8
+	.long	0xb7cf8f58
+	.long	0x3896a73d
+	.long	0xb5838656
+	.long	0x380c36af
+	.long	0xb8235454
+	.long	0x3862bae1
+	.long	0x38c5e10e
+	.long	0x38dedfac
+	.long	0x38ebfb5e
+	.long	0xb8e63c9f
+	.long	0xb85c1340
+	.long	0x38777bcd
+	.long	0xb6038656
+	.long	0x37d40984
+	.long	0xb8b85028
+	.long	0xb8ad5a5a
+	.long	0x3865c84a
+	.long	0x38c3d2f5
+	.long	0x383ebce1
+	.long	0xb8a1ed76
+	.long	0xb7a332c4
+	.long	0xb779654f
+	.long	0xb8602f73
+	.long	0x38f85db0
+	.long	0x37b4996f
+	.long	0xb8bfb3ca
+	/* == One ==.  */
+	.align	64
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* == AbsMask ==.  */
+	.align	64
+	.long	0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
+	/* == AddB5 ==.  */
+	.align	64
+	.long	0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000, 0x00020000
+	/* == RcpBitMask ==.  */
+	.align	64
+	.long	0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
+	/* == poly_coeff3 ==.  */
+	.align	64
+	.long	0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
+	/* == poly_coeff2 ==.  */
+	.align	64
+	.long	0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
+	/* == poly_coeff1 ==.  */
+	.align	64
+	.long	0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
+	/* == poly_coeff0 ==.  */
+	.align	64
+	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
+	/* == Half ==.  */
+	.align	64
+	.long	0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
+	/* == L2H = log(2)_high ==.  */
+	.align	64
+	.long	0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
+	/* == L2L = log(2)_low ==.  */
+	.align	64
+	.long	0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
+	.align	64
+	.type	__svml_satanh_data_internal_avx512, @object
+	.size	__svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512