diff mbox series

[v1] x86: Optimize svml_s_tanhf_core_{sse4|avx2|avx512}.S

Message ID 20220130043635.2494108-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Optimize svml_s_tanhf_core_{sse4|avx2|avx512}.S | expand

Commit Message

Noah Goldstein Jan. 30, 2022, 4:36 a.m. UTC
No bug.

Optimizations are:
    1. Reduce code size
        avx512: -56 bytes
        avx2:   -70 bytes
        sse4:   -106 bytes
    2. Reduce rodata size
        avx512: -448 bytes
        avx2:   -32 bytes
        sse4:   -4k+ (shares rodata with avx2)
    3. Remove register save/restores and stack adjustment from the
       fast path.
    4. Slightly better instruction selection where possible.

This results in roughly a 15% performance improvement for all
functions.

Results from geomean of 40 benchtest runs:
       Function, New Time, Old Time, New / Old
 _ZGVbN4v_tanhf,     3.28,    3.852,     0.852
 _ZGVcN8v_tanhf,    3.556,    4.192,     0.848
 _ZGVdN8v_tanhf,     2.13,    2.486,     0.857
_ZGVeN16v_tanhf,    0.658,    0.762,     0.864
---
 .../multiarch/svml_s_tanhf16_core_avx512.S    | 585 +++++------
 .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 871 +++--------------
 .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 908 +++---------------
 3 files changed, 581 insertions(+), 1783 deletions(-)

Comments

Sunil Pandey Feb. 1, 2022, 8:02 p.m. UTC | #1
Hi Noah,

We would like to get this patch, but it's too late for 2.35.

This patch is too big, can you please break this patch into multiple
smaller patches?

Also, it seems like this patch is incomplete. I got a build error on
the glibc master.

./sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S:77:33: fatal
error: svml_s_tanhf_rodata.S: No such file or directory
 #include "svml_s_tanhf_rodata.S"
                                 ^
compilation terminated.
../sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S:74:33: fatal
error: svml_s_tanhf_rodata.S: No such file or directory
 #include "svml_s_tanhf_rodata.S"
                                 ^
compilation terminated.

Thanks,
Sunil






On Sat, Jan 29, 2022 at 8:37 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> No bug.
>
> Optimizations are:
>     1. Reduce code size
>         avx512: -56 bytes
>         avx2:   -70 bytes
>         sse4:   -106 bytes
>     2. Reduce rodata size
>         avx512: -448 bytes
>         avx2:   -32 bytes
>         sse4:   -4k+ (shares rodata with avx2)
>     3. Remove register save/restores and stack adjustment from the
>        fast path.
>     4. Slightly better instruction selection where possible.
>
> This results in roughly a 15% performance improvement for all
> functions.
>
> Results from geomean of 40 benchtest runs:
>        Function, New Time, Old Time, New / Old
>  _ZGVbN4v_tanhf,     3.28,    3.852,     0.852
>  _ZGVcN8v_tanhf,    3.556,    4.192,     0.848
>  _ZGVdN8v_tanhf,     2.13,    2.486,     0.857
> _ZGVeN16v_tanhf,    0.658,    0.762,     0.864
> ---
>  .../multiarch/svml_s_tanhf16_core_avx512.S    | 585 +++++------
>  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 871 +++--------------
>  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 908 +++---------------
>  3 files changed, 581 insertions(+), 1783 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> index 8954a5f658..6a2f0c1392 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> @@ -70,312 +70,323 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _sC                            0
> -#define _sP0                           128
> -#define _sP2                           256
> -#define _sP3                           384
> -#define _sP4                           512
> -#define _sP5                           640
> -#define _sP6                           768
> -#define _sP7                           896
> -#define _iExpMantMask_UISA             1024
> -#define _iMinIdxOfsMask_UISA           1088
> -#define _iMaxIdxMask_UISA              1152
> -#define _sSignMask                     1216
> -#define _sAbsMask                      1280
> -#define _iExpMantMask                  1344
> -#define _iExpMask                      1408
> -#define _iMinIdxOfsMask                1472
> -#define _iMaxIdxMask                   1536
> -
>  #include <sysdep.h>
>
> +#define TANHF_DATA(offset)     ((offset) + __svml_stanh_data_internal)
> +
> +/* Offsets for data table __svml_stanh_data_internal.  */
> +#define _iExpMantMask_UISA     0
> +#define _iMinIdxOfsMask_UISA   4
> +#define _iMaxIdxMask_UISA      8
> +#define _iExpMask      12
> +#define _sSignMask     64
> +#define _sC_lo 128
> +#define _sC_hi 192
> +#define _sP7_lo        256
> +#define _sP7_hi        320
> +#define _sP6_lo        384
> +#define _sP6_hi        448
> +#define _sP5_lo        512
> +#define _sP5_hi        576
> +#define _sP4_lo        640
> +#define _sP4_hi        704
> +#define _sP3_lo        768
> +#define _sP3_hi        832
> +#define _sP2_lo        896
> +#define _sP2_hi        960
> +#define _sP0_lo        1024
> +#define _sP0_hi        1088
> +
>          .text
>         .section .text.exex512,"ax",@progbits
>  ENTRY(_ZGVeN16v_tanhf_skx)
> -        pushq     %rbp
> -        cfi_def_cfa_offset(16)
> -        movq      %rsp, %rbp
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -        andq      $-64, %rsp
> -        subq      $192, %rsp
> -        vmovaps   %zmm0, %zmm1
> -        vmovups   __svml_stanh_data_internal(%rip), %zmm9
> -        vmovups   _sP6+__svml_stanh_data_internal(%rip), %zmm11
> -        vmovups   _sP5+__svml_stanh_data_internal(%rip), %zmm12
> -        vmovups   _sP4+__svml_stanh_data_internal(%rip), %zmm13
> -        vmovups   _sP3+__svml_stanh_data_internal(%rip), %zmm14
> -        vmovups   _sP2+__svml_stanh_data_internal(%rip), %zmm15
> -        vpternlogd $255, %zmm2, %zmm2, %zmm2
> -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> -
> -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> -        vpandd    _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> -        vpsubd    _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> -        vpcmpd    $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> +       vpandd  TANHF_DATA(_iExpMantMask_UISA)(%rip) {1to16}, %zmm0, %zmm1
> +       vpsubd  TANHF_DATA(_iMinIdxOfsMask_UISA)(%rip) {1to16}, %zmm1, %zmm2
>
> -/*
> - *  small table specific variables *
> - *  Constant loading
> - */
> -        vpxord    %zmm5, %zmm5, %zmm5
> -
> -/* if VMIN, VMAX is defined for I type */
> -        vpmaxsd   %zmm5, %zmm4, %zmm6
> -        vpminsd   _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> -        vpsrld    $21, %zmm7, %zmm10
> -        vmovups   _sP7+__svml_stanh_data_internal(%rip), %zmm4
> -        vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> -        vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> -        vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> -        vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> -        vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> -        vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> -        vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> -        vpandnd   %zmm3, %zmm3, %zmm2{%k1}
> -        vptestmd  %zmm2, %zmm2, %k0
> -        vmovups   _sP0+__svml_stanh_data_internal(%rip), %zmm3
> -        vsubps    {rn-sae}, %zmm9, %zmm8, %zmm2
> -        kmovw     %k0, %edx
> -        vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> -        vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> -        vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> -        vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> -        vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> -        vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> -        vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> -        vorps     %zmm0, %zmm4, %zmm0
> -        testl     %edx, %edx
> -
> -/* Go to special inputs processing branch */
> -        jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> -
> -/* Restore registers
> - * and exit the function
> - */
> +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> +       vpxord  %zmm3, %zmm3, %zmm3
> +       vpmaxsd %zmm3, %zmm2, %zmm3
> +       vpminsd TANHF_DATA(_iMaxIdxMask_UISA)(%rip) {1to16}, %zmm3, %zmm3
>
> -L(EXIT):
> -        movq      %rbp, %rsp
> -        popq      %rbp
> -        cfi_def_cfa(7, 8)
> -        cfi_restore(6)
> -        ret
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -
> -/* Branch to process
> - * special inputs
> - */
> +       /* Setup permute indices in zmm3.  */
> +       vpsrld  $21, %zmm3, %zmm3
> +
> +       /* Store if there are any special cases in k1.  */
> +       vpcmpd  $6, TANHF_DATA(_iExpMask)(%rip) {1to16}, %zmm1, %k1
> +
> +
> +       /* Store absolute values of inputs in zmm1.  */
> +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> +       vandnps %zmm0, %zmm4, %zmm1
> +
> +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> +
> +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
>
> +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> +
> +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> +
> +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> +
> +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> +
> +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> +
> +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> +
> +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> +
> +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> +
> +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> +
> +       kmovw   %k1, %edx
> +       testl   %edx, %edx
> +
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       /* Wait until after branch of write over zmm0.  */
> +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> +
> +       /* No stack restoration on the fastpath.  */
> +       ret
> +
> +       /* Branch to process special inputs.  */
>  L(SPECIAL_VALUES_BRANCH):
> -        vmovups   %zmm1, 64(%rsp)
> -        vmovups   %zmm0, 128(%rsp)
> -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> -
> -        xorl      %eax, %eax
> -                                # LOE rbx r12 r13 r14 r15 eax edx
> -
> -        vzeroupper
> -        movq      %r12, 16(%rsp)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -        movl      %eax, %r12d
> -        movq      %r13, 8(%rsp)
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -        movl      %edx, %r13d
> -        movq      %r14, (%rsp)
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> +       pushq   %rbp
> +       /* Need to callee save registers to preserve state across tanhf calls.
> +        */
> +       pushq   %r13
> +       pushq   %r12
> +       movq    %rsp, %rbp
>
> -L(RANGEMASK_CHECK):
> -        btl       %r12d, %r13d
> +       /* Align stack and make room for 2x zmm vectors.  */
> +       andq    $-64, %rsp
> +       addq    $-128, %rsp
>
> -/* Call scalar math function */
> -        jc        L(SCALAR_MATH_CALL)
> -                                # LOE rbx r15 r12d r13d
> +       /* Save all already computed inputs.  */
> +       vpternlogd $0xec, %zmm4, %zmm2, %zmm2
> +       vmovaps %zmm2, (%rsp)
> +       /* Save origional input (zmm0 unchanged up to this point).  */
> +       vmovaps %zmm0, 64(%rsp)
>
> -/* Special inputs
> - * processing loop
> - */
> +       vzeroupper
>
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %r13d
>  L(SPECIAL_VALUES_LOOP):
> -        incl      %r12d
> -        cmpl      $16, %r12d
> -
> -/* Check bits in range mask */
> -        jl        L(RANGEMASK_CHECK)
> -                                # LOE rbx r15 r12d r13d
> -
> -        movq      16(%rsp), %r12
> -        cfi_restore(12)
> -        movq      8(%rsp), %r13
> -        cfi_restore(13)
> -        movq      (%rsp), %r14
> -        cfi_restore(14)
> -        vmovups   128(%rsp), %zmm0
> -
> -/* Go to exit */
> -        jmp       L(EXIT)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r12 r13 r14 r15 zmm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> +       /* use r12 as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %r12d, %r12d
> +       tzcntl  %r13d, %r12d
>
> -L(SCALAR_MATH_CALL):
> -        movl      %r12d, %r14d
> -        movss     64(%rsp,%r14,4), %xmm0
> -        call      tanhf@PLT
> -                                # LOE rbx r14 r15 r12d r13d xmm0
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   64(%rsp, %r12, 4), %xmm0
> +       call    tanhf@PLT
>
> -        movss     %xmm0, 128(%rsp,%r14,4)
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %r12, 4)
>
> -/* Process special inputs in loop */
> -        jmp       L(SPECIAL_VALUES_LOOP)
> -                                # LOE rbx r15 r12d r13d
> -END(_ZGVeN16v_tanhf_skx)
> +       blsr    %r13d, %r13d
> +       jnz     L(SPECIAL_VALUES_LOOP)
>
> -        .section .rodata, "a"
> -        .align 64
> +       /* All results have been written to 64(%rsp).  */
> +       vmovaps (%rsp), %zmm0
> +       /* Restore rsp.  */
> +       movq    %rbp, %rsp
> +       /* Restore callee save registers.  */
> +       popq    %r12
> +       popq    %r13
> +       popq    %rbp
> +       ret
> +END(_ZGVeN16v_tanhf_skx)
>
> +       .section .rodata, "a"
> +       .align  16
>  #ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct
> -{
> -        __declspec(align(64)) VUINT32 _sC[32][1];
> -        __declspec(align(64)) VUINT32 _sP0[32][1];
> -        __declspec(align(64)) VUINT32 _sP2[32][1];
> -        __declspec(align(64)) VUINT32 _sP3[32][1];
> -        __declspec(align(64)) VUINT32 _sP4[32][1];
> -        __declspec(align(64)) VUINT32 _sP5[32][1];
> -        __declspec(align(64)) VUINT32 _sP6[32][1];
> -        __declspec(align(64)) VUINT32 _sP7[32][1];
> -        __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> -        __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> -        __declspec(align(64)) VUINT32 _sSignMask[16][1];
> -        __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> -        __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> -        __declspec(align(64)) VUINT32 _iExpMask[16][1];
> -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> -        __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> -} __svml_stanh_data_internal;
> +       typedef unsigned int VUINT32;
> +       typedef struct
> +       {
> +       __declspec (align(4))VUINT32 _iExpMantMask_UISA[1][1];
> +       __declspec (align(4))VUINT32 _iMinIdxOfsMask_UISA[1][1];
> +       __declspec (align(4))VUINT32 _iMaxIdxMask_UISA[1][1];
> +       __declspec (align(4))VUINT32 _iExpMask[1][1];
> +       __declspec (align(64))VUINT32 _sSignMask[16][1];
> +       __declspec (align(64))VUINT32 _sC_lo[16][1];
> +       __declspec (align(64))VUINT32 _sC_hi[16][1];
> +       __declspec (align(64))VUINT32 _sP7_lo[16][1];
> +       __declspec (align(64))VUINT32 _sP7_hi[16][1];
> +       __declspec (align(64))VUINT32 _sP6_lo[16][1];
> +       __declspec (align(64))VUINT32 _sP6_hi[16][1];
> +       __declspec (align(64))VUINT32 _sP5_lo[16][1];
> +       __declspec (align(64))VUINT32 _sP5_hi[16][1];
> +       __declspec (align(64))VUINT32 _sP4_lo[16][1];
> +       __declspec (align(64))VUINT32 _sP4_hi[16][1];
> +       __declspec (align(64))VUINT32 _sP3_lo[16][1];
> +       __declspec (align(64))VUINT32 _sP3_hi[16][1];
> +       __declspec (align(64))VUINT32 _sP2_lo[16][1];
> +       __declspec (align(64))VUINT32 _sP2_hi[16][1];
> +       __declspec (align(64))VUINT32 _sP0_lo[16][1];
> +       __declspec (align(64))VUINT32 _sP0_hi[16][1];
> +       }__svml_stanh_data_internal;
>  #endif
> +
>  __svml_stanh_data_internal:
> -        /*== _sC ==*/
> -        .long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> -        .long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> -        .long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> -        .long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> -        .long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> -        .long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> -        .long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
> -        .long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> -        /*== p0 ==*/
> -        .align 64
> -        .long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> -        .long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> -        .long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> -        .long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> -        .long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> -        .long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> -        .long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> -        .long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> -        /*== p2 ==*/
> -        .align 64
> -        .long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> -        .long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> -        .long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> -        .long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> -        .long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> -        .long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> -        .long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> -        .long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> -        /*== p3 ==*/
> -        .align 64
> -        .long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> -        .long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> -        .long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> -        .long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> -        .long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> -        .long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> -        .long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> -        .long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> -        /*== p4 ==*/
> -        .align 64
> -        .long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> -        .long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> -        .long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> -        .long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> -        .long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> -        .long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> -        .long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> -        .long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> -        /*== p5 ==*/
> -        .align 64
> -        .long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> -        .long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> -        .long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> -        .long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> -        .long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> -        .long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> -        .long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> -        .long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> -        /*== p6 ==*/
> -        .align 64
> -        .long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> -        .long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> -        .long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> -        .long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> -        .long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> -        .long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> -        .long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> -        .long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> -        /*== p7 ==*/
> -        .align 64
> -        .long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> -        .long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> -        .long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> -        .long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> -        .long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> -        .long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> -        .long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> -        .long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> -        .align 64
> -        .long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000           /* _iExpMantMask_UISA     */
> -        .align 64
> -        .long 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000           /* _iMinIdxOfsMask_UISA   */
> -        .align 64
> -        .long 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000           /* _iMaxIdxMask_UISA      */
> -        .align 64
> -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> -        .align 64
> -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> -        .align 64
> -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> -        .align 64
> -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> -        .align 64
> -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> -        .align 64
> -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> -        .align 64
> -        .type  __svml_stanh_data_internal,@object
> -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> +       .align  4
> +       /* _iExpMantMask_UISA.  */
> +       .long   0x7fe00000
> +
> +       .align  4
> +       /* _iMinIdxOfsMask_UISA.  */
> +       .long   0x3d400000
> +
> +       .align  4
> +       /* _iMaxIdxMask_UISA.  */
> +       .long   0x03e00000
> +
> +       .align  4
> +       /* _iExpMask.  */
> +       .long   0x7f000000
> +
> +       .align  64
> +       /* _sSignMask.  */
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> +
> +       .align  64
> +       /* _sC_lo.  */
> +       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> +       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> +       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> +       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> +
> +       .align  64
> +       /* _sC_hi.  */
> +       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> +       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> +       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> +       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> +
> +       .align  64
> +       /* _sP7_lo.  */
> +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> +
> +       .align  64
> +       /* _sP7_hi.  */
> +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> +
> +       .align  64
> +       /* _sP6_lo.  */
> +       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> +       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> +       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> +       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> +
> +       .align  64
> +       /* _sP6_hi.  */
> +       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> +       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> +       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> +       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> +
> +       .align  64
> +       /* _sP5_lo.  */
> +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> +
> +       .align  64
> +       /* _sP5_hi.  */
> +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> +
> +       .align  64
> +       /* _sP4_lo.  */
> +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> +
> +       .align  64
> +       /* _sP4_hi.  */
> +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> +
> +       .align  64
> +       /* _sP3_lo.  */
> +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> +
> +       .align  64
> +       /* _sP3_hi.  */
> +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> +
> +       .align  64
> +       /* _sP2_lo.  */
> +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> +
> +       .align  64
> +       /* _sP2_hi.  */
> +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> +
> +       .align  64
> +       /* _sP0_lo.  */
> +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> +
> +       .align  64
> +       /* _sP0_hi.  */
> +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> +
> +       .align  64
> +       .type   __svml_stanh_data_internal, @object
> +       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> index 50f753ffb3..716b06d640 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> @@ -70,763 +70,154 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _dbP                           0
> -#define _sSignMask                     4288
> -#define _sAbsMask                      4304
> -#define _iExpMantMask                  4320
> -#define _iExpMask                      4336
> -#define _iMinIdxOfsMask                4352
> -#define _iMaxIdxMask                   4368
>
>  #include <sysdep.h>
>
> +#define ONLY_DECL_OFFSET
> +#include "svml_s_tanhf_rodata.S"
> +
>          .text
>         .section .text.sse4,"ax",@progbits
>  ENTRY(_ZGVbN4v_tanhf_sse4)
> -        subq      $72, %rsp
> -        cfi_def_cfa_offset(80)
> -        movaps    %xmm0, %xmm5
> +       /* Save copy of input in xmm12.  */
> +       movaps  %xmm0, %xmm12
>
> -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> -        movdqu    _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r8
> -        pand      %xmm5, %xmm9
> +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> +       pand    %xmm0, %xmm3
>
> -/* if VMIN, VMAX is defined for I type */
> -        pxor      %xmm7, %xmm7
> -        movdqa    %xmm9, %xmm6
> -        psubd     _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
>
> -/*
> - *  small table specific variables *
> - *  Constant loading
> - */
> -        movdqu    _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> -        movdqa    %xmm9, %xmm11
> -        movdqa    %xmm9, %xmm8
> -        pcmpgtd   %xmm10, %xmm11
> -        pcmpgtd   %xmm7, %xmm8
> -        movdqa    %xmm11, %xmm14
> -        pand      %xmm8, %xmm9
> -        andps     %xmm11, %xmm10
> -        andnps    %xmm9, %xmm14
> -        orps      %xmm10, %xmm14
> -        psrld     $14, %xmm14
> -        movd      %xmm14, %edx
> -        pshufd    $1, %xmm14, %xmm12
> -        pshufd    $2, %xmm14, %xmm13
> -        movd      %xmm12, %ecx
> -        pshufd    $3, %xmm14, %xmm15
> -        movups    _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> -        movslq    %edx, %rdx
> -        andps     %xmm5, %xmm3
> -        movslq    %ecx, %rcx
> -        pcmpgtd   _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> -        movd      %xmm13, %esi
> -        movups    -16(%rdx,%r8), %xmm2
> -        movaps    %xmm2, %xmm0
> -        movd      %xmm15, %edi
> -        movmskps  %xmm6, %eax
> -        movups    -16(%rcx,%r8), %xmm6
> -        unpcklpd  %xmm6, %xmm0
> -        unpckhpd  %xmm6, %xmm2
> -        cvtps2pd  %xmm3, %xmm6
> -        movhlps   %xmm3, %xmm3
> -        cvtps2pd  %xmm3, %xmm3
> -        movslq    %esi, %rsi
> -        movslq    %edi, %rdi
> -        movups    (%rcx,%r8), %xmm8
> -        movups    (%rdx,%r8), %xmm12
> -        movups    (%rsi,%r8), %xmm13
> -        movaps    %xmm12, %xmm10
> -        movups    (%rdi,%r8), %xmm9
> -        movaps    %xmm13, %xmm11
> -        unpckhpd  %xmm8, %xmm12
> -        unpckhpd  %xmm9, %xmm13
> -        mulpd     %xmm6, %xmm12
> -        mulpd     %xmm3, %xmm13
> -        unpcklpd  %xmm8, %xmm10
> -        unpcklpd  %xmm9, %xmm11
> -        addpd     %xmm10, %xmm12
> -        addpd     %xmm11, %xmm13
> -        mulpd     %xmm6, %xmm12
> -        mulpd     %xmm3, %xmm13
> -        addpd     %xmm2, %xmm12
> -        movups    -16(%rsi,%r8), %xmm1
> -        movups    -16(%rdi,%r8), %xmm7
> -        movaps    %xmm1, %xmm14
> -        unpckhpd  %xmm7, %xmm1
> -        addpd     %xmm1, %xmm13
> -        mulpd     %xmm12, %xmm6
> -        mulpd     %xmm13, %xmm3
> -        addpd     %xmm0, %xmm6
> -        unpcklpd  %xmm7, %xmm14
> -        addpd     %xmm14, %xmm3
> -        cvtpd2ps  %xmm6, %xmm0
> -        cvtpd2ps  %xmm3, %xmm1
> -        movups    _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> -        movlhps   %xmm1, %xmm0
> -        andps     %xmm5, %xmm4
> -        orps      %xmm4, %xmm0
> -        testl     %eax, %eax
> -
> -/* Go to special inputs processing branch */
> -        jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> -
> -/* Restore registers
> - * and exit the function
> - */
> +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
> +       pxor    %xmm7, %xmm7
> +       /* Save xmm3 for special values check at end.  */
> +       movdqa  %xmm3, %xmm8
> +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> +       pmaxsd  %xmm7, %xmm3
> +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> +       psrld   $14, %xmm3
>
> -L(EXIT):
> -        addq      $72, %rsp
> -        cfi_def_cfa_offset(8)
> -        ret
> -        cfi_def_cfa_offset(80)
> +       movq    %xmm3, %rcx
> +       movl    %ecx, %edx
> +       shrq    $32, %rcx
>
> -/* Branch to process
> - * special inputs
> - */
> +       /* xmm8 contains mask of special values.  */
> +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
>
> -L(SPECIAL_VALUES_BRANCH):
> -        movups    %xmm5, 32(%rsp)
> -        movups    %xmm0, 48(%rsp)
> -                                # LOE rbx rbp r12 r13 r14 r15 eax
> -
> -        xorl      %edx, %edx
> -        movq      %r12, 16(%rsp)
> -        cfi_offset(12, -64)
> -        movl      %edx, %r12d
> -        movq      %r13, 8(%rsp)
> -        cfi_offset(13, -72)
> -        movl      %eax, %r13d
> -        movq      %r14, (%rsp)
> -        cfi_offset(14, -80)
> -                                # LOE rbx rbp r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> +       pshufd  $0x0e, %xmm3, %xmm3
> +       movq    %xmm3, %rdi
> +       movl    %edi, %esi
> +       shrq    $32, %rdi
>
> -L(RANGEMASK_CHECK):
> -        btl       %r12d, %r13d
> +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> +       andps   %xmm1, %xmm0
>
> -/* Call scalar math function */
> -        jc        L(SCALAR_MATH_CALL)
> -                                # LOE rbx rbp r15 r12d r13d
> +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> +       movups  (%rdx, %rax), %xmm2
> +       movups  (%rcx, %rax), %xmm6
>
> -/* Special inputs
> - * processing loop
> - */
> +       movaps  %xmm2, %xmm4
> +       movlhps %xmm6, %xmm4
> +       unpckhpd %xmm6, %xmm2
>
> -L(SPECIAL_VALUES_LOOP):
> -        incl      %r12d
> -        cmpl      $4, %r12d
> -
> -/* Check bits in range mask */
> -        jl        L(RANGEMASK_CHECK)
> -                                # LOE rbx rbp r15 r12d r13d
> -
> -        movq      16(%rsp), %r12
> -        cfi_restore(12)
> -        movq      8(%rsp), %r13
> -        cfi_restore(13)
> -        movq      (%rsp), %r14
> -        cfi_restore(14)
> -        movups    48(%rsp), %xmm0
> -
> -/* Go to exit */
> -        jmp       L(EXIT)
> -        cfi_offset(12, -64)
> -        cfi_offset(13, -72)
> -        cfi_offset(14, -80)
> -                                # LOE rbx rbp r12 r13 r14 r15 xmm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> +       cvtps2pd %xmm0, %xmm6
> +       movhlps %xmm0, %xmm0
> +       cvtps2pd %xmm0, %xmm0
>
> -L(SCALAR_MATH_CALL):
> -        movl      %r12d, %r14d
> -        movss     32(%rsp,%r14,4), %xmm0
> -        call      tanhf@PLT
> -                                # LOE rbx rbp r14 r15 r12d r13d xmm0
> +       movups  16(%rdx, %rax), %xmm5
> +       movups  16(%rsi, %rax), %xmm13
>
> -        movss     %xmm0, 48(%rsp,%r14,4)
> +       movaps  %xmm5, %xmm10
> +       movaps  %xmm13, %xmm11
>
> -/* Process special inputs in loop */
> -        jmp       L(SPECIAL_VALUES_LOOP)
> -                                # LOE rbx rbp r15 r12d r13d
> -END(_ZGVbN4v_tanhf_sse4)
> +       movups  16(%rcx, %rax), %xmm7
> +       movups  16(%rdi, %rax), %xmm3
> +
> +       unpckhpd %xmm7, %xmm5
> +       unpckhpd %xmm3, %xmm13
> +
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
> +
> +       movlhps %xmm7, %xmm10
> +       movlhps %xmm3, %xmm11
> +
> +       addpd   %xmm10, %xmm5
> +       addpd   %xmm11, %xmm13
> +
> +       mulpd   %xmm6, %xmm5
> +       mulpd   %xmm0, %xmm13
> +
> +       addpd   %xmm2, %xmm5
>
> -        .section .rodata, "a"
> -        .align 16
> -
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct
> -{
> -        __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> -        __declspec(align(16)) VUINT32 _sSignMask[4][1];
> -        __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> -        __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> -        __declspec(align(16)) VUINT32 _iExpMask[4][1];
> -        __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> -        __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> -} __svml_stanh_data_internal;
> -#endif
> -__svml_stanh_data_internal:
> -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> -        .quad 0x3ff0000000000000
> -        .quad 0x0000000000000000
> -        .quad 0x0000000000000000
> -        .quad 0x0000000000000000
> -        .align 16
> -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> -        .align 16
> -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> -        .align 16
> -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> -        .align 16
> -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> -        .align 16
> -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> -        .align 16
> -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> -        .align 16
> -        .type  __svml_stanh_data_internal,@object
> -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> +       movups  (%rsi, %rax), %xmm2
> +       movups  (%rdi, %rax), %xmm7
> +
> +       movaps  %xmm2, %xmm3
> +
> +       unpckhpd %xmm7, %xmm2
> +       movlhps %xmm7, %xmm3
> +
> +       addpd   %xmm13, %xmm2
> +
> +       mulpd   %xmm5, %xmm6
> +       addpd   %xmm4, %xmm6
> +
> +       mulpd   %xmm2, %xmm0
> +       addpd   %xmm3, %xmm0
> +
> +       cvtpd2ps %xmm0, %xmm2
> +       cvtpd2ps %xmm6, %xmm0
> +
> +       movlhps %xmm2, %xmm0
> +       andnps  %xmm12, %xmm1
> +       orps    %xmm1, %xmm0
> +
> +       movmskps %xmm8, %edx
> +       testl   %edx, %edx
> +
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +
> +       /* No stack restoration on the fastpath.  */
> +       ret
> +
> +L(SPECIAL_VALUES_BRANCH):
> +       subq    $48, %rsp
> +
> +       movups  %xmm0, (%rsp)
> +       movups  %xmm12, 16(%rsp)
> +
> +       movq    %r12, 32(%rsp)
> +       movq    %r13, 40(%rsp)
> +
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %r13d
> +L(SPECIAL_VALUES_LOOP):
> +       /* use r12 as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %r12d, %r12d
> +       bsfl    %r13d, %r12d
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   16(%rsp, %r12, 4), %xmm0
> +       call    tanhf@PLT
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %r12, 4)
> +
> +       leal    -1(%r13), %eax
> +       andl    %eax, %r13d
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +
> +       /* All results have been written to 16(%rsp).  */
> +       movups  (%rsp), %xmm0
> +       movq    32(%rsp), %r12
> +       movq    40(%rsp), %r13
> +       addq    $48, %rsp
> +       ret
> +END(_ZGVbN4v_tanhf_sse4)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> index 3745db5aa4..90c3ea4cc6 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> @@ -70,775 +70,171 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal
> - */
> -#define _dbP                           0
> -#define _sSignMask                     4288
> -#define _sAbsMask                      4320
> -#define _iExpMantMask                  4352
> -#define _iExpMask                      4384
> -#define _iMinIdxOfsMask                4416
> -#define _iMaxIdxMask                   4448
> -
>  #include <sysdep.h>
> +#include "svml_s_tanhf_rodata.S"
>
>          .text
>         .section .text.avx2,"ax",@progbits
>  ENTRY(_ZGVdN8v_tanhf_avx2)
> -        pushq     %rbp
> -        cfi_def_cfa_offset(16)
> -        movq      %rsp, %rbp
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -        andq      $-32, %rsp
> -        pushq     %r12
> -        subq      $120, %rsp
> -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r10
> -        vmovaps   %ymm0, %ymm12
> -
> -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> -        vpand     _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
>
> -/*
> - *  small table specific variables *
> - *  Constant loading
> - */
> -        vmovups   _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> -        vpsubd    _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> -
> -/* if VMIN, VMAX is defined for I type */
> -        vxorps    %ymm15, %ymm15, %ymm15
> -        vpcmpgtd  %ymm15, %ymm9, %ymm0
> -        vpand     %ymm0, %ymm9, %ymm7
> -        vpcmpgtd  %ymm8, %ymm9, %ymm6
> -        vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> -        vpsrld    $14, %ymm3, %ymm1
> -        vpcmpgtd  _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> -        vmovmskps %ymm13, %r11d
> -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> -        vextractf128 $1, %ymm1, %xmm2
> -        vmovd     %xmm1, %r9d
> -        vmovd     %xmm2, %ecx
> -        vpextrd   $1, %xmm2, %edx
> -        vpextrd   $1, %xmm1, %r8d
> -        movslq    %r9d, %r9
> -        movslq    %edx, %rdx
> -        movslq    %r8d, %r8
> -        vpextrd   $2, %xmm1, %edi
> -        movslq    %ecx, %rcx
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> -        vpextrd   $3, %xmm2, %r12d
> -        vpextrd   $3, %xmm1, %esi
> -        vpextrd   $2, %xmm2, %eax
> -        movslq    %edi, %rdi
> -        movslq    %r12d, %r12
> -        movslq    %esi, %rsi
> -        movslq    %eax, %rax
> -        vmovupd   -16(%r9,%r10), %xmm5
> -        vmovupd   -16(%rdx,%r10), %xmm14
> -        vmovupd   -16(%rcx,%r10), %xmm13
> -        vmovupd   (%r9,%r10), %xmm1
> -        vmovupd   (%r8,%r10), %xmm2
> -        vmovupd   -16(%r8,%r10), %xmm4
> -        vinsertf128 $1, -16(%rdi,%r10), %ymm5, %ymm15
> -        vinsertf128 $1, -16(%r12,%r10), %ymm14, %ymm3
> -        vinsertf128 $1, -16(%rax,%r10), %ymm13, %ymm6
> -        vinsertf128 $1, (%rdi,%r10), %ymm1, %ymm5
> -        vinsertf128 $1, (%rsi,%r10), %ymm2, %ymm14
> -        vunpcklpd %ymm3, %ymm6, %ymm8
> -        vunpckhpd %ymm3, %ymm6, %ymm6
> -        vunpcklpd %ymm14, %ymm5, %ymm3
> -        vunpckhpd %ymm14, %ymm5, %ymm2
> -        vmovupd   (%rcx,%r10), %xmm13
> -        vcvtps2pd %xmm10, %ymm5
> -        vextractf128 $1, %ymm10, %xmm10
> -        vfmadd213pd %ymm3, %ymm5, %ymm2
> -        vinsertf128 $1, -16(%rsi,%r10), %ymm4, %ymm0
> -        vmovupd   (%rdx,%r10), %xmm4
> -        vunpcklpd %ymm0, %ymm15, %ymm9
> -        vunpckhpd %ymm0, %ymm15, %ymm7
> -        vfmadd213pd %ymm7, %ymm5, %ymm2
> -        vfmadd213pd %ymm9, %ymm5, %ymm2
> -        vinsertf128 $1, (%r12,%r10), %ymm4, %ymm0
> -        vcvtps2pd %xmm10, %ymm4
> -        vinsertf128 $1, (%rax,%r10), %ymm13, %ymm15
> -        vunpcklpd %ymm0, %ymm15, %ymm1
> -        vunpckhpd %ymm0, %ymm15, %ymm0
> -        vfmadd213pd %ymm1, %ymm4, %ymm0
> -        vcvtpd2ps %ymm2, %xmm1
> -        vfmadd213pd %ymm6, %ymm4, %ymm0
> -        vfmadd213pd %ymm8, %ymm4, %ymm0
> -        vcvtpd2ps %ymm0, %xmm0
> -        vinsertf128 $1, %xmm0, %ymm1, %ymm2
> -        vorps     %ymm11, %ymm2, %ymm0
> -        testl     %r11d, %r11d
> -
> -/* Go to special inputs processing branch */
> -        jne       L(SPECIAL_VALUES_BRANCH)
> -                                # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> -
> -/* Restore registers
> - * and exit the function
> - */
> +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> +       vpxor   %ymm3, %ymm3, %ymm3
> +       vpmaxsd %ymm3, %ymm2, %ymm2
> +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
>
> -L(EXIT):
> -        addq      $120, %rsp
> -        cfi_restore(12)
> -        popq      %r12
> -        movq      %rbp, %rsp
> -        popq      %rbp
> -        cfi_def_cfa(7, 8)
> -        cfi_restore(6)
> -        ret
> -        cfi_def_cfa(6, 16)
> -        cfi_offset(6, -16)
> -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> -
> -/* Branch to process
> - * special inputs
> - */
> +       vpsrld  $14, %ymm2, %ymm1
>
> -L(SPECIAL_VALUES_BRANCH):
> -        vmovups   %ymm12, 32(%rsp)
> -        vmovups   %ymm0, 64(%rsp)
> -                                # LOE rbx r13 r14 r15 r11d ymm0
> -
> -        xorl      %r12d, %r12d
> -                                # LOE rbx r13 r14 r15 r11d r12d
> -
> -        vzeroupper
> -        movq      %r13, 8(%rsp)
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> -        movl      %r11d, %r13d
> -        movq      %r14, (%rsp)
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r15 r12d r13d
> -
> -/* Range mask
> - * bits check
> - */
> +       /* Store special cases in ymm15.  */
> +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
>
> -L(RANGEMASK_CHECK):
> -        btl       %r12d, %r13d
>
> -/* Call scalar math function */
> -        jc        L(SCALAR_MATH_CALL)
> -                                # LOE rbx r15 r12d r13d
> +       /* Store base of lookup table in rax.  */
> +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
>
> -/* Special inputs
> - * processing loop
> - */
> +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> +          store/load as we can take advantage of store-forwarding.  */
> +       vmovq   %xmm1, %r8
> +       /* We have eliminated all negative values for ymm1 so no need to sign
> +          extend.  */
> +       movl    %r8d, %r9d
> +       shrq    $32, %r8
>
> -L(SPECIAL_VALUES_LOOP):
> -        incl      %r12d
> -        cmpl      $8, %r12d
> -
> -/* Check bits in range mask */
> -        jl        L(RANGEMASK_CHECK)
> -                                # LOE rbx r15 r12d r13d
> -
> -        movq      8(%rsp), %r13
> -        cfi_restore(13)
> -        movq      (%rsp), %r14
> -        cfi_restore(14)
> -        vmovups   64(%rsp), %ymm0
> -
> -/* Go to exit */
> -        jmp       L(EXIT)
> -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> -                                # LOE rbx r13 r14 r15 ymm0
> -
> -/* Scalar math fucntion call
> - * to process special input
> - */
> +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> +          with memory operand. This helps alleviate bottleneck on p5.  */
> +       vmovdqu 16(%r9, %rax), %xmm5
>
> -L(SCALAR_MATH_CALL):
> -        movl      %r12d, %r14d
> -        movss     32(%rsp,%r14,4), %xmm0
> -        call      tanhf@PLT
> -                                # LOE rbx r14 r15 r12d r13d xmm0
> +       vpextrq $1, %xmm1, %rsi
> +       movl    %esi, %edi
> +       shrq    $32, %rsi
>
> -        movss     %xmm0, 64(%rsp,%r14,4)
> +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
>
> -/* Process special inputs in loop */
> -        jmp       L(SPECIAL_VALUES_LOOP)
> -                                # LOE rbx r15 r12d r13d
> -END(_ZGVdN8v_tanhf_avx2)
> +       vextracti128 $1, %ymm1, %xmm2
> +       vmovq   %xmm2, %rdx
> +       movl    %edx, %ecx
> +       shrq    $32, %rdx
> +
> +       vmovdqu (%rcx, %rax), %xmm6
> +
> +       vpextrq $1, %xmm2, %r10
> +       movl    %r10d, %r11d
> +       shrq    $32, %r10
> +
> +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> +
> +       vmovupd 16(%r8, %rax), %xmm1
> +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> +       vmovupd (%rdx, %rax), %xmm3
> +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> +
> +       vunpcklpd %ymm3, %ymm6, %ymm7
> +       vunpckhpd %ymm3, %ymm6, %ymm6
> +
> +       vunpcklpd %ymm1, %ymm5, %ymm3
> +       vunpckhpd %ymm1, %ymm5, %ymm1
> +
> +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> +       vandps  %ymm11, %ymm0, %ymm4
>
> -        .section .rodata, "a"
> -        .align 32
> -
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct
> -{
> -        __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> -        __declspec(align(32)) VUINT32 _sSignMask[8][1];
> -        __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> -        __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> -        __declspec(align(32)) VUINT32 _iExpMask[8][1];
> -        __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> -        __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> -} __svml_stanh_data_internal;
> -#endif
> -__svml_stanh_data_internal:
> -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> -        .quad 0x3ff0000000000000
> -        .quad 0x0000000000000000
> -        .quad 0x0000000000000000
> -        .quad 0x0000000000000000
> -        .align 32
> -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> -        .align 32
> -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> -        .align 32
> -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> -        .align 32
> -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> -        .align 32
> -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> -        .align 32
> -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> -        .align 32
> -        .type  __svml_stanh_data_internal,@object
> -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> +       vcvtps2pd %xmm4, %ymm5
> +
> +       vextractf128 $1, %ymm4, %xmm4
> +       vcvtps2pd %xmm4, %ymm4
> +
> +       vmovdqu 16(%rcx, %rax), %xmm2
> +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
> +
> +       vfmadd213pd %ymm3, %ymm5, %ymm1
> +
> +       vmovupd 16(%rdx, %rax), %xmm3
> +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> +
> +       vunpcklpd %ymm3, %ymm2, %ymm10
> +       vunpckhpd %ymm3, %ymm2, %ymm2
> +
> +       vfmadd213pd %ymm10, %ymm4, %ymm2
> +       vfmadd213pd %ymm6, %ymm4, %ymm2
> +       vfmadd213pd %ymm7, %ymm4, %ymm2
> +       vcvtpd2ps %ymm2, %xmm2
> +
> +       vmovdqu (%r9, %rax), %xmm7
> +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> +
> +       vmovupd (%r8, %rax), %xmm3
> +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> +
> +       vunpckhpd %ymm3, %ymm7, %ymm4
> +       vunpcklpd %ymm3, %ymm7, %ymm7
> +
> +       vfmadd213pd %ymm4, %ymm5, %ymm1
> +       vfmadd213pd %ymm7, %ymm5, %ymm1
> +
> +
> +       vcvtpd2ps %ymm1, %xmm1
> +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> +
> +       vmovmskps %ymm15, %edx
> +       vandnps %ymm0, %ymm11, %ymm2
> +       testl   %edx, %edx
> +       /* Go to special inputs processing branch.  */
> +       jne     L(SPECIAL_VALUES_BRANCH)
> +       /* Wait until after branch of write over ymm0.  */
> +       vorps   %ymm2, %ymm1, %ymm0
> +       /* No stack restoration on the fastpath.  */
> +       ret
> +
> +
> +L(SPECIAL_VALUES_BRANCH):
> +       pushq   %rbp
> +       /* Need to callee save registers to preserve state across tanhf calls.
> +        */
> +       pushq   %r12
> +       pushq   %r13
> +       movq    %rsp, %rbp
> +
> +       /* Align stack and make room for 2x ymm vectors.  */
> +       andq    $-32, %rsp
> +       addq    $-64, %rsp
> +
> +       /* Save all already computed inputs.  */
> +       vorps   %ymm2, %ymm1, %ymm1
> +       vmovups %ymm1, (%rsp)
> +       /* Save origional input (ymm0 unchanged up to this point).  */
> +       vmovups %ymm0, 32(%rsp)
> +
> +       vzeroupper
> +
> +       /* edx has 1s where there was a special value that needs to be handled
> +          by a tanhf call.  */
> +       movl    %edx, %r13d
> +L(SPECIAL_VALUES_LOOP):
> +       /* use r12 as index for special value that is saved across calls to
> +          tanhf. We technically don't need a callee save register here as offset
> +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> +          in the loop.  */
> +       xorl    %r12d, %r12d
> +       tzcntl  %r13d, %r12d
> +
> +       /* Scalar math fucntion call to process special input.  */
> +       movss   32(%rsp, %r12, 4), %xmm0
> +       call    tanhf@PLT
> +       /* No good way to avoid the store-forwarding fault this will cause on
> +          return. `lfence` avoids the SF fault but at greater cost as it
> +          serialized stack/callee save restoration.  */
> +       movss   %xmm0, (%rsp, %r12, 4)
> +
> +       blsr    %r13d, %r13d
> +       jnz     L(SPECIAL_VALUES_LOOP)
> +
> +       /* All results have been written to 32(%rsp).  */
> +       vmovups (%rsp), %ymm0
> +       movq    %rbp, %rsp
> +       popq    %r13
> +       popq    %r12
> +       popq    %rbp
> +       ret
> +END(_ZGVdN8v_tanhf_avx2)
> --
> 2.25.1
>
Noah Goldstein Feb. 1, 2022, 8:20 p.m. UTC | #2
On Tue, Feb 1, 2022 at 2:03 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Hi Noah,
>
> We would like to get this patch, but it's too late for 2.35.
>
> This patch is too big, can you please break this patch into multiple
> smaller patches?

Yeah, I'll split by file.
>
> Also, it seems like this patch is incomplete. I got a build error on
> the glibc master.

My fault, I separated the rodata for avx2/sse2 into a single file
so that the two implementations could share the lookup table.

Forgot to commit it :/

Will fix in V2.
>
> ./sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S:77:33: fatal
> error: svml_s_tanhf_rodata.S: No such file or directory
>  #include "svml_s_tanhf_rodata.S"
>                                  ^
> compilation terminated.
> ../sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S:74:33: fatal
> error: svml_s_tanhf_rodata.S: No such file or directory
>  #include "svml_s_tanhf_rodata.S"
>                                  ^
> compilation terminated.
>
> Thanks,
> Sunil
>
>
>
>
>
>
> On Sat, Jan 29, 2022 at 8:37 PM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > No bug.
> >
> > Optimizations are:
> >     1. Reduce code size
> >         avx512: -56 bytes
> >         avx2:   -70 bytes
> >         sse4:   -106 bytes
> >     2. Reduce rodata size
> >         avx512: -448 bytes
> >         avx2:   -32 bytes
> >         sse4:   -4k+ (shares rodata with avx2)
> >     3. Remove register save/restores and stack adjustment from the
> >        fast path.
> >     4. Slightly better instruction selection where possible.
> >
> > This results in roughly a 15% performance improvement for all
> > functions.
> >
> > Results from geomean of 40 benchtest runs:
> >        Function, New Time, Old Time, New / Old
> >  _ZGVbN4v_tanhf,     3.28,    3.852,     0.852
> >  _ZGVcN8v_tanhf,    3.556,    4.192,     0.848
> >  _ZGVdN8v_tanhf,     2.13,    2.486,     0.857
> > _ZGVeN16v_tanhf,    0.658,    0.762,     0.864
> > ---
> >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 585 +++++------
> >  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 871 +++--------------
> >  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 908 +++---------------
> >  3 files changed, 581 insertions(+), 1783 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > index 8954a5f658..6a2f0c1392 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > @@ -70,312 +70,323 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal
> > - */
> > -#define _sC                            0
> > -#define _sP0                           128
> > -#define _sP2                           256
> > -#define _sP3                           384
> > -#define _sP4                           512
> > -#define _sP5                           640
> > -#define _sP6                           768
> > -#define _sP7                           896
> > -#define _iExpMantMask_UISA             1024
> > -#define _iMinIdxOfsMask_UISA           1088
> > -#define _iMaxIdxMask_UISA              1152
> > -#define _sSignMask                     1216
> > -#define _sAbsMask                      1280
> > -#define _iExpMantMask                  1344
> > -#define _iExpMask                      1408
> > -#define _iMinIdxOfsMask                1472
> > -#define _iMaxIdxMask                   1536
> > -
> >  #include <sysdep.h>
> >
> > +#define TANHF_DATA(offset)     ((offset) + __svml_stanh_data_internal)
> > +
> > +/* Offsets for data table __svml_stanh_data_internal.  */
> > +#define _iExpMantMask_UISA     0
> > +#define _iMinIdxOfsMask_UISA   4
> > +#define _iMaxIdxMask_UISA      8
> > +#define _iExpMask      12
> > +#define _sSignMask     64
> > +#define _sC_lo 128
> > +#define _sC_hi 192
> > +#define _sP7_lo        256
> > +#define _sP7_hi        320
> > +#define _sP6_lo        384
> > +#define _sP6_hi        448
> > +#define _sP5_lo        512
> > +#define _sP5_hi        576
> > +#define _sP4_lo        640
> > +#define _sP4_hi        704
> > +#define _sP3_lo        768
> > +#define _sP3_hi        832
> > +#define _sP2_lo        896
> > +#define _sP2_hi        960
> > +#define _sP0_lo        1024
> > +#define _sP0_hi        1088
> > +
> >          .text
> >         .section .text.exex512,"ax",@progbits
> >  ENTRY(_ZGVeN16v_tanhf_skx)
> > -        pushq     %rbp
> > -        cfi_def_cfa_offset(16)
> > -        movq      %rsp, %rbp
> > -        cfi_def_cfa(6, 16)
> > -        cfi_offset(6, -16)
> > -        andq      $-64, %rsp
> > -        subq      $192, %rsp
> > -        vmovaps   %zmm0, %zmm1
> > -        vmovups   __svml_stanh_data_internal(%rip), %zmm9
> > -        vmovups   _sP6+__svml_stanh_data_internal(%rip), %zmm11
> > -        vmovups   _sP5+__svml_stanh_data_internal(%rip), %zmm12
> > -        vmovups   _sP4+__svml_stanh_data_internal(%rip), %zmm13
> > -        vmovups   _sP3+__svml_stanh_data_internal(%rip), %zmm14
> > -        vmovups   _sP2+__svml_stanh_data_internal(%rip), %zmm15
> > -        vpternlogd $255, %zmm2, %zmm2, %zmm2
> > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> > -
> > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -        vpandd    _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> > -        vpsubd    _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> > -        vpcmpd    $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > +       vpandd  TANHF_DATA(_iExpMantMask_UISA)(%rip) {1to16}, %zmm0, %zmm1
> > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask_UISA)(%rip) {1to16}, %zmm1, %zmm2
> >
> > -/*
> > - *  small table specific variables *
> > - *  Constant loading
> > - */
> > -        vpxord    %zmm5, %zmm5, %zmm5
> > -
> > -/* if VMIN, VMAX is defined for I type */
> > -        vpmaxsd   %zmm5, %zmm4, %zmm6
> > -        vpminsd   _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> > -        vpsrld    $21, %zmm7, %zmm10
> > -        vmovups   _sP7+__svml_stanh_data_internal(%rip), %zmm4
> > -        vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> > -        vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> > -        vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> > -        vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> > -        vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> > -        vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> > -        vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> > -        vpandnd   %zmm3, %zmm3, %zmm2{%k1}
> > -        vptestmd  %zmm2, %zmm2, %k0
> > -        vmovups   _sP0+__svml_stanh_data_internal(%rip), %zmm3
> > -        vsubps    {rn-sae}, %zmm9, %zmm8, %zmm2
> > -        kmovw     %k0, %edx
> > -        vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> > -        vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> > -        vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> > -        vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> > -        vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> > -        vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> > -        vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> > -        vorps     %zmm0, %zmm4, %zmm0
> > -        testl     %edx, %edx
> > -
> > -/* Go to special inputs processing branch */
> > -        jne       L(SPECIAL_VALUES_BRANCH)
> > -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> > -
> > -/* Restore registers
> > - * and exit the function
> > - */
> > +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > +       vpxord  %zmm3, %zmm3, %zmm3
> > +       vpmaxsd %zmm3, %zmm2, %zmm3
> > +       vpminsd TANHF_DATA(_iMaxIdxMask_UISA)(%rip) {1to16}, %zmm3, %zmm3
> >
> > -L(EXIT):
> > -        movq      %rbp, %rsp
> > -        popq      %rbp
> > -        cfi_def_cfa(7, 8)
> > -        cfi_restore(6)
> > -        ret
> > -        cfi_def_cfa(6, 16)
> > -        cfi_offset(6, -16)
> > -
> > -/* Branch to process
> > - * special inputs
> > - */
> > +       /* Setup permute indices in zmm3.  */
> > +       vpsrld  $21, %zmm3, %zmm3
> > +
> > +       /* Store if there are any special cases in k1.  */
> > +       vpcmpd  $6, TANHF_DATA(_iExpMask)(%rip) {1to16}, %zmm1, %k1
> > +
> > +
> > +       /* Store absolute values of inputs in zmm1.  */
> > +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > +       vandnps %zmm0, %zmm4, %zmm1
> > +
> > +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > +
> > +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> >
> > +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > +
> > +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > +
> > +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > +
> > +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > +
> > +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > +
> > +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > +
> > +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > +
> > +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > +
> > +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > +
> > +       kmovw   %k1, %edx
> > +       testl   %edx, %edx
> > +
> > +       /* Go to special inputs processing branch.  */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +       /* Wait until after branch of write over zmm0.  */
> > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > +
> > +       /* No stack restoration on the fastpath.  */
> > +       ret
> > +
> > +       /* Branch to process special inputs.  */
> >  L(SPECIAL_VALUES_BRANCH):
> > -        vmovups   %zmm1, 64(%rsp)
> > -        vmovups   %zmm0, 128(%rsp)
> > -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> > -
> > -        xorl      %eax, %eax
> > -                                # LOE rbx r12 r13 r14 r15 eax edx
> > -
> > -        vzeroupper
> > -        movq      %r12, 16(%rsp)
> > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -        movl      %eax, %r12d
> > -        movq      %r13, 8(%rsp)
> > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -        movl      %edx, %r13d
> > -        movq      %r14, (%rsp)
> > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -                                # LOE rbx r15 r12d r13d
> > -
> > -/* Range mask
> > - * bits check
> > - */
> > +       pushq   %rbp
> > +       /* Need to callee save registers to preserve state across tanhf calls.
> > +        */
> > +       pushq   %r13
> > +       pushq   %r12
> > +       movq    %rsp, %rbp
> >
> > -L(RANGEMASK_CHECK):
> > -        btl       %r12d, %r13d
> > +       /* Align stack and make room for 2x zmm vectors.  */
> > +       andq    $-64, %rsp
> > +       addq    $-128, %rsp
> >
> > -/* Call scalar math function */
> > -        jc        L(SCALAR_MATH_CALL)
> > -                                # LOE rbx r15 r12d r13d
> > +       /* Save all already computed inputs.  */
> > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm2
> > +       vmovaps %zmm2, (%rsp)
> > +       /* Save origional input (zmm0 unchanged up to this point).  */
> > +       vmovaps %zmm0, 64(%rsp)
> >
> > -/* Special inputs
> > - * processing loop
> > - */
> > +       vzeroupper
> >
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a tanhf call.  */
> > +       movl    %edx, %r13d
> >  L(SPECIAL_VALUES_LOOP):
> > -        incl      %r12d
> > -        cmpl      $16, %r12d
> > -
> > -/* Check bits in range mask */
> > -        jl        L(RANGEMASK_CHECK)
> > -                                # LOE rbx r15 r12d r13d
> > -
> > -        movq      16(%rsp), %r12
> > -        cfi_restore(12)
> > -        movq      8(%rsp), %r13
> > -        cfi_restore(13)
> > -        movq      (%rsp), %r14
> > -        cfi_restore(14)
> > -        vmovups   128(%rsp), %zmm0
> > -
> > -/* Go to exit */
> > -        jmp       L(EXIT)
> > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > -                                # LOE rbx r12 r13 r14 r15 zmm0
> > -
> > -/* Scalar math fucntion call
> > - * to process special input
> > - */
> > +       /* use r12 as index for special value that is saved across calls to
> > +          tanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop.  */
> > +       xorl    %r12d, %r12d
> > +       tzcntl  %r13d, %r12d
> >
> > -L(SCALAR_MATH_CALL):
> > -        movl      %r12d, %r14d
> > -        movss     64(%rsp,%r14,4), %xmm0
> > -        call      tanhf@PLT
> > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   64(%rsp, %r12, 4), %xmm0
> > +       call    tanhf@PLT
> >
> > -        movss     %xmm0, 128(%rsp,%r14,4)
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, (%rsp, %r12, 4)
> >
> > -/* Process special inputs in loop */
> > -        jmp       L(SPECIAL_VALUES_LOOP)
> > -                                # LOE rbx r15 r12d r13d
> > -END(_ZGVeN16v_tanhf_skx)
> > +       blsr    %r13d, %r13d
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> >
> > -        .section .rodata, "a"
> > -        .align 64
> > +       /* All results have been written to 64(%rsp).  */
> > +       vmovaps (%rsp), %zmm0
> > +       /* Restore rsp.  */
> > +       movq    %rbp, %rsp
> > +       /* Restore callee save registers.  */
> > +       popq    %r12
> > +       popq    %r13
> > +       popq    %rbp
> > +       ret
> > +END(_ZGVeN16v_tanhf_skx)
> >
> > +       .section .rodata, "a"
> > +       .align  16
> >  #ifdef __svml_stanh_data_internal_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct
> > -{
> > -        __declspec(align(64)) VUINT32 _sC[32][1];
> > -        __declspec(align(64)) VUINT32 _sP0[32][1];
> > -        __declspec(align(64)) VUINT32 _sP2[32][1];
> > -        __declspec(align(64)) VUINT32 _sP3[32][1];
> > -        __declspec(align(64)) VUINT32 _sP4[32][1];
> > -        __declspec(align(64)) VUINT32 _sP5[32][1];
> > -        __declspec(align(64)) VUINT32 _sP6[32][1];
> > -        __declspec(align(64)) VUINT32 _sP7[32][1];
> > -        __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> > -        __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> > -        __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > -        __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> > -        __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> > -        __declspec(align(64)) VUINT32 _iExpMask[16][1];
> > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> > -        __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> > -} __svml_stanh_data_internal;
> > +       typedef unsigned int VUINT32;
> > +       typedef struct
> > +       {
> > +       __declspec (align(4))VUINT32 _iExpMantMask_UISA[1][1];
> > +       __declspec (align(4))VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > +       __declspec (align(4))VUINT32 _iMaxIdxMask_UISA[1][1];
> > +       __declspec (align(4))VUINT32 _iExpMask[1][1];
> > +       __declspec (align(64))VUINT32 _sSignMask[16][1];
> > +       __declspec (align(64))VUINT32 _sC_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sC_hi[16][1];
> > +       __declspec (align(64))VUINT32 _sP7_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sP7_hi[16][1];
> > +       __declspec (align(64))VUINT32 _sP6_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sP6_hi[16][1];
> > +       __declspec (align(64))VUINT32 _sP5_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sP5_hi[16][1];
> > +       __declspec (align(64))VUINT32 _sP4_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sP4_hi[16][1];
> > +       __declspec (align(64))VUINT32 _sP3_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sP3_hi[16][1];
> > +       __declspec (align(64))VUINT32 _sP2_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sP2_hi[16][1];
> > +       __declspec (align(64))VUINT32 _sP0_lo[16][1];
> > +       __declspec (align(64))VUINT32 _sP0_hi[16][1];
> > +       }__svml_stanh_data_internal;
> >  #endif
> > +
> >  __svml_stanh_data_internal:
> > -        /*== _sC ==*/
> > -        .long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > -        .long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > -        .long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > -        .long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > -        .long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > -        .long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > -        .long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > -        .long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > -        /*== p0 ==*/
> > -        .align 64
> > -        .long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > -        .long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > -        .long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > -        .long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > -        .long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > -        .long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > -        .long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > -        .long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > -        /*== p2 ==*/
> > -        .align 64
> > -        .long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > -        .long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > -        .long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > -        .long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > -        .long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > -        .long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > -        .long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > -        .long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > -        /*== p3 ==*/
> > -        .align 64
> > -        .long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > -        .long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > -        .long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > -        .long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > -        .long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > -        .long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > -        .long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > -        .long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > -        /*== p4 ==*/
> > -        .align 64
> > -        .long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > -        .long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > -        .long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > -        .long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > -        .long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > -        .long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > -        .long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > -        .long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > -        /*== p5 ==*/
> > -        .align 64
> > -        .long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > -        .long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > -        .long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > -        .long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > -        .long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > -        .long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > -        .long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > -        .long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > -        /*== p6 ==*/
> > -        .align 64
> > -        .long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > -        .long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > -        .long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > -        .long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > -        .long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > -        .long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > -        .long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > -        .long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > -        /*== p7 ==*/
> > -        .align 64
> > -        .long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > -        .long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > -        .long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > -        .long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > -        .long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > -        .long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > -        .long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > -        .long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > -        .align 64
> > -        .long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000           /* _iExpMantMask_UISA     */
> > -        .align 64
> > -        .long 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000           /* _iMinIdxOfsMask_UISA   */
> > -        .align 64
> > -        .long 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000           /* _iMaxIdxMask_UISA      */
> > -        .align 64
> > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > -        .align 64
> > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > -        .align 64
> > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > -        .align 64
> > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > -        .align 64
> > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > -        .align 64
> > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > -        .align 64
> > -        .type  __svml_stanh_data_internal,@object
> > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > +       .align  4
> > +       /* _iExpMantMask_UISA.  */
> > +       .long   0x7fe00000
> > +
> > +       .align  4
> > +       /* _iMinIdxOfsMask_UISA.  */
> > +       .long   0x3d400000
> > +
> > +       .align  4
> > +       /* _iMaxIdxMask_UISA.  */
> > +       .long   0x03e00000
> > +
> > +       .align  4
> > +       /* _iExpMask.  */
> > +       .long   0x7f000000
> > +
> > +       .align  64
> > +       /* _sSignMask.  */
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > +
> > +       .align  64
> > +       /* _sC_lo.  */
> > +       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > +       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > +       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > +       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > +
> > +       .align  64
> > +       /* _sC_hi.  */
> > +       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > +       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > +       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > +       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > +
> > +       .align  64
> > +       /* _sP7_lo.  */
> > +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > +
> > +       .align  64
> > +       /* _sP7_hi.  */
> > +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > +
> > +       .align  64
> > +       /* _sP6_lo.  */
> > +       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > +       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > +       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > +       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > +
> > +       .align  64
> > +       /* _sP6_hi.  */
> > +       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > +       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > +       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > +       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > +
> > +       .align  64
> > +       /* _sP5_lo.  */
> > +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > +
> > +       .align  64
> > +       /* _sP5_hi.  */
> > +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > +
> > +       .align  64
> > +       /* _sP4_lo.  */
> > +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > +
> > +       .align  64
> > +       /* _sP4_hi.  */
> > +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > +
> > +       .align  64
> > +       /* _sP3_lo.  */
> > +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > +
> > +       .align  64
> > +       /* _sP3_hi.  */
> > +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > +
> > +       .align  64
> > +       /* _sP2_lo.  */
> > +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > +
> > +       .align  64
> > +       /* _sP2_hi.  */
> > +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > +
> > +       .align  64
> > +       /* _sP0_lo.  */
> > +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > +
> > +       .align  64
> > +       /* _sP0_hi.  */
> > +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > +
> > +       .align  64
> > +       .type   __svml_stanh_data_internal, @object
> > +       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > index 50f753ffb3..716b06d640 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > @@ -70,763 +70,154 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal
> > - */
> > -#define _dbP                           0
> > -#define _sSignMask                     4288
> > -#define _sAbsMask                      4304
> > -#define _iExpMantMask                  4320
> > -#define _iExpMask                      4336
> > -#define _iMinIdxOfsMask                4352
> > -#define _iMaxIdxMask                   4368
> >
> >  #include <sysdep.h>
> >
> > +#define ONLY_DECL_OFFSET
> > +#include "svml_s_tanhf_rodata.S"
> > +
> >          .text
> >         .section .text.sse4,"ax",@progbits
> >  ENTRY(_ZGVbN4v_tanhf_sse4)
> > -        subq      $72, %rsp
> > -        cfi_def_cfa_offset(80)
> > -        movaps    %xmm0, %xmm5
> > +       /* Save copy of input in xmm12.  */
> > +       movaps  %xmm0, %xmm12
> >
> > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -        movdqu    _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r8
> > -        pand      %xmm5, %xmm9
> > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> > +       pand    %xmm0, %xmm3
> >
> > -/* if VMIN, VMAX is defined for I type */
> > -        pxor      %xmm7, %xmm7
> > -        movdqa    %xmm9, %xmm6
> > -        psubd     _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> >
> > -/*
> > - *  small table specific variables *
> > - *  Constant loading
> > - */
> > -        movdqu    _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> > -        movdqa    %xmm9, %xmm11
> > -        movdqa    %xmm9, %xmm8
> > -        pcmpgtd   %xmm10, %xmm11
> > -        pcmpgtd   %xmm7, %xmm8
> > -        movdqa    %xmm11, %xmm14
> > -        pand      %xmm8, %xmm9
> > -        andps     %xmm11, %xmm10
> > -        andnps    %xmm9, %xmm14
> > -        orps      %xmm10, %xmm14
> > -        psrld     $14, %xmm14
> > -        movd      %xmm14, %edx
> > -        pshufd    $1, %xmm14, %xmm12
> > -        pshufd    $2, %xmm14, %xmm13
> > -        movd      %xmm12, %ecx
> > -        pshufd    $3, %xmm14, %xmm15
> > -        movups    _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> > -        movslq    %edx, %rdx
> > -        andps     %xmm5, %xmm3
> > -        movslq    %ecx, %rcx
> > -        pcmpgtd   _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> > -        movd      %xmm13, %esi
> > -        movups    -16(%rdx,%r8), %xmm2
> > -        movaps    %xmm2, %xmm0
> > -        movd      %xmm15, %edi
> > -        movmskps  %xmm6, %eax
> > -        movups    -16(%rcx,%r8), %xmm6
> > -        unpcklpd  %xmm6, %xmm0
> > -        unpckhpd  %xmm6, %xmm2
> > -        cvtps2pd  %xmm3, %xmm6
> > -        movhlps   %xmm3, %xmm3
> > -        cvtps2pd  %xmm3, %xmm3
> > -        movslq    %esi, %rsi
> > -        movslq    %edi, %rdi
> > -        movups    (%rcx,%r8), %xmm8
> > -        movups    (%rdx,%r8), %xmm12
> > -        movups    (%rsi,%r8), %xmm13
> > -        movaps    %xmm12, %xmm10
> > -        movups    (%rdi,%r8), %xmm9
> > -        movaps    %xmm13, %xmm11
> > -        unpckhpd  %xmm8, %xmm12
> > -        unpckhpd  %xmm9, %xmm13
> > -        mulpd     %xmm6, %xmm12
> > -        mulpd     %xmm3, %xmm13
> > -        unpcklpd  %xmm8, %xmm10
> > -        unpcklpd  %xmm9, %xmm11
> > -        addpd     %xmm10, %xmm12
> > -        addpd     %xmm11, %xmm13
> > -        mulpd     %xmm6, %xmm12
> > -        mulpd     %xmm3, %xmm13
> > -        addpd     %xmm2, %xmm12
> > -        movups    -16(%rsi,%r8), %xmm1
> > -        movups    -16(%rdi,%r8), %xmm7
> > -        movaps    %xmm1, %xmm14
> > -        unpckhpd  %xmm7, %xmm1
> > -        addpd     %xmm1, %xmm13
> > -        mulpd     %xmm12, %xmm6
> > -        mulpd     %xmm13, %xmm3
> > -        addpd     %xmm0, %xmm6
> > -        unpcklpd  %xmm7, %xmm14
> > -        addpd     %xmm14, %xmm3
> > -        cvtpd2ps  %xmm6, %xmm0
> > -        cvtpd2ps  %xmm3, %xmm1
> > -        movups    _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> > -        movlhps   %xmm1, %xmm0
> > -        andps     %xmm5, %xmm4
> > -        orps      %xmm4, %xmm0
> > -        testl     %eax, %eax
> > -
> > -/* Go to special inputs processing branch */
> > -        jne       L(SPECIAL_VALUES_BRANCH)
> > -                                # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> > -
> > -/* Restore registers
> > - * and exit the function
> > - */
> > +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
> > +       pxor    %xmm7, %xmm7
> > +       /* Save xmm3 for special values check at end.  */
> > +       movdqa  %xmm3, %xmm8
> > +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> > +       pmaxsd  %xmm7, %xmm3
> > +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> > +       psrld   $14, %xmm3
> >
> > -L(EXIT):
> > -        addq      $72, %rsp
> > -        cfi_def_cfa_offset(8)
> > -        ret
> > -        cfi_def_cfa_offset(80)
> > +       movq    %xmm3, %rcx
> > +       movl    %ecx, %edx
> > +       shrq    $32, %rcx
> >
> > -/* Branch to process
> > - * special inputs
> > - */
> > +       /* xmm8 contains mask of special values.  */
> > +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
> >
> > -L(SPECIAL_VALUES_BRANCH):
> > -        movups    %xmm5, 32(%rsp)
> > -        movups    %xmm0, 48(%rsp)
> > -                                # LOE rbx rbp r12 r13 r14 r15 eax
> > -
> > -        xorl      %edx, %edx
> > -        movq      %r12, 16(%rsp)
> > -        cfi_offset(12, -64)
> > -        movl      %edx, %r12d
> > -        movq      %r13, 8(%rsp)
> > -        cfi_offset(13, -72)
> > -        movl      %eax, %r13d
> > -        movq      %r14, (%rsp)
> > -        cfi_offset(14, -80)
> > -                                # LOE rbx rbp r15 r12d r13d
> > -
> > -/* Range mask
> > - * bits check
> > - */
> > +       pshufd  $0x0e, %xmm3, %xmm3
> > +       movq    %xmm3, %rdi
> > +       movl    %edi, %esi
> > +       shrq    $32, %rdi
> >
> > -L(RANGEMASK_CHECK):
> > -        btl       %r12d, %r13d
> > +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> > +       andps   %xmm1, %xmm0
> >
> > -/* Call scalar math function */
> > -        jc        L(SCALAR_MATH_CALL)
> > -                                # LOE rbx rbp r15 r12d r13d
> > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > +       movups  (%rdx, %rax), %xmm2
> > +       movups  (%rcx, %rax), %xmm6
> >
> > -/* Special inputs
> > - * processing loop
> > - */
> > +       movaps  %xmm2, %xmm4
> > +       movlhps %xmm6, %xmm4
> > +       unpckhpd %xmm6, %xmm2
> >
> > -L(SPECIAL_VALUES_LOOP):
> > -        incl      %r12d
> > -        cmpl      $4, %r12d
> > -
> > -/* Check bits in range mask */
> > -        jl        L(RANGEMASK_CHECK)
> > -                                # LOE rbx rbp r15 r12d r13d
> > -
> > -        movq      16(%rsp), %r12
> > -        cfi_restore(12)
> > -        movq      8(%rsp), %r13
> > -        cfi_restore(13)
> > -        movq      (%rsp), %r14
> > -        cfi_restore(14)
> > -        movups    48(%rsp), %xmm0
> > -
> > -/* Go to exit */
> > -        jmp       L(EXIT)
> > -        cfi_offset(12, -64)
> > -        cfi_offset(13, -72)
> > -        cfi_offset(14, -80)
> > -                                # LOE rbx rbp r12 r13 r14 r15 xmm0
> > -
> > -/* Scalar math fucntion call
> > - * to process special input
> > - */
> > +       cvtps2pd %xmm0, %xmm6
> > +       movhlps %xmm0, %xmm0
> > +       cvtps2pd %xmm0, %xmm0
> >
> > -L(SCALAR_MATH_CALL):
> > -        movl      %r12d, %r14d
> > -        movss     32(%rsp,%r14,4), %xmm0
> > -        call      tanhf@PLT
> > -                                # LOE rbx rbp r14 r15 r12d r13d xmm0
> > +       movups  16(%rdx, %rax), %xmm5
> > +       movups  16(%rsi, %rax), %xmm13
> >
> > -        movss     %xmm0, 48(%rsp,%r14,4)
> > +       movaps  %xmm5, %xmm10
> > +       movaps  %xmm13, %xmm11
> >
> > -/* Process special inputs in loop */
> > -        jmp       L(SPECIAL_VALUES_LOOP)
> > -                                # LOE rbx rbp r15 r12d r13d
> > -END(_ZGVbN4v_tanhf_sse4)
> > +       movups  16(%rcx, %rax), %xmm7
> > +       movups  16(%rdi, %rax), %xmm3
> > +
> > +       unpckhpd %xmm7, %xmm5
> > +       unpckhpd %xmm3, %xmm13
> > +
> > +       mulpd   %xmm6, %xmm5
> > +       mulpd   %xmm0, %xmm13
> > +
> > +       movlhps %xmm7, %xmm10
> > +       movlhps %xmm3, %xmm11
> > +
> > +       addpd   %xmm10, %xmm5
> > +       addpd   %xmm11, %xmm13
> > +
> > +       mulpd   %xmm6, %xmm5
> > +       mulpd   %xmm0, %xmm13
> > +
> > +       addpd   %xmm2, %xmm5
> >
> > -        .section .rodata, "a"
> > -        .align 16
> > -
> > -#ifdef __svml_stanh_data_internal_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct
> > -{
> > -        __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> > -        __declspec(align(16)) VUINT32 _sSignMask[4][1];
> > -        __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> > -        __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> > -        __declspec(align(16)) VUINT32 _iExpMask[4][1];
> > -        __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> > -        __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> > -} __svml_stanh_data_internal;
> > -#endif
> > -__svml_stanh_data_internal:
> > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > -        .quad 0x3ff0000000000000
> > -        .quad 0x0000000000000000
> > -        .quad 0x0000000000000000
> > -        .quad 0x0000000000000000
> > -        .align 16
> > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > -        .align 16
> > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > -        .align 16
> > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > -        .align 16
> > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > -        .align 16
> > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > -        .align 16
> > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > -        .align 16
> > -        .type  __svml_stanh_data_internal,@object
> > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > +       movups  (%rsi, %rax), %xmm2
> > +       movups  (%rdi, %rax), %xmm7
> > +
> > +       movaps  %xmm2, %xmm3
> > +
> > +       unpckhpd %xmm7, %xmm2
> > +       movlhps %xmm7, %xmm3
> > +
> > +       addpd   %xmm13, %xmm2
> > +
> > +       mulpd   %xmm5, %xmm6
> > +       addpd   %xmm4, %xmm6
> > +
> > +       mulpd   %xmm2, %xmm0
> > +       addpd   %xmm3, %xmm0
> > +
> > +       cvtpd2ps %xmm0, %xmm2
> > +       cvtpd2ps %xmm6, %xmm0
> > +
> > +       movlhps %xmm2, %xmm0
> > +       andnps  %xmm12, %xmm1
> > +       orps    %xmm1, %xmm0
> > +
> > +       movmskps %xmm8, %edx
> > +       testl   %edx, %edx
> > +
> > +       /* Go to special inputs processing branch.  */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +
> > +       /* No stack restoration on the fastpath.  */
> > +       ret
> > +
> > +L(SPECIAL_VALUES_BRANCH):
> > +       subq    $48, %rsp
> > +
> > +       movups  %xmm0, (%rsp)
> > +       movups  %xmm12, 16(%rsp)
> > +
> > +       movq    %r12, 32(%rsp)
> > +       movq    %r13, 40(%rsp)
> > +
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a tanhf call.  */
> > +       movl    %edx, %r13d
> > +L(SPECIAL_VALUES_LOOP):
> > +       /* use r12 as index for special value that is saved across calls to
> > +          tanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop.  */
> > +       xorl    %r12d, %r12d
> > +       bsfl    %r13d, %r12d
> > +
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   16(%rsp, %r12, 4), %xmm0
> > +       call    tanhf@PLT
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, (%rsp, %r12, 4)
> > +
> > +       leal    -1(%r13), %eax
> > +       andl    %eax, %r13d
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +
> > +       /* All results have been written to 16(%rsp).  */
> > +       movups  (%rsp), %xmm0
> > +       movq    32(%rsp), %r12
> > +       movq    40(%rsp), %r13
> > +       addq    $48, %rsp
> > +       ret
> > +END(_ZGVbN4v_tanhf_sse4)
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > index 3745db5aa4..90c3ea4cc6 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > @@ -70,775 +70,171 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal
> > - */
> > -#define _dbP                           0
> > -#define _sSignMask                     4288
> > -#define _sAbsMask                      4320
> > -#define _iExpMantMask                  4352
> > -#define _iExpMask                      4384
> > -#define _iMinIdxOfsMask                4416
> > -#define _iMaxIdxMask                   4448
> > -
> >  #include <sysdep.h>
> > +#include "svml_s_tanhf_rodata.S"
> >
> >          .text
> >         .section .text.avx2,"ax",@progbits
> >  ENTRY(_ZGVdN8v_tanhf_avx2)
> > -        pushq     %rbp
> > -        cfi_def_cfa_offset(16)
> > -        movq      %rsp, %rbp
> > -        cfi_def_cfa(6, 16)
> > -        cfi_offset(6, -16)
> > -        andq      $-32, %rsp
> > -        pushq     %r12
> > -        subq      $120, %rsp
> > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r10
> > -        vmovaps   %ymm0, %ymm12
> > -
> > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -        vpand     _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
> >
> > -/*
> > - *  small table specific variables *
> > - *  Constant loading
> > - */
> > -        vmovups   _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> > -        vpsubd    _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> > -
> > -/* if VMIN, VMAX is defined for I type */
> > -        vxorps    %ymm15, %ymm15, %ymm15
> > -        vpcmpgtd  %ymm15, %ymm9, %ymm0
> > -        vpand     %ymm0, %ymm9, %ymm7
> > -        vpcmpgtd  %ymm8, %ymm9, %ymm6
> > -        vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> > -        vpsrld    $14, %ymm3, %ymm1
> > -        vpcmpgtd  _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> > -        vmovmskps %ymm13, %r11d
> > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> > -        vextractf128 $1, %ymm1, %xmm2
> > -        vmovd     %xmm1, %r9d
> > -        vmovd     %xmm2, %ecx
> > -        vpextrd   $1, %xmm2, %edx
> > -        vpextrd   $1, %xmm1, %r8d
> > -        movslq    %r9d, %r9
> > -        movslq    %edx, %rdx
> > -        movslq    %r8d, %r8
> > -        vpextrd   $2, %xmm1, %edi
> > -        movslq    %ecx, %rcx
> > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > -        vpextrd   $3, %xmm2, %r12d
> > -        vpextrd   $3, %xmm1, %esi
> > -        vpextrd   $2, %xmm2, %eax
> > -        movslq    %edi, %rdi
> > -        movslq    %r12d, %r12
> > -        movslq    %esi, %rsi
> > -        movslq    %eax, %rax
> > -        vmovupd   -16(%r9,%r10), %xmm5
> > -        vmovupd   -16(%rdx,%r10), %xmm14
> > -        vmovupd   -16(%rcx,%r10), %xmm13
> > -        vmovupd   (%r9,%r10), %xmm1
> > -        vmovupd   (%r8,%r10), %xmm2
> > -        vmovupd   -16(%r8,%r10), %xmm4
> > -        vinsertf128 $1, -16(%rdi,%r10), %ymm5, %ymm15
> > -        vinsertf128 $1, -16(%r12,%r10), %ymm14, %ymm3
> > -        vinsertf128 $1, -16(%rax,%r10), %ymm13, %ymm6
> > -        vinsertf128 $1, (%rdi,%r10), %ymm1, %ymm5
> > -        vinsertf128 $1, (%rsi,%r10), %ymm2, %ymm14
> > -        vunpcklpd %ymm3, %ymm6, %ymm8
> > -        vunpckhpd %ymm3, %ymm6, %ymm6
> > -        vunpcklpd %ymm14, %ymm5, %ymm3
> > -        vunpckhpd %ymm14, %ymm5, %ymm2
> > -        vmovupd   (%rcx,%r10), %xmm13
> > -        vcvtps2pd %xmm10, %ymm5
> > -        vextractf128 $1, %ymm10, %xmm10
> > -        vfmadd213pd %ymm3, %ymm5, %ymm2
> > -        vinsertf128 $1, -16(%rsi,%r10), %ymm4, %ymm0
> > -        vmovupd   (%rdx,%r10), %xmm4
> > -        vunpcklpd %ymm0, %ymm15, %ymm9
> > -        vunpckhpd %ymm0, %ymm15, %ymm7
> > -        vfmadd213pd %ymm7, %ymm5, %ymm2
> > -        vfmadd213pd %ymm9, %ymm5, %ymm2
> > -        vinsertf128 $1, (%r12,%r10), %ymm4, %ymm0
> > -        vcvtps2pd %xmm10, %ymm4
> > -        vinsertf128 $1, (%rax,%r10), %ymm13, %ymm15
> > -        vunpcklpd %ymm0, %ymm15, %ymm1
> > -        vunpckhpd %ymm0, %ymm15, %ymm0
> > -        vfmadd213pd %ymm1, %ymm4, %ymm0
> > -        vcvtpd2ps %ymm2, %xmm1
> > -        vfmadd213pd %ymm6, %ymm4, %ymm0
> > -        vfmadd213pd %ymm8, %ymm4, %ymm0
> > -        vcvtpd2ps %ymm0, %xmm0
> > -        vinsertf128 $1, %xmm0, %ymm1, %ymm2
> > -        vorps     %ymm11, %ymm2, %ymm0
> > -        testl     %r11d, %r11d
> > -
> > -/* Go to special inputs processing branch */
> > -        jne       L(SPECIAL_VALUES_BRANCH)
> > -                                # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> > -
> > -/* Restore registers
> > - * and exit the function
> > - */
> > +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> > +       vpxor   %ymm3, %ymm3, %ymm3
> > +       vpmaxsd %ymm3, %ymm2, %ymm2
> > +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
> >
> > -L(EXIT):
> > -        addq      $120, %rsp
> > -        cfi_restore(12)
> > -        popq      %r12
> > -        movq      %rbp, %rsp
> > -        popq      %rbp
> > -        cfi_def_cfa(7, 8)
> > -        cfi_restore(6)
> > -        ret
> > -        cfi_def_cfa(6, 16)
> > -        cfi_offset(6, -16)
> > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > -
> > -/* Branch to process
> > - * special inputs
> > - */
> > +       vpsrld  $14, %ymm2, %ymm1
> >
> > -L(SPECIAL_VALUES_BRANCH):
> > -        vmovups   %ymm12, 32(%rsp)
> > -        vmovups   %ymm0, 64(%rsp)
> > -                                # LOE rbx r13 r14 r15 r11d ymm0
> > -
> > -        xorl      %r12d, %r12d
> > -                                # LOE rbx r13 r14 r15 r11d r12d
> > -
> > -        vzeroupper
> > -        movq      %r13, 8(%rsp)
> > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > -        movl      %r11d, %r13d
> > -        movq      %r14, (%rsp)
> > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > -                                # LOE rbx r15 r12d r13d
> > -
> > -/* Range mask
> > - * bits check
> > - */
> > +       /* Store special cases in ymm15.  */
> > +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
> >
> > -L(RANGEMASK_CHECK):
> > -        btl       %r12d, %r13d
> >
> > -/* Call scalar math function */
> > -        jc        L(SCALAR_MATH_CALL)
> > -                                # LOE rbx r15 r12d r13d
> > +       /* Store base of lookup table in rax.  */
> > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> >
> > -/* Special inputs
> > - * processing loop
> > - */
> > +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> > +          store/load as we can take advantage of store-forwarding.  */
> > +       vmovq   %xmm1, %r8
> > +       /* We have eliminated all negative values for ymm1 so no need to sign
> > +          extend.  */
> > +       movl    %r8d, %r9d
> > +       shrq    $32, %r8
> >
> > -L(SPECIAL_VALUES_LOOP):
> > -        incl      %r12d
> > -        cmpl      $8, %r12d
> > -
> > -/* Check bits in range mask */
> > -        jl        L(RANGEMASK_CHECK)
> > -                                # LOE rbx r15 r12d r13d
> > -
> > -        movq      8(%rsp), %r13
> > -        cfi_restore(13)
> > -        movq      (%rsp), %r14
> > -        cfi_restore(14)
> > -        vmovups   64(%rsp), %ymm0
> > -
> > -/* Go to exit */
> > -        jmp       L(EXIT)
> > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > -                                # LOE rbx r13 r14 r15 ymm0
> > -
> > -/* Scalar math fucntion call
> > - * to process special input
> > - */
> > +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> > +          with memory operand. This helps alleviate bottleneck on p5.  */
> > +       vmovdqu 16(%r9, %rax), %xmm5
> >
> > -L(SCALAR_MATH_CALL):
> > -        movl      %r12d, %r14d
> > -        movss     32(%rsp,%r14,4), %xmm0
> > -        call      tanhf@PLT
> > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > +       vpextrq $1, %xmm1, %rsi
> > +       movl    %esi, %edi
> > +       shrq    $32, %rsi
> >
> > -        movss     %xmm0, 64(%rsp,%r14,4)
> > +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
> >
> > -/* Process special inputs in loop */
> > -        jmp       L(SPECIAL_VALUES_LOOP)
> > -                                # LOE rbx r15 r12d r13d
> > -END(_ZGVdN8v_tanhf_avx2)
> > +       vextracti128 $1, %ymm1, %xmm2
> > +       vmovq   %xmm2, %rdx
> > +       movl    %edx, %ecx
> > +       shrq    $32, %rdx
> > +
> > +       vmovdqu (%rcx, %rax), %xmm6
> > +
> > +       vpextrq $1, %xmm2, %r10
> > +       movl    %r10d, %r11d
> > +       shrq    $32, %r10
> > +
> > +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> > +
> > +       vmovupd 16(%r8, %rax), %xmm1
> > +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> > +       vmovupd (%rdx, %rax), %xmm3
> > +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> > +
> > +       vunpcklpd %ymm3, %ymm6, %ymm7
> > +       vunpckhpd %ymm3, %ymm6, %ymm6
> > +
> > +       vunpcklpd %ymm1, %ymm5, %ymm3
> > +       vunpckhpd %ymm1, %ymm5, %ymm1
> > +
> > +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> > +       vandps  %ymm11, %ymm0, %ymm4
> >
> > -        .section .rodata, "a"
> > -        .align 32
> > -
> > -#ifdef __svml_stanh_data_internal_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct
> > -{
> > -        __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> > -        __declspec(align(32)) VUINT32 _sSignMask[8][1];
> > -        __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> > -        __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> > -        __declspec(align(32)) VUINT32 _iExpMask[8][1];
> > -        __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> > -        __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> > -} __svml_stanh_data_internal;
> > -#endif
> > -__svml_stanh_data_internal:
> > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > -        .quad 0x3ff0000000000000
> > -        .quad 0x0000000000000000
> > -        .quad 0x0000000000000000
> > -        .quad 0x0000000000000000
> > -        .align 32
> > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > -        .align 32
> > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > -        .align 32
> > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > -        .align 32
> > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > -        .align 32
> > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > -        .align 32
> > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > -        .align 32
> > -        .type  __svml_stanh_data_internal,@object
> > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > +       vcvtps2pd %xmm4, %ymm5
> > +
> > +       vextractf128 $1, %ymm4, %xmm4
> > +       vcvtps2pd %xmm4, %ymm4
> > +
> > +       vmovdqu 16(%rcx, %rax), %xmm2
> > +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
> > +
> > +       vfmadd213pd %ymm3, %ymm5, %ymm1
> > +
> > +       vmovupd 16(%rdx, %rax), %xmm3
> > +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> > +
> > +       vunpcklpd %ymm3, %ymm2, %ymm10
> > +       vunpckhpd %ymm3, %ymm2, %ymm2
> > +
> > +       vfmadd213pd %ymm10, %ymm4, %ymm2
> > +       vfmadd213pd %ymm6, %ymm4, %ymm2
> > +       vfmadd213pd %ymm7, %ymm4, %ymm2
> > +       vcvtpd2ps %ymm2, %xmm2
> > +
> > +       vmovdqu (%r9, %rax), %xmm7
> > +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> > +
> > +       vmovupd (%r8, %rax), %xmm3
> > +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> > +
> > +       vunpckhpd %ymm3, %ymm7, %ymm4
> > +       vunpcklpd %ymm3, %ymm7, %ymm7
> > +
> > +       vfmadd213pd %ymm4, %ymm5, %ymm1
> > +       vfmadd213pd %ymm7, %ymm5, %ymm1
> > +
> > +
> > +       vcvtpd2ps %ymm1, %xmm1
> > +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> > +
> > +       vmovmskps %ymm15, %edx
> > +       vandnps %ymm0, %ymm11, %ymm2
> > +       testl   %edx, %edx
> > +       /* Go to special inputs processing branch.  */
> > +       jne     L(SPECIAL_VALUES_BRANCH)
> > +       /* Wait until after branch of write over ymm0.  */
> > +       vorps   %ymm2, %ymm1, %ymm0
> > +       /* No stack restoration on the fastpath.  */
> > +       ret
> > +
> > +
> > +L(SPECIAL_VALUES_BRANCH):
> > +       pushq   %rbp
> > +       /* Need to callee save registers to preserve state across tanhf calls.
> > +        */
> > +       pushq   %r12
> > +       pushq   %r13
> > +       movq    %rsp, %rbp
> > +
> > +       /* Align stack and make room for 2x ymm vectors.  */
> > +       andq    $-32, %rsp
> > +       addq    $-64, %rsp
> > +
> > +       /* Save all already computed inputs.  */
> > +       vorps   %ymm2, %ymm1, %ymm1
> > +       vmovups %ymm1, (%rsp)
> > +       /* Save origional input (ymm0 unchanged up to this point).  */
> > +       vmovups %ymm0, 32(%rsp)
> > +
> > +       vzeroupper
> > +
> > +       /* edx has 1s where there was a special value that needs to be handled
> > +          by a tanhf call.  */
> > +       movl    %edx, %r13d
> > +L(SPECIAL_VALUES_LOOP):
> > +       /* use r12 as index for special value that is saved across calls to
> > +          tanhf. We technically don't need a callee save register here as offset
> > +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > +          in the loop.  */
> > +       xorl    %r12d, %r12d
> > +       tzcntl  %r13d, %r12d
> > +
> > +       /* Scalar math fucntion call to process special input.  */
> > +       movss   32(%rsp, %r12, 4), %xmm0
> > +       call    tanhf@PLT
> > +       /* No good way to avoid the store-forwarding fault this will cause on
> > +          return. `lfence` avoids the SF fault but at greater cost as it
> > +          serialized stack/callee save restoration.  */
> > +       movss   %xmm0, (%rsp, %r12, 4)
> > +
> > +       blsr    %r13d, %r13d
> > +       jnz     L(SPECIAL_VALUES_LOOP)
> > +
> > +       /* All results have been written to 32(%rsp).  */
> > +       vmovups (%rsp), %ymm0
> > +       movq    %rbp, %rsp
> > +       popq    %r13
> > +       popq    %r12
> > +       popq    %rbp
> > +       ret
> > +END(_ZGVdN8v_tanhf_avx2)
> > --
> > 2.25.1
> >
Sunil Pandey Feb. 1, 2022, 9:29 p.m. UTC | #3
Looking into v2, it is still big, with all optimizations applied at
the same time.

>Optimizations are:
>    1. Reduce code size (-70 bytes).
>    2. Reduce rodata size (-32 bytes).
>    3. Remove register save/restores and stack adjustment from the
       fast path.
>    4. Slightly better instruction selection where possible.
>    5. Remove redundant registers moves.
>    6. Prefer registers that get smaller instruction encodings.

Can you please further split the patch according to optimization, one
optimization at a time per patch.

On Tue, Feb 1, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Feb 1, 2022 at 2:03 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Hi Noah,
> >
> > We would like to get this patch, but it's too late for 2.35.
> >
> > This patch is too big, can you please break this patch into multiple
> > smaller patches?
>
> Yeah, I'll split by file.
> >
> > Also, it seems like this patch is incomplete. I got a build error on
> > the glibc master.
>
> My fault, I separated the rodata for avx2/sse2 into a single file
> so that the two implementations could share the lookup table.
>
> Forgot to commit it :/
>
> Will fix in V2.
> >
> > ./sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S:77:33: fatal
> > error: svml_s_tanhf_rodata.S: No such file or directory
> >  #include "svml_s_tanhf_rodata.S"
> >                                  ^
> > compilation terminated.
> > ../sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S:74:33: fatal
> > error: svml_s_tanhf_rodata.S: No such file or directory
> >  #include "svml_s_tanhf_rodata.S"
> >                                  ^
> > compilation terminated.
> >
> > Thanks,
> > Sunil
> >
> >
> >
> >
> >
> >
> > On Sat, Jan 29, 2022 at 8:37 PM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > No bug.
> > >
> > > Optimizations are:
> > >     1. Reduce code size
> > >         avx512: -56 bytes
> > >         avx2:   -70 bytes
> > >         sse4:   -106 bytes
> > >     2. Reduce rodata size
> > >         avx512: -448 bytes
> > >         avx2:   -32 bytes
> > >         sse4:   -4k+ (shares rodata with avx2)
> > >     3. Remove register save/restores and stack adjustment from the
> > >        fast path.
> > >     4. Slightly better instruction selection where possible.
> > >
> > > This results in roughly a 15% performance improvement for all
> > > functions.
> > >
> > > Results from geomean of 40 benchtest runs:
> > >        Function, New Time, Old Time, New / Old
> > >  _ZGVbN4v_tanhf,     3.28,    3.852,     0.852
> > >  _ZGVcN8v_tanhf,    3.556,    4.192,     0.848
> > >  _ZGVdN8v_tanhf,     2.13,    2.486,     0.857
> > > _ZGVeN16v_tanhf,    0.658,    0.762,     0.864
> > > ---
> > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 585 +++++------
> > >  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 871 +++--------------
> > >  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 908 +++---------------
> > >  3 files changed, 581 insertions(+), 1783 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > index 8954a5f658..6a2f0c1392 100644
> > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > @@ -70,312 +70,323 @@
> > >   *
> > >   */
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal
> > > - */
> > > -#define _sC                            0
> > > -#define _sP0                           128
> > > -#define _sP2                           256
> > > -#define _sP3                           384
> > > -#define _sP4                           512
> > > -#define _sP5                           640
> > > -#define _sP6                           768
> > > -#define _sP7                           896
> > > -#define _iExpMantMask_UISA             1024
> > > -#define _iMinIdxOfsMask_UISA           1088
> > > -#define _iMaxIdxMask_UISA              1152
> > > -#define _sSignMask                     1216
> > > -#define _sAbsMask                      1280
> > > -#define _iExpMantMask                  1344
> > > -#define _iExpMask                      1408
> > > -#define _iMinIdxOfsMask                1472
> > > -#define _iMaxIdxMask                   1536
> > > -
> > >  #include <sysdep.h>
> > >
> > > +#define TANHF_DATA(offset)     ((offset) + __svml_stanh_data_internal)
> > > +
> > > +/* Offsets for data table __svml_stanh_data_internal.  */
> > > +#define _iExpMantMask_UISA     0
> > > +#define _iMinIdxOfsMask_UISA   4
> > > +#define _iMaxIdxMask_UISA      8
> > > +#define _iExpMask      12
> > > +#define _sSignMask     64
> > > +#define _sC_lo 128
> > > +#define _sC_hi 192
> > > +#define _sP7_lo        256
> > > +#define _sP7_hi        320
> > > +#define _sP6_lo        384
> > > +#define _sP6_hi        448
> > > +#define _sP5_lo        512
> > > +#define _sP5_hi        576
> > > +#define _sP4_lo        640
> > > +#define _sP4_hi        704
> > > +#define _sP3_lo        768
> > > +#define _sP3_hi        832
> > > +#define _sP2_lo        896
> > > +#define _sP2_hi        960
> > > +#define _sP0_lo        1024
> > > +#define _sP0_hi        1088
> > > +
> > >          .text
> > >         .section .text.exex512,"ax",@progbits
> > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > -        pushq     %rbp
> > > -        cfi_def_cfa_offset(16)
> > > -        movq      %rsp, %rbp
> > > -        cfi_def_cfa(6, 16)
> > > -        cfi_offset(6, -16)
> > > -        andq      $-64, %rsp
> > > -        subq      $192, %rsp
> > > -        vmovaps   %zmm0, %zmm1
> > > -        vmovups   __svml_stanh_data_internal(%rip), %zmm9
> > > -        vmovups   _sP6+__svml_stanh_data_internal(%rip), %zmm11
> > > -        vmovups   _sP5+__svml_stanh_data_internal(%rip), %zmm12
> > > -        vmovups   _sP4+__svml_stanh_data_internal(%rip), %zmm13
> > > -        vmovups   _sP3+__svml_stanh_data_internal(%rip), %zmm14
> > > -        vmovups   _sP2+__svml_stanh_data_internal(%rip), %zmm15
> > > -        vpternlogd $255, %zmm2, %zmm2, %zmm2
> > > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> > > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> > > -
> > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > -        vpandd    _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> > > -        vpsubd    _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> > > -        vpcmpd    $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > +       vpandd  TANHF_DATA(_iExpMantMask_UISA)(%rip) {1to16}, %zmm0, %zmm1
> > > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask_UISA)(%rip) {1to16}, %zmm1, %zmm2
> > >
> > > -/*
> > > - *  small table specific variables *
> > > - *  Constant loading
> > > - */
> > > -        vpxord    %zmm5, %zmm5, %zmm5
> > > -
> > > -/* if VMIN, VMAX is defined for I type */
> > > -        vpmaxsd   %zmm5, %zmm4, %zmm6
> > > -        vpminsd   _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> > > -        vpsrld    $21, %zmm7, %zmm10
> > > -        vmovups   _sP7+__svml_stanh_data_internal(%rip), %zmm4
> > > -        vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> > > -        vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> > > -        vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> > > -        vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> > > -        vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> > > -        vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> > > -        vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> > > -        vpandnd   %zmm3, %zmm3, %zmm2{%k1}
> > > -        vptestmd  %zmm2, %zmm2, %k0
> > > -        vmovups   _sP0+__svml_stanh_data_internal(%rip), %zmm3
> > > -        vsubps    {rn-sae}, %zmm9, %zmm8, %zmm2
> > > -        kmovw     %k0, %edx
> > > -        vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> > > -        vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> > > -        vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> > > -        vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> > > -        vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> > > -        vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> > > -        vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> > > -        vorps     %zmm0, %zmm4, %zmm0
> > > -        testl     %edx, %edx
> > > -
> > > -/* Go to special inputs processing branch */
> > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> > > -
> > > -/* Restore registers
> > > - * and exit the function
> > > - */
> > > +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > +       vpxord  %zmm3, %zmm3, %zmm3
> > > +       vpmaxsd %zmm3, %zmm2, %zmm3
> > > +       vpminsd TANHF_DATA(_iMaxIdxMask_UISA)(%rip) {1to16}, %zmm3, %zmm3
> > >
> > > -L(EXIT):
> > > -        movq      %rbp, %rsp
> > > -        popq      %rbp
> > > -        cfi_def_cfa(7, 8)
> > > -        cfi_restore(6)
> > > -        ret
> > > -        cfi_def_cfa(6, 16)
> > > -        cfi_offset(6, -16)
> > > -
> > > -/* Branch to process
> > > - * special inputs
> > > - */
> > > +       /* Setup permute indices in zmm3.  */
> > > +       vpsrld  $21, %zmm3, %zmm3
> > > +
> > > +       /* Store if there are any special cases in k1.  */
> > > +       vpcmpd  $6, TANHF_DATA(_iExpMask)(%rip) {1to16}, %zmm1, %k1
> > > +
> > > +
> > > +       /* Store absolute values of inputs in zmm1.  */
> > > +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > +       vandnps %zmm0, %zmm4, %zmm1
> > > +
> > > +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > +
> > > +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > >
> > > +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > +
> > > +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > +
> > > +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > +
> > > +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > +
> > > +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > +
> > > +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > +
> > > +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > +
> > > +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > +
> > > +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > +
> > > +       kmovw   %k1, %edx
> > > +       testl   %edx, %edx
> > > +
> > > +       /* Go to special inputs processing branch.  */
> > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > +       /* Wait until after branch of write over zmm0.  */
> > > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > +
> > > +       /* No stack restoration on the fastpath.  */
> > > +       ret
> > > +
> > > +       /* Branch to process special inputs.  */
> > >  L(SPECIAL_VALUES_BRANCH):
> > > -        vmovups   %zmm1, 64(%rsp)
> > > -        vmovups   %zmm0, 128(%rsp)
> > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> > > -
> > > -        xorl      %eax, %eax
> > > -                                # LOE rbx r12 r13 r14 r15 eax edx
> > > -
> > > -        vzeroupper
> > > -        movq      %r12, 16(%rsp)
> > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > -        movl      %eax, %r12d
> > > -        movq      %r13, 8(%rsp)
> > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > -        movl      %edx, %r13d
> > > -        movq      %r14, (%rsp)
> > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > -                                # LOE rbx r15 r12d r13d
> > > -
> > > -/* Range mask
> > > - * bits check
> > > - */
> > > +       pushq   %rbp
> > > +       /* Need to callee save registers to preserve state across tanhf calls.
> > > +        */
> > > +       pushq   %r13
> > > +       pushq   %r12
> > > +       movq    %rsp, %rbp
> > >
> > > -L(RANGEMASK_CHECK):
> > > -        btl       %r12d, %r13d
> > > +       /* Align stack and make room for 2x zmm vectors.  */
> > > +       andq    $-64, %rsp
> > > +       addq    $-128, %rsp
> > >
> > > -/* Call scalar math function */
> > > -        jc        L(SCALAR_MATH_CALL)
> > > -                                # LOE rbx r15 r12d r13d
> > > +       /* Save all already computed inputs.  */
> > > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm2
> > > +       vmovaps %zmm2, (%rsp)
> > > +       /* Save origional input (zmm0 unchanged up to this point).  */
> > > +       vmovaps %zmm0, 64(%rsp)
> > >
> > > -/* Special inputs
> > > - * processing loop
> > > - */
> > > +       vzeroupper
> > >
> > > +       /* edx has 1s where there was a special value that needs to be handled
> > > +          by a tanhf call.  */
> > > +       movl    %edx, %r13d
> > >  L(SPECIAL_VALUES_LOOP):
> > > -        incl      %r12d
> > > -        cmpl      $16, %r12d
> > > -
> > > -/* Check bits in range mask */
> > > -        jl        L(RANGEMASK_CHECK)
> > > -                                # LOE rbx r15 r12d r13d
> > > -
> > > -        movq      16(%rsp), %r12
> > > -        cfi_restore(12)
> > > -        movq      8(%rsp), %r13
> > > -        cfi_restore(13)
> > > -        movq      (%rsp), %r14
> > > -        cfi_restore(14)
> > > -        vmovups   128(%rsp), %zmm0
> > > -
> > > -/* Go to exit */
> > > -        jmp       L(EXIT)
> > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > -                                # LOE rbx r12 r13 r14 r15 zmm0
> > > -
> > > -/* Scalar math fucntion call
> > > - * to process special input
> > > - */
> > > +       /* use r12 as index for special value that is saved across calls to
> > > +          tanhf. We technically don't need a callee save register here as offset
> > > +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > +          in the loop.  */
> > > +       xorl    %r12d, %r12d
> > > +       tzcntl  %r13d, %r12d
> > >
> > > -L(SCALAR_MATH_CALL):
> > > -        movl      %r12d, %r14d
> > > -        movss     64(%rsp,%r14,4), %xmm0
> > > -        call      tanhf@PLT
> > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > +       /* Scalar math fucntion call to process special input.  */
> > > +       movss   64(%rsp, %r12, 4), %xmm0
> > > +       call    tanhf@PLT
> > >
> > > -        movss     %xmm0, 128(%rsp,%r14,4)
> > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > +          serialized stack/callee save restoration.  */
> > > +       movss   %xmm0, (%rsp, %r12, 4)
> > >
> > > -/* Process special inputs in loop */
> > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > -                                # LOE rbx r15 r12d r13d
> > > -END(_ZGVeN16v_tanhf_skx)
> > > +       blsr    %r13d, %r13d
> > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > >
> > > -        .section .rodata, "a"
> > > -        .align 64
> > > +       /* All results have been written to 64(%rsp).  */
> > > +       vmovaps (%rsp), %zmm0
> > > +       /* Restore rsp.  */
> > > +       movq    %rbp, %rsp
> > > +       /* Restore callee save registers.  */
> > > +       popq    %r12
> > > +       popq    %r13
> > > +       popq    %rbp
> > > +       ret
> > > +END(_ZGVeN16v_tanhf_skx)
> > >
> > > +       .section .rodata, "a"
> > > +       .align  16
> > >  #ifdef __svml_stanh_data_internal_typedef
> > > -typedef unsigned int VUINT32;
> > > -typedef struct
> > > -{
> > > -        __declspec(align(64)) VUINT32 _sC[32][1];
> > > -        __declspec(align(64)) VUINT32 _sP0[32][1];
> > > -        __declspec(align(64)) VUINT32 _sP2[32][1];
> > > -        __declspec(align(64)) VUINT32 _sP3[32][1];
> > > -        __declspec(align(64)) VUINT32 _sP4[32][1];
> > > -        __declspec(align(64)) VUINT32 _sP5[32][1];
> > > -        __declspec(align(64)) VUINT32 _sP6[32][1];
> > > -        __declspec(align(64)) VUINT32 _sP7[32][1];
> > > -        __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> > > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> > > -        __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> > > -        __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > -        __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> > > -        __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> > > -        __declspec(align(64)) VUINT32 _iExpMask[16][1];
> > > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> > > -        __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> > > -} __svml_stanh_data_internal;
> > > +       typedef unsigned int VUINT32;
> > > +       typedef struct
> > > +       {
> > > +       __declspec (align(4))VUINT32 _iExpMantMask_UISA[1][1];
> > > +       __declspec (align(4))VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > +       __declspec (align(4))VUINT32 _iMaxIdxMask_UISA[1][1];
> > > +       __declspec (align(4))VUINT32 _iExpMask[1][1];
> > > +       __declspec (align(64))VUINT32 _sSignMask[16][1];
> > > +       __declspec (align(64))VUINT32 _sC_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sC_hi[16][1];
> > > +       __declspec (align(64))VUINT32 _sP7_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sP7_hi[16][1];
> > > +       __declspec (align(64))VUINT32 _sP6_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sP6_hi[16][1];
> > > +       __declspec (align(64))VUINT32 _sP5_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sP5_hi[16][1];
> > > +       __declspec (align(64))VUINT32 _sP4_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sP4_hi[16][1];
> > > +       __declspec (align(64))VUINT32 _sP3_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sP3_hi[16][1];
> > > +       __declspec (align(64))VUINT32 _sP2_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sP2_hi[16][1];
> > > +       __declspec (align(64))VUINT32 _sP0_lo[16][1];
> > > +       __declspec (align(64))VUINT32 _sP0_hi[16][1];
> > > +       }__svml_stanh_data_internal;
> > >  #endif
> > > +
> > >  __svml_stanh_data_internal:
> > > -        /*== _sC ==*/
> > > -        .long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > -        .long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > -        .long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > -        .long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > -        .long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > -        .long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > -        .long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > -        .long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > -        /*== p0 ==*/
> > > -        .align 64
> > > -        .long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > -        .long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > -        .long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > -        .long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > -        .long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > -        .long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > -        .long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > -        .long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > -        /*== p2 ==*/
> > > -        .align 64
> > > -        .long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > -        .long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > -        .long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > -        .long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > -        .long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > -        .long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > -        .long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > -        .long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > -        /*== p3 ==*/
> > > -        .align 64
> > > -        .long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > -        .long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > -        .long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > -        .long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > -        .long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > -        .long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > -        .long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > -        .long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > -        /*== p4 ==*/
> > > -        .align 64
> > > -        .long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > -        .long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > -        .long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > -        .long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > -        .long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > -        .long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > -        .long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > -        .long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > -        /*== p5 ==*/
> > > -        .align 64
> > > -        .long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > -        .long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > -        .long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > -        .long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > -        .long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > -        .long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > -        .long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > -        .long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > -        /*== p6 ==*/
> > > -        .align 64
> > > -        .long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > -        .long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > -        .long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > -        .long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > -        .long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > -        .long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > -        .long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > -        .long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > -        /*== p7 ==*/
> > > -        .align 64
> > > -        .long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > -        .long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > -        .long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > -        .long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > -        .long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > -        .long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > -        .long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > -        .long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > -        .align 64
> > > -        .long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000           /* _iExpMantMask_UISA     */
> > > -        .align 64
> > > -        .long 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000           /* _iMinIdxOfsMask_UISA   */
> > > -        .align 64
> > > -        .long 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000           /* _iMaxIdxMask_UISA      */
> > > -        .align 64
> > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > -        .align 64
> > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > -        .align 64
> > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > -        .align 64
> > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > -        .align 64
> > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > -        .align 64
> > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > -        .align 64
> > > -        .type  __svml_stanh_data_internal,@object
> > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > +       .align  4
> > > +       /* _iExpMantMask_UISA.  */
> > > +       .long   0x7fe00000
> > > +
> > > +       .align  4
> > > +       /* _iMinIdxOfsMask_UISA.  */
> > > +       .long   0x3d400000
> > > +
> > > +       .align  4
> > > +       /* _iMaxIdxMask_UISA.  */
> > > +       .long   0x03e00000
> > > +
> > > +       .align  4
> > > +       /* _iExpMask.  */
> > > +       .long   0x7f000000
> > > +
> > > +       .align  64
> > > +       /* _sSignMask.  */
> > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > +
> > > +       .align  64
> > > +       /* _sC_lo.  */
> > > +       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > +       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > +       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > +       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > +
> > > +       .align  64
> > > +       /* _sC_hi.  */
> > > +       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > +       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > +       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > +       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > +
> > > +       .align  64
> > > +       /* _sP7_lo.  */
> > > +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > +
> > > +       .align  64
> > > +       /* _sP7_hi.  */
> > > +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > +
> > > +       .align  64
> > > +       /* _sP6_lo.  */
> > > +       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > +       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > +       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > +       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > +
> > > +       .align  64
> > > +       /* _sP6_hi.  */
> > > +       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > +       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > +       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > +       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > +
> > > +       .align  64
> > > +       /* _sP5_lo.  */
> > > +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > +
> > > +       .align  64
> > > +       /* _sP5_hi.  */
> > > +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > +
> > > +       .align  64
> > > +       /* _sP4_lo.  */
> > > +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > +
> > > +       .align  64
> > > +       /* _sP4_hi.  */
> > > +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > +
> > > +       .align  64
> > > +       /* _sP3_lo.  */
> > > +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > +
> > > +       .align  64
> > > +       /* _sP3_hi.  */
> > > +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > +
> > > +       .align  64
> > > +       /* _sP2_lo.  */
> > > +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > +
> > > +       .align  64
> > > +       /* _sP2_hi.  */
> > > +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > +
> > > +       .align  64
> > > +       /* _sP0_lo.  */
> > > +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > +
> > > +       .align  64
> > > +       /* _sP0_hi.  */
> > > +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > +
> > > +       .align  64
> > > +       .type   __svml_stanh_data_internal, @object
> > > +       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > index 50f753ffb3..716b06d640 100644
> > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > @@ -70,763 +70,154 @@
> > >   *
> > >   */
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal
> > > - */
> > > -#define _dbP                           0
> > > -#define _sSignMask                     4288
> > > -#define _sAbsMask                      4304
> > > -#define _iExpMantMask                  4320
> > > -#define _iExpMask                      4336
> > > -#define _iMinIdxOfsMask                4352
> > > -#define _iMaxIdxMask                   4368
> > >
> > >  #include <sysdep.h>
> > >
> > > +#define ONLY_DECL_OFFSET
> > > +#include "svml_s_tanhf_rodata.S"
> > > +
> > >          .text
> > >         .section .text.sse4,"ax",@progbits
> > >  ENTRY(_ZGVbN4v_tanhf_sse4)
> > > -        subq      $72, %rsp
> > > -        cfi_def_cfa_offset(80)
> > > -        movaps    %xmm0, %xmm5
> > > +       /* Save copy of input in xmm12.  */
> > > +       movaps  %xmm0, %xmm12
> > >
> > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > -        movdqu    _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> > > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r8
> > > -        pand      %xmm5, %xmm9
> > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> > > +       pand    %xmm0, %xmm3
> > >
> > > -/* if VMIN, VMAX is defined for I type */
> > > -        pxor      %xmm7, %xmm7
> > > -        movdqa    %xmm9, %xmm6
> > > -        psubd     _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> > >
> > > -/*
> > > - *  small table specific variables *
> > > - *  Constant loading
> > > - */
> > > -        movdqu    _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> > > -        movdqa    %xmm9, %xmm11
> > > -        movdqa    %xmm9, %xmm8
> > > -        pcmpgtd   %xmm10, %xmm11
> > > -        pcmpgtd   %xmm7, %xmm8
> > > -        movdqa    %xmm11, %xmm14
> > > -        pand      %xmm8, %xmm9
> > > -        andps     %xmm11, %xmm10
> > > -        andnps    %xmm9, %xmm14
> > > -        orps      %xmm10, %xmm14
> > > -        psrld     $14, %xmm14
> > > -        movd      %xmm14, %edx
> > > -        pshufd    $1, %xmm14, %xmm12
> > > -        pshufd    $2, %xmm14, %xmm13
> > > -        movd      %xmm12, %ecx
> > > -        pshufd    $3, %xmm14, %xmm15
> > > -        movups    _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> > > -        movslq    %edx, %rdx
> > > -        andps     %xmm5, %xmm3
> > > -        movslq    %ecx, %rcx
> > > -        pcmpgtd   _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> > > -        movd      %xmm13, %esi
> > > -        movups    -16(%rdx,%r8), %xmm2
> > > -        movaps    %xmm2, %xmm0
> > > -        movd      %xmm15, %edi
> > > -        movmskps  %xmm6, %eax
> > > -        movups    -16(%rcx,%r8), %xmm6
> > > -        unpcklpd  %xmm6, %xmm0
> > > -        unpckhpd  %xmm6, %xmm2
> > > -        cvtps2pd  %xmm3, %xmm6
> > > -        movhlps   %xmm3, %xmm3
> > > -        cvtps2pd  %xmm3, %xmm3
> > > -        movslq    %esi, %rsi
> > > -        movslq    %edi, %rdi
> > > -        movups    (%rcx,%r8), %xmm8
> > > -        movups    (%rdx,%r8), %xmm12
> > > -        movups    (%rsi,%r8), %xmm13
> > > -        movaps    %xmm12, %xmm10
> > > -        movups    (%rdi,%r8), %xmm9
> > > -        movaps    %xmm13, %xmm11
> > > -        unpckhpd  %xmm8, %xmm12
> > > -        unpckhpd  %xmm9, %xmm13
> > > -        mulpd     %xmm6, %xmm12
> > > -        mulpd     %xmm3, %xmm13
> > > -        unpcklpd  %xmm8, %xmm10
> > > -        unpcklpd  %xmm9, %xmm11
> > > -        addpd     %xmm10, %xmm12
> > > -        addpd     %xmm11, %xmm13
> > > -        mulpd     %xmm6, %xmm12
> > > -        mulpd     %xmm3, %xmm13
> > > -        addpd     %xmm2, %xmm12
> > > -        movups    -16(%rsi,%r8), %xmm1
> > > -        movups    -16(%rdi,%r8), %xmm7
> > > -        movaps    %xmm1, %xmm14
> > > -        unpckhpd  %xmm7, %xmm1
> > > -        addpd     %xmm1, %xmm13
> > > -        mulpd     %xmm12, %xmm6
> > > -        mulpd     %xmm13, %xmm3
> > > -        addpd     %xmm0, %xmm6
> > > -        unpcklpd  %xmm7, %xmm14
> > > -        addpd     %xmm14, %xmm3
> > > -        cvtpd2ps  %xmm6, %xmm0
> > > -        cvtpd2ps  %xmm3, %xmm1
> > > -        movups    _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> > > -        movlhps   %xmm1, %xmm0
> > > -        andps     %xmm5, %xmm4
> > > -        orps      %xmm4, %xmm0
> > > -        testl     %eax, %eax
> > > -
> > > -/* Go to special inputs processing branch */
> > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > -                                # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> > > -
> > > -/* Restore registers
> > > - * and exit the function
> > > - */
> > > +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
> > > +       pxor    %xmm7, %xmm7
> > > +       /* Save xmm3 for special values check at end.  */
> > > +       movdqa  %xmm3, %xmm8
> > > +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> > > +       pmaxsd  %xmm7, %xmm3
> > > +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> > > +       psrld   $14, %xmm3
> > >
> > > -L(EXIT):
> > > -        addq      $72, %rsp
> > > -        cfi_def_cfa_offset(8)
> > > -        ret
> > > -        cfi_def_cfa_offset(80)
> > > +       movq    %xmm3, %rcx
> > > +       movl    %ecx, %edx
> > > +       shrq    $32, %rcx
> > >
> > > -/* Branch to process
> > > - * special inputs
> > > - */
> > > +       /* xmm8 contains mask of special values.  */
> > > +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
> > >
> > > -L(SPECIAL_VALUES_BRANCH):
> > > -        movups    %xmm5, 32(%rsp)
> > > -        movups    %xmm0, 48(%rsp)
> > > -                                # LOE rbx rbp r12 r13 r14 r15 eax
> > > -
> > > -        xorl      %edx, %edx
> > > -        movq      %r12, 16(%rsp)
> > > -        cfi_offset(12, -64)
> > > -        movl      %edx, %r12d
> > > -        movq      %r13, 8(%rsp)
> > > -        cfi_offset(13, -72)
> > > -        movl      %eax, %r13d
> > > -        movq      %r14, (%rsp)
> > > -        cfi_offset(14, -80)
> > > -                                # LOE rbx rbp r15 r12d r13d
> > > -
> > > -/* Range mask
> > > - * bits check
> > > - */
> > > +       pshufd  $0x0e, %xmm3, %xmm3
> > > +       movq    %xmm3, %rdi
> > > +       movl    %edi, %esi
> > > +       shrq    $32, %rdi
> > >
> > > -L(RANGEMASK_CHECK):
> > > -        btl       %r12d, %r13d
> > > +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> > > +       andps   %xmm1, %xmm0
> > >
> > > -/* Call scalar math function */
> > > -        jc        L(SCALAR_MATH_CALL)
> > > -                                # LOE rbx rbp r15 r12d r13d
> > > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > > +       movups  (%rdx, %rax), %xmm2
> > > +       movups  (%rcx, %rax), %xmm6
> > >
> > > -/* Special inputs
> > > - * processing loop
> > > - */
> > > +       movaps  %xmm2, %xmm4
> > > +       movlhps %xmm6, %xmm4
> > > +       unpckhpd %xmm6, %xmm2
> > >
> > > -L(SPECIAL_VALUES_LOOP):
> > > -        incl      %r12d
> > > -        cmpl      $4, %r12d
> > > -
> > > -/* Check bits in range mask */
> > > -        jl        L(RANGEMASK_CHECK)
> > > -                                # LOE rbx rbp r15 r12d r13d
> > > -
> > > -        movq      16(%rsp), %r12
> > > -        cfi_restore(12)
> > > -        movq      8(%rsp), %r13
> > > -        cfi_restore(13)
> > > -        movq      (%rsp), %r14
> > > -        cfi_restore(14)
> > > -        movups    48(%rsp), %xmm0
> > > -
> > > -/* Go to exit */
> > > -        jmp       L(EXIT)
> > > -        cfi_offset(12, -64)
> > > -        cfi_offset(13, -72)
> > > -        cfi_offset(14, -80)
> > > -                                # LOE rbx rbp r12 r13 r14 r15 xmm0
> > > -
> > > -/* Scalar math fucntion call
> > > - * to process special input
> > > - */
> > > +       cvtps2pd %xmm0, %xmm6
> > > +       movhlps %xmm0, %xmm0
> > > +       cvtps2pd %xmm0, %xmm0
> > >
> > > -L(SCALAR_MATH_CALL):
> > > -        movl      %r12d, %r14d
> > > -        movss     32(%rsp,%r14,4), %xmm0
> > > -        call      tanhf@PLT
> > > -                                # LOE rbx rbp r14 r15 r12d r13d xmm0
> > > +       movups  16(%rdx, %rax), %xmm5
> > > +       movups  16(%rsi, %rax), %xmm13
> > >
> > > -        movss     %xmm0, 48(%rsp,%r14,4)
> > > +       movaps  %xmm5, %xmm10
> > > +       movaps  %xmm13, %xmm11
> > >
> > > -/* Process special inputs in loop */
> > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > -                                # LOE rbx rbp r15 r12d r13d
> > > -END(_ZGVbN4v_tanhf_sse4)
> > > +       movups  16(%rcx, %rax), %xmm7
> > > +       movups  16(%rdi, %rax), %xmm3
> > > +
> > > +       unpckhpd %xmm7, %xmm5
> > > +       unpckhpd %xmm3, %xmm13
> > > +
> > > +       mulpd   %xmm6, %xmm5
> > > +       mulpd   %xmm0, %xmm13
> > > +
> > > +       movlhps %xmm7, %xmm10
> > > +       movlhps %xmm3, %xmm11
> > > +
> > > +       addpd   %xmm10, %xmm5
> > > +       addpd   %xmm11, %xmm13
> > > +
> > > +       mulpd   %xmm6, %xmm5
> > > +       mulpd   %xmm0, %xmm13
> > > +
> > > +       addpd   %xmm2, %xmm5
> > >
> > > -        .section .rodata, "a"
> > > -        .align 16
> > > -
> > > -#ifdef __svml_stanh_data_internal_typedef
> > > -typedef unsigned int VUINT32;
> > > -typedef struct
> > > -{
> > > -        __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> > > -        __declspec(align(16)) VUINT32 _sSignMask[4][1];
> > > -        __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> > > -        __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> > > -        __declspec(align(16)) VUINT32 _iExpMask[4][1];
> > > -        __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> > > -        __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> > > -} __svml_stanh_data_internal;
> > > -#endif
> > > -__svml_stanh_data_internal:
> > > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > > -        .quad 0x3ff0000000000000
> > > -        .quad 0x0000000000000000
> > > -        .quad 0x0000000000000000
> > > -        .quad 0x0000000000000000
> > > -        .align 16
> > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > -        .align 16
> > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > -        .align 16
> > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > -        .align 16
> > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > -        .align 16
> > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > -        .align 16
> > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > -        .align 16
> > > -        .type  __svml_stanh_data_internal,@object
> > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > +       movups  (%rsi, %rax), %xmm2
> > > +       movups  (%rdi, %rax), %xmm7
> > > +
> > > +       movaps  %xmm2, %xmm3
> > > +
> > > +       unpckhpd %xmm7, %xmm2
> > > +       movlhps %xmm7, %xmm3
> > > +
> > > +       addpd   %xmm13, %xmm2
> > > +
> > > +       mulpd   %xmm5, %xmm6
> > > +       addpd   %xmm4, %xmm6
> > > +
> > > +       mulpd   %xmm2, %xmm0
> > > +       addpd   %xmm3, %xmm0
> > > +
> > > +       cvtpd2ps %xmm0, %xmm2
> > > +       cvtpd2ps %xmm6, %xmm0
> > > +
> > > +       movlhps %xmm2, %xmm0
> > > +       andnps  %xmm12, %xmm1
> > > +       orps    %xmm1, %xmm0
> > > +
> > > +       movmskps %xmm8, %edx
> > > +       testl   %edx, %edx
> > > +
> > > +       /* Go to special inputs processing branch.  */
> > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > +
> > > +       /* No stack restoration on the fastpath.  */
> > > +       ret
> > > +
> > > +L(SPECIAL_VALUES_BRANCH):
> > > +       subq    $48, %rsp
> > > +
> > > +       movups  %xmm0, (%rsp)
> > > +       movups  %xmm12, 16(%rsp)
> > > +
> > > +       movq    %r12, 32(%rsp)
> > > +       movq    %r13, 40(%rsp)
> > > +
> > > +       /* edx has 1s where there was a special value that needs to be handled
> > > +          by a tanhf call.  */
> > > +       movl    %edx, %r13d
> > > +L(SPECIAL_VALUES_LOOP):
> > > +       /* use r12 as index for special value that is saved across calls to
> > > +          tanhf. We technically don't need a callee save register here as offset
> > > +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > +          in the loop.  */
> > > +       xorl    %r12d, %r12d
> > > +       bsfl    %r13d, %r12d
> > > +
> > > +       /* Scalar math fucntion call to process special input.  */
> > > +       movss   16(%rsp, %r12, 4), %xmm0
> > > +       call    tanhf@PLT
> > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > +          serialized stack/callee save restoration.  */
> > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > +
> > > +       leal    -1(%r13), %eax
> > > +       andl    %eax, %r13d
> > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > +
> > > +       /* All results have been written to 16(%rsp).  */
> > > +       movups  (%rsp), %xmm0
> > > +       movq    32(%rsp), %r12
> > > +       movq    40(%rsp), %r13
> > > +       addq    $48, %rsp
> > > +       ret
> > > +END(_ZGVbN4v_tanhf_sse4)
> > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > index 3745db5aa4..90c3ea4cc6 100644
> > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > @@ -70,775 +70,171 @@
> > >   *
> > >   */
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal
> > > - */
> > > -#define _dbP                           0
> > > -#define _sSignMask                     4288
> > > -#define _sAbsMask                      4320
> > > -#define _iExpMantMask                  4352
> > > -#define _iExpMask                      4384
> > > -#define _iMinIdxOfsMask                4416
> > > -#define _iMaxIdxMask                   4448
> > > -
> > >  #include <sysdep.h>
> > > +#include "svml_s_tanhf_rodata.S"
> > >
> > >          .text
> > >         .section .text.avx2,"ax",@progbits
> > >  ENTRY(_ZGVdN8v_tanhf_avx2)
> > > -        pushq     %rbp
> > > -        cfi_def_cfa_offset(16)
> > > -        movq      %rsp, %rbp
> > > -        cfi_def_cfa(6, 16)
> > > -        cfi_offset(6, -16)
> > > -        andq      $-32, %rsp
> > > -        pushq     %r12
> > > -        subq      $120, %rsp
> > > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r10
> > > -        vmovaps   %ymm0, %ymm12
> > > -
> > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > -        vpand     _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> > > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
> > >
> > > -/*
> > > - *  small table specific variables *
> > > - *  Constant loading
> > > - */
> > > -        vmovups   _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> > > -        vpsubd    _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> > > -
> > > -/* if VMIN, VMAX is defined for I type */
> > > -        vxorps    %ymm15, %ymm15, %ymm15
> > > -        vpcmpgtd  %ymm15, %ymm9, %ymm0
> > > -        vpand     %ymm0, %ymm9, %ymm7
> > > -        vpcmpgtd  %ymm8, %ymm9, %ymm6
> > > -        vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> > > -        vpsrld    $14, %ymm3, %ymm1
> > > -        vpcmpgtd  _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> > > -        vmovmskps %ymm13, %r11d
> > > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> > > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> > > -        vextractf128 $1, %ymm1, %xmm2
> > > -        vmovd     %xmm1, %r9d
> > > -        vmovd     %xmm2, %ecx
> > > -        vpextrd   $1, %xmm2, %edx
> > > -        vpextrd   $1, %xmm1, %r8d
> > > -        movslq    %r9d, %r9
> > > -        movslq    %edx, %rdx
> > > -        movslq    %r8d, %r8
> > > -        vpextrd   $2, %xmm1, %edi
> > > -        movslq    %ecx, %rcx
> > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > > -        vpextrd   $3, %xmm2, %r12d
> > > -        vpextrd   $3, %xmm1, %esi
> > > -        vpextrd   $2, %xmm2, %eax
> > > -        movslq    %edi, %rdi
> > > -        movslq    %r12d, %r12
> > > -        movslq    %esi, %rsi
> > > -        movslq    %eax, %rax
> > > -        vmovupd   -16(%r9,%r10), %xmm5
> > > -        vmovupd   -16(%rdx,%r10), %xmm14
> > > -        vmovupd   -16(%rcx,%r10), %xmm13
> > > -        vmovupd   (%r9,%r10), %xmm1
> > > -        vmovupd   (%r8,%r10), %xmm2
> > > -        vmovupd   -16(%r8,%r10), %xmm4
> > > -        vinsertf128 $1, -16(%rdi,%r10), %ymm5, %ymm15
> > > -        vinsertf128 $1, -16(%r12,%r10), %ymm14, %ymm3
> > > -        vinsertf128 $1, -16(%rax,%r10), %ymm13, %ymm6
> > > -        vinsertf128 $1, (%rdi,%r10), %ymm1, %ymm5
> > > -        vinsertf128 $1, (%rsi,%r10), %ymm2, %ymm14
> > > -        vunpcklpd %ymm3, %ymm6, %ymm8
> > > -        vunpckhpd %ymm3, %ymm6, %ymm6
> > > -        vunpcklpd %ymm14, %ymm5, %ymm3
> > > -        vunpckhpd %ymm14, %ymm5, %ymm2
> > > -        vmovupd   (%rcx,%r10), %xmm13
> > > -        vcvtps2pd %xmm10, %ymm5
> > > -        vextractf128 $1, %ymm10, %xmm10
> > > -        vfmadd213pd %ymm3, %ymm5, %ymm2
> > > -        vinsertf128 $1, -16(%rsi,%r10), %ymm4, %ymm0
> > > -        vmovupd   (%rdx,%r10), %xmm4
> > > -        vunpcklpd %ymm0, %ymm15, %ymm9
> > > -        vunpckhpd %ymm0, %ymm15, %ymm7
> > > -        vfmadd213pd %ymm7, %ymm5, %ymm2
> > > -        vfmadd213pd %ymm9, %ymm5, %ymm2
> > > -        vinsertf128 $1, (%r12,%r10), %ymm4, %ymm0
> > > -        vcvtps2pd %xmm10, %ymm4
> > > -        vinsertf128 $1, (%rax,%r10), %ymm13, %ymm15
> > > -        vunpcklpd %ymm0, %ymm15, %ymm1
> > > -        vunpckhpd %ymm0, %ymm15, %ymm0
> > > -        vfmadd213pd %ymm1, %ymm4, %ymm0
> > > -        vcvtpd2ps %ymm2, %xmm1
> > > -        vfmadd213pd %ymm6, %ymm4, %ymm0
> > > -        vfmadd213pd %ymm8, %ymm4, %ymm0
> > > -        vcvtpd2ps %ymm0, %xmm0
> > > -        vinsertf128 $1, %xmm0, %ymm1, %ymm2
> > > -        vorps     %ymm11, %ymm2, %ymm0
> > > -        testl     %r11d, %r11d
> > > -
> > > -/* Go to special inputs processing branch */
> > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > -                                # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> > > -
> > > -/* Restore registers
> > > - * and exit the function
> > > - */
> > > +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> > > +       vpxor   %ymm3, %ymm3, %ymm3
> > > +       vpmaxsd %ymm3, %ymm2, %ymm2
> > > +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
> > >
> > > -L(EXIT):
> > > -        addq      $120, %rsp
> > > -        cfi_restore(12)
> > > -        popq      %r12
> > > -        movq      %rbp, %rsp
> > > -        popq      %rbp
> > > -        cfi_def_cfa(7, 8)
> > > -        cfi_restore(6)
> > > -        ret
> > > -        cfi_def_cfa(6, 16)
> > > -        cfi_offset(6, -16)
> > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > > -
> > > -/* Branch to process
> > > - * special inputs
> > > - */
> > > +       vpsrld  $14, %ymm2, %ymm1
> > >
> > > -L(SPECIAL_VALUES_BRANCH):
> > > -        vmovups   %ymm12, 32(%rsp)
> > > -        vmovups   %ymm0, 64(%rsp)
> > > -                                # LOE rbx r13 r14 r15 r11d ymm0
> > > -
> > > -        xorl      %r12d, %r12d
> > > -                                # LOE rbx r13 r14 r15 r11d r12d
> > > -
> > > -        vzeroupper
> > > -        movq      %r13, 8(%rsp)
> > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > > -        movl      %r11d, %r13d
> > > -        movq      %r14, (%rsp)
> > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > > -                                # LOE rbx r15 r12d r13d
> > > -
> > > -/* Range mask
> > > - * bits check
> > > - */
> > > +       /* Store special cases in ymm15.  */
> > > +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
> > >
> > > -L(RANGEMASK_CHECK):
> > > -        btl       %r12d, %r13d
> > >
> > > -/* Call scalar math function */
> > > -        jc        L(SCALAR_MATH_CALL)
> > > -                                # LOE rbx r15 r12d r13d
> > > +       /* Store base of lookup table in rax.  */
> > > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > >
> > > -/* Special inputs
> > > - * processing loop
> > > - */
> > > +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> > > +          store/load as we can take advantage of store-forwarding.  */
> > > +       vmovq   %xmm1, %r8
> > > +       /* We have eliminated all negative values for ymm1 so no need to sign
> > > +          extend.  */
> > > +       movl    %r8d, %r9d
> > > +       shrq    $32, %r8
> > >
> > > -L(SPECIAL_VALUES_LOOP):
> > > -        incl      %r12d
> > > -        cmpl      $8, %r12d
> > > -
> > > -/* Check bits in range mask */
> > > -        jl        L(RANGEMASK_CHECK)
> > > -                                # LOE rbx r15 r12d r13d
> > > -
> > > -        movq      8(%rsp), %r13
> > > -        cfi_restore(13)
> > > -        movq      (%rsp), %r14
> > > -        cfi_restore(14)
> > > -        vmovups   64(%rsp), %ymm0
> > > -
> > > -/* Go to exit */
> > > -        jmp       L(EXIT)
> > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > > -                                # LOE rbx r13 r14 r15 ymm0
> > > -
> > > -/* Scalar math fucntion call
> > > - * to process special input
> > > - */
> > > +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> > > +          with memory operand. This helps alleviate bottleneck on p5.  */
> > > +       vmovdqu 16(%r9, %rax), %xmm5
> > >
> > > -L(SCALAR_MATH_CALL):
> > > -        movl      %r12d, %r14d
> > > -        movss     32(%rsp,%r14,4), %xmm0
> > > -        call      tanhf@PLT
> > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > +       vpextrq $1, %xmm1, %rsi
> > > +       movl    %esi, %edi
> > > +       shrq    $32, %rsi
> > >
> > > -        movss     %xmm0, 64(%rsp,%r14,4)
> > > +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
> > >
> > > -/* Process special inputs in loop */
> > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > -                                # LOE rbx r15 r12d r13d
> > > -END(_ZGVdN8v_tanhf_avx2)
> > > +       vextracti128 $1, %ymm1, %xmm2
> > > +       vmovq   %xmm2, %rdx
> > > +       movl    %edx, %ecx
> > > +       shrq    $32, %rdx
> > > +
> > > +       vmovdqu (%rcx, %rax), %xmm6
> > > +
> > > +       vpextrq $1, %xmm2, %r10
> > > +       movl    %r10d, %r11d
> > > +       shrq    $32, %r10
> > > +
> > > +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> > > +
> > > +       vmovupd 16(%r8, %rax), %xmm1
> > > +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> > > +       vmovupd (%rdx, %rax), %xmm3
> > > +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> > > +
> > > +       vunpcklpd %ymm3, %ymm6, %ymm7
> > > +       vunpckhpd %ymm3, %ymm6, %ymm6
> > > +
> > > +       vunpcklpd %ymm1, %ymm5, %ymm3
> > > +       vunpckhpd %ymm1, %ymm5, %ymm1
> > > +
> > > +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> > > +       vandps  %ymm11, %ymm0, %ymm4
> > >
> > > -        .section .rodata, "a"
> > > -        .align 32
> > > -
> > > -#ifdef __svml_stanh_data_internal_typedef
> > > -typedef unsigned int VUINT32;
> > > -typedef struct
> > > -{
> > > -        __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> > > -        __declspec(align(32)) VUINT32 _sSignMask[8][1];
> > > -        __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> > > -        __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> > > -        __declspec(align(32)) VUINT32 _iExpMask[8][1];
> > > -        __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> > > -        __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> > > -} __svml_stanh_data_internal;
> > > -#endif
> > > -__svml_stanh_data_internal:
> > > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > > -        .quad 0x3ff0000000000000
> > > -        .quad 0x0000000000000000
> > > -        .quad 0x0000000000000000
> > > -        .quad 0x0000000000000000
> > > -        .align 32
> > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > -        .align 32
> > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > -        .align 32
> > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > -        .align 32
> > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > -        .align 32
> > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > -        .align 32
> > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > -        .align 32
> > > -        .type  __svml_stanh_data_internal,@object
> > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > +       vcvtps2pd %xmm4, %ymm5
> > > +
> > > +       vextractf128 $1, %ymm4, %xmm4
> > > +       vcvtps2pd %xmm4, %ymm4
> > > +
> > > +       vmovdqu 16(%rcx, %rax), %xmm2
> > > +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
> > > +
> > > +       vfmadd213pd %ymm3, %ymm5, %ymm1
> > > +
> > > +       vmovupd 16(%rdx, %rax), %xmm3
> > > +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> > > +
> > > +       vunpcklpd %ymm3, %ymm2, %ymm10
> > > +       vunpckhpd %ymm3, %ymm2, %ymm2
> > > +
> > > +       vfmadd213pd %ymm10, %ymm4, %ymm2
> > > +       vfmadd213pd %ymm6, %ymm4, %ymm2
> > > +       vfmadd213pd %ymm7, %ymm4, %ymm2
> > > +       vcvtpd2ps %ymm2, %xmm2
> > > +
> > > +       vmovdqu (%r9, %rax), %xmm7
> > > +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> > > +
> > > +       vmovupd (%r8, %rax), %xmm3
> > > +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> > > +
> > > +       vunpckhpd %ymm3, %ymm7, %ymm4
> > > +       vunpcklpd %ymm3, %ymm7, %ymm7
> > > +
> > > +       vfmadd213pd %ymm4, %ymm5, %ymm1
> > > +       vfmadd213pd %ymm7, %ymm5, %ymm1
> > > +
> > > +
> > > +       vcvtpd2ps %ymm1, %xmm1
> > > +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> > > +
> > > +       vmovmskps %ymm15, %edx
> > > +       vandnps %ymm0, %ymm11, %ymm2
> > > +       testl   %edx, %edx
> > > +       /* Go to special inputs processing branch.  */
> > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > +       /* Wait until after branch of write over ymm0.  */
> > > +       vorps   %ymm2, %ymm1, %ymm0
> > > +       /* No stack restoration on the fastpath.  */
> > > +       ret
> > > +
> > > +
> > > +L(SPECIAL_VALUES_BRANCH):
> > > +       pushq   %rbp
> > > +       /* Need to callee save registers to preserve state across tanhf calls.
> > > +        */
> > > +       pushq   %r12
> > > +       pushq   %r13
> > > +       movq    %rsp, %rbp
> > > +
> > > +       /* Align stack and make room for 2x ymm vectors.  */
> > > +       andq    $-32, %rsp
> > > +       addq    $-64, %rsp
> > > +
> > > +       /* Save all already computed inputs.  */
> > > +       vorps   %ymm2, %ymm1, %ymm1
> > > +       vmovups %ymm1, (%rsp)
> > > +       /* Save origional input (ymm0 unchanged up to this point).  */
> > > +       vmovups %ymm0, 32(%rsp)
> > > +
> > > +       vzeroupper
> > > +
> > > +       /* edx has 1s where there was a special value that needs to be handled
> > > +          by a tanhf call.  */
> > > +       movl    %edx, %r13d
> > > +L(SPECIAL_VALUES_LOOP):
> > > +       /* use r12 as index for special value that is saved across calls to
> > > +          tanhf. We technically don't need a callee save register here as offset
> > > +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > +          in the loop.  */
> > > +       xorl    %r12d, %r12d
> > > +       tzcntl  %r13d, %r12d
> > > +
> > > +       /* Scalar math fucntion call to process special input.  */
> > > +       movss   32(%rsp, %r12, 4), %xmm0
> > > +       call    tanhf@PLT
> > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > +          serialized stack/callee save restoration.  */
> > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > +
> > > +       blsr    %r13d, %r13d
> > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > +
> > > +       /* All results have been written to 32(%rsp).  */
> > > +       vmovups (%rsp), %ymm0
> > > +       movq    %rbp, %rsp
> > > +       popq    %r13
> > > +       popq    %r12
> > > +       popq    %rbp
> > > +       ret
> > > +END(_ZGVdN8v_tanhf_avx2)
> > > --
> > > 2.25.1
> > >
Noah Goldstein Feb. 1, 2022, 9:54 p.m. UTC | #4
On Tue, Feb 1, 2022 at 3:29 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Looking into v2, it is still big, with all optimizations applied at
> the same time.
>
> >Optimizations are:
> >    1. Reduce code size (-70 bytes).
> >    2. Reduce rodata size (-32 bytes).
> >    3. Remove register save/restores and stack adjustment from the
>        fast path.
> >    4. Slightly better instruction selection where possible.
> >    5. Remove redundant registers moves.
> >    6. Prefer registers that get smaller instruction encodings.
>
> Can you please further split the patch according to optimization, one
> optimization at a time per patch.

I don't think the changes are independent enough from one another to
do that cleanly. As well I think 1 patch/file is within the norm.

>
> On Tue, Feb 1, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Feb 1, 2022 at 2:03 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > Hi Noah,
> > >
> > > We would like to get this patch, but it's too late for 2.35.
> > >
> > > This patch is too big, can you please break this patch into multiple
> > > smaller patches?
> >
> > Yeah, I'll split by file.
> > >
> > > Also, it seems like this patch is incomplete. I got a build error on
> > > the glibc master.
> >
> > My fault, I separated the rodata for avx2/sse2 into a single file
> > so that the two implementations could share the lookup table.
> >
> > Forgot to commit it :/
> >
> > Will fix in V2.
> > >
> > > ./sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S:77:33: fatal
> > > error: svml_s_tanhf_rodata.S: No such file or directory
> > >  #include "svml_s_tanhf_rodata.S"
> > >                                  ^
> > > compilation terminated.
> > > ../sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S:74:33: fatal
> > > error: svml_s_tanhf_rodata.S: No such file or directory
> > >  #include "svml_s_tanhf_rodata.S"
> > >                                  ^
> > > compilation terminated.
> > >
> > > Thanks,
> > > Sunil
> > >
> > >
> > >
> > >
> > >
> > >
> > > On Sat, Jan 29, 2022 at 8:37 PM Noah Goldstein via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > No bug.
> > > >
> > > > Optimizations are:
> > > >     1. Reduce code size
> > > >         avx512: -56 bytes
> > > >         avx2:   -70 bytes
> > > >         sse4:   -106 bytes
> > > >     2. Reduce rodata size
> > > >         avx512: -448 bytes
> > > >         avx2:   -32 bytes
> > > >         sse4:   -4k+ (shares rodata with avx2)
> > > >     3. Remove register save/restores and stack adjustment from the
> > > >        fast path.
> > > >     4. Slightly better instruction selection where possible.
> > > >
> > > > This results in roughly a 15% performance improvement for all
> > > > functions.
> > > >
> > > > Results from geomean of 40 benchtest runs:
> > > >        Function, New Time, Old Time, New / Old
> > > >  _ZGVbN4v_tanhf,     3.28,    3.852,     0.852
> > > >  _ZGVcN8v_tanhf,    3.556,    4.192,     0.848
> > > >  _ZGVdN8v_tanhf,     2.13,    2.486,     0.857
> > > > _ZGVeN16v_tanhf,    0.658,    0.762,     0.864
> > > > ---
> > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 585 +++++------
> > > >  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 871 +++--------------
> > > >  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 908 +++---------------
> > > >  3 files changed, 581 insertions(+), 1783 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > index 8954a5f658..6a2f0c1392 100644
> > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > @@ -70,312 +70,323 @@
> > > >   *
> > > >   */
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal
> > > > - */
> > > > -#define _sC                            0
> > > > -#define _sP0                           128
> > > > -#define _sP2                           256
> > > > -#define _sP3                           384
> > > > -#define _sP4                           512
> > > > -#define _sP5                           640
> > > > -#define _sP6                           768
> > > > -#define _sP7                           896
> > > > -#define _iExpMantMask_UISA             1024
> > > > -#define _iMinIdxOfsMask_UISA           1088
> > > > -#define _iMaxIdxMask_UISA              1152
> > > > -#define _sSignMask                     1216
> > > > -#define _sAbsMask                      1280
> > > > -#define _iExpMantMask                  1344
> > > > -#define _iExpMask                      1408
> > > > -#define _iMinIdxOfsMask                1472
> > > > -#define _iMaxIdxMask                   1536
> > > > -
> > > >  #include <sysdep.h>
> > > >
> > > > +#define TANHF_DATA(offset)     ((offset) + __svml_stanh_data_internal)
> > > > +
> > > > +/* Offsets for data table __svml_stanh_data_internal.  */
> > > > +#define _iExpMantMask_UISA     0
> > > > +#define _iMinIdxOfsMask_UISA   4
> > > > +#define _iMaxIdxMask_UISA      8
> > > > +#define _iExpMask      12
> > > > +#define _sSignMask     64
> > > > +#define _sC_lo 128
> > > > +#define _sC_hi 192
> > > > +#define _sP7_lo        256
> > > > +#define _sP7_hi        320
> > > > +#define _sP6_lo        384
> > > > +#define _sP6_hi        448
> > > > +#define _sP5_lo        512
> > > > +#define _sP5_hi        576
> > > > +#define _sP4_lo        640
> > > > +#define _sP4_hi        704
> > > > +#define _sP3_lo        768
> > > > +#define _sP3_hi        832
> > > > +#define _sP2_lo        896
> > > > +#define _sP2_hi        960
> > > > +#define _sP0_lo        1024
> > > > +#define _sP0_hi        1088
> > > > +
> > > >          .text
> > > >         .section .text.exex512,"ax",@progbits
> > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > -        pushq     %rbp
> > > > -        cfi_def_cfa_offset(16)
> > > > -        movq      %rsp, %rbp
> > > > -        cfi_def_cfa(6, 16)
> > > > -        cfi_offset(6, -16)
> > > > -        andq      $-64, %rsp
> > > > -        subq      $192, %rsp
> > > > -        vmovaps   %zmm0, %zmm1
> > > > -        vmovups   __svml_stanh_data_internal(%rip), %zmm9
> > > > -        vmovups   _sP6+__svml_stanh_data_internal(%rip), %zmm11
> > > > -        vmovups   _sP5+__svml_stanh_data_internal(%rip), %zmm12
> > > > -        vmovups   _sP4+__svml_stanh_data_internal(%rip), %zmm13
> > > > -        vmovups   _sP3+__svml_stanh_data_internal(%rip), %zmm14
> > > > -        vmovups   _sP2+__svml_stanh_data_internal(%rip), %zmm15
> > > > -        vpternlogd $255, %zmm2, %zmm2, %zmm2
> > > > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> > > > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> > > > -
> > > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > -        vpandd    _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> > > > -        vpsubd    _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> > > > -        vpcmpd    $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> > > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > > +       vpandd  TANHF_DATA(_iExpMantMask_UISA)(%rip) {1to16}, %zmm0, %zmm1
> > > > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask_UISA)(%rip) {1to16}, %zmm1, %zmm2
> > > >
> > > > -/*
> > > > - *  small table specific variables *
> > > > - *  Constant loading
> > > > - */
> > > > -        vpxord    %zmm5, %zmm5, %zmm5
> > > > -
> > > > -/* if VMIN, VMAX is defined for I type */
> > > > -        vpmaxsd   %zmm5, %zmm4, %zmm6
> > > > -        vpminsd   _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> > > > -        vpsrld    $21, %zmm7, %zmm10
> > > > -        vmovups   _sP7+__svml_stanh_data_internal(%rip), %zmm4
> > > > -        vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> > > > -        vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> > > > -        vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> > > > -        vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> > > > -        vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> > > > -        vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> > > > -        vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> > > > -        vpandnd   %zmm3, %zmm3, %zmm2{%k1}
> > > > -        vptestmd  %zmm2, %zmm2, %k0
> > > > -        vmovups   _sP0+__svml_stanh_data_internal(%rip), %zmm3
> > > > -        vsubps    {rn-sae}, %zmm9, %zmm8, %zmm2
> > > > -        kmovw     %k0, %edx
> > > > -        vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> > > > -        vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> > > > -        vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> > > > -        vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> > > > -        vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> > > > -        vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> > > > -        vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> > > > -        vorps     %zmm0, %zmm4, %zmm0
> > > > -        testl     %edx, %edx
> > > > -
> > > > -/* Go to special inputs processing branch */
> > > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> > > > -
> > > > -/* Restore registers
> > > > - * and exit the function
> > > > - */
> > > > +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > +       vpxord  %zmm3, %zmm3, %zmm3
> > > > +       vpmaxsd %zmm3, %zmm2, %zmm3
> > > > +       vpminsd TANHF_DATA(_iMaxIdxMask_UISA)(%rip) {1to16}, %zmm3, %zmm3
> > > >
> > > > -L(EXIT):
> > > > -        movq      %rbp, %rsp
> > > > -        popq      %rbp
> > > > -        cfi_def_cfa(7, 8)
> > > > -        cfi_restore(6)
> > > > -        ret
> > > > -        cfi_def_cfa(6, 16)
> > > > -        cfi_offset(6, -16)
> > > > -
> > > > -/* Branch to process
> > > > - * special inputs
> > > > - */
> > > > +       /* Setup permute indices in zmm3.  */
> > > > +       vpsrld  $21, %zmm3, %zmm3
> > > > +
> > > > +       /* Store if there are any special cases in k1.  */
> > > > +       vpcmpd  $6, TANHF_DATA(_iExpMask)(%rip) {1to16}, %zmm1, %k1
> > > > +
> > > > +
> > > > +       /* Store absolute values of inputs in zmm1.  */
> > > > +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > +       vandnps %zmm0, %zmm4, %zmm1
> > > > +
> > > > +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > +
> > > > +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > >
> > > > +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > +
> > > > +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > +
> > > > +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > +
> > > > +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > +
> > > > +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > +
> > > > +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > +
> > > > +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > +
> > > > +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > +
> > > > +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > +
> > > > +       kmovw   %k1, %edx
> > > > +       testl   %edx, %edx
> > > > +
> > > > +       /* Go to special inputs processing branch.  */
> > > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > > +       /* Wait until after branch of write over zmm0.  */
> > > > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > +
> > > > +       /* No stack restoration on the fastpath.  */
> > > > +       ret
> > > > +
> > > > +       /* Branch to process special inputs.  */
> > > >  L(SPECIAL_VALUES_BRANCH):
> > > > -        vmovups   %zmm1, 64(%rsp)
> > > > -        vmovups   %zmm0, 128(%rsp)
> > > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> > > > -
> > > > -        xorl      %eax, %eax
> > > > -                                # LOE rbx r12 r13 r14 r15 eax edx
> > > > -
> > > > -        vzeroupper
> > > > -        movq      %r12, 16(%rsp)
> > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > > -        movl      %eax, %r12d
> > > > -        movq      %r13, 8(%rsp)
> > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > > -        movl      %edx, %r13d
> > > > -        movq      %r14, (%rsp)
> > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -
> > > > -/* Range mask
> > > > - * bits check
> > > > - */
> > > > +       pushq   %rbp
> > > > +       /* Need to callee save registers to preserve state across tanhf calls.
> > > > +        */
> > > > +       pushq   %r13
> > > > +       pushq   %r12
> > > > +       movq    %rsp, %rbp
> > > >
> > > > -L(RANGEMASK_CHECK):
> > > > -        btl       %r12d, %r13d
> > > > +       /* Align stack and make room for 2x zmm vectors.  */
> > > > +       andq    $-64, %rsp
> > > > +       addq    $-128, %rsp
> > > >
> > > > -/* Call scalar math function */
> > > > -        jc        L(SCALAR_MATH_CALL)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > +       /* Save all already computed inputs.  */
> > > > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm2
> > > > +       vmovaps %zmm2, (%rsp)
> > > > +       /* Save origional input (zmm0 unchanged up to this point).  */
> > > > +       vmovaps %zmm0, 64(%rsp)
> > > >
> > > > -/* Special inputs
> > > > - * processing loop
> > > > - */
> > > > +       vzeroupper
> > > >
> > > > +       /* edx has 1s where there was a special value that needs to be handled
> > > > +          by a tanhf call.  */
> > > > +       movl    %edx, %r13d
> > > >  L(SPECIAL_VALUES_LOOP):
> > > > -        incl      %r12d
> > > > -        cmpl      $16, %r12d
> > > > -
> > > > -/* Check bits in range mask */
> > > > -        jl        L(RANGEMASK_CHECK)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -
> > > > -        movq      16(%rsp), %r12
> > > > -        cfi_restore(12)
> > > > -        movq      8(%rsp), %r13
> > > > -        cfi_restore(13)
> > > > -        movq      (%rsp), %r14
> > > > -        cfi_restore(14)
> > > > -        vmovups   128(%rsp), %zmm0
> > > > -
> > > > -/* Go to exit */
> > > > -        jmp       L(EXIT)
> > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > > -                                # LOE rbx r12 r13 r14 r15 zmm0
> > > > -
> > > > -/* Scalar math fucntion call
> > > > - * to process special input
> > > > - */
> > > > +       /* use r12 as index for special value that is saved across calls to
> > > > +          tanhf. We technically don't need a callee save register here as offset
> > > > +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > +          in the loop.  */
> > > > +       xorl    %r12d, %r12d
> > > > +       tzcntl  %r13d, %r12d
> > > >
> > > > -L(SCALAR_MATH_CALL):
> > > > -        movl      %r12d, %r14d
> > > > -        movss     64(%rsp,%r14,4), %xmm0
> > > > -        call      tanhf@PLT
> > > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > > +       /* Scalar math fucntion call to process special input.  */
> > > > +       movss   64(%rsp, %r12, 4), %xmm0
> > > > +       call    tanhf@PLT
> > > >
> > > > -        movss     %xmm0, 128(%rsp,%r14,4)
> > > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > > +          serialized stack/callee save restoration.  */
> > > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > >
> > > > -/* Process special inputs in loop */
> > > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -END(_ZGVeN16v_tanhf_skx)
> > > > +       blsr    %r13d, %r13d
> > > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > >
> > > > -        .section .rodata, "a"
> > > > -        .align 64
> > > > +       /* All results have been written to 64(%rsp).  */
> > > > +       vmovaps (%rsp), %zmm0
> > > > +       /* Restore rsp.  */
> > > > +       movq    %rbp, %rsp
> > > > +       /* Restore callee save registers.  */
> > > > +       popq    %r12
> > > > +       popq    %r13
> > > > +       popq    %rbp
> > > > +       ret
> > > > +END(_ZGVeN16v_tanhf_skx)
> > > >
> > > > +       .section .rodata, "a"
> > > > +       .align  16
> > > >  #ifdef __svml_stanh_data_internal_typedef
> > > > -typedef unsigned int VUINT32;
> > > > -typedef struct
> > > > -{
> > > > -        __declspec(align(64)) VUINT32 _sC[32][1];
> > > > -        __declspec(align(64)) VUINT32 _sP0[32][1];
> > > > -        __declspec(align(64)) VUINT32 _sP2[32][1];
> > > > -        __declspec(align(64)) VUINT32 _sP3[32][1];
> > > > -        __declspec(align(64)) VUINT32 _sP4[32][1];
> > > > -        __declspec(align(64)) VUINT32 _sP5[32][1];
> > > > -        __declspec(align(64)) VUINT32 _sP6[32][1];
> > > > -        __declspec(align(64)) VUINT32 _sP7[32][1];
> > > > -        __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> > > > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> > > > -        __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> > > > -        __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > -        __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> > > > -        __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> > > > -        __declspec(align(64)) VUINT32 _iExpMask[16][1];
> > > > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> > > > -        __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> > > > -} __svml_stanh_data_internal;
> > > > +       typedef unsigned int VUINT32;
> > > > +       typedef struct
> > > > +       {
> > > > +       __declspec (align(4))VUINT32 _iExpMantMask_UISA[1][1];
> > > > +       __declspec (align(4))VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > +       __declspec (align(4))VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > +       __declspec (align(4))VUINT32 _iExpMask[1][1];
> > > > +       __declspec (align(64))VUINT32 _sSignMask[16][1];
> > > > +       __declspec (align(64))VUINT32 _sC_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sC_hi[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP7_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP7_hi[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP6_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP6_hi[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP5_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP5_hi[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP4_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP4_hi[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP3_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP3_hi[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP2_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP2_hi[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP0_lo[16][1];
> > > > +       __declspec (align(64))VUINT32 _sP0_hi[16][1];
> > > > +       }__svml_stanh_data_internal;
> > > >  #endif
> > > > +
> > > >  __svml_stanh_data_internal:
> > > > -        /*== _sC ==*/
> > > > -        .long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > -        .long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > -        .long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > -        .long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > -        .long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > -        .long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > -        .long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > -        .long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > -        /*== p0 ==*/
> > > > -        .align 64
> > > > -        .long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > -        .long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > -        .long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > -        .long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > -        .long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > -        .long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > -        .long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > -        .long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > -        /*== p2 ==*/
> > > > -        .align 64
> > > > -        .long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > -        .long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > -        .long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > -        .long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > -        .long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > -        .long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > -        .long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > -        .long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > -        /*== p3 ==*/
> > > > -        .align 64
> > > > -        .long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > -        .long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > -        .long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > -        .long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > -        .long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > -        .long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > -        .long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > -        .long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > -        /*== p4 ==*/
> > > > -        .align 64
> > > > -        .long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > -        .long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > -        .long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > -        .long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > -        .long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > -        .long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > -        .long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > -        .long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > -        /*== p5 ==*/
> > > > -        .align 64
> > > > -        .long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > -        .long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > -        .long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > -        .long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > -        .long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > -        .long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > -        .long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > -        .long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > -        /*== p6 ==*/
> > > > -        .align 64
> > > > -        .long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > -        .long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > -        .long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > -        .long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > -        .long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > -        .long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > -        .long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > -        .long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > -        /*== p7 ==*/
> > > > -        .align 64
> > > > -        .long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > -        .long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > -        .long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > -        .long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > -        .long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > -        .long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > -        .long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > -        .long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > -        .align 64
> > > > -        .long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000           /* _iExpMantMask_UISA     */
> > > > -        .align 64
> > > > -        .long 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000           /* _iMinIdxOfsMask_UISA   */
> > > > -        .align 64
> > > > -        .long 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000           /* _iMaxIdxMask_UISA      */
> > > > -        .align 64
> > > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > > -        .align 64
> > > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > > -        .align 64
> > > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > > -        .align 64
> > > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > > -        .align 64
> > > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > > -        .align 64
> > > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > > -        .align 64
> > > > -        .type  __svml_stanh_data_internal,@object
> > > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > > +       .align  4
> > > > +       /* _iExpMantMask_UISA.  */
> > > > +       .long   0x7fe00000
> > > > +
> > > > +       .align  4
> > > > +       /* _iMinIdxOfsMask_UISA.  */
> > > > +       .long   0x3d400000
> > > > +
> > > > +       .align  4
> > > > +       /* _iMaxIdxMask_UISA.  */
> > > > +       .long   0x03e00000
> > > > +
> > > > +       .align  4
> > > > +       /* _iExpMask.  */
> > > > +       .long   0x7f000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sSignMask.  */
> > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sC_lo.  */
> > > > +       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > +       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > +       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > +       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > +
> > > > +       .align  64
> > > > +       /* _sC_hi.  */
> > > > +       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > +       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > +       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > +       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sP7_lo.  */
> > > > +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > +
> > > > +       .align  64
> > > > +       /* _sP7_hi.  */
> > > > +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sP6_lo.  */
> > > > +       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > +       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > +       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > +       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > +
> > > > +       .align  64
> > > > +       /* _sP6_hi.  */
> > > > +       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > +       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > +       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > +       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sP5_lo.  */
> > > > +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > +
> > > > +       .align  64
> > > > +       /* _sP5_hi.  */
> > > > +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sP4_lo.  */
> > > > +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > +
> > > > +       .align  64
> > > > +       /* _sP4_hi.  */
> > > > +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sP3_lo.  */
> > > > +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > +
> > > > +       .align  64
> > > > +       /* _sP3_hi.  */
> > > > +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sP2_lo.  */
> > > > +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > +
> > > > +       .align  64
> > > > +       /* _sP2_hi.  */
> > > > +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > +
> > > > +       .align  64
> > > > +       /* _sP0_lo.  */
> > > > +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > +
> > > > +       .align  64
> > > > +       /* _sP0_hi.  */
> > > > +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > +
> > > > +       .align  64
> > > > +       .type   __svml_stanh_data_internal, @object
> > > > +       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > > index 50f753ffb3..716b06d640 100644
> > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > > @@ -70,763 +70,154 @@
> > > >   *
> > > >   */
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal
> > > > - */
> > > > -#define _dbP                           0
> > > > -#define _sSignMask                     4288
> > > > -#define _sAbsMask                      4304
> > > > -#define _iExpMantMask                  4320
> > > > -#define _iExpMask                      4336
> > > > -#define _iMinIdxOfsMask                4352
> > > > -#define _iMaxIdxMask                   4368
> > > >
> > > >  #include <sysdep.h>
> > > >
> > > > +#define ONLY_DECL_OFFSET
> > > > +#include "svml_s_tanhf_rodata.S"
> > > > +
> > > >          .text
> > > >         .section .text.sse4,"ax",@progbits
> > > >  ENTRY(_ZGVbN4v_tanhf_sse4)
> > > > -        subq      $72, %rsp
> > > > -        cfi_def_cfa_offset(80)
> > > > -        movaps    %xmm0, %xmm5
> > > > +       /* Save copy of input in xmm12.  */
> > > > +       movaps  %xmm0, %xmm12
> > > >
> > > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > -        movdqu    _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> > > > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r8
> > > > -        pand      %xmm5, %xmm9
> > > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > > +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> > > > +       pand    %xmm0, %xmm3
> > > >
> > > > -/* if VMIN, VMAX is defined for I type */
> > > > -        pxor      %xmm7, %xmm7
> > > > -        movdqa    %xmm9, %xmm6
> > > > -        psubd     _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> > > >
> > > > -/*
> > > > - *  small table specific variables *
> > > > - *  Constant loading
> > > > - */
> > > > -        movdqu    _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> > > > -        movdqa    %xmm9, %xmm11
> > > > -        movdqa    %xmm9, %xmm8
> > > > -        pcmpgtd   %xmm10, %xmm11
> > > > -        pcmpgtd   %xmm7, %xmm8
> > > > -        movdqa    %xmm11, %xmm14
> > > > -        pand      %xmm8, %xmm9
> > > > -        andps     %xmm11, %xmm10
> > > > -        andnps    %xmm9, %xmm14
> > > > -        orps      %xmm10, %xmm14
> > > > -        psrld     $14, %xmm14
> > > > -        movd      %xmm14, %edx
> > > > -        pshufd    $1, %xmm14, %xmm12
> > > > -        pshufd    $2, %xmm14, %xmm13
> > > > -        movd      %xmm12, %ecx
> > > > -        pshufd    $3, %xmm14, %xmm15
> > > > -        movups    _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> > > > -        movslq    %edx, %rdx
> > > > -        andps     %xmm5, %xmm3
> > > > -        movslq    %ecx, %rcx
> > > > -        pcmpgtd   _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> > > > -        movd      %xmm13, %esi
> > > > -        movups    -16(%rdx,%r8), %xmm2
> > > > -        movaps    %xmm2, %xmm0
> > > > -        movd      %xmm15, %edi
> > > > -        movmskps  %xmm6, %eax
> > > > -        movups    -16(%rcx,%r8), %xmm6
> > > > -        unpcklpd  %xmm6, %xmm0
> > > > -        unpckhpd  %xmm6, %xmm2
> > > > -        cvtps2pd  %xmm3, %xmm6
> > > > -        movhlps   %xmm3, %xmm3
> > > > -        cvtps2pd  %xmm3, %xmm3
> > > > -        movslq    %esi, %rsi
> > > > -        movslq    %edi, %rdi
> > > > -        movups    (%rcx,%r8), %xmm8
> > > > -        movups    (%rdx,%r8), %xmm12
> > > > -        movups    (%rsi,%r8), %xmm13
> > > > -        movaps    %xmm12, %xmm10
> > > > -        movups    (%rdi,%r8), %xmm9
> > > > -        movaps    %xmm13, %xmm11
> > > > -        unpckhpd  %xmm8, %xmm12
> > > > -        unpckhpd  %xmm9, %xmm13
> > > > -        mulpd     %xmm6, %xmm12
> > > > -        mulpd     %xmm3, %xmm13
> > > > -        unpcklpd  %xmm8, %xmm10
> > > > -        unpcklpd  %xmm9, %xmm11
> > > > -        addpd     %xmm10, %xmm12
> > > > -        addpd     %xmm11, %xmm13
> > > > -        mulpd     %xmm6, %xmm12
> > > > -        mulpd     %xmm3, %xmm13
> > > > -        addpd     %xmm2, %xmm12
> > > > -        movups    -16(%rsi,%r8), %xmm1
> > > > -        movups    -16(%rdi,%r8), %xmm7
> > > > -        movaps    %xmm1, %xmm14
> > > > -        unpckhpd  %xmm7, %xmm1
> > > > -        addpd     %xmm1, %xmm13
> > > > -        mulpd     %xmm12, %xmm6
> > > > -        mulpd     %xmm13, %xmm3
> > > > -        addpd     %xmm0, %xmm6
> > > > -        unpcklpd  %xmm7, %xmm14
> > > > -        addpd     %xmm14, %xmm3
> > > > -        cvtpd2ps  %xmm6, %xmm0
> > > > -        cvtpd2ps  %xmm3, %xmm1
> > > > -        movups    _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> > > > -        movlhps   %xmm1, %xmm0
> > > > -        andps     %xmm5, %xmm4
> > > > -        orps      %xmm4, %xmm0
> > > > -        testl     %eax, %eax
> > > > -
> > > > -/* Go to special inputs processing branch */
> > > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > > -                                # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> > > > -
> > > > -/* Restore registers
> > > > - * and exit the function
> > > > - */
> > > > +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
> > > > +       pxor    %xmm7, %xmm7
> > > > +       /* Save xmm3 for special values check at end.  */
> > > > +       movdqa  %xmm3, %xmm8
> > > > +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> > > > +       pmaxsd  %xmm7, %xmm3
> > > > +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> > > > +       psrld   $14, %xmm3
> > > >
> > > > -L(EXIT):
> > > > -        addq      $72, %rsp
> > > > -        cfi_def_cfa_offset(8)
> > > > -        ret
> > > > -        cfi_def_cfa_offset(80)
> > > > +       movq    %xmm3, %rcx
> > > > +       movl    %ecx, %edx
> > > > +       shrq    $32, %rcx
> > > >
> > > > -/* Branch to process
> > > > - * special inputs
> > > > - */
> > > > +       /* xmm8 contains mask of special values.  */
> > > > +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
> > > >
> > > > -L(SPECIAL_VALUES_BRANCH):
> > > > -        movups    %xmm5, 32(%rsp)
> > > > -        movups    %xmm0, 48(%rsp)
> > > > -                                # LOE rbx rbp r12 r13 r14 r15 eax
> > > > -
> > > > -        xorl      %edx, %edx
> > > > -        movq      %r12, 16(%rsp)
> > > > -        cfi_offset(12, -64)
> > > > -        movl      %edx, %r12d
> > > > -        movq      %r13, 8(%rsp)
> > > > -        cfi_offset(13, -72)
> > > > -        movl      %eax, %r13d
> > > > -        movq      %r14, (%rsp)
> > > > -        cfi_offset(14, -80)
> > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > -
> > > > -/* Range mask
> > > > - * bits check
> > > > - */
> > > > +       pshufd  $0x0e, %xmm3, %xmm3
> > > > +       movq    %xmm3, %rdi
> > > > +       movl    %edi, %esi
> > > > +       shrq    $32, %rdi
> > > >
> > > > -L(RANGEMASK_CHECK):
> > > > -        btl       %r12d, %r13d
> > > > +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> > > > +       andps   %xmm1, %xmm0
> > > >
> > > > -/* Call scalar math function */
> > > > -        jc        L(SCALAR_MATH_CALL)
> > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > > > +       movups  (%rdx, %rax), %xmm2
> > > > +       movups  (%rcx, %rax), %xmm6
> > > >
> > > > -/* Special inputs
> > > > - * processing loop
> > > > - */
> > > > +       movaps  %xmm2, %xmm4
> > > > +       movlhps %xmm6, %xmm4
> > > > +       unpckhpd %xmm6, %xmm2
> > > >
> > > > -L(SPECIAL_VALUES_LOOP):
> > > > -        incl      %r12d
> > > > -        cmpl      $4, %r12d
> > > > -
> > > > -/* Check bits in range mask */
> > > > -        jl        L(RANGEMASK_CHECK)
> > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > -
> > > > -        movq      16(%rsp), %r12
> > > > -        cfi_restore(12)
> > > > -        movq      8(%rsp), %r13
> > > > -        cfi_restore(13)
> > > > -        movq      (%rsp), %r14
> > > > -        cfi_restore(14)
> > > > -        movups    48(%rsp), %xmm0
> > > > -
> > > > -/* Go to exit */
> > > > -        jmp       L(EXIT)
> > > > -        cfi_offset(12, -64)
> > > > -        cfi_offset(13, -72)
> > > > -        cfi_offset(14, -80)
> > > > -                                # LOE rbx rbp r12 r13 r14 r15 xmm0
> > > > -
> > > > -/* Scalar math fucntion call
> > > > - * to process special input
> > > > - */
> > > > +       cvtps2pd %xmm0, %xmm6
> > > > +       movhlps %xmm0, %xmm0
> > > > +       cvtps2pd %xmm0, %xmm0
> > > >
> > > > -L(SCALAR_MATH_CALL):
> > > > -        movl      %r12d, %r14d
> > > > -        movss     32(%rsp,%r14,4), %xmm0
> > > > -        call      tanhf@PLT
> > > > -                                # LOE rbx rbp r14 r15 r12d r13d xmm0
> > > > +       movups  16(%rdx, %rax), %xmm5
> > > > +       movups  16(%rsi, %rax), %xmm13
> > > >
> > > > -        movss     %xmm0, 48(%rsp,%r14,4)
> > > > +       movaps  %xmm5, %xmm10
> > > > +       movaps  %xmm13, %xmm11
> > > >
> > > > -/* Process special inputs in loop */
> > > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > -END(_ZGVbN4v_tanhf_sse4)
> > > > +       movups  16(%rcx, %rax), %xmm7
> > > > +       movups  16(%rdi, %rax), %xmm3
> > > > +
> > > > +       unpckhpd %xmm7, %xmm5
> > > > +       unpckhpd %xmm3, %xmm13
> > > > +
> > > > +       mulpd   %xmm6, %xmm5
> > > > +       mulpd   %xmm0, %xmm13
> > > > +
> > > > +       movlhps %xmm7, %xmm10
> > > > +       movlhps %xmm3, %xmm11
> > > > +
> > > > +       addpd   %xmm10, %xmm5
> > > > +       addpd   %xmm11, %xmm13
> > > > +
> > > > +       mulpd   %xmm6, %xmm5
> > > > +       mulpd   %xmm0, %xmm13
> > > > +
> > > > +       addpd   %xmm2, %xmm5
> > > >
> > > > -        .section .rodata, "a"
> > > > -        .align 16
> > > > -
> > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > -typedef unsigned int VUINT32;
> > > > -typedef struct
> > > > -{
> > > > -        __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> > > > -        __declspec(align(16)) VUINT32 _sSignMask[4][1];
> > > > -        __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> > > > -        __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> > > > -        __declspec(align(16)) VUINT32 _iExpMask[4][1];
> > > > -        __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> > > > -        __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> > > > -} __svml_stanh_data_internal;
> > > > -#endif
> > > > -__svml_stanh_data_internal:
> > > > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > > > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > > > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > > > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > > > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > > > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > > > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > > > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > > > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > > > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > > > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > > > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > > > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > > > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > > > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > > > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > > > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > > > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > > > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > > > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > > > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > > > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > > > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > > > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > > > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > > > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > > > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > > > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > > > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > > > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > > > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > > > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > > > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > > > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > > > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > > > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > > > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > > > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > > > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > > > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > > > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > > > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > > > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > > > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > > > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > > > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > > > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > > > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > > > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > > > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > > > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > > > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > > > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > > > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > > > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > > > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > > > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > > > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > > > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > > > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > > > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > > > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > > > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > > > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > > > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > > > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > > > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > > > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > > > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > > > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > > > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > > > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > > > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > > > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > > > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > > > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > > > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > > > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > > > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > > > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > > > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > > > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > > > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > > > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > > > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > > > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > > > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > > > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > > > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > > > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > > > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > > > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > > > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > > > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > > > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > > > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > > > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > > > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > > > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > > > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > > > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > > > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > > > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > > > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > > > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > > > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > > > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > > > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > > > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > > > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > > > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > > > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > > > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > > > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > > > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > > > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > > > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > > > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > > > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > > > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > > > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > > > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > > > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > > > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > > > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > > > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > > > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > > > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > > > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > > > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > > > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > > > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > > > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > > > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > > > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > > > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > > > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > > > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > > > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > > > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > > > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > > > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > > > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > > > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > > > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > > > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > > > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > > > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > > > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > > > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > > > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > > > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > > > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > > > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > > > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > > > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > > > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > > > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > > > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > > > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > > > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > > > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > > > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > > > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > > > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > > > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > > > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > > > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > > > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > > > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > > > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > > > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > > > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > > > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > > > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > > > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > > > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > > > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > > > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > > > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > > > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > > > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > > > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > > > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > > > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > > > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > > > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > > > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > > > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > > > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > > > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > > > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > > > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > > > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > > > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > > > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > > > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > > > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > > > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > > > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > > > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > > > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > > > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > > > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > > > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > > > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > > > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > > > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > > > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > > > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > > > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > > > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > > > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > > > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > > > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > > > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > > > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > > > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > > > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > > > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > > > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > > > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > > > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > > > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > > > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > > > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > > > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > > > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > > > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > > > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > > > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > > > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > > > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > > > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > > > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > > > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > > > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > > > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > > > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > > > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > > > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > > > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > > > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > > > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > > > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > > > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > > > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > > > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > > > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > > > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > > > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > > > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > > > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > > > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > > > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > > > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > > > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > > > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > > > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > > > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > > > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > > > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > > > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > > > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > > > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > > > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > > > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > > > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > > > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > > > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > > > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > > > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > > > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > > > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > > > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > > > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > > > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > > > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > > > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > > > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > > > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > > > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > > > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > > > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > > > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > > > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > > > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > > > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > > > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > > > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > > > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > > > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > > > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > > > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > > > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > > > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > > > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > > > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > > > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > > > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > > > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > > > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > > > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > > > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > > > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > > > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > > > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > > > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > > > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > > > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > > > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > > > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > > > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > > > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > > > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > > > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > > > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > > > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > > > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > > > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > > > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > > > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > > > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > > > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > > > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > > > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > > > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > > > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > > > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > > > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > > > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > > > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > > > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > > > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > > > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > > > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > > > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > > > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > > > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > > > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > > > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > > > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > > > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > > > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > > > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > > > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > > > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > > > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > > > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > > > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > > > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > > > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > > > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > > > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > > > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > > > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > > > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > > > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > > > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > > > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > > > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > > > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > > > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > > > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > > > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > > > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > > > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > > > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > > > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > > > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > > > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > > > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > > > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > > > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > > > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > > > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > > > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > > > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > > > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > > > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > > > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > > > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > > > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > > > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > > > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > > > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > > > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > > > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > > > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > > > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > > > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > > > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > > > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > > > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > > > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > > > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > > > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > > > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > > > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > > > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > > > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > > > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > > > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > > > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > > > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > > > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > > > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > > > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > > > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > > > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > > > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > > > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > > > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > > > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > > > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > > > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > > > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > > > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > > > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > > > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > > > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > > > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > > > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > > > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > > > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > > > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > > > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > > > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > > > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > > > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > > > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > > > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > > > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > > > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > > > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > > > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > > > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > > > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > > > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > > > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > > > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > > > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > > > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > > > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > > > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > > > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > > > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > > > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > > > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > > > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > > > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > > > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > > > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > > > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > > > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > > > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > > > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > > > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > > > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > > > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > > > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > > > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > > > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > > > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > > > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > > > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > > > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > > > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > > > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > > > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > > > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > > > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > > > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > > > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > > > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > > > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > > > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > > > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > > > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > > > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > > > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > > > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > > > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > > > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > > > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > > > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > > > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > > > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > > > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > > > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > > > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > > > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > > > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > > > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > > > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > > > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > > > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > > > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > > > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > > > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > > > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > > > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > > > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > > > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > > > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > > > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > > > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > > > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > > > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > > > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > > > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > > > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > > > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > > > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > > > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > > > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > > > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > > > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > > > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > > > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > > > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > > > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > > > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > > > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > > > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > > > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > > > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > > > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > > > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > > > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > > > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > > > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > > > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > > > -        .quad 0x3ff0000000000000
> > > > -        .quad 0x0000000000000000
> > > > -        .quad 0x0000000000000000
> > > > -        .quad 0x0000000000000000
> > > > -        .align 16
> > > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > > -        .align 16
> > > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > > -        .align 16
> > > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > > -        .align 16
> > > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > > -        .align 16
> > > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > > -        .align 16
> > > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > > -        .align 16
> > > > -        .type  __svml_stanh_data_internal,@object
> > > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > > +       movups  (%rsi, %rax), %xmm2
> > > > +       movups  (%rdi, %rax), %xmm7
> > > > +
> > > > +       movaps  %xmm2, %xmm3
> > > > +
> > > > +       unpckhpd %xmm7, %xmm2
> > > > +       movlhps %xmm7, %xmm3
> > > > +
> > > > +       addpd   %xmm13, %xmm2
> > > > +
> > > > +       mulpd   %xmm5, %xmm6
> > > > +       addpd   %xmm4, %xmm6
> > > > +
> > > > +       mulpd   %xmm2, %xmm0
> > > > +       addpd   %xmm3, %xmm0
> > > > +
> > > > +       cvtpd2ps %xmm0, %xmm2
> > > > +       cvtpd2ps %xmm6, %xmm0
> > > > +
> > > > +       movlhps %xmm2, %xmm0
> > > > +       andnps  %xmm12, %xmm1
> > > > +       orps    %xmm1, %xmm0
> > > > +
> > > > +       movmskps %xmm8, %edx
> > > > +       testl   %edx, %edx
> > > > +
> > > > +       /* Go to special inputs processing branch.  */
> > > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > > +
> > > > +       /* No stack restoration on the fastpath.  */
> > > > +       ret
> > > > +
> > > > +L(SPECIAL_VALUES_BRANCH):
> > > > +       subq    $48, %rsp
> > > > +
> > > > +       movups  %xmm0, (%rsp)
> > > > +       movups  %xmm12, 16(%rsp)
> > > > +
> > > > +       movq    %r12, 32(%rsp)
> > > > +       movq    %r13, 40(%rsp)
> > > > +
> > > > +       /* edx has 1s where there was a special value that needs to be handled
> > > > +          by a tanhf call.  */
> > > > +       movl    %edx, %r13d
> > > > +L(SPECIAL_VALUES_LOOP):
> > > > +       /* use r12 as index for special value that is saved across calls to
> > > > +          tanhf. We technically don't need a callee save register here as offset
> > > > +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> > > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > +          in the loop.  */
> > > > +       xorl    %r12d, %r12d
> > > > +       bsfl    %r13d, %r12d
> > > > +
> > > > +       /* Scalar math fucntion call to process special input.  */
> > > > +       movss   16(%rsp, %r12, 4), %xmm0
> > > > +       call    tanhf@PLT
> > > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > > +          serialized stack/callee save restoration.  */
> > > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > > +
> > > > +       leal    -1(%r13), %eax
> > > > +       andl    %eax, %r13d
> > > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > > +
> > > > +       /* All results have been written to 16(%rsp).  */
> > > > +       movups  (%rsp), %xmm0
> > > > +       movq    32(%rsp), %r12
> > > > +       movq    40(%rsp), %r13
> > > > +       addq    $48, %rsp
> > > > +       ret
> > > > +END(_ZGVbN4v_tanhf_sse4)
> > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > > index 3745db5aa4..90c3ea4cc6 100644
> > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > > @@ -70,775 +70,171 @@
> > > >   *
> > > >   */
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal
> > > > - */
> > > > -#define _dbP                           0
> > > > -#define _sSignMask                     4288
> > > > -#define _sAbsMask                      4320
> > > > -#define _iExpMantMask                  4352
> > > > -#define _iExpMask                      4384
> > > > -#define _iMinIdxOfsMask                4416
> > > > -#define _iMaxIdxMask                   4448
> > > > -
> > > >  #include <sysdep.h>
> > > > +#include "svml_s_tanhf_rodata.S"
> > > >
> > > >          .text
> > > >         .section .text.avx2,"ax",@progbits
> > > >  ENTRY(_ZGVdN8v_tanhf_avx2)
> > > > -        pushq     %rbp
> > > > -        cfi_def_cfa_offset(16)
> > > > -        movq      %rsp, %rbp
> > > > -        cfi_def_cfa(6, 16)
> > > > -        cfi_offset(6, -16)
> > > > -        andq      $-32, %rsp
> > > > -        pushq     %r12
> > > > -        subq      $120, %rsp
> > > > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r10
> > > > -        vmovaps   %ymm0, %ymm12
> > > > -
> > > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > -        vpand     _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> > > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > > +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> > > > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
> > > >
> > > > -/*
> > > > - *  small table specific variables *
> > > > - *  Constant loading
> > > > - */
> > > > -        vmovups   _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> > > > -        vpsubd    _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> > > > -
> > > > -/* if VMIN, VMAX is defined for I type */
> > > > -        vxorps    %ymm15, %ymm15, %ymm15
> > > > -        vpcmpgtd  %ymm15, %ymm9, %ymm0
> > > > -        vpand     %ymm0, %ymm9, %ymm7
> > > > -        vpcmpgtd  %ymm8, %ymm9, %ymm6
> > > > -        vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> > > > -        vpsrld    $14, %ymm3, %ymm1
> > > > -        vpcmpgtd  _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> > > > -        vmovmskps %ymm13, %r11d
> > > > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> > > > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> > > > -        vextractf128 $1, %ymm1, %xmm2
> > > > -        vmovd     %xmm1, %r9d
> > > > -        vmovd     %xmm2, %ecx
> > > > -        vpextrd   $1, %xmm2, %edx
> > > > -        vpextrd   $1, %xmm1, %r8d
> > > > -        movslq    %r9d, %r9
> > > > -        movslq    %edx, %rdx
> > > > -        movslq    %r8d, %r8
> > > > -        vpextrd   $2, %xmm1, %edi
> > > > -        movslq    %ecx, %rcx
> > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > > > -        vpextrd   $3, %xmm2, %r12d
> > > > -        vpextrd   $3, %xmm1, %esi
> > > > -        vpextrd   $2, %xmm2, %eax
> > > > -        movslq    %edi, %rdi
> > > > -        movslq    %r12d, %r12
> > > > -        movslq    %esi, %rsi
> > > > -        movslq    %eax, %rax
> > > > -        vmovupd   -16(%r9,%r10), %xmm5
> > > > -        vmovupd   -16(%rdx,%r10), %xmm14
> > > > -        vmovupd   -16(%rcx,%r10), %xmm13
> > > > -        vmovupd   (%r9,%r10), %xmm1
> > > > -        vmovupd   (%r8,%r10), %xmm2
> > > > -        vmovupd   -16(%r8,%r10), %xmm4
> > > > -        vinsertf128 $1, -16(%rdi,%r10), %ymm5, %ymm15
> > > > -        vinsertf128 $1, -16(%r12,%r10), %ymm14, %ymm3
> > > > -        vinsertf128 $1, -16(%rax,%r10), %ymm13, %ymm6
> > > > -        vinsertf128 $1, (%rdi,%r10), %ymm1, %ymm5
> > > > -        vinsertf128 $1, (%rsi,%r10), %ymm2, %ymm14
> > > > -        vunpcklpd %ymm3, %ymm6, %ymm8
> > > > -        vunpckhpd %ymm3, %ymm6, %ymm6
> > > > -        vunpcklpd %ymm14, %ymm5, %ymm3
> > > > -        vunpckhpd %ymm14, %ymm5, %ymm2
> > > > -        vmovupd   (%rcx,%r10), %xmm13
> > > > -        vcvtps2pd %xmm10, %ymm5
> > > > -        vextractf128 $1, %ymm10, %xmm10
> > > > -        vfmadd213pd %ymm3, %ymm5, %ymm2
> > > > -        vinsertf128 $1, -16(%rsi,%r10), %ymm4, %ymm0
> > > > -        vmovupd   (%rdx,%r10), %xmm4
> > > > -        vunpcklpd %ymm0, %ymm15, %ymm9
> > > > -        vunpckhpd %ymm0, %ymm15, %ymm7
> > > > -        vfmadd213pd %ymm7, %ymm5, %ymm2
> > > > -        vfmadd213pd %ymm9, %ymm5, %ymm2
> > > > -        vinsertf128 $1, (%r12,%r10), %ymm4, %ymm0
> > > > -        vcvtps2pd %xmm10, %ymm4
> > > > -        vinsertf128 $1, (%rax,%r10), %ymm13, %ymm15
> > > > -        vunpcklpd %ymm0, %ymm15, %ymm1
> > > > -        vunpckhpd %ymm0, %ymm15, %ymm0
> > > > -        vfmadd213pd %ymm1, %ymm4, %ymm0
> > > > -        vcvtpd2ps %ymm2, %xmm1
> > > > -        vfmadd213pd %ymm6, %ymm4, %ymm0
> > > > -        vfmadd213pd %ymm8, %ymm4, %ymm0
> > > > -        vcvtpd2ps %ymm0, %xmm0
> > > > -        vinsertf128 $1, %xmm0, %ymm1, %ymm2
> > > > -        vorps     %ymm11, %ymm2, %ymm0
> > > > -        testl     %r11d, %r11d
> > > > -
> > > > -/* Go to special inputs processing branch */
> > > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > > -                                # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> > > > -
> > > > -/* Restore registers
> > > > - * and exit the function
> > > > - */
> > > > +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> > > > +       vpxor   %ymm3, %ymm3, %ymm3
> > > > +       vpmaxsd %ymm3, %ymm2, %ymm2
> > > > +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
> > > >
> > > > -L(EXIT):
> > > > -        addq      $120, %rsp
> > > > -        cfi_restore(12)
> > > > -        popq      %r12
> > > > -        movq      %rbp, %rsp
> > > > -        popq      %rbp
> > > > -        cfi_def_cfa(7, 8)
> > > > -        cfi_restore(6)
> > > > -        ret
> > > > -        cfi_def_cfa(6, 16)
> > > > -        cfi_offset(6, -16)
> > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > > > -
> > > > -/* Branch to process
> > > > - * special inputs
> > > > - */
> > > > +       vpsrld  $14, %ymm2, %ymm1
> > > >
> > > > -L(SPECIAL_VALUES_BRANCH):
> > > > -        vmovups   %ymm12, 32(%rsp)
> > > > -        vmovups   %ymm0, 64(%rsp)
> > > > -                                # LOE rbx r13 r14 r15 r11d ymm0
> > > > -
> > > > -        xorl      %r12d, %r12d
> > > > -                                # LOE rbx r13 r14 r15 r11d r12d
> > > > -
> > > > -        vzeroupper
> > > > -        movq      %r13, 8(%rsp)
> > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > > > -        movl      %r11d, %r13d
> > > > -        movq      %r14, (%rsp)
> > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -
> > > > -/* Range mask
> > > > - * bits check
> > > > - */
> > > > +       /* Store special cases in ymm15.  */
> > > > +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
> > > >
> > > > -L(RANGEMASK_CHECK):
> > > > -        btl       %r12d, %r13d
> > > >
> > > > -/* Call scalar math function */
> > > > -        jc        L(SCALAR_MATH_CALL)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > +       /* Store base of lookup table in rax.  */
> > > > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > > >
> > > > -/* Special inputs
> > > > - * processing loop
> > > > - */
> > > > +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> > > > +          store/load as we can take advantage of store-forwarding.  */
> > > > +       vmovq   %xmm1, %r8
> > > > +       /* We have eliminated all negative values for ymm1 so no need to sign
> > > > +          extend.  */
> > > > +       movl    %r8d, %r9d
> > > > +       shrq    $32, %r8
> > > >
> > > > -L(SPECIAL_VALUES_LOOP):
> > > > -        incl      %r12d
> > > > -        cmpl      $8, %r12d
> > > > -
> > > > -/* Check bits in range mask */
> > > > -        jl        L(RANGEMASK_CHECK)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -
> > > > -        movq      8(%rsp), %r13
> > > > -        cfi_restore(13)
> > > > -        movq      (%rsp), %r14
> > > > -        cfi_restore(14)
> > > > -        vmovups   64(%rsp), %ymm0
> > > > -
> > > > -/* Go to exit */
> > > > -        jmp       L(EXIT)
> > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > > > -                                # LOE rbx r13 r14 r15 ymm0
> > > > -
> > > > -/* Scalar math fucntion call
> > > > - * to process special input
> > > > - */
> > > > +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> > > > +          with memory operand. This helps alleviate bottleneck on p5.  */
> > > > +       vmovdqu 16(%r9, %rax), %xmm5
> > > >
> > > > -L(SCALAR_MATH_CALL):
> > > > -        movl      %r12d, %r14d
> > > > -        movss     32(%rsp,%r14,4), %xmm0
> > > > -        call      tanhf@PLT
> > > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > > +       vpextrq $1, %xmm1, %rsi
> > > > +       movl    %esi, %edi
> > > > +       shrq    $32, %rsi
> > > >
> > > > -        movss     %xmm0, 64(%rsp,%r14,4)
> > > > +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
> > > >
> > > > -/* Process special inputs in loop */
> > > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > > -                                # LOE rbx r15 r12d r13d
> > > > -END(_ZGVdN8v_tanhf_avx2)
> > > > +       vextracti128 $1, %ymm1, %xmm2
> > > > +       vmovq   %xmm2, %rdx
> > > > +       movl    %edx, %ecx
> > > > +       shrq    $32, %rdx
> > > > +
> > > > +       vmovdqu (%rcx, %rax), %xmm6
> > > > +
> > > > +       vpextrq $1, %xmm2, %r10
> > > > +       movl    %r10d, %r11d
> > > > +       shrq    $32, %r10
> > > > +
> > > > +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> > > > +
> > > > +       vmovupd 16(%r8, %rax), %xmm1
> > > > +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> > > > +       vmovupd (%rdx, %rax), %xmm3
> > > > +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> > > > +
> > > > +       vunpcklpd %ymm3, %ymm6, %ymm7
> > > > +       vunpckhpd %ymm3, %ymm6, %ymm6
> > > > +
> > > > +       vunpcklpd %ymm1, %ymm5, %ymm3
> > > > +       vunpckhpd %ymm1, %ymm5, %ymm1
> > > > +
> > > > +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> > > > +       vandps  %ymm11, %ymm0, %ymm4
> > > >
> > > > -        .section .rodata, "a"
> > > > -        .align 32
> > > > -
> > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > -typedef unsigned int VUINT32;
> > > > -typedef struct
> > > > -{
> > > > -        __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> > > > -        __declspec(align(32)) VUINT32 _sSignMask[8][1];
> > > > -        __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> > > > -        __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> > > > -        __declspec(align(32)) VUINT32 _iExpMask[8][1];
> > > > -        __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> > > > -        __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> > > > -} __svml_stanh_data_internal;
> > > > -#endif
> > > > -__svml_stanh_data_internal:
> > > > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > > > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > > > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > > > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > > > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > > > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > > > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > > > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > > > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > > > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > > > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > > > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > > > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > > > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > > > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > > > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > > > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > > > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > > > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > > > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > > > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > > > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > > > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > > > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > > > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > > > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > > > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > > > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > > > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > > > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > > > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > > > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > > > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > > > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > > > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > > > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > > > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > > > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > > > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > > > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > > > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > > > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > > > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > > > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > > > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > > > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > > > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > > > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > > > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > > > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > > > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > > > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > > > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > > > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > > > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > > > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > > > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > > > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > > > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > > > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > > > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > > > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > > > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > > > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > > > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > > > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > > > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > > > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > > > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > > > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > > > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > > > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > > > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > > > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > > > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > > > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > > > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > > > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > > > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > > > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > > > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > > > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > > > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > > > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > > > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > > > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > > > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > > > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > > > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > > > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > > > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > > > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > > > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > > > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > > > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > > > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > > > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > > > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > > > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > > > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > > > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > > > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > > > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > > > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > > > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > > > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > > > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > > > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > > > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > > > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > > > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > > > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > > > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > > > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > > > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > > > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > > > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > > > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > > > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > > > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > > > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > > > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > > > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > > > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > > > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > > > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > > > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > > > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > > > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > > > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > > > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > > > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > > > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > > > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > > > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > > > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > > > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > > > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > > > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > > > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > > > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > > > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > > > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > > > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > > > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > > > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > > > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > > > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > > > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > > > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > > > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > > > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > > > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > > > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > > > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > > > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > > > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > > > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > > > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > > > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > > > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > > > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > > > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > > > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > > > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > > > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > > > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > > > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > > > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > > > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > > > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > > > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > > > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > > > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > > > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > > > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > > > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > > > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > > > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > > > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > > > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > > > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > > > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > > > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > > > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > > > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > > > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > > > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > > > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > > > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > > > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > > > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > > > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > > > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > > > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > > > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > > > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > > > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > > > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > > > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > > > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > > > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > > > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > > > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > > > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > > > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > > > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > > > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > > > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > > > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > > > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > > > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > > > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > > > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > > > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > > > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > > > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > > > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > > > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > > > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > > > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > > > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > > > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > > > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > > > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > > > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > > > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > > > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > > > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > > > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > > > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > > > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > > > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > > > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > > > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > > > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > > > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > > > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > > > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > > > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > > > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > > > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > > > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > > > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > > > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > > > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > > > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > > > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > > > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > > > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > > > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > > > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > > > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > > > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > > > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > > > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > > > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > > > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > > > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > > > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > > > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > > > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > > > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > > > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > > > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > > > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > > > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > > > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > > > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > > > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > > > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > > > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > > > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > > > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > > > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > > > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > > > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > > > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > > > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > > > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > > > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > > > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > > > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > > > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > > > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > > > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > > > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > > > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > > > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > > > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > > > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > > > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > > > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > > > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > > > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > > > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > > > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > > > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > > > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > > > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > > > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > > > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > > > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > > > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > > > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > > > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > > > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > > > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > > > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > > > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > > > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > > > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > > > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > > > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > > > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > > > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > > > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > > > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > > > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > > > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > > > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > > > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > > > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > > > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > > > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > > > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > > > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > > > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > > > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > > > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > > > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > > > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > > > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > > > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > > > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > > > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > > > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > > > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > > > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > > > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > > > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > > > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > > > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > > > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > > > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > > > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > > > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > > > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > > > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > > > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > > > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > > > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > > > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > > > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > > > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > > > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > > > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > > > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > > > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > > > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > > > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > > > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > > > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > > > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > > > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > > > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > > > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > > > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > > > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > > > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > > > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > > > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > > > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > > > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > > > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > > > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > > > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > > > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > > > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > > > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > > > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > > > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > > > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > > > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > > > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > > > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > > > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > > > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > > > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > > > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > > > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > > > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > > > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > > > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > > > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > > > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > > > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > > > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > > > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > > > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > > > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > > > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > > > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > > > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > > > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > > > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > > > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > > > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > > > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > > > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > > > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > > > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > > > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > > > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > > > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > > > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > > > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > > > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > > > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > > > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > > > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > > > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > > > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > > > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > > > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > > > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > > > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > > > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > > > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > > > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > > > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > > > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > > > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > > > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > > > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > > > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > > > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > > > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > > > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > > > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > > > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > > > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > > > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > > > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > > > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > > > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > > > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > > > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > > > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > > > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > > > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > > > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > > > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > > > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > > > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > > > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > > > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > > > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > > > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > > > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > > > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > > > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > > > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > > > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > > > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > > > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > > > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > > > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > > > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > > > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > > > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > > > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > > > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > > > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > > > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > > > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > > > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > > > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > > > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > > > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > > > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > > > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > > > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > > > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > > > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > > > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > > > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > > > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > > > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > > > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > > > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > > > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > > > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > > > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > > > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > > > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > > > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > > > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > > > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > > > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > > > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > > > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > > > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > > > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > > > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > > > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > > > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > > > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > > > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > > > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > > > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > > > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > > > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > > > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > > > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > > > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > > > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > > > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > > > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > > > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > > > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > > > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > > > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > > > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > > > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > > > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > > > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > > > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > > > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > > > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > > > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > > > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > > > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > > > -        .quad 0x3ff0000000000000
> > > > -        .quad 0x0000000000000000
> > > > -        .quad 0x0000000000000000
> > > > -        .quad 0x0000000000000000
> > > > -        .align 32
> > > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > > -        .align 32
> > > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > > -        .align 32
> > > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > > -        .align 32
> > > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > > -        .align 32
> > > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > > -        .align 32
> > > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > > -        .align 32
> > > > -        .type  __svml_stanh_data_internal,@object
> > > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > > +       vcvtps2pd %xmm4, %ymm5
> > > > +
> > > > +       vextractf128 $1, %ymm4, %xmm4
> > > > +       vcvtps2pd %xmm4, %ymm4
> > > > +
> > > > +       vmovdqu 16(%rcx, %rax), %xmm2
> > > > +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
> > > > +
> > > > +       vfmadd213pd %ymm3, %ymm5, %ymm1
> > > > +
> > > > +       vmovupd 16(%rdx, %rax), %xmm3
> > > > +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> > > > +
> > > > +       vunpcklpd %ymm3, %ymm2, %ymm10
> > > > +       vunpckhpd %ymm3, %ymm2, %ymm2
> > > > +
> > > > +       vfmadd213pd %ymm10, %ymm4, %ymm2
> > > > +       vfmadd213pd %ymm6, %ymm4, %ymm2
> > > > +       vfmadd213pd %ymm7, %ymm4, %ymm2
> > > > +       vcvtpd2ps %ymm2, %xmm2
> > > > +
> > > > +       vmovdqu (%r9, %rax), %xmm7
> > > > +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> > > > +
> > > > +       vmovupd (%r8, %rax), %xmm3
> > > > +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> > > > +
> > > > +       vunpckhpd %ymm3, %ymm7, %ymm4
> > > > +       vunpcklpd %ymm3, %ymm7, %ymm7
> > > > +
> > > > +       vfmadd213pd %ymm4, %ymm5, %ymm1
> > > > +       vfmadd213pd %ymm7, %ymm5, %ymm1
> > > > +
> > > > +
> > > > +       vcvtpd2ps %ymm1, %xmm1
> > > > +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> > > > +
> > > > +       vmovmskps %ymm15, %edx
> > > > +       vandnps %ymm0, %ymm11, %ymm2
> > > > +       testl   %edx, %edx
> > > > +       /* Go to special inputs processing branch.  */
> > > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > > +       /* Wait until after branch of write over ymm0.  */
> > > > +       vorps   %ymm2, %ymm1, %ymm0
> > > > +       /* No stack restoration on the fastpath.  */
> > > > +       ret
> > > > +
> > > > +
> > > > +L(SPECIAL_VALUES_BRANCH):
> > > > +       pushq   %rbp
> > > > +       /* Need to callee save registers to preserve state across tanhf calls.
> > > > +        */
> > > > +       pushq   %r12
> > > > +       pushq   %r13
> > > > +       movq    %rsp, %rbp
> > > > +
> > > > +       /* Align stack and make room for 2x ymm vectors.  */
> > > > +       andq    $-32, %rsp
> > > > +       addq    $-64, %rsp
> > > > +
> > > > +       /* Save all already computed inputs.  */
> > > > +       vorps   %ymm2, %ymm1, %ymm1
> > > > +       vmovups %ymm1, (%rsp)
> > > > +       /* Save origional input (ymm0 unchanged up to this point).  */
> > > > +       vmovups %ymm0, 32(%rsp)
> > > > +
> > > > +       vzeroupper
> > > > +
> > > > +       /* edx has 1s where there was a special value that needs to be handled
> > > > +          by a tanhf call.  */
> > > > +       movl    %edx, %r13d
> > > > +L(SPECIAL_VALUES_LOOP):
> > > > +       /* use r12 as index for special value that is saved across calls to
> > > > +          tanhf. We technically don't need a callee save register here as offset
> > > > +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> > > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > +          in the loop.  */
> > > > +       xorl    %r12d, %r12d
> > > > +       tzcntl  %r13d, %r12d
> > > > +
> > > > +       /* Scalar math fucntion call to process special input.  */
> > > > +       movss   32(%rsp, %r12, 4), %xmm0
> > > > +       call    tanhf@PLT
> > > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > > +          serialized stack/callee save restoration.  */
> > > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > > +
> > > > +       blsr    %r13d, %r13d
> > > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > > +
> > > > +       /* All results have been written to 32(%rsp).  */
> > > > +       vmovups (%rsp), %ymm0
> > > > +       movq    %rbp, %rsp
> > > > +       popq    %r13
> > > > +       popq    %r12
> > > > +       popq    %rbp
> > > > +       ret
> > > > +END(_ZGVdN8v_tanhf_avx2)
> > > > --
> > > > 2.25.1
> > > >
Sunil Pandey Feb. 2, 2022, 12:41 a.m. UTC | #5
On Tue, Feb 1, 2022 at 1:54 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Feb 1, 2022 at 3:29 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Looking into v2, it is still big, with all optimizations applied at
> > the same time.
> >
> > >Optimizations are:
> > >    1. Reduce code size (-70 bytes).
> > >    2. Reduce rodata size (-32 bytes).
> > >    3. Remove register save/restores and stack adjustment from the
> >        fast path.
> > >    4. Slightly better instruction selection where possible.
> > >    5. Remove redundant registers moves.
> > >    6. Prefer registers that get smaller instruction encodings.
> >
> > Can you please further split the patch according to optimization, one
> > optimization at a time per patch.
>
> I don't think the changes are independent enough from one another to
> do that cleanly. As well I think 1 patch/file is within the norm.
>

Can you please make it independent patches for each optimization? It
will help us
review/evaluate/test the patches.

> >
> > On Tue, Feb 1, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Tue, Feb 1, 2022 at 2:03 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > >
> > > > Hi Noah,
> > > >
> > > > We would like to get this patch, but it's too late for 2.35.
> > > >
> > > > This patch is too big, can you please break this patch into multiple
> > > > smaller patches?
> > >
> > > Yeah, I'll split by file.
> > > >
> > > > Also, it seems like this patch is incomplete. I got a build error on
> > > > the glibc master.
> > >
> > > My fault, I separated the rodata for avx2/sse2 into a single file
> > > so that the two implementations could share the lookup table.
> > >
> > > Forgot to commit it :/
> > >
> > > Will fix in V2.
> > > >
> > > > ./sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S:77:33: fatal
> > > > error: svml_s_tanhf_rodata.S: No such file or directory
> > > >  #include "svml_s_tanhf_rodata.S"
> > > >                                  ^
> > > > compilation terminated.
> > > > ../sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S:74:33: fatal
> > > > error: svml_s_tanhf_rodata.S: No such file or directory
> > > >  #include "svml_s_tanhf_rodata.S"
> > > >                                  ^
> > > > compilation terminated.
> > > >
> > > > Thanks,
> > > > Sunil
> > > >
> > > >
> > > >
> > > >
> > > >
> > > >
> > > > On Sat, Jan 29, 2022 at 8:37 PM Noah Goldstein via Libc-alpha
> > > > <libc-alpha@sourceware.org> wrote:
> > > > >
> > > > > No bug.
> > > > >
> > > > > Optimizations are:
> > > > >     1. Reduce code size
> > > > >         avx512: -56 bytes
> > > > >         avx2:   -70 bytes
> > > > >         sse4:   -106 bytes
> > > > >     2. Reduce rodata size
> > > > >         avx512: -448 bytes
> > > > >         avx2:   -32 bytes
> > > > >         sse4:   -4k+ (shares rodata with avx2)
> > > > >     3. Remove register save/restores and stack adjustment from the
> > > > >        fast path.
> > > > >     4. Slightly better instruction selection where possible.
> > > > >
> > > > > This results in roughly a 15% performance improvement for all
> > > > > functions.
> > > > >
> > > > > Results from geomean of 40 benchtest runs:
> > > > >        Function, New Time, Old Time, New / Old
> > > > >  _ZGVbN4v_tanhf,     3.28,    3.852,     0.852
> > > > >  _ZGVcN8v_tanhf,    3.556,    4.192,     0.848
> > > > >  _ZGVdN8v_tanhf,     2.13,    2.486,     0.857
> > > > > _ZGVeN16v_tanhf,    0.658,    0.762,     0.864
> > > > > ---
> > > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 585 +++++------
> > > > >  .../fpu/multiarch/svml_s_tanhf4_core_sse4.S   | 871 +++--------------
> > > > >  .../fpu/multiarch/svml_s_tanhf8_core_avx2.S   | 908 +++---------------
> > > > >  3 files changed, 581 insertions(+), 1783 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > index 8954a5f658..6a2f0c1392 100644
> > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > @@ -70,312 +70,323 @@
> > > > >   *
> > > > >   */
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal
> > > > > - */
> > > > > -#define _sC                            0
> > > > > -#define _sP0                           128
> > > > > -#define _sP2                           256
> > > > > -#define _sP3                           384
> > > > > -#define _sP4                           512
> > > > > -#define _sP5                           640
> > > > > -#define _sP6                           768
> > > > > -#define _sP7                           896
> > > > > -#define _iExpMantMask_UISA             1024
> > > > > -#define _iMinIdxOfsMask_UISA           1088
> > > > > -#define _iMaxIdxMask_UISA              1152
> > > > > -#define _sSignMask                     1216
> > > > > -#define _sAbsMask                      1280
> > > > > -#define _iExpMantMask                  1344
> > > > > -#define _iExpMask                      1408
> > > > > -#define _iMinIdxOfsMask                1472
> > > > > -#define _iMaxIdxMask                   1536
> > > > > -
> > > > >  #include <sysdep.h>
> > > > >
> > > > > +#define TANHF_DATA(offset)     ((offset) + __svml_stanh_data_internal)
> > > > > +
> > > > > +/* Offsets for data table __svml_stanh_data_internal.  */
> > > > > +#define _iExpMantMask_UISA     0
> > > > > +#define _iMinIdxOfsMask_UISA   4
> > > > > +#define _iMaxIdxMask_UISA      8
> > > > > +#define _iExpMask      12
> > > > > +#define _sSignMask     64
> > > > > +#define _sC_lo 128
> > > > > +#define _sC_hi 192
> > > > > +#define _sP7_lo        256
> > > > > +#define _sP7_hi        320
> > > > > +#define _sP6_lo        384
> > > > > +#define _sP6_hi        448
> > > > > +#define _sP5_lo        512
> > > > > +#define _sP5_hi        576
> > > > > +#define _sP4_lo        640
> > > > > +#define _sP4_hi        704
> > > > > +#define _sP3_lo        768
> > > > > +#define _sP3_hi        832
> > > > > +#define _sP2_lo        896
> > > > > +#define _sP2_hi        960
> > > > > +#define _sP0_lo        1024
> > > > > +#define _sP0_hi        1088
> > > > > +
> > > > >          .text
> > > > >         .section .text.exex512,"ax",@progbits
> > > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > -        pushq     %rbp
> > > > > -        cfi_def_cfa_offset(16)
> > > > > -        movq      %rsp, %rbp
> > > > > -        cfi_def_cfa(6, 16)
> > > > > -        cfi_offset(6, -16)
> > > > > -        andq      $-64, %rsp
> > > > > -        subq      $192, %rsp
> > > > > -        vmovaps   %zmm0, %zmm1
> > > > > -        vmovups   __svml_stanh_data_internal(%rip), %zmm9
> > > > > -        vmovups   _sP6+__svml_stanh_data_internal(%rip), %zmm11
> > > > > -        vmovups   _sP5+__svml_stanh_data_internal(%rip), %zmm12
> > > > > -        vmovups   _sP4+__svml_stanh_data_internal(%rip), %zmm13
> > > > > -        vmovups   _sP3+__svml_stanh_data_internal(%rip), %zmm14
> > > > > -        vmovups   _sP2+__svml_stanh_data_internal(%rip), %zmm15
> > > > > -        vpternlogd $255, %zmm2, %zmm2, %zmm2
> > > > > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
> > > > > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
> > > > > -
> > > > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > -        vpandd    _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
> > > > > -        vpsubd    _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
> > > > > -        vpcmpd    $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
> > > > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > > > +       vpandd  TANHF_DATA(_iExpMantMask_UISA)(%rip) {1to16}, %zmm0, %zmm1
> > > > > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask_UISA)(%rip) {1to16}, %zmm1, %zmm2
> > > > >
> > > > > -/*
> > > > > - *  small table specific variables *
> > > > > - *  Constant loading
> > > > > - */
> > > > > -        vpxord    %zmm5, %zmm5, %zmm5
> > > > > -
> > > > > -/* if VMIN, VMAX is defined for I type */
> > > > > -        vpmaxsd   %zmm5, %zmm4, %zmm6
> > > > > -        vpminsd   _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
> > > > > -        vpsrld    $21, %zmm7, %zmm10
> > > > > -        vmovups   _sP7+__svml_stanh_data_internal(%rip), %zmm4
> > > > > -        vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
> > > > > -        vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
> > > > > -        vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
> > > > > -        vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
> > > > > -        vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
> > > > > -        vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
> > > > > -        vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
> > > > > -        vpandnd   %zmm3, %zmm3, %zmm2{%k1}
> > > > > -        vptestmd  %zmm2, %zmm2, %k0
> > > > > -        vmovups   _sP0+__svml_stanh_data_internal(%rip), %zmm3
> > > > > -        vsubps    {rn-sae}, %zmm9, %zmm8, %zmm2
> > > > > -        kmovw     %k0, %edx
> > > > > -        vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
> > > > > -        vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
> > > > > -        vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
> > > > > -        vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
> > > > > -        vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
> > > > > -        vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
> > > > > -        vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
> > > > > -        vorps     %zmm0, %zmm4, %zmm0
> > > > > -        testl     %edx, %edx
> > > > > -
> > > > > -/* Go to special inputs processing branch */
> > > > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
> > > > > -
> > > > > -/* Restore registers
> > > > > - * and exit the function
> > > > > - */
> > > > > +       /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > > +       vpxord  %zmm3, %zmm3, %zmm3
> > > > > +       vpmaxsd %zmm3, %zmm2, %zmm3
> > > > > +       vpminsd TANHF_DATA(_iMaxIdxMask_UISA)(%rip) {1to16}, %zmm3, %zmm3
> > > > >
> > > > > -L(EXIT):
> > > > > -        movq      %rbp, %rsp
> > > > > -        popq      %rbp
> > > > > -        cfi_def_cfa(7, 8)
> > > > > -        cfi_restore(6)
> > > > > -        ret
> > > > > -        cfi_def_cfa(6, 16)
> > > > > -        cfi_offset(6, -16)
> > > > > -
> > > > > -/* Branch to process
> > > > > - * special inputs
> > > > > - */
> > > > > +       /* Setup permute indices in zmm3.  */
> > > > > +       vpsrld  $21, %zmm3, %zmm3
> > > > > +
> > > > > +       /* Store if there are any special cases in k1.  */
> > > > > +       vpcmpd  $6, TANHF_DATA(_iExpMask)(%rip) {1to16}, %zmm1, %k1
> > > > > +
> > > > > +
> > > > > +       /* Store absolute values of inputs in zmm1.  */
> > > > > +       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > > +       vandnps %zmm0, %zmm4, %zmm1
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > > +       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > +       vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > > +       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > >
> > > > > +       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > > +       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > > +       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > +
> > > > > +       vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > > +       vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > > +       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > > +       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > +
> > > > > +       vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > > +       vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > > +       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > > +       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > +
> > > > > +       vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > > +       vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > > +
> > > > > +       kmovw   %k1, %edx
> > > > > +       testl   %edx, %edx
> > > > > +
> > > > > +       /* Go to special inputs processing branch.  */
> > > > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > > > +       /* Wait until after branch of write over zmm0.  */
> > > > > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > > +
> > > > > +       /* No stack restoration on the fastpath.  */
> > > > > +       ret
> > > > > +
> > > > > +       /* Branch to process special inputs.  */
> > > > >  L(SPECIAL_VALUES_BRANCH):
> > > > > -        vmovups   %zmm1, 64(%rsp)
> > > > > -        vmovups   %zmm0, 128(%rsp)
> > > > > -                                # LOE rbx r12 r13 r14 r15 edx zmm0
> > > > > -
> > > > > -        xorl      %eax, %eax
> > > > > -                                # LOE rbx r12 r13 r14 r15 eax edx
> > > > > -
> > > > > -        vzeroupper
> > > > > -        movq      %r12, 16(%rsp)
> > > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > > > -        movl      %eax, %r12d
> > > > > -        movq      %r13, 8(%rsp)
> > > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > > > -        movl      %edx, %r13d
> > > > > -        movq      %r14, (%rsp)
> > > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > -
> > > > > -/* Range mask
> > > > > - * bits check
> > > > > - */
> > > > > +       pushq   %rbp
> > > > > +       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > +        */
> > > > > +       pushq   %r13
> > > > > +       pushq   %r12
> > > > > +       movq    %rsp, %rbp
> > > > >
> > > > > -L(RANGEMASK_CHECK):
> > > > > -        btl       %r12d, %r13d
> > > > > +       /* Align stack and make room for 2x zmm vectors.  */
> > > > > +       andq    $-64, %rsp
> > > > > +       addq    $-128, %rsp
> > > > >
> > > > > -/* Call scalar math function */
> > > > > -        jc        L(SCALAR_MATH_CALL)
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > +       /* Save all already computed inputs.  */
> > > > > +       vpternlogd $0xec, %zmm4, %zmm2, %zmm2
> > > > > +       vmovaps %zmm2, (%rsp)
> > > > > +       /* Save origional input (zmm0 unchanged up to this point).  */
> > > > > +       vmovaps %zmm0, 64(%rsp)
> > > > >
> > > > > -/* Special inputs
> > > > > - * processing loop
> > > > > - */
> > > > > +       vzeroupper
> > > > >
> > > > > +       /* edx has 1s where there was a special value that needs to be handled
> > > > > +          by a tanhf call.  */
> > > > > +       movl    %edx, %r13d
> > > > >  L(SPECIAL_VALUES_LOOP):
> > > > > -        incl      %r12d
> > > > > -        cmpl      $16, %r12d
> > > > > -
> > > > > -/* Check bits in range mask */
> > > > > -        jl        L(RANGEMASK_CHECK)
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > -
> > > > > -        movq      16(%rsp), %r12
> > > > > -        cfi_restore(12)
> > > > > -        movq      8(%rsp), %r13
> > > > > -        cfi_restore(13)
> > > > > -        movq      (%rsp), %r14
> > > > > -        cfi_restore(14)
> > > > > -        vmovups   128(%rsp), %zmm0
> > > > > -
> > > > > -/* Go to exit */
> > > > > -        jmp       L(EXIT)
> > > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
> > > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
> > > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
> > > > > -                                # LOE rbx r12 r13 r14 r15 zmm0
> > > > > -
> > > > > -/* Scalar math fucntion call
> > > > > - * to process special input
> > > > > - */
> > > > > +       /* use r12 as index for special value that is saved across calls to
> > > > > +          tanhf. We technically don't need a callee save register here as offset
> > > > > +          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > +          in the loop.  */
> > > > > +       xorl    %r12d, %r12d
> > > > > +       tzcntl  %r13d, %r12d
> > > > >
> > > > > -L(SCALAR_MATH_CALL):
> > > > > -        movl      %r12d, %r14d
> > > > > -        movss     64(%rsp,%r14,4), %xmm0
> > > > > -        call      tanhf@PLT
> > > > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > > > +       /* Scalar math fucntion call to process special input.  */
> > > > > +       movss   64(%rsp, %r12, 4), %xmm0
> > > > > +       call    tanhf@PLT
> > > > >
> > > > > -        movss     %xmm0, 128(%rsp,%r14,4)
> > > > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > +          serialized stack/callee save restoration.  */
> > > > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > > >
> > > > > -/* Process special inputs in loop */
> > > > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > -END(_ZGVeN16v_tanhf_skx)
> > > > > +       blsr    %r13d, %r13d
> > > > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > > >
> > > > > -        .section .rodata, "a"
> > > > > -        .align 64
> > > > > +       /* All results have been written to 64(%rsp).  */
> > > > > +       vmovaps (%rsp), %zmm0
> > > > > +       /* Restore rsp.  */
> > > > > +       movq    %rbp, %rsp
> > > > > +       /* Restore callee save registers.  */
> > > > > +       popq    %r12
> > > > > +       popq    %r13
> > > > > +       popq    %rbp
> > > > > +       ret
> > > > > +END(_ZGVeN16v_tanhf_skx)
> > > > >
> > > > > +       .section .rodata, "a"
> > > > > +       .align  16
> > > > >  #ifdef __svml_stanh_data_internal_typedef
> > > > > -typedef unsigned int VUINT32;
> > > > > -typedef struct
> > > > > -{
> > > > > -        __declspec(align(64)) VUINT32 _sC[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _sP0[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _sP2[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _sP3[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _sP4[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _sP5[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _sP6[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _sP7[32][1];
> > > > > -        __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _sAbsMask[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _iExpMask[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
> > > > > -        __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
> > > > > -} __svml_stanh_data_internal;
> > > > > +       typedef unsigned int VUINT32;
> > > > > +       typedef struct
> > > > > +       {
> > > > > +       __declspec (align(4))VUINT32 _iExpMantMask_UISA[1][1];
> > > > > +       __declspec (align(4))VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > > +       __declspec (align(4))VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > > +       __declspec (align(4))VUINT32 _iExpMask[1][1];
> > > > > +       __declspec (align(64))VUINT32 _sSignMask[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sC_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sC_hi[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP7_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP7_hi[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP6_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP6_hi[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP5_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP5_hi[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP4_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP4_hi[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP3_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP3_hi[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP2_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP2_hi[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP0_lo[16][1];
> > > > > +       __declspec (align(64))VUINT32 _sP0_hi[16][1];
> > > > > +       }__svml_stanh_data_internal;
> > > > >  #endif
> > > > > +
> > > > >  __svml_stanh_data_internal:
> > > > > -        /*== _sC ==*/
> > > > > -        .long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > -        .long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > -        .long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > -        .long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > -        .long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > -        .long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > -        .long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > -        .long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > -        /*== p0 ==*/
> > > > > -        .align 64
> > > > > -        .long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > -        .long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > -        .long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > -        .long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > -        .long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > -        .long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > -        .long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > -        .long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > -        /*== p2 ==*/
> > > > > -        .align 64
> > > > > -        .long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > -        .long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > -        .long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > -        .long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > -        .long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > -        .long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > -        .long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > -        .long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > -        /*== p3 ==*/
> > > > > -        .align 64
> > > > > -        .long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > -        .long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > -        .long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > -        .long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > -        .long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > -        .long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > -        .long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > -        .long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > -        /*== p4 ==*/
> > > > > -        .align 64
> > > > > -        .long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > -        .long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > -        .long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > -        .long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > -        .long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > -        .long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > -        .long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > -        .long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > -        /*== p5 ==*/
> > > > > -        .align 64
> > > > > -        .long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > -        .long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > -        .long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > -        .long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > -        .long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > -        .long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > -        .long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > -        .long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > -        /*== p6 ==*/
> > > > > -        .align 64
> > > > > -        .long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > -        .long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > -        .long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > -        .long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > -        .long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > -        .long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > -        .long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > -        .long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > -        /*== p7 ==*/
> > > > > -        .align 64
> > > > > -        .long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > -        .long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > -        .long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > -        .long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > -        .long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > -        .long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > -        .long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > -        .long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > > -        .align 64
> > > > > -        .long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000           /* _iExpMantMask_UISA     */
> > > > > -        .align 64
> > > > > -        .long 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000           /* _iMinIdxOfsMask_UISA   */
> > > > > -        .align 64
> > > > > -        .long 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000           /* _iMaxIdxMask_UISA      */
> > > > > -        .align 64
> > > > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > > > -        .align 64
> > > > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > > > -        .align 64
> > > > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > > > -        .align 64
> > > > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > > > -        .align 64
> > > > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > > > -        .align 64
> > > > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > > > -        .align 64
> > > > > -        .type  __svml_stanh_data_internal,@object
> > > > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > > > +       .align  4
> > > > > +       /* _iExpMantMask_UISA.  */
> > > > > +       .long   0x7fe00000
> > > > > +
> > > > > +       .align  4
> > > > > +       /* _iMinIdxOfsMask_UISA.  */
> > > > > +       .long   0x3d400000
> > > > > +
> > > > > +       .align  4
> > > > > +       /* _iMaxIdxMask_UISA.  */
> > > > > +       .long   0x03e00000
> > > > > +
> > > > > +       .align  4
> > > > > +       /* _iExpMask.  */
> > > > > +       .long   0x7f000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sSignMask.  */
> > > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > +       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sC_lo.  */
> > > > > +       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > +       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > +       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > +       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sC_hi.  */
> > > > > +       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > +       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > +       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > +       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP7_lo.  */
> > > > > +       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > +       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > +       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > +       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP7_hi.  */
> > > > > +       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > +       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > +       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > +       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP6_lo.  */
> > > > > +       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > +       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > +       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > +       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP6_hi.  */
> > > > > +       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > +       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > +       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > +       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP5_lo.  */
> > > > > +       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > +       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > +       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > +       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP5_hi.  */
> > > > > +       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > +       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > +       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > +       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP4_lo.  */
> > > > > +       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > +       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > +       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > +       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP4_hi.  */
> > > > > +       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > +       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > +       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > +       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP3_lo.  */
> > > > > +       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > +       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > +       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > +       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP3_hi.  */
> > > > > +       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > +       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > +       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > +       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP2_lo.  */
> > > > > +       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > +       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > +       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > +       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP2_hi.  */
> > > > > +       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > +       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > +       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > +       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP0_lo.  */
> > > > > +       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > +       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > +       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > +       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > +
> > > > > +       .align  64
> > > > > +       /* _sP0_hi.  */
> > > > > +       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > +       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > +       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > +       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > +
> > > > > +       .align  64
> > > > > +       .type   __svml_stanh_data_internal, @object
> > > > > +       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > > > index 50f753ffb3..716b06d640 100644
> > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
> > > > > @@ -70,763 +70,154 @@
> > > > >   *
> > > > >   */
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal
> > > > > - */
> > > > > -#define _dbP                           0
> > > > > -#define _sSignMask                     4288
> > > > > -#define _sAbsMask                      4304
> > > > > -#define _iExpMantMask                  4320
> > > > > -#define _iExpMask                      4336
> > > > > -#define _iMinIdxOfsMask                4352
> > > > > -#define _iMaxIdxMask                   4368
> > > > >
> > > > >  #include <sysdep.h>
> > > > >
> > > > > +#define ONLY_DECL_OFFSET
> > > > > +#include "svml_s_tanhf_rodata.S"
> > > > > +
> > > > >          .text
> > > > >         .section .text.sse4,"ax",@progbits
> > > > >  ENTRY(_ZGVbN4v_tanhf_sse4)
> > > > > -        subq      $72, %rsp
> > > > > -        cfi_def_cfa_offset(80)
> > > > > -        movaps    %xmm0, %xmm5
> > > > > +       /* Save copy of input in xmm12.  */
> > > > > +       movaps  %xmm0, %xmm12
> > > > >
> > > > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > -        movdqu    _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
> > > > > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r8
> > > > > -        pand      %xmm5, %xmm9
> > > > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > > > +       movdqu  TANHF_DATA(_iExpMantMask)(%rip), %xmm3
> > > > > +       pand    %xmm0, %xmm3
> > > > >
> > > > > -/* if VMIN, VMAX is defined for I type */
> > > > > -        pxor      %xmm7, %xmm7
> > > > > -        movdqa    %xmm9, %xmm6
> > > > > -        psubd     _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
> > > > >
> > > > > -/*
> > > > > - *  small table specific variables *
> > > > > - *  Constant loading
> > > > > - */
> > > > > -        movdqu    _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
> > > > > -        movdqa    %xmm9, %xmm11
> > > > > -        movdqa    %xmm9, %xmm8
> > > > > -        pcmpgtd   %xmm10, %xmm11
> > > > > -        pcmpgtd   %xmm7, %xmm8
> > > > > -        movdqa    %xmm11, %xmm14
> > > > > -        pand      %xmm8, %xmm9
> > > > > -        andps     %xmm11, %xmm10
> > > > > -        andnps    %xmm9, %xmm14
> > > > > -        orps      %xmm10, %xmm14
> > > > > -        psrld     $14, %xmm14
> > > > > -        movd      %xmm14, %edx
> > > > > -        pshufd    $1, %xmm14, %xmm12
> > > > > -        pshufd    $2, %xmm14, %xmm13
> > > > > -        movd      %xmm12, %ecx
> > > > > -        pshufd    $3, %xmm14, %xmm15
> > > > > -        movups    _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
> > > > > -        movslq    %edx, %rdx
> > > > > -        andps     %xmm5, %xmm3
> > > > > -        movslq    %ecx, %rcx
> > > > > -        pcmpgtd   _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
> > > > > -        movd      %xmm13, %esi
> > > > > -        movups    -16(%rdx,%r8), %xmm2
> > > > > -        movaps    %xmm2, %xmm0
> > > > > -        movd      %xmm15, %edi
> > > > > -        movmskps  %xmm6, %eax
> > > > > -        movups    -16(%rcx,%r8), %xmm6
> > > > > -        unpcklpd  %xmm6, %xmm0
> > > > > -        unpckhpd  %xmm6, %xmm2
> > > > > -        cvtps2pd  %xmm3, %xmm6
> > > > > -        movhlps   %xmm3, %xmm3
> > > > > -        cvtps2pd  %xmm3, %xmm3
> > > > > -        movslq    %esi, %rsi
> > > > > -        movslq    %edi, %rdi
> > > > > -        movups    (%rcx,%r8), %xmm8
> > > > > -        movups    (%rdx,%r8), %xmm12
> > > > > -        movups    (%rsi,%r8), %xmm13
> > > > > -        movaps    %xmm12, %xmm10
> > > > > -        movups    (%rdi,%r8), %xmm9
> > > > > -        movaps    %xmm13, %xmm11
> > > > > -        unpckhpd  %xmm8, %xmm12
> > > > > -        unpckhpd  %xmm9, %xmm13
> > > > > -        mulpd     %xmm6, %xmm12
> > > > > -        mulpd     %xmm3, %xmm13
> > > > > -        unpcklpd  %xmm8, %xmm10
> > > > > -        unpcklpd  %xmm9, %xmm11
> > > > > -        addpd     %xmm10, %xmm12
> > > > > -        addpd     %xmm11, %xmm13
> > > > > -        mulpd     %xmm6, %xmm12
> > > > > -        mulpd     %xmm3, %xmm13
> > > > > -        addpd     %xmm2, %xmm12
> > > > > -        movups    -16(%rsi,%r8), %xmm1
> > > > > -        movups    -16(%rdi,%r8), %xmm7
> > > > > -        movaps    %xmm1, %xmm14
> > > > > -        unpckhpd  %xmm7, %xmm1
> > > > > -        addpd     %xmm1, %xmm13
> > > > > -        mulpd     %xmm12, %xmm6
> > > > > -        mulpd     %xmm13, %xmm3
> > > > > -        addpd     %xmm0, %xmm6
> > > > > -        unpcklpd  %xmm7, %xmm14
> > > > > -        addpd     %xmm14, %xmm3
> > > > > -        cvtpd2ps  %xmm6, %xmm0
> > > > > -        cvtpd2ps  %xmm3, %xmm1
> > > > > -        movups    _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
> > > > > -        movlhps   %xmm1, %xmm0
> > > > > -        andps     %xmm5, %xmm4
> > > > > -        orps      %xmm4, %xmm0
> > > > > -        testl     %eax, %eax
> > > > > -
> > > > > -/* Go to special inputs processing branch */
> > > > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > > > -                                # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
> > > > > -
> > > > > -/* Restore registers
> > > > > - * and exit the function
> > > > > - */
> > > > > +       /* Selection of arguments between [0, 0x04280000] into xmm3.  */
> > > > > +       pxor    %xmm7, %xmm7
> > > > > +       /* Save xmm3 for special values check at end.  */
> > > > > +       movdqa  %xmm3, %xmm8
> > > > > +       psubd   TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
> > > > > +       pmaxsd  %xmm7, %xmm3
> > > > > +       pminsd  TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
> > > > > +       psrld   $14, %xmm3
> > > > >
> > > > > -L(EXIT):
> > > > > -        addq      $72, %rsp
> > > > > -        cfi_def_cfa_offset(8)
> > > > > -        ret
> > > > > -        cfi_def_cfa_offset(80)
> > > > > +       movq    %xmm3, %rcx
> > > > > +       movl    %ecx, %edx
> > > > > +       shrq    $32, %rcx
> > > > >
> > > > > -/* Branch to process
> > > > > - * special inputs
> > > > > - */
> > > > > +       /* xmm8 contains mask of special values.  */
> > > > > +       pcmpgtd TANHF_DATA(_iExpMask)(%rip), %xmm8
> > > > >
> > > > > -L(SPECIAL_VALUES_BRANCH):
> > > > > -        movups    %xmm5, 32(%rsp)
> > > > > -        movups    %xmm0, 48(%rsp)
> > > > > -                                # LOE rbx rbp r12 r13 r14 r15 eax
> > > > > -
> > > > > -        xorl      %edx, %edx
> > > > > -        movq      %r12, 16(%rsp)
> > > > > -        cfi_offset(12, -64)
> > > > > -        movl      %edx, %r12d
> > > > > -        movq      %r13, 8(%rsp)
> > > > > -        cfi_offset(13, -72)
> > > > > -        movl      %eax, %r13d
> > > > > -        movq      %r14, (%rsp)
> > > > > -        cfi_offset(14, -80)
> > > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > > -
> > > > > -/* Range mask
> > > > > - * bits check
> > > > > - */
> > > > > +       pshufd  $0x0e, %xmm3, %xmm3
> > > > > +       movq    %xmm3, %rdi
> > > > > +       movl    %edi, %esi
> > > > > +       shrq    $32, %rdi
> > > > >
> > > > > -L(RANGEMASK_CHECK):
> > > > > -        btl       %r12d, %r13d
> > > > > +       movaps  TANHF_DATA(_sAbsMask)(%rip), %xmm1
> > > > > +       andps   %xmm1, %xmm0
> > > > >
> > > > > -/* Call scalar math function */
> > > > > -        jc        L(SCALAR_MATH_CALL)
> > > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > > > > +       movups  (%rdx, %rax), %xmm2
> > > > > +       movups  (%rcx, %rax), %xmm6
> > > > >
> > > > > -/* Special inputs
> > > > > - * processing loop
> > > > > - */
> > > > > +       movaps  %xmm2, %xmm4
> > > > > +       movlhps %xmm6, %xmm4
> > > > > +       unpckhpd %xmm6, %xmm2
> > > > >
> > > > > -L(SPECIAL_VALUES_LOOP):
> > > > > -        incl      %r12d
> > > > > -        cmpl      $4, %r12d
> > > > > -
> > > > > -/* Check bits in range mask */
> > > > > -        jl        L(RANGEMASK_CHECK)
> > > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > > -
> > > > > -        movq      16(%rsp), %r12
> > > > > -        cfi_restore(12)
> > > > > -        movq      8(%rsp), %r13
> > > > > -        cfi_restore(13)
> > > > > -        movq      (%rsp), %r14
> > > > > -        cfi_restore(14)
> > > > > -        movups    48(%rsp), %xmm0
> > > > > -
> > > > > -/* Go to exit */
> > > > > -        jmp       L(EXIT)
> > > > > -        cfi_offset(12, -64)
> > > > > -        cfi_offset(13, -72)
> > > > > -        cfi_offset(14, -80)
> > > > > -                                # LOE rbx rbp r12 r13 r14 r15 xmm0
> > > > > -
> > > > > -/* Scalar math fucntion call
> > > > > - * to process special input
> > > > > - */
> > > > > +       cvtps2pd %xmm0, %xmm6
> > > > > +       movhlps %xmm0, %xmm0
> > > > > +       cvtps2pd %xmm0, %xmm0
> > > > >
> > > > > -L(SCALAR_MATH_CALL):
> > > > > -        movl      %r12d, %r14d
> > > > > -        movss     32(%rsp,%r14,4), %xmm0
> > > > > -        call      tanhf@PLT
> > > > > -                                # LOE rbx rbp r14 r15 r12d r13d xmm0
> > > > > +       movups  16(%rdx, %rax), %xmm5
> > > > > +       movups  16(%rsi, %rax), %xmm13
> > > > >
> > > > > -        movss     %xmm0, 48(%rsp,%r14,4)
> > > > > +       movaps  %xmm5, %xmm10
> > > > > +       movaps  %xmm13, %xmm11
> > > > >
> > > > > -/* Process special inputs in loop */
> > > > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > > > -                                # LOE rbx rbp r15 r12d r13d
> > > > > -END(_ZGVbN4v_tanhf_sse4)
> > > > > +       movups  16(%rcx, %rax), %xmm7
> > > > > +       movups  16(%rdi, %rax), %xmm3
> > > > > +
> > > > > +       unpckhpd %xmm7, %xmm5
> > > > > +       unpckhpd %xmm3, %xmm13
> > > > > +
> > > > > +       mulpd   %xmm6, %xmm5
> > > > > +       mulpd   %xmm0, %xmm13
> > > > > +
> > > > > +       movlhps %xmm7, %xmm10
> > > > > +       movlhps %xmm3, %xmm11
> > > > > +
> > > > > +       addpd   %xmm10, %xmm5
> > > > > +       addpd   %xmm11, %xmm13
> > > > > +
> > > > > +       mulpd   %xmm6, %xmm5
> > > > > +       mulpd   %xmm0, %xmm13
> > > > > +
> > > > > +       addpd   %xmm2, %xmm5
> > > > >
> > > > > -        .section .rodata, "a"
> > > > > -        .align 16
> > > > > -
> > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > -typedef unsigned int VUINT32;
> > > > > -typedef struct
> > > > > -{
> > > > > -        __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
> > > > > -        __declspec(align(16)) VUINT32 _sSignMask[4][1];
> > > > > -        __declspec(align(16)) VUINT32 _sAbsMask[4][1];
> > > > > -        __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
> > > > > -        __declspec(align(16)) VUINT32 _iExpMask[4][1];
> > > > > -        __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
> > > > > -        __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
> > > > > -} __svml_stanh_data_internal;
> > > > > -#endif
> > > > > -__svml_stanh_data_internal:
> > > > > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > > > > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > > > > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > > > > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > > > > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > > > > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > > > > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > > > > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > > > > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > > > > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > > > > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > > > > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > > > > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > > > > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > > > > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > > > > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > > > > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > > > > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > > > > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > > > > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > > > > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > > > > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > > > > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > > > > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > > > > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > > > > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > > > > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > > > > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > > > > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > > > > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > > > > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > > > > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > > > > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > > > > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > > > > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > > > > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > > > > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > > > > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > > > > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > > > > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > > > > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > > > > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > > > > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > > > > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > > > > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > > > > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > > > > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > > > > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > > > > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > > > > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > > > > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > > > > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > > > > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > > > > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > > > > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > > > > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > > > > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > > > > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > > > > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > > > > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > > > > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > > > > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > > > > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > > > > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > > > > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > > > > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > > > > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > > > > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > > > > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > > > > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > > > > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > > > > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > > > > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > > > > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > > > > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > > > > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > > > > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > > > > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > > > > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > > > > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > > > > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > > > > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > > > > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > > > > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > > > > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > > > > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > > > > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > > > > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > > > > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > > > > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > > > > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > > > > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > > > > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > > > > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > > > > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > > > > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > > > > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > > > > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > > > > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > > > > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > > > > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > > > > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > > > > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > > > > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > > > > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > > > > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > > > > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > > > > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > > > > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > > > > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > > > > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > > > > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > > > > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > > > > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > > > > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > > > > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > > > > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > > > > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > > > > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > > > > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > > > > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > > > > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > > > > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > > > > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > > > > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > > > > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > > > > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > > > > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > > > > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > > > > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > > > > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > > > > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > > > > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > > > > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > > > > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > > > > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > > > > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > > > > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > > > > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > > > > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > > > > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > > > > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > > > > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > > > > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > > > > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > > > > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > > > > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > > > > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > > > > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > > > > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > > > > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > > > > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > > > > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > > > > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > > > > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > > > > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > > > > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > > > > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > > > > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > > > > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > > > > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > > > > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > > > > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > > > > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > > > > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > > > > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > > > > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > > > > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > > > > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > > > > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > > > > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > > > > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > > > > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > > > > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > > > > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > > > > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > > > > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > > > > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > > > > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > > > > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > > > > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > > > > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > > > > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > > > > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > > > > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > > > > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > > > > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > > > > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > > > > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > > > > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > > > > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > > > > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > > > > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > > > > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > > > > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > > > > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > > > > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > > > > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > > > > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > > > > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > > > > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > > > > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > > > > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > > > > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > > > > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > > > > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > > > > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > > > > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > > > > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > > > > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > > > > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > > > > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > > > > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > > > > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > > > > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > > > > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > > > > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > > > > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > > > > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > > > > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > > > > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > > > > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > > > > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > > > > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > > > > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > > > > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > > > > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > > > > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > > > > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > > > > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > > > > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > > > > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > > > > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > > > > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > > > > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > > > > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > > > > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > > > > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > > > > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > > > > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > > > > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > > > > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > > > > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > > > > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > > > > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > > > > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > > > > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > > > > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > > > > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > > > > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > > > > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > > > > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > > > > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > > > > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > > > > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > > > > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > > > > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > > > > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > > > > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > > > > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > > > > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > > > > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > > > > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > > > > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > > > > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > > > > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > > > > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > > > > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > > > > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > > > > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > > > > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > > > > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > > > > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > > > > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > > > > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > > > > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > > > > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > > > > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > > > > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > > > > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > > > > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > > > > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > > > > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > > > > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > > > > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > > > > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > > > > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > > > > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > > > > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > > > > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > > > > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > > > > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > > > > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > > > > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > > > > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > > > > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > > > > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > > > > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > > > > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > > > > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > > > > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > > > > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > > > > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > > > > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > > > > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > > > > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > > > > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > > > > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > > > > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > > > > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > > > > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > > > > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > > > > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > > > > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > > > > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > > > > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > > > > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > > > > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > > > > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > > > > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > > > > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > > > > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > > > > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > > > > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > > > > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > > > > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > > > > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > > > > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > > > > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > > > > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > > > > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > > > > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > > > > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > > > > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > > > > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > > > > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > > > > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > > > > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > > > > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > > > > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > > > > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > > > > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > > > > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > > > > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > > > > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > > > > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > > > > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > > > > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > > > > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > > > > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > > > > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > > > > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > > > > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > > > > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > > > > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > > > > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > > > > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > > > > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > > > > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > > > > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > > > > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > > > > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > > > > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > > > > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > > > > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > > > > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > > > > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > > > > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > > > > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > > > > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > > > > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > > > > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > > > > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > > > > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > > > > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > > > > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > > > > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > > > > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > > > > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > > > > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > > > > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > > > > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > > > > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > > > > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > > > > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > > > > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > > > > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > > > > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > > > > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > > > > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > > > > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > > > > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > > > > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > > > > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > > > > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > > > > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > > > > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > > > > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > > > > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > > > > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > > > > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > > > > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > > > > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > > > > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > > > > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > > > > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > > > > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > > > > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > > > > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > > > > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > > > > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > > > > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > > > > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > > > > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > > > > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > > > > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > > > > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > > > > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > > > > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > > > > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > > > > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > > > > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > > > > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > > > > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > > > > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > > > > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > > > > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > > > > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > > > > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > > > > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > > > > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > > > > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > > > > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > > > > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > > > > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > > > > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > > > > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > > > > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > > > > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > > > > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > > > > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > > > > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > > > > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > > > > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > > > > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > > > > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > > > > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > > > > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > > > > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > > > > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > > > > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > > > > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > > > > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > > > > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > > > > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > > > > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > > > > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > > > > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > > > > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > > > > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > > > > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > > > > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > > > > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > > > > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > > > > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > > > > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > > > > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > > > > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > > > > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > > > > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > > > > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > > > > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > > > > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > > > > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > > > > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > > > > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > > > > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > > > > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > > > > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > > > > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > > > > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > > > > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > > > > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > > > > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > > > > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > > > > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > > > > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > > > > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > > > > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > > > > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > > > > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > > > > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > > > > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > > > > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > > > > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > > > > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > > > > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > > > > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > > > > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > > > > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > > > > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > > > > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > > > > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > > > > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > > > > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > > > > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > > > > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > > > > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > > > > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > > > > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > > > > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > > > > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > > > > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > > > > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > > > > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > > > > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > > > > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > > > > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > > > > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > > > > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > > > > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > > > > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > > > > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > > > > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > > > > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > > > > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > > > > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > > > > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > > > > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > > > > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > > > > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > > > > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > > > > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > > > > -        .quad 0x3ff0000000000000
> > > > > -        .quad 0x0000000000000000
> > > > > -        .quad 0x0000000000000000
> > > > > -        .quad 0x0000000000000000
> > > > > -        .align 16
> > > > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > > > -        .align 16
> > > > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > > > -        .align 16
> > > > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > > > -        .align 16
> > > > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > > > -        .align 16
> > > > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > > > -        .align 16
> > > > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > > > -        .align 16
> > > > > -        .type  __svml_stanh_data_internal,@object
> > > > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > > > +       movups  (%rsi, %rax), %xmm2
> > > > > +       movups  (%rdi, %rax), %xmm7
> > > > > +
> > > > > +       movaps  %xmm2, %xmm3
> > > > > +
> > > > > +       unpckhpd %xmm7, %xmm2
> > > > > +       movlhps %xmm7, %xmm3
> > > > > +
> > > > > +       addpd   %xmm13, %xmm2
> > > > > +
> > > > > +       mulpd   %xmm5, %xmm6
> > > > > +       addpd   %xmm4, %xmm6
> > > > > +
> > > > > +       mulpd   %xmm2, %xmm0
> > > > > +       addpd   %xmm3, %xmm0
> > > > > +
> > > > > +       cvtpd2ps %xmm0, %xmm2
> > > > > +       cvtpd2ps %xmm6, %xmm0
> > > > > +
> > > > > +       movlhps %xmm2, %xmm0
> > > > > +       andnps  %xmm12, %xmm1
> > > > > +       orps    %xmm1, %xmm0
> > > > > +
> > > > > +       movmskps %xmm8, %edx
> > > > > +       testl   %edx, %edx
> > > > > +
> > > > > +       /* Go to special inputs processing branch.  */
> > > > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > > > +
> > > > > +       /* No stack restoration on the fastpath.  */
> > > > > +       ret
> > > > > +
> > > > > +L(SPECIAL_VALUES_BRANCH):
> > > > > +       subq    $48, %rsp
> > > > > +
> > > > > +       movups  %xmm0, (%rsp)
> > > > > +       movups  %xmm12, 16(%rsp)
> > > > > +
> > > > > +       movq    %r12, 32(%rsp)
> > > > > +       movq    %r13, 40(%rsp)
> > > > > +
> > > > > +       /* edx has 1s where there was a special value that needs to be handled
> > > > > +          by a tanhf call.  */
> > > > > +       movl    %edx, %r13d
> > > > > +L(SPECIAL_VALUES_LOOP):
> > > > > +       /* use r12 as index for special value that is saved across calls to
> > > > > +          tanhf. We technically don't need a callee save register here as offset
> > > > > +          to rsp is always [0, 12] so we can restore rsp by realigning to 64.
> > > > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > +          in the loop.  */
> > > > > +       xorl    %r12d, %r12d
> > > > > +       bsfl    %r13d, %r12d
> > > > > +
> > > > > +       /* Scalar math fucntion call to process special input.  */
> > > > > +       movss   16(%rsp, %r12, 4), %xmm0
> > > > > +       call    tanhf@PLT
> > > > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > +          serialized stack/callee save restoration.  */
> > > > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > > > +
> > > > > +       leal    -1(%r13), %eax
> > > > > +       andl    %eax, %r13d
> > > > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > > > +
> > > > > +       /* All results have been written to 16(%rsp).  */
> > > > > +       movups  (%rsp), %xmm0
> > > > > +       movq    32(%rsp), %r12
> > > > > +       movq    40(%rsp), %r13
> > > > > +       addq    $48, %rsp
> > > > > +       ret
> > > > > +END(_ZGVbN4v_tanhf_sse4)
> > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > > > index 3745db5aa4..90c3ea4cc6 100644
> > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> > > > > @@ -70,775 +70,171 @@
> > > > >   *
> > > > >   */
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal
> > > > > - */
> > > > > -#define _dbP                           0
> > > > > -#define _sSignMask                     4288
> > > > > -#define _sAbsMask                      4320
> > > > > -#define _iExpMantMask                  4352
> > > > > -#define _iExpMask                      4384
> > > > > -#define _iMinIdxOfsMask                4416
> > > > > -#define _iMaxIdxMask                   4448
> > > > > -
> > > > >  #include <sysdep.h>
> > > > > +#include "svml_s_tanhf_rodata.S"
> > > > >
> > > > >          .text
> > > > >         .section .text.avx2,"ax",@progbits
> > > > >  ENTRY(_ZGVdN8v_tanhf_avx2)
> > > > > -        pushq     %rbp
> > > > > -        cfi_def_cfa_offset(16)
> > > > > -        movq      %rsp, %rbp
> > > > > -        cfi_def_cfa(6, 16)
> > > > > -        cfi_offset(6, -16)
> > > > > -        andq      $-32, %rsp
> > > > > -        pushq     %r12
> > > > > -        subq      $120, %rsp
> > > > > -        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r10
> > > > > -        vmovaps   %ymm0, %ymm12
> > > > > -
> > > > > -/* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > -        vpand     _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
> > > > > +       /* Here huge arguments, INF and NaNs are filtered out to callout.  */
> > > > > +       vpand   TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
> > > > > +       vpsubd  TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
> > > > >
> > > > > -/*
> > > > > - *  small table specific variables *
> > > > > - *  Constant loading
> > > > > - */
> > > > > -        vmovups   _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
> > > > > -        vpsubd    _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
> > > > > -
> > > > > -/* if VMIN, VMAX is defined for I type */
> > > > > -        vxorps    %ymm15, %ymm15, %ymm15
> > > > > -        vpcmpgtd  %ymm15, %ymm9, %ymm0
> > > > > -        vpand     %ymm0, %ymm9, %ymm7
> > > > > -        vpcmpgtd  %ymm8, %ymm9, %ymm6
> > > > > -        vblendvps %ymm6, %ymm8, %ymm7, %ymm3
> > > > > -        vpsrld    $14, %ymm3, %ymm1
> > > > > -        vpcmpgtd  _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
> > > > > -        vmovmskps %ymm13, %r11d
> > > > > -        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
> > > > > -        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
> > > > > -        vextractf128 $1, %ymm1, %xmm2
> > > > > -        vmovd     %xmm1, %r9d
> > > > > -        vmovd     %xmm2, %ecx
> > > > > -        vpextrd   $1, %xmm2, %edx
> > > > > -        vpextrd   $1, %xmm1, %r8d
> > > > > -        movslq    %r9d, %r9
> > > > > -        movslq    %edx, %rdx
> > > > > -        movslq    %r8d, %r8
> > > > > -        vpextrd   $2, %xmm1, %edi
> > > > > -        movslq    %ecx, %rcx
> > > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > > > > -        vpextrd   $3, %xmm2, %r12d
> > > > > -        vpextrd   $3, %xmm1, %esi
> > > > > -        vpextrd   $2, %xmm2, %eax
> > > > > -        movslq    %edi, %rdi
> > > > > -        movslq    %r12d, %r12
> > > > > -        movslq    %esi, %rsi
> > > > > -        movslq    %eax, %rax
> > > > > -        vmovupd   -16(%r9,%r10), %xmm5
> > > > > -        vmovupd   -16(%rdx,%r10), %xmm14
> > > > > -        vmovupd   -16(%rcx,%r10), %xmm13
> > > > > -        vmovupd   (%r9,%r10), %xmm1
> > > > > -        vmovupd   (%r8,%r10), %xmm2
> > > > > -        vmovupd   -16(%r8,%r10), %xmm4
> > > > > -        vinsertf128 $1, -16(%rdi,%r10), %ymm5, %ymm15
> > > > > -        vinsertf128 $1, -16(%r12,%r10), %ymm14, %ymm3
> > > > > -        vinsertf128 $1, -16(%rax,%r10), %ymm13, %ymm6
> > > > > -        vinsertf128 $1, (%rdi,%r10), %ymm1, %ymm5
> > > > > -        vinsertf128 $1, (%rsi,%r10), %ymm2, %ymm14
> > > > > -        vunpcklpd %ymm3, %ymm6, %ymm8
> > > > > -        vunpckhpd %ymm3, %ymm6, %ymm6
> > > > > -        vunpcklpd %ymm14, %ymm5, %ymm3
> > > > > -        vunpckhpd %ymm14, %ymm5, %ymm2
> > > > > -        vmovupd   (%rcx,%r10), %xmm13
> > > > > -        vcvtps2pd %xmm10, %ymm5
> > > > > -        vextractf128 $1, %ymm10, %xmm10
> > > > > -        vfmadd213pd %ymm3, %ymm5, %ymm2
> > > > > -        vinsertf128 $1, -16(%rsi,%r10), %ymm4, %ymm0
> > > > > -        vmovupd   (%rdx,%r10), %xmm4
> > > > > -        vunpcklpd %ymm0, %ymm15, %ymm9
> > > > > -        vunpckhpd %ymm0, %ymm15, %ymm7
> > > > > -        vfmadd213pd %ymm7, %ymm5, %ymm2
> > > > > -        vfmadd213pd %ymm9, %ymm5, %ymm2
> > > > > -        vinsertf128 $1, (%r12,%r10), %ymm4, %ymm0
> > > > > -        vcvtps2pd %xmm10, %ymm4
> > > > > -        vinsertf128 $1, (%rax,%r10), %ymm13, %ymm15
> > > > > -        vunpcklpd %ymm0, %ymm15, %ymm1
> > > > > -        vunpckhpd %ymm0, %ymm15, %ymm0
> > > > > -        vfmadd213pd %ymm1, %ymm4, %ymm0
> > > > > -        vcvtpd2ps %ymm2, %xmm1
> > > > > -        vfmadd213pd %ymm6, %ymm4, %ymm0
> > > > > -        vfmadd213pd %ymm8, %ymm4, %ymm0
> > > > > -        vcvtpd2ps %ymm0, %xmm0
> > > > > -        vinsertf128 $1, %xmm0, %ymm1, %ymm2
> > > > > -        vorps     %ymm11, %ymm2, %ymm0
> > > > > -        testl     %r11d, %r11d
> > > > > -
> > > > > -/* Go to special inputs processing branch */
> > > > > -        jne       L(SPECIAL_VALUES_BRANCH)
> > > > > -                                # LOE rbx r13 r14 r15 r11d ymm0 ymm12
> > > > > -
> > > > > -/* Restore registers
> > > > > - * and exit the function
> > > > > - */
> > > > > +       /* Selection of arguments between [0, 0x04280000] into ymm2.  */
> > > > > +       vpxor   %ymm3, %ymm3, %ymm3
> > > > > +       vpmaxsd %ymm3, %ymm2, %ymm2
> > > > > +       vpminsd TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
> > > > >
> > > > > -L(EXIT):
> > > > > -        addq      $120, %rsp
> > > > > -        cfi_restore(12)
> > > > > -        popq      %r12
> > > > > -        movq      %rbp, %rsp
> > > > > -        popq      %rbp
> > > > > -        cfi_def_cfa(7, 8)
> > > > > -        cfi_restore(6)
> > > > > -        ret
> > > > > -        cfi_def_cfa(6, 16)
> > > > > -        cfi_offset(6, -16)
> > > > > -        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
> > > > > -
> > > > > -/* Branch to process
> > > > > - * special inputs
> > > > > - */
> > > > > +       vpsrld  $14, %ymm2, %ymm1
> > > > >
> > > > > -L(SPECIAL_VALUES_BRANCH):
> > > > > -        vmovups   %ymm12, 32(%rsp)
> > > > > -        vmovups   %ymm0, 64(%rsp)
> > > > > -                                # LOE rbx r13 r14 r15 r11d ymm0
> > > > > -
> > > > > -        xorl      %r12d, %r12d
> > > > > -                                # LOE rbx r13 r14 r15 r11d r12d
> > > > > -
> > > > > -        vzeroupper
> > > > > -        movq      %r13, 8(%rsp)
> > > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > > > > -        movl      %r11d, %r13d
> > > > > -        movq      %r14, (%rsp)
> > > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > -
> > > > > -/* Range mask
> > > > > - * bits check
> > > > > - */
> > > > > +       /* Store special cases in ymm15.  */
> > > > > +       vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
> > > > >
> > > > > -L(RANGEMASK_CHECK):
> > > > > -        btl       %r12d, %r13d
> > > > >
> > > > > -/* Call scalar math function */
> > > > > -        jc        L(SCALAR_MATH_CALL)
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > +       /* Store base of lookup table in rax.  */
> > > > > +       leaq    TANHF_DATA(_lookupTable)(%rip), %rax
> > > > >
> > > > > -/* Special inputs
> > > > > - * processing loop
> > > > > - */
> > > > > +       /* We are splitting xmm1 into 8 GPRs. This may be faster to do with
> > > > > +          store/load as we can take advantage of store-forwarding.  */
> > > > > +       vmovq   %xmm1, %r8
> > > > > +       /* We have eliminated all negative values for ymm1 so no need to sign
> > > > > +          extend.  */
> > > > > +       movl    %r8d, %r9d
> > > > > +       shrq    $32, %r8
> > > > >
> > > > > -L(SPECIAL_VALUES_LOOP):
> > > > > -        incl      %r12d
> > > > > -        cmpl      $8, %r12d
> > > > > -
> > > > > -/* Check bits in range mask */
> > > > > -        jl        L(RANGEMASK_CHECK)
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > -
> > > > > -        movq      8(%rsp), %r13
> > > > > -        cfi_restore(13)
> > > > > -        movq      (%rsp), %r14
> > > > > -        cfi_restore(14)
> > > > > -        vmovups   64(%rsp), %ymm0
> > > > > -
> > > > > -/* Go to exit */
> > > > > -        jmp       L(EXIT)
> > > > > -        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
> > > > > -        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
> > > > > -        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
> > > > > -                                # LOE rbx r13 r14 r15 ymm0
> > > > > -
> > > > > -/* Scalar math fucntion call
> > > > > - * to process special input
> > > > > - */
> > > > > +       /* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
> > > > > +          with memory operand. This helps alleviate bottleneck on p5.  */
> > > > > +       vmovdqu 16(%r9, %rax), %xmm5
> > > > >
> > > > > -L(SCALAR_MATH_CALL):
> > > > > -        movl      %r12d, %r14d
> > > > > -        movss     32(%rsp,%r14,4), %xmm0
> > > > > -        call      tanhf@PLT
> > > > > -                                # LOE rbx r14 r15 r12d r13d xmm0
> > > > > +       vpextrq $1, %xmm1, %rsi
> > > > > +       movl    %esi, %edi
> > > > > +       shrq    $32, %rsi
> > > > >
> > > > > -        movss     %xmm0, 64(%rsp,%r14,4)
> > > > > +       vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
> > > > >
> > > > > -/* Process special inputs in loop */
> > > > > -        jmp       L(SPECIAL_VALUES_LOOP)
> > > > > -                                # LOE rbx r15 r12d r13d
> > > > > -END(_ZGVdN8v_tanhf_avx2)
> > > > > +       vextracti128 $1, %ymm1, %xmm2
> > > > > +       vmovq   %xmm2, %rdx
> > > > > +       movl    %edx, %ecx
> > > > > +       shrq    $32, %rdx
> > > > > +
> > > > > +       vmovdqu (%rcx, %rax), %xmm6
> > > > > +
> > > > > +       vpextrq $1, %xmm2, %r10
> > > > > +       movl    %r10d, %r11d
> > > > > +       shrq    $32, %r10
> > > > > +
> > > > > +       vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
> > > > > +
> > > > > +       vmovupd 16(%r8, %rax), %xmm1
> > > > > +       vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
> > > > > +       vmovupd (%rdx, %rax), %xmm3
> > > > > +       vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
> > > > > +
> > > > > +       vunpcklpd %ymm3, %ymm6, %ymm7
> > > > > +       vunpckhpd %ymm3, %ymm6, %ymm6
> > > > > +
> > > > > +       vunpcklpd %ymm1, %ymm5, %ymm3
> > > > > +       vunpckhpd %ymm1, %ymm5, %ymm1
> > > > > +
> > > > > +       vmovaps TANHF_DATA(_sAbsMask)(%rip), %ymm11
> > > > > +       vandps  %ymm11, %ymm0, %ymm4
> > > > >
> > > > > -        .section .rodata, "a"
> > > > > -        .align 32
> > > > > -
> > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > -typedef unsigned int VUINT32;
> > > > > -typedef struct
> > > > > -{
> > > > > -        __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
> > > > > -        __declspec(align(32)) VUINT32 _sSignMask[8][1];
> > > > > -        __declspec(align(32)) VUINT32 _sAbsMask[8][1];
> > > > > -        __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
> > > > > -        __declspec(align(32)) VUINT32 _iExpMask[8][1];
> > > > > -        __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
> > > > > -        __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
> > > > > -} __svml_stanh_data_internal;
> > > > > -#endif
> > > > > -__svml_stanh_data_internal:
> > > > > -        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
> > > > > -        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
> > > > > -        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
> > > > > -        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
> > > > > -        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
> > > > > -        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
> > > > > -        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
> > > > > -        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
> > > > > -        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
> > > > > -        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
> > > > > -        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
> > > > > -        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
> > > > > -        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
> > > > > -        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
> > > > > -        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
> > > > > -        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
> > > > > -        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
> > > > > -        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
> > > > > -        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
> > > > > -        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
> > > > > -        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
> > > > > -        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
> > > > > -        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
> > > > > -        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
> > > > > -        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
> > > > > -        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
> > > > > -        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
> > > > > -        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
> > > > > -        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
> > > > > -        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
> > > > > -        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
> > > > > -        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
> > > > > -        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
> > > > > -        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
> > > > > -        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
> > > > > -        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
> > > > > -        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
> > > > > -        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
> > > > > -        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
> > > > > -        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
> > > > > -        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
> > > > > -        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
> > > > > -        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
> > > > > -        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
> > > > > -        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
> > > > > -        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
> > > > > -        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
> > > > > -        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
> > > > > -        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
> > > > > -        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
> > > > > -        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
> > > > > -        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
> > > > > -        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
> > > > > -        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
> > > > > -        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
> > > > > -        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
> > > > > -        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
> > > > > -        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
> > > > > -        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
> > > > > -        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
> > > > > -        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
> > > > > -        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
> > > > > -        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
> > > > > -        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
> > > > > -        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
> > > > > -        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
> > > > > -        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
> > > > > -        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
> > > > > -        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
> > > > > -        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
> > > > > -        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
> > > > > -        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
> > > > > -        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
> > > > > -        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
> > > > > -        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
> > > > > -        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
> > > > > -        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
> > > > > -        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
> > > > > -        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
> > > > > -        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
> > > > > -        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
> > > > > -        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
> > > > > -        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
> > > > > -        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
> > > > > -        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
> > > > > -        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
> > > > > -        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
> > > > > -        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
> > > > > -        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
> > > > > -        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
> > > > > -        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
> > > > > -        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
> > > > > -        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
> > > > > -        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
> > > > > -        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
> > > > > -        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
> > > > > -        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
> > > > > -        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
> > > > > -        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
> > > > > -        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
> > > > > -        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
> > > > > -        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
> > > > > -        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
> > > > > -        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
> > > > > -        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
> > > > > -        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
> > > > > -        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
> > > > > -        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
> > > > > -        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
> > > > > -        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
> > > > > -        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
> > > > > -        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
> > > > > -        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
> > > > > -        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
> > > > > -        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
> > > > > -        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
> > > > > -        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
> > > > > -        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
> > > > > -        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
> > > > > -        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
> > > > > -        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
> > > > > -        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
> > > > > -        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
> > > > > -        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
> > > > > -        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
> > > > > -        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
> > > > > -        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
> > > > > -        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
> > > > > -        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
> > > > > -        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
> > > > > -        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
> > > > > -        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
> > > > > -        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
> > > > > -        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
> > > > > -        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
> > > > > -        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
> > > > > -        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
> > > > > -        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
> > > > > -        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
> > > > > -        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
> > > > > -        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
> > > > > -        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
> > > > > -        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
> > > > > -        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
> > > > > -        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
> > > > > -        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
> > > > > -        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
> > > > > -        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
> > > > > -        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
> > > > > -        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
> > > > > -        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
> > > > > -        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
> > > > > -        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
> > > > > -        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
> > > > > -        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
> > > > > -        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
> > > > > -        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
> > > > > -        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
> > > > > -        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
> > > > > -        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
> > > > > -        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
> > > > > -        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
> > > > > -        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
> > > > > -        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
> > > > > -        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
> > > > > -        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
> > > > > -        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
> > > > > -        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
> > > > > -        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
> > > > > -        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
> > > > > -        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
> > > > > -        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
> > > > > -        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
> > > > > -        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
> > > > > -        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
> > > > > -        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
> > > > > -        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
> > > > > -        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
> > > > > -        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
> > > > > -        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
> > > > > -        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
> > > > > -        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
> > > > > -        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
> > > > > -        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
> > > > > -        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
> > > > > -        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
> > > > > -        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
> > > > > -        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
> > > > > -        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
> > > > > -        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
> > > > > -        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
> > > > > -        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
> > > > > -        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
> > > > > -        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
> > > > > -        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
> > > > > -        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
> > > > > -        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
> > > > > -        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
> > > > > -        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
> > > > > -        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
> > > > > -        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
> > > > > -        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
> > > > > -        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
> > > > > -        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
> > > > > -        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
> > > > > -        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
> > > > > -        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
> > > > > -        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
> > > > > -        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
> > > > > -        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
> > > > > -        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
> > > > > -        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
> > > > > -        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
> > > > > -        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
> > > > > -        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
> > > > > -        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
> > > > > -        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
> > > > > -        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
> > > > > -        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
> > > > > -        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
> > > > > -        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
> > > > > -        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
> > > > > -        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
> > > > > -        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
> > > > > -        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
> > > > > -        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
> > > > > -        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
> > > > > -        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
> > > > > -        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
> > > > > -        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
> > > > > -        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
> > > > > -        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
> > > > > -        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
> > > > > -        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
> > > > > -        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
> > > > > -        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
> > > > > -        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
> > > > > -        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
> > > > > -        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
> > > > > -        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
> > > > > -        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
> > > > > -        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
> > > > > -        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
> > > > > -        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
> > > > > -        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
> > > > > -        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
> > > > > -        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
> > > > > -        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
> > > > > -        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
> > > > > -        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
> > > > > -        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
> > > > > -        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
> > > > > -        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
> > > > > -        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
> > > > > -        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
> > > > > -        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
> > > > > -        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
> > > > > -        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
> > > > > -        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
> > > > > -        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
> > > > > -        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
> > > > > -        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
> > > > > -        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
> > > > > -        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
> > > > > -        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
> > > > > -        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
> > > > > -        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
> > > > > -        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
> > > > > -        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
> > > > > -        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
> > > > > -        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
> > > > > -        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
> > > > > -        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
> > > > > -        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
> > > > > -        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
> > > > > -        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
> > > > > -        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
> > > > > -        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
> > > > > -        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
> > > > > -        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
> > > > > -        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
> > > > > -        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
> > > > > -        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
> > > > > -        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
> > > > > -        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
> > > > > -        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
> > > > > -        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
> > > > > -        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
> > > > > -        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
> > > > > -        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
> > > > > -        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
> > > > > -        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
> > > > > -        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
> > > > > -        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
> > > > > -        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
> > > > > -        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
> > > > > -        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
> > > > > -        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
> > > > > -        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
> > > > > -        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
> > > > > -        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
> > > > > -        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
> > > > > -        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
> > > > > -        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
> > > > > -        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
> > > > > -        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
> > > > > -        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
> > > > > -        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
> > > > > -        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
> > > > > -        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
> > > > > -        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
> > > > > -        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
> > > > > -        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
> > > > > -        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
> > > > > -        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
> > > > > -        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
> > > > > -        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
> > > > > -        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
> > > > > -        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
> > > > > -        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
> > > > > -        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
> > > > > -        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
> > > > > -        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
> > > > > -        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
> > > > > -        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
> > > > > -        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
> > > > > -        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
> > > > > -        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
> > > > > -        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
> > > > > -        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
> > > > > -        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
> > > > > -        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
> > > > > -        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
> > > > > -        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
> > > > > -        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
> > > > > -        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
> > > > > -        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
> > > > > -        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
> > > > > -        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
> > > > > -        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
> > > > > -        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
> > > > > -        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
> > > > > -        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
> > > > > -        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
> > > > > -        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
> > > > > -        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
> > > > > -        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
> > > > > -        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
> > > > > -        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
> > > > > -        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
> > > > > -        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
> > > > > -        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
> > > > > -        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
> > > > > -        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
> > > > > -        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
> > > > > -        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
> > > > > -        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
> > > > > -        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
> > > > > -        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
> > > > > -        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
> > > > > -        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
> > > > > -        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
> > > > > -        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
> > > > > -        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
> > > > > -        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
> > > > > -        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
> > > > > -        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
> > > > > -        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
> > > > > -        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
> > > > > -        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
> > > > > -        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
> > > > > -        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
> > > > > -        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
> > > > > -        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
> > > > > -        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
> > > > > -        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
> > > > > -        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
> > > > > -        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
> > > > > -        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
> > > > > -        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
> > > > > -        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
> > > > > -        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
> > > > > -        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
> > > > > -        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
> > > > > -        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
> > > > > -        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
> > > > > -        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
> > > > > -        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
> > > > > -        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
> > > > > -        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
> > > > > -        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
> > > > > -        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
> > > > > -        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
> > > > > -        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
> > > > > -        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
> > > > > -        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
> > > > > -        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
> > > > > -        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
> > > > > -        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
> > > > > -        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
> > > > > -        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
> > > > > -        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
> > > > > -        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
> > > > > -        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
> > > > > -        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
> > > > > -        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
> > > > > -        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
> > > > > -        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
> > > > > -        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
> > > > > -        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
> > > > > -        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
> > > > > -        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
> > > > > -        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
> > > > > -        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
> > > > > -        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
> > > > > -        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
> > > > > -        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
> > > > > -        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
> > > > > -        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
> > > > > -        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
> > > > > -        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
> > > > > -        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
> > > > > -        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
> > > > > -        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
> > > > > -        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
> > > > > -        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
> > > > > -        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
> > > > > -        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
> > > > > -        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
> > > > > -        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
> > > > > -        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
> > > > > -        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
> > > > > -        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
> > > > > -        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
> > > > > -        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
> > > > > -        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
> > > > > -        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
> > > > > -        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
> > > > > -        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
> > > > > -        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
> > > > > -        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
> > > > > -        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
> > > > > -        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
> > > > > -        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
> > > > > -        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
> > > > > -        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
> > > > > -        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
> > > > > -        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
> > > > > -        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
> > > > > -        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
> > > > > -        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
> > > > > -        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
> > > > > -        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
> > > > > -        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
> > > > > -        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
> > > > > -        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
> > > > > -        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
> > > > > -        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
> > > > > -        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
> > > > > -        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
> > > > > -        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
> > > > > -        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
> > > > > -        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
> > > > > -        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
> > > > > -        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
> > > > > -        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
> > > > > -        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
> > > > > -        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
> > > > > -        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
> > > > > -        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
> > > > > -        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
> > > > > -        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
> > > > > -        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
> > > > > -        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
> > > > > -        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
> > > > > -        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
> > > > > -        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
> > > > > -        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
> > > > > -        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
> > > > > -        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
> > > > > -        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
> > > > > -        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
> > > > > -        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
> > > > > -        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
> > > > > -        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
> > > > > -        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
> > > > > -        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
> > > > > -        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
> > > > > -        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
> > > > > -        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
> > > > > -        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
> > > > > -        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
> > > > > -        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
> > > > > -        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
> > > > > -        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
> > > > > -        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
> > > > > -        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
> > > > > -        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
> > > > > -        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
> > > > > -        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
> > > > > -        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
> > > > > -        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
> > > > > -        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
> > > > > -        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
> > > > > -        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
> > > > > -        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
> > > > > -        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
> > > > > -        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
> > > > > -        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
> > > > > -        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
> > > > > -        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
> > > > > -        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
> > > > > -        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
> > > > > -        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
> > > > > -        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
> > > > > -        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
> > > > > -        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
> > > > > -        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
> > > > > -        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
> > > > > -        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
> > > > > -        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
> > > > > -        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
> > > > > -        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
> > > > > -        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
> > > > > -        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
> > > > > -        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
> > > > > -        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
> > > > > -        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
> > > > > -        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
> > > > > -        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
> > > > > -        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
> > > > > -        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
> > > > > -        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
> > > > > -        .quad 0x3ff0000000000000
> > > > > -        .quad 0x0000000000000000
> > > > > -        .quad 0x0000000000000000
> > > > > -        .quad 0x0000000000000000
> > > > > -        .align 32
> > > > > -        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
> > > > > -        .align 32
> > > > > -        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
> > > > > -        .align 32
> > > > > -        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
> > > > > -        .align 32
> > > > > -        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
> > > > > -        .align 32
> > > > > -        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
> > > > > -        .align 32
> > > > > -        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
> > > > > -        .align 32
> > > > > -        .type  __svml_stanh_data_internal,@object
> > > > > -        .size  __svml_stanh_data_internal,.-__svml_stanh_data_internal
> > > > > +       vcvtps2pd %xmm4, %ymm5
> > > > > +
> > > > > +       vextractf128 $1, %ymm4, %xmm4
> > > > > +       vcvtps2pd %xmm4, %ymm4
> > > > > +
> > > > > +       vmovdqu 16(%rcx, %rax), %xmm2
> > > > > +       vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
> > > > > +
> > > > > +       vfmadd213pd %ymm3, %ymm5, %ymm1
> > > > > +
> > > > > +       vmovupd 16(%rdx, %rax), %xmm3
> > > > > +       vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
> > > > > +
> > > > > +       vunpcklpd %ymm3, %ymm2, %ymm10
> > > > > +       vunpckhpd %ymm3, %ymm2, %ymm2
> > > > > +
> > > > > +       vfmadd213pd %ymm10, %ymm4, %ymm2
> > > > > +       vfmadd213pd %ymm6, %ymm4, %ymm2
> > > > > +       vfmadd213pd %ymm7, %ymm4, %ymm2
> > > > > +       vcvtpd2ps %ymm2, %xmm2
> > > > > +
> > > > > +       vmovdqu (%r9, %rax), %xmm7
> > > > > +       vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
> > > > > +
> > > > > +       vmovupd (%r8, %rax), %xmm3
> > > > > +       vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
> > > > > +
> > > > > +       vunpckhpd %ymm3, %ymm7, %ymm4
> > > > > +       vunpcklpd %ymm3, %ymm7, %ymm7
> > > > > +
> > > > > +       vfmadd213pd %ymm4, %ymm5, %ymm1
> > > > > +       vfmadd213pd %ymm7, %ymm5, %ymm1
> > > > > +
> > > > > +
> > > > > +       vcvtpd2ps %ymm1, %xmm1
> > > > > +       vinsertf128 $1, %xmm2, %ymm1, %ymm1
> > > > > +
> > > > > +       vmovmskps %ymm15, %edx
> > > > > +       vandnps %ymm0, %ymm11, %ymm2
> > > > > +       testl   %edx, %edx
> > > > > +       /* Go to special inputs processing branch.  */
> > > > > +       jne     L(SPECIAL_VALUES_BRANCH)
> > > > > +       /* Wait until after branch of write over ymm0.  */
> > > > > +       vorps   %ymm2, %ymm1, %ymm0
> > > > > +       /* No stack restoration on the fastpath.  */
> > > > > +       ret
> > > > > +
> > > > > +
> > > > > +L(SPECIAL_VALUES_BRANCH):
> > > > > +       pushq   %rbp
> > > > > +       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > +        */
> > > > > +       pushq   %r12
> > > > > +       pushq   %r13
> > > > > +       movq    %rsp, %rbp
> > > > > +
> > > > > +       /* Align stack and make room for 2x ymm vectors.  */
> > > > > +       andq    $-32, %rsp
> > > > > +       addq    $-64, %rsp
> > > > > +
> > > > > +       /* Save all already computed inputs.  */
> > > > > +       vorps   %ymm2, %ymm1, %ymm1
> > > > > +       vmovups %ymm1, (%rsp)
> > > > > +       /* Save origional input (ymm0 unchanged up to this point).  */
> > > > > +       vmovups %ymm0, 32(%rsp)
> > > > > +
> > > > > +       vzeroupper
> > > > > +
> > > > > +       /* edx has 1s where there was a special value that needs to be handled
> > > > > +          by a tanhf call.  */
> > > > > +       movl    %edx, %r13d
> > > > > +L(SPECIAL_VALUES_LOOP):
> > > > > +       /* use r12 as index for special value that is saved across calls to
> > > > > +          tanhf. We technically don't need a callee save register here as offset
> > > > > +          to rsp is always [0, 28] so we can restore rsp by realigning to 64.
> > > > > +          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > +          in the loop.  */
> > > > > +       xorl    %r12d, %r12d
> > > > > +       tzcntl  %r13d, %r12d
> > > > > +
> > > > > +       /* Scalar math fucntion call to process special input.  */
> > > > > +       movss   32(%rsp, %r12, 4), %xmm0
> > > > > +       call    tanhf@PLT
> > > > > +       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > +          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > +          serialized stack/callee save restoration.  */
> > > > > +       movss   %xmm0, (%rsp, %r12, 4)
> > > > > +
> > > > > +       blsr    %r13d, %r13d
> > > > > +       jnz     L(SPECIAL_VALUES_LOOP)
> > > > > +
> > > > > +       /* All results have been written to 32(%rsp).  */
> > > > > +       vmovups (%rsp), %ymm0
> > > > > +       movq    %rbp, %rsp
> > > > > +       popq    %r13
> > > > > +       popq    %r12
> > > > > +       popq    %rbp
> > > > > +       ret
> > > > > +END(_ZGVdN8v_tanhf_avx2)
> > > > > --
> > > > > 2.25.1
> > > > >
diff mbox series

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index 8954a5f658..6a2f0c1392 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -70,312 +70,323 @@ 
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _sC                           	0
-#define _sP0                          	128
-#define _sP2                          	256
-#define _sP3                          	384
-#define _sP4                          	512
-#define _sP5                          	640
-#define _sP6                          	768
-#define _sP7                          	896
-#define _iExpMantMask_UISA            	1024
-#define _iMinIdxOfsMask_UISA          	1088
-#define _iMaxIdxMask_UISA             	1152
-#define _sSignMask                    	1216
-#define _sAbsMask                     	1280
-#define _iExpMantMask                 	1344
-#define _iExpMask                     	1408
-#define _iMinIdxOfsMask               	1472
-#define _iMaxIdxMask                  	1536
-
 #include <sysdep.h>
 
+#define TANHF_DATA(offset)	((offset) + __svml_stanh_data_internal)
+
+/* Offsets for data table __svml_stanh_data_internal.  */
+#define _iExpMantMask_UISA	0
+#define _iMinIdxOfsMask_UISA	4
+#define _iMaxIdxMask_UISA	8
+#define _iExpMask	12
+#define _sSignMask	64
+#define _sC_lo	128
+#define _sC_hi	192
+#define _sP7_lo	256
+#define _sP7_hi	320
+#define _sP6_lo	384
+#define _sP6_hi	448
+#define _sP5_lo	512
+#define _sP5_hi	576
+#define _sP4_lo	640
+#define _sP4_hi	704
+#define _sP3_lo	768
+#define _sP3_hi	832
+#define _sP2_lo	896
+#define _sP2_hi	960
+#define _sP0_lo	1024
+#define _sP0_hi	1088
+
         .text
 	.section .text.exex512,"ax",@progbits
 ENTRY(_ZGVeN16v_tanhf_skx)
-        pushq     %rbp
-        cfi_def_cfa_offset(16)
-        movq      %rsp, %rbp
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-        andq      $-64, %rsp
-        subq      $192, %rsp
-        vmovaps   %zmm0, %zmm1
-        vmovups   __svml_stanh_data_internal(%rip), %zmm9
-        vmovups   _sP6+__svml_stanh_data_internal(%rip), %zmm11
-        vmovups   _sP5+__svml_stanh_data_internal(%rip), %zmm12
-        vmovups   _sP4+__svml_stanh_data_internal(%rip), %zmm13
-        vmovups   _sP3+__svml_stanh_data_internal(%rip), %zmm14
-        vmovups   _sP2+__svml_stanh_data_internal(%rip), %zmm15
-        vpternlogd $255, %zmm2, %zmm2, %zmm2
-        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm8
-        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %zmm1, %zmm0
-
-/* Here huge arguments, INF and NaNs are filtered out to callout. */
-        vpandd    _iExpMantMask_UISA+__svml_stanh_data_internal(%rip), %zmm1, %zmm3
-        vpsubd    _iMinIdxOfsMask_UISA+__svml_stanh_data_internal(%rip), %zmm3, %zmm4
-        vpcmpd    $2, _iExpMask+__svml_stanh_data_internal(%rip), %zmm3, %k1
+	/* Here huge arguments, INF and NaNs are filtered out to callout.  */
+	vpandd	TANHF_DATA(_iExpMantMask_UISA)(%rip) {1to16}, %zmm0, %zmm1
+	vpsubd	TANHF_DATA(_iMinIdxOfsMask_UISA)(%rip) {1to16}, %zmm1, %zmm2
 
-/*
- *  small table specific variables *
- *  Constant loading
- */
-        vpxord    %zmm5, %zmm5, %zmm5
-
-/* if VMIN, VMAX is defined for I type */
-        vpmaxsd   %zmm5, %zmm4, %zmm6
-        vpminsd   _iMaxIdxMask_UISA+__svml_stanh_data_internal(%rip), %zmm6, %zmm7
-        vpsrld    $21, %zmm7, %zmm10
-        vmovups   _sP7+__svml_stanh_data_internal(%rip), %zmm4
-        vpermt2ps _sC+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm9
-        vpermt2ps _sP6+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm11
-        vpermt2ps _sP7+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm4
-        vpermt2ps _sP5+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm12
-        vpermt2ps _sP4+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm13
-        vpermt2ps _sP3+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm14
-        vpermt2ps _sP2+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm15
-        vpandnd   %zmm3, %zmm3, %zmm2{%k1}
-        vptestmd  %zmm2, %zmm2, %k0
-        vmovups   _sP0+__svml_stanh_data_internal(%rip), %zmm3
-        vsubps    {rn-sae}, %zmm9, %zmm8, %zmm2
-        kmovw     %k0, %edx
-        vfmadd213ps {rn-sae}, %zmm11, %zmm2, %zmm4
-        vpermt2ps _sP0+64+__svml_stanh_data_internal(%rip), %zmm10, %zmm3
-        vfmadd213ps {rn-sae}, %zmm12, %zmm2, %zmm4
-        vfmadd213ps {rn-sae}, %zmm13, %zmm2, %zmm4
-        vfmadd213ps {rn-sae}, %zmm14, %zmm2, %zmm4
-        vfmadd213ps {rn-sae}, %zmm15, %zmm2, %zmm4
-        vfmadd213ps {rn-sae}, %zmm3, %zmm2, %zmm4
-        vorps     %zmm0, %zmm4, %zmm0
-        testl     %edx, %edx
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
-
-/* Restore registers
- * and exit the function
- */
+	/* Selection arguments between [0, 0x03e00000] into zmm3.  */
+	vpxord	%zmm3, %zmm3, %zmm3
+	vpmaxsd	%zmm3, %zmm2, %zmm3
+	vpminsd	TANHF_DATA(_iMaxIdxMask_UISA)(%rip) {1to16}, %zmm3, %zmm3
 
-L(EXIT):
-        movq      %rbp, %rsp
-        popq      %rbp
-        cfi_def_cfa(7, 8)
-        cfi_restore(6)
-        ret
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
+	/* Setup permute indices in zmm3.  */
+	vpsrld	$21, %zmm3, %zmm3
+
+	/* Store if there are any special cases in k1.  */
+	vpcmpd	$6, TANHF_DATA(_iExpMask)(%rip) {1to16}, %zmm1, %k1
+
+
+	/* Store absolute values of inputs in zmm1.  */
+	vmovaps	TANHF_DATA(_sSignMask)(%rip), %zmm4
+	vandnps	%zmm0, %zmm4, %zmm1
+
+	vmovaps	TANHF_DATA(_sC_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
+	vsubps	{rn-sae}, %zmm5, %zmm1, %zmm1
+
+	vmovaps	TANHF_DATA(_sP7_lo)(%rip), %zmm2
+	vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
 
+	vmovaps	TANHF_DATA(_sP6_lo)(%rip), %zmm5
+	vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
+
+	vmovaps	TANHF_DATA(_sP5_lo)(%rip), %zmm6
+	vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
+
+	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
+
+	vmovaps	TANHF_DATA(_sP4_lo)(%rip), %zmm7
+	vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
+
+	vmovaps	TANHF_DATA(_sP3_lo)(%rip), %zmm8
+	vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
+
+	vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
+
+	vmovaps	TANHF_DATA(_sP2_lo)(%rip), %zmm9
+	vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
+
+	vmovaps	TANHF_DATA(_sP0_lo)(%rip), %zmm10
+	vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
+
+	vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
+	vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
+
+	kmovw	%k1, %edx
+	testl	%edx, %edx
+
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	/* Wait until after branch of write over zmm0.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
+
+	/* No stack restoration on the fastpath.  */
+	ret
+
+	/* Branch to process special inputs.  */
 L(SPECIAL_VALUES_BRANCH):
-        vmovups   %zmm1, 64(%rsp)
-        vmovups   %zmm0, 128(%rsp)
-                                # LOE rbx r12 r13 r14 r15 edx zmm0
-
-        xorl      %eax, %eax
-                                # LOE rbx r12 r13 r14 r15 eax edx
-
-        vzeroupper
-        movq      %r12, 16(%rsp)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-        movl      %eax, %r12d
-        movq      %r13, 8(%rsp)
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-        movl      %edx, %r13d
-        movq      %r14, (%rsp)
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
+	pushq	%rbp
+	/* Need to callee save registers to preserve state across tanhf calls.
+	 */
+	pushq	%r13
+	pushq	%r12
+	movq	%rsp, %rbp
 
-L(RANGEMASK_CHECK):
-        btl       %r12d, %r13d
+	/* Align stack and make room for 2x zmm vectors.  */
+	andq	$-64, %rsp
+	addq	$-128, %rsp
 
-/* Call scalar math function */
-        jc        L(SCALAR_MATH_CALL)
-                                # LOE rbx r15 r12d r13d
+	/* Save all already computed inputs.  */
+	vpternlogd $0xec, %zmm4, %zmm2, %zmm2
+	vmovaps	%zmm2, (%rsp)
+	/* Save origional input (zmm0 unchanged up to this point).  */
+	vmovaps	%zmm0, 64(%rsp)
 
-/* Special inputs
- * processing loop
- */
+	vzeroupper
 
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %r13d
 L(SPECIAL_VALUES_LOOP):
-        incl      %r12d
-        cmpl      $16, %r12d
-
-/* Check bits in range mask */
-        jl        L(RANGEMASK_CHECK)
-                                # LOE rbx r15 r12d r13d
-
-        movq      16(%rsp), %r12
-        cfi_restore(12)
-        movq      8(%rsp), %r13
-        cfi_restore(13)
-        movq      (%rsp), %r14
-        cfi_restore(14)
-        vmovups   128(%rsp), %zmm0
-
-/* Go to exit */
-        jmp       L(EXIT)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r12 r13 r14 r15 zmm0
-
-/* Scalar math fucntion call
- * to process special input
- */
+	/* use r12 as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%r12d, %r12d
+	tzcntl	%r13d, %r12d
 
-L(SCALAR_MATH_CALL):
-        movl      %r12d, %r14d
-        movss     64(%rsp,%r14,4), %xmm0
-        call      tanhf@PLT
-                                # LOE rbx r14 r15 r12d r13d xmm0
+	/* Scalar math fucntion call to process special input.  */
+	movss	64(%rsp, %r12, 4), %xmm0
+	call	tanhf@PLT
 
-        movss     %xmm0, 128(%rsp,%r14,4)
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %r12, 4)
 
-/* Process special inputs in loop */
-        jmp       L(SPECIAL_VALUES_LOOP)
-                                # LOE rbx r15 r12d r13d
-END(_ZGVeN16v_tanhf_skx)
+	blsr	%r13d, %r13d
+	jnz	L(SPECIAL_VALUES_LOOP)
 
-        .section .rodata, "a"
-        .align 64
+	/* All results have been written to 64(%rsp).  */
+	vmovaps	(%rsp), %zmm0
+	/* Restore rsp.  */
+	movq	%rbp, %rsp
+	/* Restore callee save registers.  */
+	popq	%r12
+	popq	%r13
+	popq	%rbp
+	ret
+END(_ZGVeN16v_tanhf_skx)
 
+	.section .rodata, "a"
+	.align	16
 #ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct
-{
-        __declspec(align(64)) VUINT32 _sC[32][1];
-        __declspec(align(64)) VUINT32 _sP0[32][1];
-        __declspec(align(64)) VUINT32 _sP2[32][1];
-        __declspec(align(64)) VUINT32 _sP3[32][1];
-        __declspec(align(64)) VUINT32 _sP4[32][1];
-        __declspec(align(64)) VUINT32 _sP5[32][1];
-        __declspec(align(64)) VUINT32 _sP6[32][1];
-        __declspec(align(64)) VUINT32 _sP7[32][1];
-        __declspec(align(64)) VUINT32 _iExpMantMask_UISA[16][1];
-        __declspec(align(64)) VUINT32 _iMinIdxOfsMask_UISA[16][1];
-        __declspec(align(64)) VUINT32 _iMaxIdxMask_UISA[16][1];
-        __declspec(align(64)) VUINT32 _sSignMask[16][1];
-        __declspec(align(64)) VUINT32 _sAbsMask[16][1];
-        __declspec(align(64)) VUINT32 _iExpMantMask[16][1];
-        __declspec(align(64)) VUINT32 _iExpMask[16][1];
-        __declspec(align(64)) VUINT32 _iMinIdxOfsMask[16][1];
-        __declspec(align(64)) VUINT32 _iMaxIdxMask[16][1];
-} __svml_stanh_data_internal;
+	typedef	unsigned int VUINT32;
+	typedef	struct
+	{
+	__declspec (align(4))VUINT32 _iExpMantMask_UISA[1][1];
+	__declspec (align(4))VUINT32 _iMinIdxOfsMask_UISA[1][1];
+	__declspec (align(4))VUINT32 _iMaxIdxMask_UISA[1][1];
+	__declspec (align(4))VUINT32 _iExpMask[1][1];
+	__declspec (align(64))VUINT32 _sSignMask[16][1];
+	__declspec (align(64))VUINT32 _sC_lo[16][1];
+	__declspec (align(64))VUINT32 _sC_hi[16][1];
+	__declspec (align(64))VUINT32 _sP7_lo[16][1];
+	__declspec (align(64))VUINT32 _sP7_hi[16][1];
+	__declspec (align(64))VUINT32 _sP6_lo[16][1];
+	__declspec (align(64))VUINT32 _sP6_hi[16][1];
+	__declspec (align(64))VUINT32 _sP5_lo[16][1];
+	__declspec (align(64))VUINT32 _sP5_hi[16][1];
+	__declspec (align(64))VUINT32 _sP4_lo[16][1];
+	__declspec (align(64))VUINT32 _sP4_hi[16][1];
+	__declspec (align(64))VUINT32 _sP3_lo[16][1];
+	__declspec (align(64))VUINT32 _sP3_hi[16][1];
+	__declspec (align(64))VUINT32 _sP2_lo[16][1];
+	__declspec (align(64))VUINT32 _sP2_hi[16][1];
+	__declspec (align(64))VUINT32 _sP0_lo[16][1];
+	__declspec (align(64))VUINT32 _sP0_hi[16][1];
+	}__svml_stanh_data_internal;
 #endif
+
 __svml_stanh_data_internal:
-        /*== _sC ==*/
-        .long 0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
-        .long 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
-        .long 0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
-        .long 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
-        .long 0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
-        .long 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
-        .long 0x40500000, 0x40700000, 0x40900000, 0x40b00000
-        .long 0x40d00000, 0x40f00000, 0x41100000, 0x00000000
-        /*== p0 ==*/
-        .align 64
-        .long 0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
-        .long 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
-        .long 0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
-        .long 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
-        .long 0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
-        .long 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
-        .long 0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
-        .long 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
-        /*== p2 ==*/
-        .align 64
-        .long 0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
-        .long 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
-        .long 0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
-        .long 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
-        .long 0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
-        .long 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
-        .long 0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
-        .long 0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
-        /*== p3 ==*/
-        .align 64
-        .long 0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
-        .long 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
-        .long 0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
-        .long 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
-        .long 0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
-        .long 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
-        .long 0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
-        .long 0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
-        /*== p4 ==*/
-        .align 64
-        .long 0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
-        .long 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
-        .long 0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
-        .long 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
-        .long 0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
-        .long 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
-        .long 0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
-        .long 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
-        /*== p5 ==*/
-        .align 64
-        .long 0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
-        .long 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
-        .long 0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
-        .long 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
-        .long 0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
-        .long 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
-        .long 0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
-        .long 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
-        /*== p6 ==*/
-        .align 64
-        .long 0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
-        .long 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
-        .long 0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
-        .long 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
-        .long 0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
-        .long 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
-        .long 0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
-        .long 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
-        /*== p7 ==*/
-        .align 64
-        .long 0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
-        .long 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
-        .long 0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
-        .long 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
-        .long 0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
-        .long 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
-        .long 0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
-        .long 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
-        .align 64
-        .long 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000, 0x7fe00000           /* _iExpMantMask_UISA     */
-        .align 64
-        .long 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000, 0x3d400000           /* _iMinIdxOfsMask_UISA   */
-        .align 64
-        .long 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000, 0x03e00000           /* _iMaxIdxMask_UISA      */
-        .align 64
-        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
-        .align 64
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
-        .align 64
-        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
-        .align 64
-        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
-        .align 64
-        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
-        .align 64
-        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
-        .align 64
-        .type	__svml_stanh_data_internal,@object
-        .size	__svml_stanh_data_internal,.-__svml_stanh_data_internal
+	.align	4
+	/* _iExpMantMask_UISA.  */
+	.long	0x7fe00000
+
+	.align	4
+	/* _iMinIdxOfsMask_UISA.  */
+	.long	0x3d400000
+
+	.align	4
+	/* _iMaxIdxMask_UISA.  */
+	.long	0x03e00000
+
+	.align	4
+	/* _iExpMask.  */
+	.long	0x7f000000
+
+	.align	64
+	/* _sSignMask.  */
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
+
+	.align	64
+	/* _sC_lo.  */
+	.long	0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
+	.long	0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
+	.long	0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
+	.long	0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
+
+	.align	64
+	/* _sC_hi.  */
+	.long	0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
+	.long	0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
+	.long	0x40500000, 0x40700000, 0x40900000, 0x40b00000
+	.long	0x40d00000, 0x40f00000, 0x41100000, 0x00000000
+
+	.align	64
+	/* _sP7_lo.  */
+	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
+	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
+	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
+	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
+
+	.align	64
+	/* _sP7_hi.  */
+	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
+	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
+	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
+	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
+
+	.align	64
+	/* _sP6_lo.  */
+	.long	0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
+	.long	0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
+	.long	0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
+	.long	0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
+
+	.align	64
+	/* _sP6_hi.  */
+	.long	0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
+	.long	0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
+	.long	0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
+	.long	0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
+
+	.align	64
+	/* _sP5_lo.  */
+	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
+	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
+	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
+	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
+
+	.align	64
+	/* _sP5_hi.  */
+	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
+	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
+	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
+	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
+
+	.align	64
+	/* _sP4_lo.  */
+	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
+	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
+	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
+	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
+
+	.align	64
+	/* _sP4_hi.  */
+	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
+	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
+	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
+	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
+
+	.align	64
+	/* _sP3_lo.  */
+	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
+	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
+	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
+	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
+
+	.align	64
+	/* _sP3_hi.  */
+	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
+	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
+	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
+	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
+
+	.align	64
+	/* _sP2_lo.  */
+	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
+	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
+	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
+	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
+
+	.align	64
+	/* _sP2_hi.  */
+	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
+	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
+	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
+	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
+
+	.align	64
+	/* _sP0_lo.  */
+	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
+	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
+	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
+	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
+
+	.align	64
+	/* _sP0_hi.  */
+	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
+	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
+	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
+	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
+
+	.align	64
+	.type	__svml_stanh_data_internal, @object
+	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
index 50f753ffb3..716b06d640 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf4_core_sse4.S
@@ -70,763 +70,154 @@ 
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP                          	0
-#define _sSignMask                    	4288
-#define _sAbsMask                     	4304
-#define _iExpMantMask                 	4320
-#define _iExpMask                     	4336
-#define _iMinIdxOfsMask               	4352
-#define _iMaxIdxMask                  	4368
 
 #include <sysdep.h>
 
+#define ONLY_DECL_OFFSET
+#include "svml_s_tanhf_rodata.S"
+
         .text
 	.section .text.sse4,"ax",@progbits
 ENTRY(_ZGVbN4v_tanhf_sse4)
-        subq      $72, %rsp
-        cfi_def_cfa_offset(80)
-        movaps    %xmm0, %xmm5
+	/* Save copy of input in xmm12.  */
+	movaps	%xmm0, %xmm12
 
-/* Here huge arguments, INF and NaNs are filtered out to callout. */
-        movdqu    _iExpMantMask+__svml_stanh_data_internal(%rip), %xmm9
-        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r8
-        pand      %xmm5, %xmm9
+	/* Here huge arguments, INF and NaNs are filtered out to callout.  */
+	movdqu	TANHF_DATA(_iExpMantMask)(%rip), %xmm3
+	pand	%xmm0, %xmm3
 
-/* if VMIN, VMAX is defined for I type */
-        pxor      %xmm7, %xmm7
-        movdqa    %xmm9, %xmm6
-        psubd     _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %xmm9
 
-/*
- *  small table specific variables *
- *  Constant loading
- */
-        movdqu    _iMaxIdxMask+__svml_stanh_data_internal(%rip), %xmm10
-        movdqa    %xmm9, %xmm11
-        movdqa    %xmm9, %xmm8
-        pcmpgtd   %xmm10, %xmm11
-        pcmpgtd   %xmm7, %xmm8
-        movdqa    %xmm11, %xmm14
-        pand      %xmm8, %xmm9
-        andps     %xmm11, %xmm10
-        andnps    %xmm9, %xmm14
-        orps      %xmm10, %xmm14
-        psrld     $14, %xmm14
-        movd      %xmm14, %edx
-        pshufd    $1, %xmm14, %xmm12
-        pshufd    $2, %xmm14, %xmm13
-        movd      %xmm12, %ecx
-        pshufd    $3, %xmm14, %xmm15
-        movups    _sAbsMask+__svml_stanh_data_internal(%rip), %xmm3
-        movslq    %edx, %rdx
-        andps     %xmm5, %xmm3
-        movslq    %ecx, %rcx
-        pcmpgtd   _iExpMask+__svml_stanh_data_internal(%rip), %xmm6
-        movd      %xmm13, %esi
-        movups    -16(%rdx,%r8), %xmm2
-        movaps    %xmm2, %xmm0
-        movd      %xmm15, %edi
-        movmskps  %xmm6, %eax
-        movups    -16(%rcx,%r8), %xmm6
-        unpcklpd  %xmm6, %xmm0
-        unpckhpd  %xmm6, %xmm2
-        cvtps2pd  %xmm3, %xmm6
-        movhlps   %xmm3, %xmm3
-        cvtps2pd  %xmm3, %xmm3
-        movslq    %esi, %rsi
-        movslq    %edi, %rdi
-        movups    (%rcx,%r8), %xmm8
-        movups    (%rdx,%r8), %xmm12
-        movups    (%rsi,%r8), %xmm13
-        movaps    %xmm12, %xmm10
-        movups    (%rdi,%r8), %xmm9
-        movaps    %xmm13, %xmm11
-        unpckhpd  %xmm8, %xmm12
-        unpckhpd  %xmm9, %xmm13
-        mulpd     %xmm6, %xmm12
-        mulpd     %xmm3, %xmm13
-        unpcklpd  %xmm8, %xmm10
-        unpcklpd  %xmm9, %xmm11
-        addpd     %xmm10, %xmm12
-        addpd     %xmm11, %xmm13
-        mulpd     %xmm6, %xmm12
-        mulpd     %xmm3, %xmm13
-        addpd     %xmm2, %xmm12
-        movups    -16(%rsi,%r8), %xmm1
-        movups    -16(%rdi,%r8), %xmm7
-        movaps    %xmm1, %xmm14
-        unpckhpd  %xmm7, %xmm1
-        addpd     %xmm1, %xmm13
-        mulpd     %xmm12, %xmm6
-        mulpd     %xmm13, %xmm3
-        addpd     %xmm0, %xmm6
-        unpcklpd  %xmm7, %xmm14
-        addpd     %xmm14, %xmm3
-        cvtpd2ps  %xmm6, %xmm0
-        cvtpd2ps  %xmm3, %xmm1
-        movups    _sSignMask+__svml_stanh_data_internal(%rip), %xmm4
-        movlhps   %xmm1, %xmm0
-        andps     %xmm5, %xmm4
-        orps      %xmm4, %xmm0
-        testl     %eax, %eax
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm5
-
-/* Restore registers
- * and exit the function
- */
+	/* Selection of arguments between [0, 0x04280000] into xmm3.  */
+	pxor	%xmm7, %xmm7
+	/* Save xmm3 for special values check at end.  */
+	movdqa	%xmm3, %xmm8
+	psubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %xmm3
+	pmaxsd	%xmm7, %xmm3
+	pminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %xmm3
+	psrld	$14, %xmm3
 
-L(EXIT):
-        addq      $72, %rsp
-        cfi_def_cfa_offset(8)
-        ret
-        cfi_def_cfa_offset(80)
+	movq	%xmm3, %rcx
+	movl	%ecx, %edx
+	shrq	$32, %rcx
 
-/* Branch to process
- * special inputs
- */
+	/* xmm8 contains mask of special values.  */
+	pcmpgtd	TANHF_DATA(_iExpMask)(%rip), %xmm8
 
-L(SPECIAL_VALUES_BRANCH):
-        movups    %xmm5, 32(%rsp)
-        movups    %xmm0, 48(%rsp)
-                                # LOE rbx rbp r12 r13 r14 r15 eax
-
-        xorl      %edx, %edx
-        movq      %r12, 16(%rsp)
-        cfi_offset(12, -64)
-        movl      %edx, %r12d
-        movq      %r13, 8(%rsp)
-        cfi_offset(13, -72)
-        movl      %eax, %r13d
-        movq      %r14, (%rsp)
-        cfi_offset(14, -80)
-                                # LOE rbx rbp r15 r12d r13d
-
-/* Range mask
- * bits check
- */
+	pshufd	$0x0e, %xmm3, %xmm3
+	movq	%xmm3, %rdi
+	movl	%edi, %esi
+	shrq	$32, %rdi
 
-L(RANGEMASK_CHECK):
-        btl       %r12d, %r13d
+	movaps	TANHF_DATA(_sAbsMask)(%rip), %xmm1
+	andps	%xmm1, %xmm0
 
-/* Call scalar math function */
-        jc        L(SCALAR_MATH_CALL)
-                                # LOE rbx rbp r15 r12d r13d
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
+	movups	(%rdx, %rax), %xmm2
+	movups	(%rcx, %rax), %xmm6
 
-/* Special inputs
- * processing loop
- */
+	movaps	%xmm2, %xmm4
+	movlhps	%xmm6, %xmm4
+	unpckhpd %xmm6, %xmm2
 
-L(SPECIAL_VALUES_LOOP):
-        incl      %r12d
-        cmpl      $4, %r12d
-
-/* Check bits in range mask */
-        jl        L(RANGEMASK_CHECK)
-                                # LOE rbx rbp r15 r12d r13d
-
-        movq      16(%rsp), %r12
-        cfi_restore(12)
-        movq      8(%rsp), %r13
-        cfi_restore(13)
-        movq      (%rsp), %r14
-        cfi_restore(14)
-        movups    48(%rsp), %xmm0
-
-/* Go to exit */
-        jmp       L(EXIT)
-        cfi_offset(12, -64)
-        cfi_offset(13, -72)
-        cfi_offset(14, -80)
-                                # LOE rbx rbp r12 r13 r14 r15 xmm0
-
-/* Scalar math fucntion call
- * to process special input
- */
+	cvtps2pd %xmm0, %xmm6
+	movhlps	%xmm0, %xmm0
+	cvtps2pd %xmm0, %xmm0
 
-L(SCALAR_MATH_CALL):
-        movl      %r12d, %r14d
-        movss     32(%rsp,%r14,4), %xmm0
-        call      tanhf@PLT
-                                # LOE rbx rbp r14 r15 r12d r13d xmm0
+	movups	16(%rdx, %rax), %xmm5
+	movups	16(%rsi, %rax), %xmm13
 
-        movss     %xmm0, 48(%rsp,%r14,4)
+	movaps	%xmm5, %xmm10
+	movaps	%xmm13, %xmm11
 
-/* Process special inputs in loop */
-        jmp       L(SPECIAL_VALUES_LOOP)
-                                # LOE rbx rbp r15 r12d r13d
-END(_ZGVbN4v_tanhf_sse4)
+	movups	16(%rcx, %rax), %xmm7
+	movups	16(%rdi, %rax), %xmm3
+
+	unpckhpd %xmm7, %xmm5
+	unpckhpd %xmm3, %xmm13
+
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
+
+	movlhps	%xmm7, %xmm10
+	movlhps	%xmm3, %xmm11
+
+	addpd	%xmm10, %xmm5
+	addpd	%xmm11, %xmm13
+
+	mulpd	%xmm6, %xmm5
+	mulpd	%xmm0, %xmm13
+
+	addpd	%xmm2, %xmm5
 
-        .section .rodata, "a"
-        .align 16
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct
-{
-        __declspec(align(16)) VUINT32 _dbP[(134*4)][2];
-        __declspec(align(16)) VUINT32 _sSignMask[4][1];
-        __declspec(align(16)) VUINT32 _sAbsMask[4][1];
-        __declspec(align(16)) VUINT32 _iExpMantMask[4][1];
-        __declspec(align(16)) VUINT32 _iExpMask[4][1];
-        __declspec(align(16)) VUINT32 _iMinIdxOfsMask[4][1];
-        __declspec(align(16)) VUINT32 _iMaxIdxMask[4][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
-        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
-        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
-        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
-        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
-        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
-        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
-        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
-        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
-        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
-        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
-        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
-        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
-        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
-        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
-        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
-        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
-        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
-        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
-        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
-        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
-        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
-        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
-        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
-        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
-        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
-        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
-        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
-        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
-        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
-        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
-        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
-        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
-        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
-        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
-        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
-        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
-        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
-        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
-        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
-        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
-        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
-        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
-        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
-        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
-        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
-        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
-        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
-        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
-        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
-        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
-        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
-        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
-        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
-        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
-        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
-        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
-        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
-        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
-        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
-        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
-        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
-        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
-        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
-        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
-        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
-        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
-        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
-        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
-        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
-        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
-        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
-        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
-        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
-        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
-        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
-        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
-        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
-        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
-        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
-        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
-        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
-        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
-        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
-        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
-        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
-        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
-        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
-        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
-        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
-        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
-        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
-        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
-        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
-        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
-        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
-        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
-        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
-        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
-        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
-        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
-        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
-        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
-        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
-        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
-        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
-        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
-        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
-        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
-        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
-        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
-        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
-        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
-        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
-        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
-        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
-        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
-        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
-        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
-        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
-        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
-        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
-        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
-        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
-        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
-        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
-        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
-        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
-        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
-        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
-        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
-        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
-        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
-        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
-        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
-        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
-        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
-        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
-        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
-        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
-        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
-        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
-        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
-        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
-        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
-        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
-        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
-        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
-        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
-        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
-        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
-        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
-        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
-        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
-        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
-        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
-        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
-        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
-        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
-        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
-        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
-        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
-        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
-        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
-        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
-        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
-        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
-        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
-        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
-        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
-        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
-        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
-        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
-        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
-        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
-        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
-        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
-        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
-        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
-        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
-        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
-        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
-        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
-        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
-        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
-        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
-        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
-        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
-        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
-        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
-        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
-        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
-        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
-        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
-        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
-        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
-        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
-        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
-        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
-        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
-        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
-        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
-        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
-        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
-        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
-        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
-        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
-        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
-        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
-        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
-        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
-        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
-        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
-        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
-        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
-        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
-        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
-        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
-        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
-        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
-        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
-        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
-        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
-        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
-        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
-        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
-        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
-        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
-        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
-        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
-        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
-        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
-        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
-        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
-        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
-        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
-        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
-        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
-        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
-        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
-        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
-        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
-        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
-        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
-        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
-        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
-        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
-        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
-        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
-        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
-        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
-        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
-        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
-        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
-        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
-        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
-        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
-        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
-        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
-        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
-        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
-        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
-        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
-        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
-        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
-        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
-        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
-        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
-        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
-        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
-        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
-        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
-        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
-        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
-        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
-        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
-        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
-        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
-        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
-        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
-        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
-        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
-        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
-        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
-        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
-        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
-        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
-        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
-        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
-        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
-        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
-        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
-        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
-        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
-        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
-        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
-        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
-        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
-        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
-        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
-        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
-        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
-        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
-        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
-        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
-        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
-        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
-        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
-        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
-        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
-        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
-        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
-        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
-        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
-        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
-        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
-        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
-        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
-        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
-        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
-        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
-        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
-        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
-        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
-        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
-        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
-        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
-        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
-        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
-        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
-        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
-        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
-        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
-        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
-        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
-        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
-        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
-        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
-        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
-        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
-        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
-        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
-        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
-        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
-        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
-        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
-        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
-        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
-        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
-        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
-        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
-        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
-        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
-        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
-        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
-        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
-        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
-        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
-        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
-        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
-        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
-        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
-        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
-        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
-        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
-        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
-        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
-        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
-        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
-        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
-        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
-        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
-        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
-        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
-        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
-        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
-        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
-        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
-        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
-        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
-        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
-        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
-        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
-        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
-        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
-        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
-        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
-        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
-        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
-        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
-        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
-        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
-        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
-        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
-        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
-        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
-        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
-        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
-        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
-        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
-        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
-        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
-        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
-        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
-        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
-        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
-        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
-        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
-        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
-        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
-        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
-        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
-        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
-        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
-        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
-        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
-        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
-        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
-        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
-        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
-        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
-        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
-        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
-        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
-        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
-        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
-        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
-        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
-        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
-        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
-        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
-        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
-        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
-        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
-        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
-        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
-        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
-        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
-        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
-        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
-        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
-        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
-        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
-        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
-        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
-        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
-        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
-        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
-        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
-        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
-        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
-        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
-        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
-        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
-        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
-        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
-        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
-        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
-        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
-        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
-        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
-        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
-        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
-        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
-        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
-        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
-        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
-        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
-        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
-        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
-        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
-        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
-        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
-        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
-        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
-        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
-        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
-        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
-        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
-        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
-        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
-        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
-        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
-        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
-        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
-        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
-        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
-        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
-        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
-        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
-        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
-        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
-        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
-        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
-        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
-        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
-        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
-        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
-        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
-        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
-        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
-        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
-        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
-        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
-        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
-        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
-        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
-        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
-        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
-        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
-        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
-        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
-        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
-        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
-        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
-        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
-        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
-        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
-        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
-        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
-        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
-        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
-        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
-        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
-        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
-        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
-        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
-        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
-        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
-        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
-        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
-        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
-        .quad 0x3ff0000000000000
-        .quad 0x0000000000000000
-        .quad 0x0000000000000000
-        .quad 0x0000000000000000
-        .align 16
-        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
-        .align 16
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
-        .align 16
-        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
-        .align 16
-        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
-        .align 16
-        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
-        .align 16
-        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
-        .align 16
-        .type	__svml_stanh_data_internal,@object
-        .size	__svml_stanh_data_internal,.-__svml_stanh_data_internal
+	movups	(%rsi, %rax), %xmm2
+	movups	(%rdi, %rax), %xmm7
+
+	movaps	%xmm2, %xmm3
+
+	unpckhpd %xmm7, %xmm2
+	movlhps	%xmm7, %xmm3
+
+	addpd	%xmm13, %xmm2
+
+	mulpd	%xmm5, %xmm6
+	addpd	%xmm4, %xmm6
+
+	mulpd	%xmm2, %xmm0
+	addpd	%xmm3, %xmm0
+
+	cvtpd2ps %xmm0, %xmm2
+	cvtpd2ps %xmm6, %xmm0
+
+	movlhps	%xmm2, %xmm0
+	andnps	%xmm12, %xmm1
+	orps	%xmm1, %xmm0
+
+	movmskps %xmm8, %edx
+	testl	%edx, %edx
+
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+
+	/* No stack restoration on the fastpath.  */
+	ret
+
+L(SPECIAL_VALUES_BRANCH):
+	subq	$48, %rsp
+
+	movups	%xmm0, (%rsp)
+	movups	%xmm12, 16(%rsp)
+
+	movq	%r12, 32(%rsp)
+	movq	%r13, 40(%rsp)
+
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %r13d
+L(SPECIAL_VALUES_LOOP):
+	/* use r12 as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 12] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%r12d, %r12d
+	bsfl	%r13d, %r12d
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	16(%rsp, %r12, 4), %xmm0
+	call	tanhf@PLT
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %r12, 4)
+
+	leal	-1(%r13), %eax
+	andl	%eax, %r13d
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 16(%rsp).  */
+	movups	(%rsp), %xmm0
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	addq	$48, %rsp
+	ret
+END(_ZGVbN4v_tanhf_sse4)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index 3745db5aa4..90c3ea4cc6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -70,775 +70,171 @@ 
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal
- */
-#define _dbP                          	0
-#define _sSignMask                    	4288
-#define _sAbsMask                     	4320
-#define _iExpMantMask                 	4352
-#define _iExpMask                     	4384
-#define _iMinIdxOfsMask               	4416
-#define _iMaxIdxMask                  	4448
-
 #include <sysdep.h>
+#include "svml_s_tanhf_rodata.S"
 
         .text
 	.section .text.avx2,"ax",@progbits
 ENTRY(_ZGVdN8v_tanhf_avx2)
-        pushq     %rbp
-        cfi_def_cfa_offset(16)
-        movq      %rsp, %rbp
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-        andq      $-32, %rsp
-        pushq     %r12
-        subq      $120, %rsp
-        lea       _dbP+16+__svml_stanh_data_internal(%rip), %r10
-        vmovaps   %ymm0, %ymm12
-
-/* Here huge arguments, INF and NaNs are filtered out to callout. */
-        vpand     _iExpMantMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm14
+	/* Here huge arguments, INF and NaNs are filtered out to callout.  */
+	vpand	TANHF_DATA(_iExpMantMask)(%rip), %ymm0, %ymm4
+	vpsubd	TANHF_DATA(_iMinIdxOfsMask)(%rip), %ymm4, %ymm2
 
-/*
- *  small table specific variables *
- *  Constant loading
- */
-        vmovups   _iMaxIdxMask+__svml_stanh_data_internal(%rip), %ymm8
-        vpsubd    _iMinIdxOfsMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm9
-
-/* if VMIN, VMAX is defined for I type */
-        vxorps    %ymm15, %ymm15, %ymm15
-        vpcmpgtd  %ymm15, %ymm9, %ymm0
-        vpand     %ymm0, %ymm9, %ymm7
-        vpcmpgtd  %ymm8, %ymm9, %ymm6
-        vblendvps %ymm6, %ymm8, %ymm7, %ymm3
-        vpsrld    $14, %ymm3, %ymm1
-        vpcmpgtd  _iExpMask+__svml_stanh_data_internal(%rip), %ymm14, %ymm13
-        vmovmskps %ymm13, %r11d
-        vandps    _sAbsMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm10
-        vandps    _sSignMask+__svml_stanh_data_internal(%rip), %ymm12, %ymm11
-        vextractf128 $1, %ymm1, %xmm2
-        vmovd     %xmm1, %r9d
-        vmovd     %xmm2, %ecx
-        vpextrd   $1, %xmm2, %edx
-        vpextrd   $1, %xmm1, %r8d
-        movslq    %r9d, %r9
-        movslq    %edx, %rdx
-        movslq    %r8d, %r8
-        vpextrd   $2, %xmm1, %edi
-        movslq    %ecx, %rcx
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-        vpextrd   $3, %xmm2, %r12d
-        vpextrd   $3, %xmm1, %esi
-        vpextrd   $2, %xmm2, %eax
-        movslq    %edi, %rdi
-        movslq    %r12d, %r12
-        movslq    %esi, %rsi
-        movslq    %eax, %rax
-        vmovupd   -16(%r9,%r10), %xmm5
-        vmovupd   -16(%rdx,%r10), %xmm14
-        vmovupd   -16(%rcx,%r10), %xmm13
-        vmovupd   (%r9,%r10), %xmm1
-        vmovupd   (%r8,%r10), %xmm2
-        vmovupd   -16(%r8,%r10), %xmm4
-        vinsertf128 $1, -16(%rdi,%r10), %ymm5, %ymm15
-        vinsertf128 $1, -16(%r12,%r10), %ymm14, %ymm3
-        vinsertf128 $1, -16(%rax,%r10), %ymm13, %ymm6
-        vinsertf128 $1, (%rdi,%r10), %ymm1, %ymm5
-        vinsertf128 $1, (%rsi,%r10), %ymm2, %ymm14
-        vunpcklpd %ymm3, %ymm6, %ymm8
-        vunpckhpd %ymm3, %ymm6, %ymm6
-        vunpcklpd %ymm14, %ymm5, %ymm3
-        vunpckhpd %ymm14, %ymm5, %ymm2
-        vmovupd   (%rcx,%r10), %xmm13
-        vcvtps2pd %xmm10, %ymm5
-        vextractf128 $1, %ymm10, %xmm10
-        vfmadd213pd %ymm3, %ymm5, %ymm2
-        vinsertf128 $1, -16(%rsi,%r10), %ymm4, %ymm0
-        vmovupd   (%rdx,%r10), %xmm4
-        vunpcklpd %ymm0, %ymm15, %ymm9
-        vunpckhpd %ymm0, %ymm15, %ymm7
-        vfmadd213pd %ymm7, %ymm5, %ymm2
-        vfmadd213pd %ymm9, %ymm5, %ymm2
-        vinsertf128 $1, (%r12,%r10), %ymm4, %ymm0
-        vcvtps2pd %xmm10, %ymm4
-        vinsertf128 $1, (%rax,%r10), %ymm13, %ymm15
-        vunpcklpd %ymm0, %ymm15, %ymm1
-        vunpckhpd %ymm0, %ymm15, %ymm0
-        vfmadd213pd %ymm1, %ymm4, %ymm0
-        vcvtpd2ps %ymm2, %xmm1
-        vfmadd213pd %ymm6, %ymm4, %ymm0
-        vfmadd213pd %ymm8, %ymm4, %ymm0
-        vcvtpd2ps %ymm0, %xmm0
-        vinsertf128 $1, %xmm0, %ymm1, %ymm2
-        vorps     %ymm11, %ymm2, %ymm0
-        testl     %r11d, %r11d
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx r13 r14 r15 r11d ymm0 ymm12
-
-/* Restore registers
- * and exit the function
- */
+	/* Selection of arguments between [0, 0x04280000] into ymm2.  */
+	vpxor	%ymm3, %ymm3, %ymm3
+	vpmaxsd	%ymm3, %ymm2, %ymm2
+	vpminsd	TANHF_DATA(_iMaxIdxMask)(%rip), %ymm2, %ymm2
 
-L(EXIT):
-        addq      $120, %rsp
-        cfi_restore(12)
-        popq      %r12
-        movq      %rbp, %rsp
-        popq      %rbp
-        cfi_def_cfa(7, 8)
-        cfi_restore(6)
-        ret
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -8; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-
-/* Branch to process
- * special inputs
- */
+	vpsrld	$14, %ymm2, %ymm1
 
-L(SPECIAL_VALUES_BRANCH):
-        vmovups   %ymm12, 32(%rsp)
-        vmovups   %ymm0, 64(%rsp)
-                                # LOE rbx r13 r14 r15 r11d ymm0
-
-        xorl      %r12d, %r12d
-                                # LOE rbx r13 r14 r15 r11d r12d
-
-        vzeroupper
-        movq      %r13, 8(%rsp)
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-        movl      %r11d, %r13d
-        movq      %r14, (%rsp)
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
+	/* Store special cases in ymm15.  */
+	vpcmpgtd TANHF_DATA(_iExpMask)(%rip), %ymm4, %ymm15
 
-L(RANGEMASK_CHECK):
-        btl       %r12d, %r13d
 
-/* Call scalar math function */
-        jc        L(SCALAR_MATH_CALL)
-                                # LOE rbx r15 r12d r13d
+	/* Store base of lookup table in rax.  */
+	leaq	TANHF_DATA(_lookupTable)(%rip), %rax
 
-/* Special inputs
- * processing loop
- */
+	/* We are splitting xmm1 into 8 GPRs. This may be faster to do with
+	   store/load as we can take advantage of store-forwarding.  */
+	vmovq	%xmm1, %r8
+	/* We have eliminated all negative values for ymm1 so no need to sign
+	   extend.  */
+	movl	%r8d, %r9d
+	shrq	$32, %r8
 
-L(SPECIAL_VALUES_LOOP):
-        incl      %r12d
-        cmpl      $8, %r12d
-
-/* Check bits in range mask */
-        jl        L(RANGEMASK_CHECK)
-                                # LOE rbx r15 r12d r13d
-
-        movq      8(%rsp), %r13
-        cfi_restore(13)
-        movq      (%rsp), %r14
-        cfi_restore(14)
-        vmovups   64(%rsp), %ymm0
-
-/* Go to exit */
-        jmp       L(EXIT)
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-                                # LOE rbx r13 r14 r15 ymm0
-
-/* Scalar math fucntion call
- * to process special input
- */
+	/* Instead of using cross-lane permutes on ymm vectors, use vpinsertf128
+	   with memory operand. This helps alleviate bottleneck on p5.  */
+	vmovdqu	16(%r9, %rax), %xmm5
 
-L(SCALAR_MATH_CALL):
-        movl      %r12d, %r14d
-        movss     32(%rsp,%r14,4), %xmm0
-        call      tanhf@PLT
-                                # LOE rbx r14 r15 r12d r13d xmm0
+	vpextrq	$1, %xmm1, %rsi
+	movl	%esi, %edi
+	shrq	$32, %rsi
 
-        movss     %xmm0, 64(%rsp,%r14,4)
+	vinsertf128 $1, 16(%rdi, %rax), %ymm5, %ymm5
 
-/* Process special inputs in loop */
-        jmp       L(SPECIAL_VALUES_LOOP)
-                                # LOE rbx r15 r12d r13d
-END(_ZGVdN8v_tanhf_avx2)
+	vextracti128 $1, %ymm1, %xmm2
+	vmovq	%xmm2, %rdx
+	movl	%edx, %ecx
+	shrq	$32, %rdx
+
+	vmovdqu	(%rcx, %rax), %xmm6
+
+	vpextrq	$1, %xmm2, %r10
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+
+	vinsertf128 $1, (%r11, %rax), %ymm6, %ymm6
+
+	vmovupd	16(%r8, %rax), %xmm1
+	vinsertf128 $1, 16(%rsi, %rax), %ymm1, %ymm1
+	vmovupd	(%rdx, %rax), %xmm3
+	vinsertf128 $1, (%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm6, %ymm7
+	vunpckhpd %ymm3, %ymm6, %ymm6
+
+	vunpcklpd %ymm1, %ymm5, %ymm3
+	vunpckhpd %ymm1, %ymm5, %ymm1
+
+	vmovaps	TANHF_DATA(_sAbsMask)(%rip), %ymm11
+	vandps	%ymm11, %ymm0, %ymm4
 
-        .section .rodata, "a"
-        .align 32
-
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct
-{
-        __declspec(align(32)) VUINT32 _dbP[(134*4)][2];
-        __declspec(align(32)) VUINT32 _sSignMask[8][1];
-        __declspec(align(32)) VUINT32 _sAbsMask[8][1];
-        __declspec(align(32)) VUINT32 _iExpMantMask[8][1];
-        __declspec(align(32)) VUINT32 _iExpMask[8][1];
-        __declspec(align(32)) VUINT32 _iMinIdxOfsMask[8][1];
-        __declspec(align(32)) VUINT32 _iMaxIdxMask[8][1];
-} __svml_stanh_data_internal;
-#endif
-__svml_stanh_data_internal:
-        /* Pol_000:  err=7.93e-09, x in [0.0000000; 0.0312500]. */
-        .quad 0x0000000000000000  /* A00 = +0.000000000000000000000e-01 */
-        .quad 0x3FF00000022C70EB  /* A01 = +1.000000008097283510367e+00 */
-        .quad 0xBED00E878CFFA194  /* A02 = -3.828228912518614443549e-06 */
-        .quad 0xBFD551766D0607A9  /* A03 = -3.330970825846813476723e-01 */
-        .quad 0xBE53D60CE3E4C297  /* A00 = -1.847383956330407336230e-08 */
-        .quad 0x3FF000024177CF5C  /* A01 = +1.000002151235967140508e+00 */
-        .quad 0xBF1758BC94A51A25  /* A02 = -8.906031613262943753568e-05 */
-        .quad 0xBFD53EAE67E0D4F0  /* A03 = -3.319507612644221339337e-01 */
-        .quad 0xBE5A9E47EF32D6FE  /* A00 = -2.479020984039698285657e-08 */
-        .quad 0x3FF00002DA983057  /* A01 = +1.000002721676556793895e+00 */
-        .quad 0xBF1BD953509E94AA  /* A02 = -1.062352277175377670507e-04 */
-        .quad 0xBFD53BDB562EEDD5  /* A03 = -3.317783681520414806876e-01 */
-        .quad 0xBE6191BBE496D294  /* A00 = -3.272532162914017685901e-08 */
-        .quad 0x3FF0000390492017  /* A01 = +1.000003398528866105366e+00 */
-        .quad 0xBF20727E814A57CE  /* A02 = -1.254825043772153972919e-04 */
-        .quad 0xBFD538DE060A6F22  /* A03 = -3.315959033004550748913e-01 */
-        .quad 0xBE66DAFA2A893A25  /* A00 = -4.257146219278012568149e-08 */
-        .quad 0x3FF0000465E08CD1  /* A01 = +1.000004194219219266770e+00 */
-        .quad 0xBF2341C765EF91B6  /* A02 = -1.469188600530365522261e-04 */
-        .quad 0xBFD535B6841FAF9E  /* A03 = -3.314033785124993469751e-01 */
-        .quad 0xBE6D5794E361E964  /* A00 = -5.465394929765249413434e-08 */
-        .quad 0x3FF000055EE2A0CB  /* A01 = +1.000005121846742950353e+00 */
-        .quad 0xBF265E6C77E66C8B  /* A02 = -1.706607253709506650304e-04 */
-        .quad 0xBFD53264DDCCEDA6  /* A03 = -3.312008062382240103361e-01 */
-        .quad 0xBE729C844D374A6E  /* A00 = -6.933284462462096107184e-08 */
-        .quad 0x3FF000067F019093  /* A01 = +1.000006195180536350264e+00 */
-        .quad 0xBF29CC5348D6DCE5  /* A02 = -1.968242326435338705130e-04 */
-        .quad 0xBFD52EE92121ED35  /* A03 = -3.309881995734998416658e-01 */
-        .quad 0xBE775AEA17EAA872  /* A00 = -8.700465590574974405858e-08 */
-        .quad 0x3FF00007CA1D66B8  /* A01 = +1.000007428656699559610e+00 */
-        .quad 0xBF2D8F5EB98A2637  /* A02 = -2.255252009216044881395e-04 */
-        .quad 0xBFD52B435CDF9128  /* A03 = -3.307655722585587376727e-01 */
-        .quad 0xBE7D04DA28C343F0  /* A00 = -1.081040272327705484794e-07 */
-        .quad 0x3FF000094443CCF5  /* A01 = +1.000008837375216730337e+00 */
-        .quad 0xBF30D5B76C947AE5  /* A02 = -2.568791210978817814332e-04 */
-        .quad 0xBFD52773A0776FAD  /* A03 = -3.305329386764651045105e-01 */
-        .quad 0xBE81DD77A12C51C7  /* A00 = -1.331054169875768625701e-07 */
-        .quad 0x3FF0000AF1AFD2DA  /* A01 = +1.000010437096696680470e+00 */
-        .quad 0xBF331230624C1680  /* A02 = -2.910011410651516805537e-04 */
-        .quad 0xBFD52379FC0B61DF  /* A03 = -3.302903138515186909352e-01 */
-        .quad 0xBE85D04EEEB3C435  /* A00 = -1.625247628488202841012e-07 */
-        .quad 0x3FF0000CD6C9B1F2  /* A01 = +1.000012244238970726684e+00 */
-        .quad 0xBF357F0742FADDD4  /* A02 = -3.280060509313874068243e-04 */
-        .quad 0xBFD51F56806D0E81  /* A03 = -3.300377134475880880338e-01 */
-        .quad 0xBE8A6E289B59681B  /* A00 = -1.969211333326924655065e-07 */
-        .quad 0x3FF0000EF8268F72  /* A01 = +1.000014275873550406715e+00 */
-        .quad 0xBF381E277A1B747A  /* A02 = -3.680082682942575423093e-04 */
-        .quad 0xBFD51B093F1D6FD4  /* A03 = -3.297751537663746734808e-01 */
-        .quad 0xBE8FCBC40EE9ABD5  /* A00 = -2.368983653301529373887e-07 */
-        .quad 0x3FF000115A883B6C  /* A01 = +1.000016549721943981410e+00 */
-        .quad 0xBF3AF17AC974B3D9  /* A02 = -4.111218235774406434303e-04 */
-        .quad 0xBFD516924A4C549C  /* A03 = -3.295026517456081105450e-01 */
-        .quad 0xBE92FFBC60A3F956  /* A00 = -2.831066871072026054144e-07 */
-        .quad 0x3FF0001402DCED8A  /* A01 = +1.000019084151832604590e+00 */
-        .quad 0xBF3DFAE9390C4801  /* A02 = -4.574603454311488280083e-04 */
-        .quad 0xBFD511F1B4D7DC3A  /* A03 = -3.292202249571719585575e-01 */
-        .quad 0xBE9690A22F96D5AD  /* A00 = -3.362443262393081632612e-07 */
-        .quad 0x3FF00016F63EFF5D  /* A01 = +1.000021898173108825247e+00 */
-        .quad 0xBF409E2C839605BB  /* A02 = -5.071370461992499986334e-04 */
-        .quad 0xBFD50D27924BEE00  /* A03 = -3.289278916051614487515e-01 */
-        .quad 0xBE9AA56C65E72A73  /* A00 = -3.970591019557469835586e-07 */
-        .quad 0x3FF0001A39F4A43E  /* A01 = +1.000025011433776978009e+00 */
-        .quad 0xBF425BD74C3D6667  /* A02 = -5.602647074553602319844e-04 */
-        .quad 0xBFD50833F6E1ABA2  /* A03 = -3.286256705238718156536e-01 */
-        .quad 0xBE9F4BD4FF1A83B0  /* A00 = -4.663500013744687071912e-07 */
-        .quad 0x3FF0001DD36F9EC2  /* A01 = +1.000028444215715683896e+00 */
-        .quad 0xBF44376634149405  /* A02 = -6.169556656102642569831e-04 */
-        .quad 0xBFD50316F77EDEE5  /* A03 = -3.283135811757190158922e-01 */
-        .quad 0xBEA3B625387BB079  /* A00 = -5.874486399249461304297e-07 */
-        .quad 0x3FF00023E14CFBA9  /* A01 = +1.000034217911642153709e+00 */
-        .quad 0xBF47392F923218D2  /* A02 = -7.087213783883111826306e-04 */
-        .quad 0xBFD4FB1FACDEB938  /* A03 = -3.278273761924483942209e-01 */
-        .quad 0xBEAA6E24F543500A  /* A00 = -7.876828740601738750574e-07 */
-        .quad 0x3FF0002D5C6E8412  /* A01 = +1.000043259679163742959e+00 */
-        .quad 0xBF4BAF02BD7FDD70  /* A02 = -8.448375110664940040861e-04 */
-        .quad 0xBFD4EFEE6527A7DE  /* A03 = -3.271442401734229177279e-01 */
-        .quad 0xBEB16E3EBE2157D0  /* A00 = -1.038947396133402500647e-06 */
-        .quad 0x3FF00038990FEE2F  /* A01 = +1.000053975962952312884e+00 */
-        .quad 0xBF50569481C574CB  /* A02 = -9.972048056490652716971e-04 */
-        .quad 0xBFD4E419278DA2B4  /* A03 = -3.264220129263251113372e-01 */
-        .quad 0xBEB6A7B6723165D4  /* A00 = -1.350350836279403750524e-06 */
-        .quad 0x3FF00045CAB4158E  /* A01 = +1.000066558657042303793e+00 */
-        .quad 0xBF531D7C9C849108  /* A02 = -1.166698160951775212202e-03 */
-        .quad 0xBFD4D7A0BB33B152  /* A03 = -3.256608799117844954552e-01 */
-        .quad 0xBEBD0EE2A8654AFD  /* A00 = -1.732000471561702711532e-06 */
-        .quad 0x3FF00055276F18D6  /* A01 = +1.000081209219890521211e+00 */
-        .quad 0xBF562FDBA3FB6C6C  /* A02 = -1.354183666925102939860e-03 */
-        .quad 0xBFD4CA85F1B93DB2  /* A03 = -3.248610363561638125773e-01 */
-        .quad 0xBEC269D4036A207E  /* A00 = -2.195047297096822741730e-06 */
-        .quad 0x3FF00066E7DA6E4E  /* A01 = +1.000098138500919997540e+00 */
-        .quad 0xBF5991499FC36B3A  /* A02 = -1.560518167983372759405e-03 */
-        .quad 0xBFD4BCC9A72283D6  /* A03 = -3.240226871658341556426e-01 */
-        .quad 0xBEC7154B6C09CFE1  /* A00 = -2.751729738565190291276e-06 */
-        .quad 0x3FF0007B47086B80  /* A01 = +1.000117566559055148900e+00 */
-        .quad 0xBF5D455433B4F8F4  /* A02 = -1.786548832412968197680e-03 */
-        .quad 0xBFD4AE6CC1BFE145  /* A03 = -3.231460468373550942722e-01 */
-        .quad 0xBECCA68CC64A0F8A  /* A00 = -3.415415948561670285790e-06 */
-        .quad 0x3FF00092827742F7  /* A01 = +1.000139722473418535387e+00 */
-        .quad 0xBF60A7BF15A527AF  /* A02 = -2.033112728132522705610e-03 */
-        .quad 0xBFD49F703214084C  /* A03 = -3.222313393636155876010e-01 */
-        .quad 0xBED19E68676B241B  /* A00 = -4.200644630977303616698e-06 */
-        .quad 0x3FF000ACDA037B26  /* A01 = +1.000164844146362863597e+00 */
-        .quad 0xBF62D99F836A02F8  /* A02 = -2.301036405072284102280e-03 */
-        .quad 0xBFD48FD4F2B91B28  /* A03 = -3.212787981359945810311e-01 */
-        .quad 0xBED57CF4B0C7AA54  /* A00 = -5.123164339408145209103e-06 */
-        .quad 0x3FF000CA8FD9E1A1  /* A01 = +1.000193178099017865534e+00 */
-        .quad 0xBF653A014548E686  /* A02 = -2.591135484433962181405e-03 */
-        .quad 0xBFD47F9C0844B38F  /* A03 = -3.202886658426046806447e-01 */
-        .quad 0xBEDA012B1B1A41E2  /* A00 = -6.199971197454598722328e-06 */
-        .quad 0x3FF000EBE868FDF4  /* A01 = +1.000224979259539459520e+00 */
-        .quad 0xBF67CA9427E0A544  /* A02 = -2.904214255086275467410e-03 */
-        .quad 0xBFD46EC6812ADB37  /* A03 = -3.192611943626845749655e-01 */
-        .quad 0xBEDF3EAC5BF12194  /* A00 = -7.449344990702664567927e-06 */
-        .quad 0x3FF001112A520784  /* A01 = +1.000260510744255704196e+00 */
-        .quad 0xBF6A8D01ABDA4DC4  /* A02 = -3.241065277345108255891e-03 */
-        .quad 0xBFD45D55759FFA4A  /* A03 = -3.181966446572103146551e-01 */
-        .quad 0xBEE2A541BC274267  /* A00 = -8.890883582164319970972e-06 */
-        .quad 0x3FF0013A9E5961F2  /* A01 = +1.000300043631906721231e+00 */
-        .quad 0xBF6D82ECD080C540  /* A02 = -3.602468994380686462264e-03 */
-        .quad 0xBFD44B4A0779C0AD  /* A03 = -3.170952866557950611259e-01 */
-        .quad 0xBEE61D97609A27F4  /* A00 = -1.054553560499505625520e-05 */
-        .quad 0x3FF001688F56A3AF  /* A01 = +1.000343856731187974773e+00 */
-        .quad 0xBF7056F8EFB683EC  /* A02 = -3.989193351487490407647e-03 */
-        .quad 0xBFD438A5620F0F74  /* A03 = -3.159573991399533543500e-01 */
-        .quad 0xBEEA145429EDD370  /* A00 = -1.243563138839952927732e-05 */
-        .quad 0x3FF0019B4A242A67  /* A01 = +1.000392236341804297339e+00 */
-        .quad 0xBF7207D31CA78D9B  /* A02 = -4.401993423445739288258e-03 */
-        .quad 0xBFD42568BA16E7CD  /* A03 = -3.147832696228050619602e-01 */
-        .quad 0xBEEE96370D52680F  /* A00 = -1.458491207477835326165e-05 */
-        .quad 0x3FF001D31D8E4115  /* A01 = +1.000445476009251821736e+00 */
-        .quad 0xBF73D4CC11EDC094  /* A02 = -4.841611050196221316400e-03 */
-        .quad 0xBFD411954D8664E7  /* A03 = -3.135731942252974469021e-01 */
-        .quad 0xBEF338C046215EF8  /* A00 = -1.833122622260562810219e-05 */
-        .quad 0x3FF00230C32C2EC1  /* A01 = +1.000534784691737621998e+00 */
-        .quad 0xBF76BD019BCC5DAF  /* A02 = -5.551344188254799492943e-03 */
-        .quad 0xBFD3F2C7156DC21E  /* A03 = -3.116929730668135389848e-01 */
-        .quad 0xBEF9B15EAE411EAE  /* A00 = -2.450261207822986676092e-05 */
-        .quad 0x3FF002C2DF057A4D  /* A01 = +1.000674124886830940184e+00 */
-        .quad 0xBF7B08CCD9AC1E30  /* A02 = -6.600189396301511801646e-03 */
-        .quad 0xBFD3C7A7A114FED8  /* A03 = -3.090609620157755976777e-01 */
-        .quad 0xBF00E36483C373B3  /* A00 = -3.221178528332122595812e-05 */
-        .quad 0x3FF0036F419480D7  /* A01 = +1.000838524028997644777e+00 */
-        .quad 0xBF7FD255D1777007  /* A02 = -7.768950679260206403087e-03 */
-        .quad 0xBFD39A453911D6CE  /* A03 = -3.062909180947429588215e-01 */
-        .quad 0xBF05DFA04DD12059  /* A00 = -4.172046622180685472624e-05 */
-        .quad 0x3FF00438B2A03D8D  /* A01 = +1.001030633695197069599e+00 */
-        .quad 0xBF828F8DBB4A9D10  /* A02 = -9.062869337255224921890e-03 */
-        .quad 0xBFD36AAB704697D9  /* A03 = -3.033856007044711255993e-01 */
-        .quad 0xBF0BF3E0C647DEFB  /* A00 = -5.331544597092331081714e-05 */
-        .quad 0x3FF005221063D36D  /* A01 = +1.001253189109060359741e+00 */
-        .quad 0xBF857A2CB3C96102  /* A02 = -1.048693584122917590862e-02 */
-        .quad 0xBFD338E65BBB4FEC  /* A03 = -3.003478904549854444639e-01 */
-        .quad 0xBF11A506ED7C9D31  /* A00 = -6.730894835681591541979e-05 */
-        .quad 0x3FF0062E4D0EA92A  /* A01 = +1.001508999829250345925e+00 */
-        .quad 0xBF88AB82C2761AF3  /* A02 = -1.204588085125866091241e-02 */
-        .quad 0xBFD305028D6BD206  /* A03 = -2.971807843271395688234e-01 */
-        .quad 0xBF1607C0922D9BF1  /* A00 = -8.403885708006799337092e-05 */
-        .quad 0x3FF007606C341961  /* A01 = +1.001800940198869449560e+00 */
-        .quad 0xBF8C25E6DA487BCF  /* A02 = -1.374416688582682892494e-02 */
-        .quad 0xBFD2CF0D0EE8F7B5  /* A03 = -2.938873906713255768075e-01 */
-        .quad 0xBF1B3A8480A0A16D  /* A00 = -1.038688061788578038307e-04 */
-        .quad 0x3FF008BB802D02D6  /* A01 = +1.002131939589323561535e+00 */
-        .quad 0xBF8FEB8AE99FD100  /* A02 = -1.558598065819483124983e-02 */
-        .quad 0xBFD297135BD0911B  /* A03 = -2.904709240558688843059e-01 */
-        .quad 0xBF20ABB9BDB75C65  /* A00 = -1.271881327357976163798e-04 */
-        .quad 0x3FF00A42A76D8CD1  /* A01 = +1.002504972472525901495e+00 */
-        .quad 0xBF91FF3D752BB9E6  /* A02 = -1.757522609380570560722e-02 */
-        .quad 0xBFD25D235C1F88B4  /* A03 = -2.869346999779154305799e-01 */
-        .quad 0xBF243D3254425461  /* A00 = -1.544116913733432829448e-04 */
-        .quad 0x3FF00BF909D1795E  /* A01 = +1.002923048355647051011e+00 */
-        .quad 0xBF94304E04D44942  /* A02 = -1.971551804042204897316e-02 */
-        .quad 0xBFD2214B5E61CFA6  /* A03 = -2.832821294498394371075e-01 */
-        .quad 0xBF286070011B61CE  /* A00 = -1.859795307186510085994e-04 */
-        .quad 0x3FF00DE1D5E1627E  /* A01 = +1.003389201612804537689e+00 */
-        .quad 0xBF9689D5F4163F59  /* A02 = -2.201017668045266231780e-02 */
-        .quad 0xBFD1E39A11C3B42C  /* A03 = -2.795167134743816728104e-01 */
-        .quad 0xBF2D250B366A79E8  /* A00 = -2.223564326486314902259e-04 */
-        .quad 0x3FF010003E134001  /* A01 = +1.003906481248123094829e+00 */
-        .quad 0xBF990C9FF91F6F81  /* A02 = -2.446222265267250853271e-02 */
-        .quad 0xBFD1A41E80084CDC  /* A03 = -2.756420374218586655246e-01 */
-        .quad 0xBF314DB5DDC2A30E  /* A00 = -2.640313157465248123865e-04 */
-        .quad 0x3FF012577608921B  /* A01 = +1.004477940624503018441e+00 */
-        .quad 0xBF9BB9626875B0C9  /* A02 = -2.707437288829409385849e-02 */
-        .quad 0xBFD162E80768A9D0  /* A03 = -2.716617653228725615122e-01 */
-        .quad 0xBF346A6133808864  /* A00 = -3.115165050094957730625e-04 */
-        .quad 0x3FF014EAAFCC88A3  /* A01 = +1.005106627192198898157e+00 */
-        .quad 0xBF9E90BEF9BF7419  /* A02 = -2.984903716411588595059e-02 */
-        .quad 0xBFD12006545F7FAD  /* A03 = -2.675796340899932457269e-01 */
-        .quad 0xBF37F180DC3848EA  /* A00 = -3.653468704395550778821e-04 */
-        .quad 0x3FF017BD19147861  /* A01 = +1.005795572250939295955e+00 */
-        .quad 0xBFA0C9A14C702E07  /* A02 = -3.278831537326359207851e-02 */
-        .quad 0xBFD0DB895B650092  /* A03 = -2.633994476818851682154e-01 */
-        .quad 0xBF3BEC6AAC6D7635  /* A00 = -4.260788377246944457107e-04 */
-        .quad 0x3FF01AD1D884E719  /* A01 = +1.006547780778822565040e+00 */
-        .quad 0xBFA260B2A1B1434A  /* A02 = -3.589399551186163439542e-02 */
-        .quad 0xBFD09581529E93D6  /* A03 = -2.591250712233067465817e-01 */
-        .quad 0xBF4164E26167882B  /* A00 = -5.308251737086202562063e-04 */
-        .quad 0x3FF01FEF14B62B81  /* A01 = +1.007796364693348545316e+00 */
-        .quad 0xBFA4EB014538AA42  /* A02 = -4.085544557559163403315e-02 */
-        .quad 0xBFD029D36FEAF41F  /* A03 = -2.525528519580024222613e-01 */
-        .quad 0xBF46F6FFF4E53DC8  /* A00 = -7.008313930700277652464e-04 */
-        .quad 0x3FF027CBB51CBBA0  /* A01 = +1.009715754956893363214e+00 */
-        .quad 0xBFA89DEC9FEC112E  /* A02 = -4.807986690687680864098e-02 */
-        .quad 0xBFCF2A99464D0DB4  /* A03 = -2.434875100390009317053e-01 */
-        .quad 0xBF4DCC9C4F66A4D9  /* A00 = -9.094012482836712945103e-04 */
-        .quad 0x3FF030E7CFCCD583  /* A01 = +1.011939822882909068014e+00 */
-        .quad 0xBFACAA3B95814081  /* A02 = -5.598627281199331645611e-02 */
-        .quad 0xBFCDF78F156BE7CF  /* A03 = -2.341173987004467604844e-01 */
-        .quad 0xBF5308ED74E5C7A6  /* A00 = -1.161796466103906435435e-03 */
-        .quad 0x3FF03B5986412ECB  /* A01 = +1.014489674026594512313e+00 */
-        .quad 0xBFB087EBA88DCC3F  /* A02 = -6.457398285947223148806e-02 */
-        .quad 0xBFCCBB9BD134862F  /* A03 = -2.244753619680052991736e-01 */
-        .quad 0xBF57FA23C00DF4B5  /* A00 = -1.463446533505758208674e-03 */
-        .quad 0x3FF0473558A1BCC0  /* A01 = +1.017384859292903342975e+00 */
-        .quad 0xBFB2E702BC6360EF  /* A02 = -7.383744334527241048871e-02 */
-        .quad 0xBFCB77D546379288  /* A03 = -2.145945160729250122955e-01 */
-        .quad 0xBF5DD12971557F71  /* A00 = -1.819887610814388068450e-03 */
-        .quad 0x3FF0548DDF5000A8  /* A01 = +1.020643112482540360020e+00 */
-        .quad 0xBFB571B63DA186E1  /* A02 = -8.376635555898871710045e-02 */
-        .quad 0xBFCA2D5202605148  /* A03 = -2.045080672838912594358e-01 */
-        .quad 0xBF6252B1AD5D4F17  /* A00 = -2.236697221556737096709e-03 */
-        .quad 0x3FF063738A910BF7  /* A01 = +1.024280110622155737232e+00 */
-        .quad 0xBFB8270C8E6B601B  /* A02 = -9.434584118878357184013e-02 */
-        .quad 0xBFC8DD27D950A07E  /* A03 = -1.942491351230763441116e-01 */
-        .quad 0xBF66470C91730CFC  /* A00 = -2.719425723258004842786e-03 */
-        .quad 0x3FF073F468FCF331  /* A01 = +1.028309259519300633556e+00 */
-        .quad 0xBFBB05C2952191E4  /* A02 = -1.055566419686964629854e-01 */
-        .quad 0xBFC7886A770DE2BD  /* A03 = -1.838505822486435070662e-01 */
-        .quad 0xBF6AD114AC8E98EC  /* A00 = -3.273525599485007861467e-03 */
-        .quad 0x3FF0861BF53E5226  /* A01 = +1.032741506559554434119e+00 */
-        .quad 0xBFBE0C4F9B461507  /* A02 = -1.173753503881763554650e-01 */
-        .quad 0xBFC6302A037CDE3A  /* A03 = -1.733448521642786954722e-01 */
-        .quad 0xBF6FFBDE2A6C2AF8  /* A00 = -3.904279630096648551207e-03 */
-        .quad 0x3FF099F2EB8E7DA3  /* A01 = +1.037585182326304034106e+00 */
-        .quad 0xBFC09C74D192DDF0  /* A02 = -1.297746680554463516444e-01 */
-        .quad 0xBFC4D571D8E3079F  /* A03 = -1.627638157861470424859e-01 */
-        .quad 0xBF72E8FDC0B952AA  /* A00 = -4.616728994353872309042e-03 */
-        .quad 0x3FF0AF7F273C9533  /* A01 = +1.042845872181101141152e+00 */
-        .quad 0xBFC244C512736F10  /* A02 = -1.427236881344176033792e-01 */
-        .quad 0xBFC379474F58B902  /* A03 = -1.521386277613104298645e-01 */
-        .quad 0xBF762EABAF17395B  /* A00 = -5.415602341101023557701e-03 */
-        .quad 0x3FF0C6C3886F63FB  /* A01 = +1.048526318502125631582e+00 */
-        .quad 0xBFC3FDF9918EA12A  /* A02 = -1.561881981590514389957e-01 */
-        .quad 0xBFC21CA89ECAB895  /* A03 = -1.414995932913753196036e-01 */
-        .quad 0xBF79D387CE5B2BAE  /* A00 = -6.305246822828998107258e-03 */
-        .quad 0x3FF0DFBFE2346376  /* A01 = +1.054626353847394337748e+00 */
-        .quad 0xBFC5C6DA43602620  /* A02 = -1.701309994680721970894e-01 */
-        .quad 0xBFC0C08BD8DB6631  /* A03 = -1.308760460731704100557e-01 */
-        .quad 0xBF7DDBA8E8DA9060  /* A00 = -7.289562037531366334164e-03 */
-        .quad 0x3FF0FA70F0D1B464  /* A01 = +1.061142864894713433443e+00 */
-        .quad 0xBFC79E18D92BAA7C  /* A02 = -1.845122394946264732241e-01 */
-        .quad 0xBFBECBBBF74C2669  /* A03 = -1.202962378266875381749e-01 */
-        .quad 0xBF81254E76EA25DA  /* A00 = -8.371937755572145950511e-03 */
-        .quad 0x3FF116D05835EBD0  /* A01 = +1.068069786618014660462e+00 */
-        .quad 0xBFC982539E2ED224  /* A02 = -1.992897531869327609755e-01 */
-        .quad 0xBFBC1B043C350159  /* A03 = -1.097872397413132278254e-01 */
-        .quad 0xBF8391ACBA863403  /* A00 = -9.555196230190082448686e-03 */
-        .quad 0x3FF134D4AA477FE2  /* A01 = +1.075398125794884141015e+00 */
-        .quad 0xBFCB7218609FEAFB  /* A02 = -2.144194099235717521079e-01 */
-        .quad 0xBFB970A16CB88329  /* A03 = -9.937485603633135211599e-02 */
-        .quad 0xBF87935088E48E8B  /* A00 = -1.151144902957603431692e-02 */
-        .quad 0x3FF1649892AD7DD3  /* A01 = +1.087059567413110938716e+00 */
-        .quad 0xBFCE6971DDE75409  /* A02 = -2.375929196847723912089e-01 */
-        .quad 0xBFB58291E88CB251  /* A03 = -8.402358939628952472223e-02 */
-        .quad 0xBF8DB3A62C325325  /* A00 = -1.450280973794233242702e-02 */
-        .quad 0x3FF1A9C900C6DEEA  /* A01 = +1.103951457056548068891e+00 */
-        .quad 0xBFD13DBC65B0E08E  /* A02 = -2.693930619311765140012e-01 */
-        .quad 0xBFB06696F62696D1  /* A03 = -6.406539449252625362252e-02 */
-        .quad 0xBF92583699F2E27A  /* A00 = -1.791463198307716858659e-02 */
-        .quad 0x3FF1F451B85AA9F0  /* A01 = +1.122148246892376022288e+00 */
-        .quad 0xBFD34FD5F8288180  /* A02 = -3.017477916164565954205e-01 */
-        .quad 0xBFA6FB692825B683  /* A03 = -4.488686194495718900788e-02 */
-        .quad 0xBF9641C26E673D6F  /* A00 = -2.173522757385398448959e-02 */
-        .quad 0x3FF24364DA5E2B07  /* A01 = +1.141453602790251542487e+00 */
-        .quad 0xBFD564A5A5EF5890  /* A02 = -3.342680092295120530821e-01 */
-        .quad 0xBF9B43712011A982  /* A03 = -2.662445791467283467968e-02 */
-        .quad 0xBF9A901038EC2F39  /* A00 = -2.594018313816024226548e-02 */
-        .quad 0x3FF2961356DFFEBA  /* A01 = +1.161639537196534011088e+00 */
-        .quad 0xBFD775EBB17198C7  /* A02 = -3.665723069046972759644e-01 */
-        .quad 0xBF833B1A926CD462  /* A03 = -9.390075295963199591975e-03 */
-        .quad 0xBF9F396A6A461B91  /* A00 = -3.049246095317987084727e-02 */
-        .quad 0x3FF2EB53BAEF534B  /* A01 = +1.182452898229899629357e+00 */
-        .quad 0xBFD97DABF8AD8BBD  /* A02 = -3.982953957076310058660e-01 */
-        .quad 0x3F7B8F6A3E0F8837  /* A03 = +6.728568086119371925713e-03 */
-        .quad 0xBFA21878590F8BAA  /* A00 = -3.534294211546946951064e-02 */
-        .quad 0x3FF34209790236E1  /* A01 = +1.203622315111197105253e+00 */
-        .quad 0xBFDB764C0E71BECB  /* A02 = -4.290952817018306997277e-01 */
-        .quad 0x3F962FE0C03F84C0  /* A03 = +2.166701482190513949888e-02 */
-        .quad 0xBFA4B36B9AD27ECC  /* A00 = -4.043136849327097492868e-02 */
-        .quad 0x3FF3990C5B12FC16  /* A01 = +1.224865298994477935679e+00 */
-        .quad 0xBFDD5AABB0D01390  /* A02 = -4.586590983092770912322e-01 */
-        .quad 0x3FA21DAF5CA162DB  /* A03 = +3.538272863142363083844e-02 */
-        .quad 0xBFA7645E4D7BF28B  /* A00 = -4.568762489177399105378e-02 */
-        .quad 0x3FF3EF2FD51C0D9F  /* A01 = +1.245895225962932562069e+00 */
-        .quad 0xBFDF26377E1B686E  /* A02 = -4.867075664057044503963e-01 */
-        .quad 0x3FA8803E756EE812  /* A03 = +4.785342391501513914509e-02 */
-        .quad 0xBFAA210925C64413  /* A00 = -5.103329263796054643398e-02 */
-        .quad 0x3FF44349F897D8E7  /* A01 = +1.266427966181760345066e+00 */
-        .quad 0xBFE06A7B02C6D8E2  /* A02 = -5.129981092675530707226e-01 */
-        .quad 0x3FAE3F194734F5D0  /* A03 = +5.907515520309980505687e-02 */
-        .quad 0xBFACDE48F8A19BBB  /* A00 = -5.638340029764018351832e-02 */
-        .quad 0x3FF49439D5466582  /* A01 = +1.286187966447272845727e+00 */
-        .quad 0xBFE131C7C1063DDC  /* A02 = -5.373266954429101183166e-01 */
-        .quad 0x3FB1ADEEC36AD805  /* A03 = +6.906025191241844940482e-02 */
-        .quad 0xBFAF905D8F585680  /* A00 = -6.164829611604449866036e-02 */
-        .quad 0x3FF4E0ED1FD27F99  /* A01 = +1.304913639360142818546e+00 */
-        .quad 0xBFE1E7A859DC1D3D  /* A02 = -5.595285182070380836095e-01 */
-        .quad 0x3FB3ED018E4642A1  /* A03 = +7.783517573831001679086e-02 */
-        .quad 0xBFB11595104160BA  /* A00 = -6.673556944713512906198e-02 */
-        .quad 0x3FF528650340490B  /* A01 = +1.322361958217302513319e+00 */
-        .quad 0xBFE28B14B40BC974  /* A02 = -5.794776455425521000109e-01 */
-        .quad 0x3FB5DF49F5BAF6D7  /* A03 = +8.543836831355676453281e-02 */
-        .quad 0xBFB2513A97344BA4  /* A00 = -7.155195418844911836587e-02 */
-        .quad 0x3FF569BA0DB5EE14  /* A01 = +1.338312200124055273420e+00 */
-        .quad 0xBFE31B53A8B67B20  /* A02 = -5.970857901737396389308e-01 */
-        .quad 0x3FB787F297BB0544  /* A03 = +9.191814617499455275507e-02 */
-        .quad 0xBFB37512E848FAFA  /* A00 = -7.600515528700305112331e-02 */
-        .quad 0x3FF5A41F33B403C8  /* A01 = +1.352568819013173495591e+00 */
-        .quad 0xBFE397F6EA9A58A5  /* A02 = -6.123003561103997904880e-01 */
-        .quad 0x3FB8EAA9FF25CA06  /* A03 = +9.733068923177520814782e-02 */
-        .quad 0xBFB47B3E603AFC5D  /* A00 = -8.000554894805263217439e-02 */
-        .quad 0x3FF5D6E3EDE40487  /* A01 = +1.364963464031718975988e+00 */
-        .quad 0xBFE400D5BCA6D631  /* A02 = -6.251019177058819709103e-01 */
-        .quad 0x3FBA0B830ED567FE  /* A03 = +1.017381583418739132707e-01 */
-        .quad 0xBFB5BBFE8AC90496  /* A00 = -8.489981544791400103200e-02 */
-        .quad 0x3FF612BA70107E95  /* A01 = +1.379572332145390989311e+00 */
-        .quad 0xBFE477EAF1FA7693  /* A02 = -6.396383978023599814478e-01 */
-        .quad 0x3FBB4784B7C08A95  /* A03 = +1.065600346196709652391e-01 */
-        .quad 0xBFB6D5D940743939  /* A00 = -8.920057128509463473254e-02 */
-        .quad 0x3FF644A8748F70CE  /* A01 = +1.391762214006166953340e+00 */
-        .quad 0xBFE4D646AB07EA37  /* A02 = -6.511567440459832267763e-01 */
-        .quad 0x3FBC354F4E1D5292  /* A03 = +1.101884427747086558913e-01 */
-        .quad 0xBFB7223D19E4F3D1  /* A00 = -9.036619074045339206069e-02 */
-        .quad 0x3FF6518FEB42B7FA  /* A01 = +1.394912642466350494175e+00 */
-        .quad 0xBFE4ED86CB87498C  /* A02 = -6.539949393430091184598e-01 */
-        .quad 0x3FBC6D29F28CCA9B  /* A03 = +1.110407082713131127205e-01 */
-        .quad 0xBFB6878652FF6312  /* A00 = -8.800544287022329936754e-02 */
-        .quad 0x3FF63948C302D040  /* A01 = +1.388985406648330922508e+00 */
-        .quad 0xBFE4C4E2E7904E17  /* A02 = -6.490339777687407218920e-01 */
-        .quad 0x3FBC127356CA1ABE  /* A03 = +1.096565329445224612481e-01 */
-        .quad 0xBFB4F5D18B0C91D6  /* A00 = -8.187589306596207427980e-02 */
-        .quad 0x3FF5FD27EB7DD0B8  /* A01 = +1.374305648697413673176e+00 */
-        .quad 0xBFE464E01A2B2FC6  /* A02 = -6.373138915164353601739e-01 */
-        .quad 0x3FBB460547674A30  /* A03 = +1.065371798825160976065e-01 */
-        .quad 0xBFB26642FA16A685  /* A00 = -7.187288861919156890412e-02 */
-        .quad 0x3FF59F9BEDE1C95A  /* A01 = +1.351467065073470141812e+00 */
-        .quad 0xBFE3D67920C8FBEA  /* A02 = -6.199308052381387046381e-01 */
-        .quad 0x3FBA24F6A8D3CBC1  /* A03 = +1.021265184570401413078e-01 */
-        .quad 0xBFADB5294794F097  /* A00 = -5.802277563859197656582e-02 */
-        .quad 0x3FF523EA7B9CF453  /* A01 = +1.321268542159732772845e+00 */
-        .quad 0xBFE322A8B55E35DB  /* A02 = -5.979808370918208160205e-01 */
-        .quad 0x3FB8C8673B1B3E37  /* A03 = +9.680791085269722928697e-02 */
-        .quad 0xBFA4B7D661965C6A  /* A00 = -4.046506825687219699450e-02 */
-        .quad 0x3FF48DE3E2CE3122  /* A01 = +1.284641157110919085227e+00 */
-        .quad 0xBFE251FED1A7F445  /* A02 = -5.725092024655472622285e-01 */
-        .quad 0x3FB745699FCABDB9  /* A03 = +9.090290213747821701507e-02 */
-        .quad 0xBF93E60456E4EE1D  /* A00 = -1.943213253365004902773e-02 */
-        .quad 0x3FF3E1A14E628A59  /* A01 = +1.242585474196536532432e+00 */
-        .quad 0xBFE16C5AB660E876  /* A02 = -5.444768488007543094653e-01 */
-        .quad 0x3FB5AD33AA8C188F  /* A03 = +8.467410005332197397987e-02 */
-        .quad 0x3F738C17C47C7961  /* A00 = +4.772274820224659853951e-03 */
-        .quad 0x3FF3234DDE3BD146  /* A01 = +1.196119182682268355933e+00 */
-        .quad 0xBFE078C0D77A9D3B  /* A02 = -5.147403915952176722826e-01 */
-        .quad 0x3FB40D74B3E276B8  /* A03 = +7.833032027925923568290e-02 */
-        .quad 0x3FA0474BECC689C7  /* A00 = +3.179394975019849550746e-02 */
-        .quad 0x3FF256FB4FA7D18A  /* A01 = +1.146235762743432307076e+00 */
-        .quad 0xBFDEFA8E3FB285E2  /* A02 = -4.840427038235174395098e-01 */
-        .quad 0x3FB270C007493D59  /* A03 = +7.203293016322244446403e-02 */
-        .quad 0x3FAF5BD51E479BDC  /* A00 = +6.124750132203590768931e-02 */
-        .quad 0x3FF18081D0B53BC5  /* A01 = +1.093873801484492647162e+00 */
-        .quad 0xBFDCFE2439BD0C03  /* A02 = -4.530115665294831006626e-01 */
-        .quad 0x3FB0DEFE5A45AFDD  /* A03 = +6.590261176978580437424e-02 */
-        .quad 0x3FB7BD5D2806EA26  /* A00 = +9.273321368429118805032e-02 */
-        .quad 0x3FF0A369E35B4440  /* A01 = +1.039895904647224256223e+00 */
-        .quad 0xBFDB04BC5C9951E7  /* A02 = -4.221640495573226181669e-01 */
-        .quad 0x3FAEBBBAA9D6DEEF  /* A03 = +6.002600978120919278380e-02 */
-        .quad 0x3FC01BE411098DBC  /* A00 = +1.258511622610124502941e-01 */
-        .quad 0x3FEF85BDABC031C1  /* A01 = +9.850757936961188621083e-01 */
-        .quad 0xBFD91521375097C2  /* A02 = -3.919146576102968682065e-01 */
-        .quad 0x3FABE26F0086D982  /* A03 = +5.446192628317005068883e-02 */
-        .quad 0x3FC481D7FF5776B9  /* A00 = +1.602125164781023347604e-01 */
-        .quad 0x3FEDC3506C1E7218  /* A01 = +9.300920592973538347792e-01 */
-        .quad 0xBFD7349A88DA7D4F  /* A02 = -3.625856720409119104964e-01 */
-        .quad 0x3FA936E2DFF8E2AE  /* A03 = +4.924687370334389358018e-02 */
-        .quad 0x3FC90471F96FA27A  /* A00 = +1.954481571149420671141e-01 */
-        .quad 0x3FEC0451601987A2  /* A01 = +8.755270840595026360376e-01 */
-        .quad 0xBFD5671CD4B898DC  /* A02 = -3.344184949259110251063e-01 */
-        .quad 0x3FA6BB9594603B67  /* A03 = +4.439990459660841243261e-02 */
-        .quad 0x3FCFD8ADB9ED944C  /* A00 = +2.488000066615846384011e-01 */
-        .quad 0x3FE978C073F6809A  /* A01 = +7.959902062321078108909e-01 */
-        .quad 0xBFD2DF7E00BCD5A9  /* A02 = -2.948908812716931060471e-01 */
-        .quad 0x3FA3614033D490B2  /* A03 = +3.785133965200894456959e-02 */
-        .quad 0x3FD4846A12AFE5A0  /* A00 = +3.205819303981005674586e-01 */
-        .quad 0x3FE63A1147D40472  /* A01 = +6.945883181471244061100e-01 */
-        .quad 0xBFCFA2268AD34450  /* A02 = -2.471359422548027318101e-01 */
-        .quad 0x3F9F150201D9FFE0  /* A03 = +3.035357605267552383310e-02 */
-        .quad 0x3FD9018641F82BEB  /* A00 = +3.907180446846598154131e-01 */
-        .quad 0x3FE33B7C220FFBDC  /* A01 = +6.010113396913498995389e-01 */
-        .quad 0xBFCA4E4187E29C86  /* A02 = -2.055131829740483584423e-01 */
-        .quad 0x3F98C30CED19F8F4  /* A03 = +2.418155858185229434287e-02 */
-        .quad 0x3FDD4B8255BEB078  /* A00 = +4.577337109901757905561e-01 */
-        .quad 0x3FE0858B19D3A49B  /* A01 = +5.163016800335243905451e-01 */
-        .quad 0xBFC5BC929EACE564  /* A02 = -1.698172831327539045176e-01 */
-        .quad 0x3F93A083CE57DE2B  /* A03 = +1.916700312537337677621e-02 */
-        .quad 0x3FE0A8E5E039295C  /* A00 = +5.206174258576470315063e-01 */
-        .quad 0x3FDC35E1234583FE  /* A01 = +4.407885403107342225937e-01 */
-        .quad 0xBFC1DE034E31AEB9  /* A02 = -1.395877963835710222629e-01 */
-        .quad 0x3F8EFDEBB3471BDC  /* A03 = +1.513275280821162888101e-02 */
-        .quad 0x3FE2851B603CB2A5  /* A00 = +5.787484054213406503564e-01 */
-        .quad 0x3FD7F4A44ABBB286  /* A01 = +3.743067483726821853551e-01 */
-        .quad 0xBFBD3EEB67087DE7  /* A02 = -1.142413260026767657385e-01 */
-        .quad 0x3F8864F38329E8BD  /* A03 = +1.191129917173260922836e-02 */
-        .quad 0x3FE437DBE3C34AC1  /* A00 = +6.318187187665317283702e-01 */
-        .quad 0x3FD43F6F789441B5  /* A01 = +3.163717916040938438194e-01 */
-        .quad 0xBFB7D92E7901B9A4  /* A02 = -9.315767721429907277653e-02 */
-        .quad 0x3F8327ED342308E1  /* A03 = +9.353497651663324544136e-03 */
-        .quad 0x3FE5C0977766D55C  /* A00 = +6.797597248138731451661e-01 */
-        .quad 0x3FD10B42A764D8F9  /* A01 = +2.663122782427219115142e-01 */
-        .quad 0xBFB3633351D3D70F  /* A02 = -7.573242900602060456716e-02 */
-        .quad 0x3F7E079E30FF899C  /* A03 = +7.331483779099558922843e-03 */
-        .quad 0x3FE7202CE08A88C4  /* A00 = +7.226776490754436288455e-01 */
-        .quad 0x3FCC973EB5662B01  /* A01 = +2.233656297433626314319e-01 */
-        .quad 0xBFAF70A455F9920B  /* A02 = -6.140626477716545211782e-02 */
-        .quad 0x3F77812411CE99B6  /* A03 = +5.738392731393584730859e-03 */
-        .quad 0x3FE85879424095B1  /* A00 = +7.608000082006382003286e-01 */
-        .quad 0x3FC7E73BD1674D84  /* A01 = +1.867441914060742336190e-01 */
-        .quad 0xBFA96F84E4BF333B  /* A02 = -4.967894832916504993525e-02 */
-        .quad 0x3F72606DDCA6E117  /* A03 = +4.486493251924870105662e-03 */
-        .quad 0x3FE96BFE4957F4DD  /* A00 = +7.944327766887472330737e-01 */
-        .quad 0x3FC3ED4780D25478  /* A01 = +1.556786898624158421711e-01 */
-        .quad 0xBFA489C5F9A56B58  /* A02 = -4.011362717093075458408e-02 */
-        .quad 0x3F6CB5DC17E9AD2A  /* A03 = +3.504686231556104931972e-03 */
-        .quad 0x3FEA5D9CB2F41234  /* A00 = +8.239272589858672724006e-01 */
-        .quad 0x3FC091A758374DCF  /* A01 = +1.294449978582705440555e-01 */
-        .quad 0xBFA08E436D4B5CE0  /* A02 = -3.233538350257858517978e-02 */
-        .quad 0x3F666997AD53E6B7  /* A03 = +2.735897297154145629133e-03 */
-        .quad 0x3FEB3060342CB850  /* A00 = +8.496552485501158713532e-01 */
-        .quad 0x3FBB7D30BBC7DC1B  /* A01 = +1.073790033768634993860e-01 */
-        .quad 0xBF9AA6BA3443D9E3  /* A02 = -2.602663940430173170060e-02 */
-        .quad 0x3F617CA764B7850B  /* A03 = +2.134634914668814050648e-03 */
-        .quad 0x3FEBE759A6A0C7B8  /* A00 = +8.719909910635044170135e-01 */
-        .quad 0x3FB6C10DE6A703FF  /* A01 = +8.888327485239243264115e-02 */
-        .quad 0xBF956C566D8BE1F6  /* A02 = -2.092108768099084498138e-02 */
-        .quad 0x3F5B46D1A4A59CF8  /* A03 = +1.664833764687232917079e-03 */
-        .quad 0x3FEC858494887A04  /* A00 = +8.912985707318630268503e-01 */
-        .quad 0x3FB2CC31F543394D  /* A01 = +7.342827070099140762682e-02 */
-        .quad 0xBF9133477FF69137  /* A02 = -1.679717749142747504343e-02 */
-        .quad 0x3F5544482FBB4DA5  /* A03 = +1.298017973501022466823e-03 */
-        .quad 0x3FED0DB59D0E32E9  /* A00 = +9.079235141267335551518e-01 */
-        .quad 0x3FAF006BAFFC6EF4  /* A01 = +6.055008433597022787787e-02 */
-        .quad 0xBF8B97146FA2B97A  /* A02 = -1.347175565419144252499e-02 */
-        .quad 0x3F5093B01F4CDC69  /* A03 = +1.011774057770665211434e-03 */
-        .quad 0x3FEDB487C3EC457C  /* A00 = +9.282873942012623835751e-01 */
-        .quad 0x3FA7390C09D0BD1D  /* A01 = +4.535710925881118044112e-02 */
-        .quad 0xBF83D9F7C3181106  /* A02 = -9.693084374710735778846e-03 */
-        .quad 0x3F46E34A0A3C0E64  /* A03 = +6.984817050299072134500e-04 */
-        .quad 0x3FEE5FFCB4E6EB00  /* A00 = +9.492171796076434020506e-01 */
-        .quad 0x3F9F4913ED00AADF  /* A01 = +3.055220731782070861526e-02 */
-        .quad 0xBF79670BD0E59B5C  /* A02 = -6.201788097633133961528e-03 */
-        .quad 0x3F3BC998EBCAF96D  /* A03 = +4.240034429975534616304e-04 */
-        .quad 0x3FEEDBA41E9542FE  /* A00 = +9.643116566968215064293e-01 */
-        .quad 0x3F94F5DD18D9C24D  /* A01 = +2.046914543319848858727e-02 */
-        .quad 0xBF7034896AA122B9  /* A02 = -3.956352980886528904192e-03 */
-        .quad 0x3F30DCCB47810B39  /* A03 = +2.573009765038273091199e-04 */
-        .quad 0x3FEF33F2882520ED  /* A00 = +9.750912341196716903724e-01 */
-        .quad 0x3F8BF37F2CF553FF  /* A01 = +1.364802699996836392315e-02 */
-        .quad 0xBF649F6F05A69619  /* A02 = -2.517430152880317534986e-03 */
-        .quad 0x3F247623C950AAC9  /* A03 = +1.561087307505231250044e-04 */
-        .quad 0x3FEF727757751741  /* A00 = +9.827229221489021115943e-01 */
-        .quad 0x3F828E67912C4400  /* A01 = +9.060677640748693306705e-03 */
-        .quad 0xBF5A2F51A806CC2C  /* A02 = -1.598195784123355826789e-03 */
-        .quad 0x3F18D35D7687E613  /* A03 = +9.470231965016282719549e-05 */
-        .quad 0x3FEF9E6325C5942A  /* A00 = +9.880843866091073568469e-01 */
-        .quad 0x3F788AB117618F76  /* A01 = +5.991641772286606867914e-03 */
-        .quad 0xBF5096EAB0B1EA89  /* A02 = -1.012543859160305046233e-03 */
-        .quad 0x3F0E1E50EC4435AB  /* A03 = +5.744633156910412119652e-05 */
-        .quad 0x3FEFBD0784049369  /* A00 = +9.918248728250605994461e-01 */
-        .quad 0x3F702BBD8294035F  /* A01 = +3.947963975634432264028e-03 */
-        .quad 0xBF44FB55E0F00593  /* A02 = -6.403130845457509273330e-04 */
-        .quad 0x3F0244DCD723230A  /* A03 = +3.484534217219031730379e-05 */
-        .quad 0x3FEFD245E2366A43  /* A00 = +9.944180887426415926811e-01 */
-        .quad 0x3F653D82EC088433  /* A01 = +2.592807490387838333795e-03 */
-        .quad 0xBF3A7DF75E013CB8  /* A02 = -4.042366908878036561859e-04 */
-        .quad 0x3EF6298E69F991CD  /* A03 = +2.113564425911141559972e-05 */
-        .quad 0x3FEFE0EAA508BC69  /* A00 = +9.962056372950317539861e-01 */
-        .quad 0x3F5BD0771AF3FDDA  /* A01 = +1.697651208644282514598e-03 */
-        .quad 0xBF30B2E1254DE571  /* A02 = -2.548026725928887099328e-04 */
-        .quad 0x3EEAE28B70EC0256  /* A03 = +1.281973848454955042307e-05 */
-        .quad 0x3FEFEAF5303D7F96  /* A00 = +9.974313680831865536192e-01 */
-        .quad 0x3F5229111365657E  /* A01 = +1.108423877289460134782e-03 */
-        .quad 0xBF250572D04DFE66  /* A02 = -1.603796628408704519168e-04 */
-        .quad 0x3EE04E89BB57C981  /* A03 = +7.775682983689149966743e-06 */
-        .quad 0x3FEFF1CF52F1CF44  /* A00 = +9.982678051005469122003e-01 */
-        .quad 0x3F47A71316147CEB  /* A01 = +7.218211359577819110842e-04 */
-        .quad 0xBF1A6D7604055719  /* A02 = -1.008132248946049582547e-04 */
-        .quad 0x3ED3C8047586A85C  /* A03 = +4.716233739913014633626e-06 */
-        .quad 0x3FEFF6770369EF69  /* A00 = +9.988360468555416149528e-01 */
-        .quad 0x3F3EBB261180FBF0  /* A01 = +4.689186039321105101130e-04 */
-        .quad 0xBF1097754FE19D7F  /* A02 = -6.329206004950480057066e-05 */
-        .quad 0x3EC7FEFF83BCA0A7  /* A03 = +2.860556404988488738366e-06 */
-        .quad 0x3FEFF99D42371AC4  /* A00 = +9.992204945818561334647e-01 */
-        .quad 0x3F33EB2AEC271F59  /* A01 = +3.039340773764907474054e-04 */
-        .quad 0xBF04CF18E0FC0D79  /* A02 = -3.968996690952969588805e-05 */
-        .quad 0x3EBD1BDBD6019BE9  /* A03 = +1.735021065507727833886e-06 */
-        .quad 0x3FEFFBBCA32B0D91  /* A00 = +9.994795977476532700123e-01 */
-        .quad 0x3F29C41E1615110A  /* A01 = +1.965796209707565346710e-04 */
-        .quad 0xBEFA11F93D9DCB5A  /* A02 = -2.486248909101414873235e-05 */
-        .quad 0x3EB1A7CA4546F7A7  /* A03 = +1.052345642723709228769e-06 */
-        .quad 0x3FEFFD298B8E8DE2  /* A00 = +9.996535993308806045121e-01 */
-        .quad 0x3F20A1C42D523C5B  /* A01 = +1.268913244172078754520e-04 */
-        .quad 0xBEF0507A364AFAE4  /* A02 = -1.555859070622834605755e-05 */
-        .quad 0x3EA56ACA17E7CDF4  /* A03 = +6.382806956848098872313e-07 */
-        .quad 0x3FEFFE1DC82BA5A3  /* A00 = +9.997700604991915929176e-01 */
-        .quad 0x3F156E73B90F1769  /* A01 = +8.175450626798714452801e-05 */
-        .quad 0xBEE4663579D0A09F  /* A02 = -9.727122057226747625365e-06 */
-        .quad 0x3E99FAF6FEC5D4C1  /* A03 = +3.871371052824002996020e-07 */
-        .quad 0x3FEFFEF8D0BB5E81  /* A00 = +9.998745037837154514548e-01 */
-        .quad 0x3F06686DA18D39C3  /* A01 = +4.273972098777251447726e-05 */
-        .quad 0xBED46BC298073E90  /* A02 = -4.868731025855742842491e-06 */
-        .quad 0x3E88E42286B9D0FD  /* A03 = +1.854535328530838170114e-07 */
-        .quad 0x3FEFFF8DBC68DDC7  /* A00 = +9.999455146670975791423e-01 */
-        .quad 0x3EF26B2953A80AF0  /* A01 = +1.756534514108903368909e-05 */
-        .quad 0xBEBFC4472D580F83  /* A02 = -1.893443529411295465239e-06 */
-        .quad 0x3E72505B4553D19F  /* A03 = +6.822456673547912277047e-08 */
-        .quad 0x3FEFFFCED1276609  /* A00 = +9.999765477215883935358e-01 */
-        .quad 0x3EDE1A94C7CC58F5  /* A01 = +7.177313020153979672606e-06 */
-        .quad 0xBEA8A2C988744E57  /* A02 = -7.342066660497443762363e-07 */
-        .quad 0x3E5AF30036BBBAF4  /* A03 = +2.509841882843541084885e-08 */
-        .quad 0x3FEFFFEAFE70FCFC  /* A00 = +9.999899835164849370983e-01 */
-        .quad 0x3EC879175E3549F5  /* A01 = +2.917410471128503564412e-06 */
-        .quad 0xBE930E36677D1813  /* A02 = -2.839493400307523115929e-07 */
-        .quad 0x3E43D4005B42D48F  /* A03 = +9.233192745401904898013e-09 */
-        .quad 0x3ff0000000000000
-        .quad 0x0000000000000000
-        .quad 0x0000000000000000
-        .quad 0x0000000000000000
-        .align 32
-        .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000           /* _sSignMask        */
-        .align 32
-        .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff           /* _sAbsMask         */
-        .align 32
-        .long 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000, 0x7ff80000           /* _iExpMantMask     */
-        .align 32
-        .long 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000, 0x7f000000           /* _iExpMask         */
-        .align 32
-        .long 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000, 0x3cf80000           /* _iMinIdxOfsMask   */
-        .align 32
-        .long 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000, 0x04280000           /* _iMaxIdxMask      */
-        .align 32
-        .type	__svml_stanh_data_internal,@object
-        .size	__svml_stanh_data_internal,.-__svml_stanh_data_internal
+	vcvtps2pd %xmm4, %ymm5
+
+	vextractf128 $1, %ymm4, %xmm4
+	vcvtps2pd %xmm4, %ymm4
+
+	vmovdqu	16(%rcx, %rax), %xmm2
+	vinsertf128 $1, 16(%r11, %rax), %ymm2, %ymm2
+
+	vfmadd213pd %ymm3, %ymm5, %ymm1
+
+	vmovupd	16(%rdx, %rax), %xmm3
+	vinsertf128 $1, 16(%r10, %rax), %ymm3, %ymm3
+
+	vunpcklpd %ymm3, %ymm2, %ymm10
+	vunpckhpd %ymm3, %ymm2, %ymm2
+
+	vfmadd213pd %ymm10, %ymm4, %ymm2
+	vfmadd213pd %ymm6, %ymm4, %ymm2
+	vfmadd213pd %ymm7, %ymm4, %ymm2
+	vcvtpd2ps %ymm2, %xmm2
+
+	vmovdqu	(%r9, %rax), %xmm7
+	vinsertf128 $1, (%rdi, %rax), %ymm7, %ymm7
+
+	vmovupd	(%r8, %rax), %xmm3
+	vinsertf128 $1, (%rsi, %rax), %ymm3, %ymm3
+
+	vunpckhpd %ymm3, %ymm7, %ymm4
+	vunpcklpd %ymm3, %ymm7, %ymm7
+
+	vfmadd213pd %ymm4, %ymm5, %ymm1
+	vfmadd213pd %ymm7, %ymm5, %ymm1
+
+
+	vcvtpd2ps %ymm1, %xmm1
+	vinsertf128 $1, %xmm2, %ymm1, %ymm1
+
+	vmovmskps %ymm15, %edx
+	vandnps	%ymm0, %ymm11, %ymm2
+	testl	%edx, %edx
+	/* Go to special inputs processing branch.  */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	/* Wait until after branch of write over ymm0.  */
+	vorps	%ymm2, %ymm1, %ymm0
+	/* No stack restoration on the fastpath.  */
+	ret
+
+
+L(SPECIAL_VALUES_BRANCH):
+	pushq	%rbp
+	/* Need to callee save registers to preserve state across tanhf calls.
+	 */
+	pushq	%r12
+	pushq	%r13
+	movq	%rsp, %rbp
+
+	/* Align stack and make room for 2x ymm vectors.  */
+	andq	$-32, %rsp
+	addq	$-64, %rsp
+
+	/* Save all already computed inputs.  */
+	vorps	%ymm2, %ymm1, %ymm1
+	vmovups	%ymm1, (%rsp)
+	/* Save origional input (ymm0 unchanged up to this point).  */
+	vmovups	%ymm0, 32(%rsp)
+
+	vzeroupper
+
+	/* edx has 1s where there was a special value that needs to be handled
+	   by a tanhf call.  */
+	movl	%edx, %r13d
+L(SPECIAL_VALUES_LOOP):
+	/* use r12 as index for special value that is saved across calls to
+	   tanhf. We technically don't need a callee save register here as offset
+	   to rsp is always [0, 28] so we can restore rsp by realigning to 64.
+	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
+	   in the loop.  */
+	xorl	%r12d, %r12d
+	tzcntl	%r13d, %r12d
+
+	/* Scalar math fucntion call to process special input.  */
+	movss	32(%rsp, %r12, 4), %xmm0
+	call	tanhf@PLT
+	/* No good way to avoid the store-forwarding fault this will cause on
+	   return. `lfence` avoids the SF fault but at greater cost as it
+	   serialized stack/callee save restoration.  */
+	movss	%xmm0, (%rsp, %r12, 4)
+
+	blsr	%r13d, %r13d
+	jnz	L(SPECIAL_VALUES_LOOP)
+
+	/* All results have been written to 32(%rsp).  */
+	vmovups	(%rsp), %ymm0
+	movq	%rbp, %rsp
+	popq	%r13
+	popq	%r12
+	popq	%rbp
+	ret
+END(_ZGVdN8v_tanhf_avx2)