diff mbox series

[v1,07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S

Message ID 20221207085236.1424424-7-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,01/27] x86/fpu: Create helper file for common data macros | expand

Commit Message

Noah Goldstein Dec. 7, 2022, 8:52 a.m. UTC
No changes to the logic, just change how rodata is handled.

1. Define the rodatas using the new macros so they check that the
   offset is correct.

2. Use common data where applicable.
---
 .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
 1 file changed, 197 insertions(+), 253 deletions(-)

Comments

H.J. Lu Dec. 16, 2022, 5:05 p.m. UTC | #1
On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No changes to the logic, just change how rodata is handled.
>
> 1. Define the rodatas using the new macros so they check that the
>    offset is correct.
>
> 2. Use common data where applicable.
> ---
>  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
>  1 file changed, 197 insertions(+), 253 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> index d74fc7731d..765e9ed7f7 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> @@ -70,94 +70,99 @@
>   *
>   */
>
> -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> -   by use in the function. On cold-starts this might help the
> -   prefetcher. Possibly a better idea is to interleave start/end so
> -   that the prefetcher is less likely to detect a stream and pull
> -   irrelivant lines into cache.  */
>
> -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> - */
> +
> +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> +#include "svml_s_common_evex512_rodata_offsets.h"
> +
> +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> +   4 bytes each.  */
>  #define _iExpMantMask_UISA             0
>  #define _iMinIdxOfsMask_UISA           4
>  #define _iMaxIdxMask_UISA              8
>  #define _iExpMask                      12
>
> -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> -   each.  */
> -#define _sC_lo                         0
> -#define _sC_hi                         64
> -#define _sP7_lo                                128
> -#define _sP7_hi                                192
> -#define _sSignMask                     256
> -#define _sP6_lo                                320
> -#define _sP6_hi                                384
> -#define _sP5_lo                                448
> -#define _sP5_hi                                512
> -#define _sP4_lo                                576
> -#define _sP4_hi                                640
> -#define _sP3_lo                                704
> -#define _sP3_hi                                768
> -#define _sP2_lo                                832
> -#define _sP2_hi                                896
> -#define _sP0_lo                                960
> -#define _sP0_hi                                1024
> +/* Offsets for data table __svml_stanh_data_internal. Ordered
> +   by use in the function. On cold-starts this might help the
> +   prefetcher. Possibly a better idea is to interleave start/end so
> +   that the prefetcher is less likely to detect a stream and pull
> +   irrelivant lines into cache.  */
> +
> +/* Offsets for data table __svml_stanh_data_internal.
> +   64 bytes each.  */
> +#define _sC_lo 0
> +#define _sC_hi 64
> +#define _sP7_lo        128
> +#define _sP7_hi        192
> +#define _sP6_lo        256
> +#define _sP6_hi        320
> +#define _sP5_lo        384
> +#define _sP5_hi        448
> +#define _sP4_lo        512
> +#define _sP4_hi        576
> +#define _sP3_lo        640
> +#define _sP3_hi        704
> +#define _sP2_lo        768
> +#define _sP2_hi        832
> +#define _sP0_lo        896
> +#define _sP0_hi        960
> +
>
>  #include <sysdep.h>
> -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
>
>         .section .text.evex512, "ax", @progbits
>  ENTRY(_ZGVeN16v_tanhf_skx)
> -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> +       /* Here huge arguments, INF and NaNs are filtered out to
> +          callout.  */
> +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
>
>         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
>         vpxord  %zmm3, %zmm3, %zmm3
>         vpmaxsd %zmm3, %zmm2, %zmm3
> -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
>
>         /* Setup permute indices in zmm3.  */
>         vpsrld  $21, %zmm3, %zmm3
>
>         /* Store if there are any special cases in k1.  */
> -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
>
> -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
>
> -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
>
>         /* Store absolute values of inputs in zmm1.  */
> -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
>         vandnps %zmm0, %zmm4, %zmm1
>         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
>
> -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
>
> -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
>
>         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
>         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
>
> -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
>
> -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
>
>         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
>         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
>
> -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
>
> -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
>
>         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
>         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
>
>         /* Go to special inputs processing branch.  */
>         jne     L(SPECIAL_VALUES_BRANCH)
> -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> +
>         /* Wait until after branch of write over zmm0.  */
>         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
>
> @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
>
>         /* Cold case. edx has 1s where there was a special value that
>            needs to be handled by a tanhf call. Optimize for code size
> -          more so than speed here. */
> +          more so than speed here.  */
>  L(SPECIAL_VALUES_BRANCH):
> -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> -       callee save register saving code size. */
> +
> +       /* Use r13 to save/restore the stack. This allows us to use rbp
> +          as callee save register saving code size.  */
>         pushq   %r13
> -       cfi_adjust_cfa_offset(8)
> -       cfi_offset(r13, -16)
> -       /* Need to callee save registers to preserve state across tanhf calls.
> -        */
> +       cfi_adjust_cfa_offset (8)
> +       cfi_offset (r13, -16)
> +       /* Need to callee save registers to preserve state across tanhf
> +          calls.  */
>         pushq   %rbx
> -       cfi_adjust_cfa_offset(8)
> -       cfi_offset(rbx, -24)
> +       cfi_adjust_cfa_offset (8)
> +       cfi_offset (rbx, -24)
>         pushq   %rbp
> -       cfi_adjust_cfa_offset(8)
> -       cfi_offset(rbp, -32)
> +       cfi_adjust_cfa_offset (8)
> +       cfi_offset (rbp, -32)
>         movq    %rsp, %r13
> -       cfi_def_cfa_register(r13)
> +       cfi_def_cfa_register (r13)
>
>         /* Align stack and make room for 2x zmm vectors.  */
>         andq    $-64, %rsp
> @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
>
>         vzeroupper
>
> -       /* edx has 1s where there was a special value that needs to be handled
> -          by a tanhf call.  */
> +       /* edx has 1s where there was a special value that needs to be
> +          handled by a tanhf call.  */
>         movl    %edx, %ebx
>  L(SPECIAL_VALUES_LOOP):
> -       # LOE rbx rbp r12 r13 r14 r15
> -       /* use rbp as index for special value that is saved across calls to
> -          tanhf. We technically don't need a callee save register here as offset
> -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> -          in the loop. Realigning also costs more code size.  */
> +
> +       /* use rbp as index for special value that is saved across calls
> +          to tanhf. We technically don't need a callee save register
> +          here as offset to rsp is always [0, 56] so we can restore
> +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> +          save/restore vs 2 extra instructions in the loop. Realigning
> +          also costs more code size.  */
>         xorl    %ebp, %ebp
>         tzcntl  %ebx, %ebp
>
> @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
>         vmovss  64(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
>
> -       /* No good way to avoid the store-forwarding fault this will cause on
> -          return. `lfence` avoids the SF fault but at greater cost as it
> -          serialized stack/callee save restoration.  */
> +       /* No good way to avoid the store-forwarding fault this will
> +          cause on return. `lfence` avoids the SF fault but at greater
> +          cost as it serialized stack/callee save restoration.  */
>         vmovss  %xmm0, (%rsp, %rbp, 4)
>
> -       blsrl   %ebx, %ebx
> +       blsrl   %ebx, %ebx
>         jnz     L(SPECIAL_VALUES_LOOP)
> -       # LOE r12 r13 r14 r15
> +
>
>         /* All results have been written to (%rsp).  */
>         vmovaps (%rsp), %zmm0
>         /* Restore rsp.  */
>         movq    %r13, %rsp
> -       cfi_def_cfa_register(rsp)
> +       cfi_def_cfa_register (rsp)
>         /* Restore callee save registers.  */
>         popq    %rbp
> -       cfi_adjust_cfa_offset(-8)
> -       cfi_restore(rbp)
> +       cfi_adjust_cfa_offset (-8)
> +       cfi_restore (rbp)
>         popq    %rbx
> -       cfi_adjust_cfa_offset(-8)
> -       cfi_restore(rbp)
> +       cfi_adjust_cfa_offset (-8)
> +       cfi_restore (rbp)
>         popq    %r13
> -       cfi_adjust_cfa_offset(-8)
> -       cfi_restore(r13)
> +       cfi_adjust_cfa_offset (-8)
> +       cfi_restore (r13)
>         ret
>  END(_ZGVeN16v_tanhf_skx)
>
> -       .section .rodata, "a"
> +       .section .rodata.evex512, "a"
>         .align  16
> -#ifdef __svml_stanh_data_internal_typedef
> -typedef unsigned int VUINT32;
> -typedef struct
> -       {
> -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> -} __svml_stanh_data_internal;
> -#endif
> -
> -__svml_stanh_data_internal:
> -       .align  4
> -       /* _iExpMantMask_UISA */
> -       .long   0x7fe00000
> -
> -       .align  4
> -       /* _iMinIdxOfsMask_UISA */
> -       .long   0x3d400000
> -
> -       .align  4
> -       /* _iMaxIdxMask_UISA */
> -       .long   0x03e00000
> -
> -       .align  4
> -       /* _iExpMask */
> -       .long   0x7f000000
> -
> -       .align  64
> -__svml_stanh_data_internal_al64:
> -       .align  64
> -       /* _sC_lo */
> -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> -
> -       .align  64
> -       /* _sC_hi */
> -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> -
> -       .align  64
> -       /* _sP7_lo */
> -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> -
> -       .align  64
> -       /* _sP7_hi */
> -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
>
> -       .align  64
> -       /* _sSignMask */
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> -
> -       .align  64
> -       /* _sP6_lo */
> -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> -
> -       .align  64
> -       /* _sP6_hi */
> -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> -
> -       .align  64
> -       /* _sP5_lo */
> -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> -
> -       .align  64
> -       /* _sP5_hi */
> -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> -
> -       .align  64
> -       /* _sP4_lo */
> -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> -
> -       .align  64
> -       /* _sP4_hi */
> -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> -
> -       .align  64
> -       /* _sP3_lo */
> -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> -
> -       .align  64
> -       /* _sP3_hi */
> -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> -
> -       .align  64
> -       /* _sP2_lo */
> -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> -
> -       .align  64
> -       /* _sP2_hi */
> -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> -
> -       .align  64
> -       /* _sP0_lo */
> -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> -
> -       .align  64
> -       /* _sP0_hi */
> -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> +LOCAL_DATA_NAME_UNALIGNED:
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
>
>         .align  64
> -       .type   __svml_stanh_data_internal_al64, @object
> -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> -       .type   __svml_stanh_data_internal, @object
> -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> +LOCAL_DATA_NAME:
> +       float_block (LOCAL_DATA_NAME, _sC_lo,
> +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> +
> +       float_block (LOCAL_DATA_NAME, _sC_hi,
> +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> +
> +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> +
> +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> +
> +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> +
> +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> +
> +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> +
> +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> +
> +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> +
> +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> +
> +       .type   LOCAL_DATA_NAME, @object
> +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> --
> 2.34.1
>

The data movement makes the assembler codes much harder to follow.
Sunil, what do you think of this patch series?
Noah Goldstein Dec. 16, 2022, 6:17 p.m. UTC | #2
On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No changes to the logic, just change how rodata is handled.
> >
> > 1. Define the rodatas using the new macros so they check that the
> >    offset is correct.
> >
> > 2. Use common data where applicable.
> > ---
> >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> >  1 file changed, 197 insertions(+), 253 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > index d74fc7731d..765e9ed7f7 100644
> > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > @@ -70,94 +70,99 @@
> >   *
> >   */
> >
> > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > -   by use in the function. On cold-starts this might help the
> > -   prefetcher. Possibly a better idea is to interleave start/end so
> > -   that the prefetcher is less likely to detect a stream and pull
> > -   irrelivant lines into cache.  */
> >
> > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > - */
> > +
> > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > +#include "svml_s_common_evex512_rodata_offsets.h"
> > +
> > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > +   4 bytes each.  */
> >  #define _iExpMantMask_UISA             0
> >  #define _iMinIdxOfsMask_UISA           4
> >  #define _iMaxIdxMask_UISA              8
> >  #define _iExpMask                      12
> >
> > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > -   each.  */
> > -#define _sC_lo                         0
> > -#define _sC_hi                         64
> > -#define _sP7_lo                                128
> > -#define _sP7_hi                                192
> > -#define _sSignMask                     256
> > -#define _sP6_lo                                320
> > -#define _sP6_hi                                384
> > -#define _sP5_lo                                448
> > -#define _sP5_hi                                512
> > -#define _sP4_lo                                576
> > -#define _sP4_hi                                640
> > -#define _sP3_lo                                704
> > -#define _sP3_hi                                768
> > -#define _sP2_lo                                832
> > -#define _sP2_hi                                896
> > -#define _sP0_lo                                960
> > -#define _sP0_hi                                1024
> > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > +   by use in the function. On cold-starts this might help the
> > +   prefetcher. Possibly a better idea is to interleave start/end so
> > +   that the prefetcher is less likely to detect a stream and pull
> > +   irrelivant lines into cache.  */
> > +
> > +/* Offsets for data table __svml_stanh_data_internal.
> > +   64 bytes each.  */
> > +#define _sC_lo 0
> > +#define _sC_hi 64
> > +#define _sP7_lo        128
> > +#define _sP7_hi        192
> > +#define _sP6_lo        256
> > +#define _sP6_hi        320
> > +#define _sP5_lo        384
> > +#define _sP5_hi        448
> > +#define _sP4_lo        512
> > +#define _sP4_hi        576
> > +#define _sP3_lo        640
> > +#define _sP3_hi        704
> > +#define _sP2_lo        768
> > +#define _sP2_hi        832
> > +#define _sP0_lo        896
> > +#define _sP0_hi        960
> > +
> >
> >  #include <sysdep.h>
> > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> >
> >         .section .text.evex512, "ax", @progbits
> >  ENTRY(_ZGVeN16v_tanhf_skx)
> > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > +       /* Here huge arguments, INF and NaNs are filtered out to
> > +          callout.  */
> > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> >
> >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> >         vpxord  %zmm3, %zmm3, %zmm3
> >         vpmaxsd %zmm3, %zmm2, %zmm3
> > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> >
> >         /* Setup permute indices in zmm3.  */
> >         vpsrld  $21, %zmm3, %zmm3
> >
> >         /* Store if there are any special cases in k1.  */
> > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> >
> > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> >
> > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> >
> >         /* Store absolute values of inputs in zmm1.  */
> > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> >         vandnps %zmm0, %zmm4, %zmm1
> >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> >
> > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> >
> > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> >
> >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> >
> > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> >
> > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> >
> >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> >
> > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> >
> > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> >
> >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> >
> >         /* Go to special inputs processing branch.  */
> >         jne     L(SPECIAL_VALUES_BRANCH)
> > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > +
> >         /* Wait until after branch of write over zmm0.  */
> >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> >
> > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> >
> >         /* Cold case. edx has 1s where there was a special value that
> >            needs to be handled by a tanhf call. Optimize for code size
> > -          more so than speed here. */
> > +          more so than speed here.  */
> >  L(SPECIAL_VALUES_BRANCH):
> > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > -       callee save register saving code size. */
> > +
> > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > +          as callee save register saving code size.  */
> >         pushq   %r13
> > -       cfi_adjust_cfa_offset(8)
> > -       cfi_offset(r13, -16)
> > -       /* Need to callee save registers to preserve state across tanhf calls.
> > -        */
> > +       cfi_adjust_cfa_offset (8)
> > +       cfi_offset (r13, -16)
> > +       /* Need to callee save registers to preserve state across tanhf
> > +          calls.  */
> >         pushq   %rbx
> > -       cfi_adjust_cfa_offset(8)
> > -       cfi_offset(rbx, -24)
> > +       cfi_adjust_cfa_offset (8)
> > +       cfi_offset (rbx, -24)
> >         pushq   %rbp
> > -       cfi_adjust_cfa_offset(8)
> > -       cfi_offset(rbp, -32)
> > +       cfi_adjust_cfa_offset (8)
> > +       cfi_offset (rbp, -32)
> >         movq    %rsp, %r13
> > -       cfi_def_cfa_register(r13)
> > +       cfi_def_cfa_register (r13)
> >
> >         /* Align stack and make room for 2x zmm vectors.  */
> >         andq    $-64, %rsp
> > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> >
> >         vzeroupper
> >
> > -       /* edx has 1s where there was a special value that needs to be handled
> > -          by a tanhf call.  */
> > +       /* edx has 1s where there was a special value that needs to be
> > +          handled by a tanhf call.  */
> >         movl    %edx, %ebx
> >  L(SPECIAL_VALUES_LOOP):
> > -       # LOE rbx rbp r12 r13 r14 r15
> > -       /* use rbp as index for special value that is saved across calls to
> > -          tanhf. We technically don't need a callee save register here as offset
> > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > -          in the loop. Realigning also costs more code size.  */
> > +
> > +       /* use rbp as index for special value that is saved across calls
> > +          to tanhf. We technically don't need a callee save register
> > +          here as offset to rsp is always [0, 56] so we can restore
> > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > +          save/restore vs 2 extra instructions in the loop. Realigning
> > +          also costs more code size.  */
> >         xorl    %ebp, %ebp
> >         tzcntl  %ebx, %ebp
> >
> > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> >         vmovss  64(%rsp, %rbp, 4), %xmm0
> >         call    tanhf@PLT
> >
> > -       /* No good way to avoid the store-forwarding fault this will cause on
> > -          return. `lfence` avoids the SF fault but at greater cost as it
> > -          serialized stack/callee save restoration.  */
> > +       /* No good way to avoid the store-forwarding fault this will
> > +          cause on return. `lfence` avoids the SF fault but at greater
> > +          cost as it serialized stack/callee save restoration.  */
> >         vmovss  %xmm0, (%rsp, %rbp, 4)
> >
> > -       blsrl   %ebx, %ebx
> > +       blsrl   %ebx, %ebx
> >         jnz     L(SPECIAL_VALUES_LOOP)
> > -       # LOE r12 r13 r14 r15
> > +
> >
> >         /* All results have been written to (%rsp).  */
> >         vmovaps (%rsp), %zmm0
> >         /* Restore rsp.  */
> >         movq    %r13, %rsp
> > -       cfi_def_cfa_register(rsp)
> > +       cfi_def_cfa_register (rsp)
> >         /* Restore callee save registers.  */
> >         popq    %rbp
> > -       cfi_adjust_cfa_offset(-8)
> > -       cfi_restore(rbp)
> > +       cfi_adjust_cfa_offset (-8)
> > +       cfi_restore (rbp)
> >         popq    %rbx
> > -       cfi_adjust_cfa_offset(-8)
> > -       cfi_restore(rbp)
> > +       cfi_adjust_cfa_offset (-8)
> > +       cfi_restore (rbp)
> >         popq    %r13
> > -       cfi_adjust_cfa_offset(-8)
> > -       cfi_restore(r13)
> > +       cfi_adjust_cfa_offset (-8)
> > +       cfi_restore (r13)
> >         ret
> >  END(_ZGVeN16v_tanhf_skx)
> >
> > -       .section .rodata, "a"
> > +       .section .rodata.evex512, "a"
> >         .align  16
> > -#ifdef __svml_stanh_data_internal_typedef
> > -typedef unsigned int VUINT32;
> > -typedef struct
> > -       {
> > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > -} __svml_stanh_data_internal;
> > -#endif
> > -
> > -__svml_stanh_data_internal:
> > -       .align  4
> > -       /* _iExpMantMask_UISA */
> > -       .long   0x7fe00000
> > -
> > -       .align  4
> > -       /* _iMinIdxOfsMask_UISA */
> > -       .long   0x3d400000
> > -
> > -       .align  4
> > -       /* _iMaxIdxMask_UISA */
> > -       .long   0x03e00000
> > -
> > -       .align  4
> > -       /* _iExpMask */
> > -       .long   0x7f000000
> > -
> > -       .align  64
> > -__svml_stanh_data_internal_al64:
> > -       .align  64
> > -       /* _sC_lo */
> > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > -
> > -       .align  64
> > -       /* _sC_hi */
> > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP7_lo */
> > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > -
> > -       .align  64
> > -       /* _sP7_hi */
> > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> >
> > -       .align  64
> > -       /* _sSignMask */
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > -
> > -       .align  64
> > -       /* _sP6_lo */
> > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > -
> > -       .align  64
> > -       /* _sP6_hi */
> > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP5_lo */
> > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > -
> > -       .align  64
> > -       /* _sP5_hi */
> > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP4_lo */
> > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > -
> > -       .align  64
> > -       /* _sP4_hi */
> > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP3_lo */
> > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > -
> > -       .align  64
> > -       /* _sP3_hi */
> > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP2_lo */
> > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > -
> > -       .align  64
> > -       /* _sP2_hi */
> > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > -
> > -       .align  64
> > -       /* _sP0_lo */
> > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > -
> > -       .align  64
> > -       /* _sP0_hi */
> > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > +LOCAL_DATA_NAME_UNALIGNED:
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> >
> >         .align  64
> > -       .type   __svml_stanh_data_internal_al64, @object
> > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > -       .type   __svml_stanh_data_internal, @object
> > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > +LOCAL_DATA_NAME:
> > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > +
> > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > +
> > +       .type   LOCAL_DATA_NAME, @object
> > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > --
> > 2.34.1
> >
>
> The data movement makes the assembler codes much harder to follow.
> Sunil, what do you think of this patch series?

What do you mean? The change on in how we define rodata or the movement
to multiple files or something else?
>
>
> --
> H.J.
H.J. Lu Dec. 16, 2022, 9:37 p.m. UTC | #3
On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > No changes to the logic, just change how rodata is handled.
> > >
> > > 1. Define the rodatas using the new macros so they check that the
> > >    offset is correct.
> > >
> > > 2. Use common data where applicable.
> > > ---
> > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > index d74fc7731d..765e9ed7f7 100644
> > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > @@ -70,94 +70,99 @@
> > >   *
> > >   */
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > -   by use in the function. On cold-starts this might help the
> > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > -   that the prefetcher is less likely to detect a stream and pull
> > > -   irrelivant lines into cache.  */
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > - */
> > > +
> > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > +
> > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > +   4 bytes each.  */
> > >  #define _iExpMantMask_UISA             0
> > >  #define _iMinIdxOfsMask_UISA           4
> > >  #define _iMaxIdxMask_UISA              8
> > >  #define _iExpMask                      12
> > >
> > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > -   each.  */
> > > -#define _sC_lo                         0
> > > -#define _sC_hi                         64
> > > -#define _sP7_lo                                128
> > > -#define _sP7_hi                                192
> > > -#define _sSignMask                     256
> > > -#define _sP6_lo                                320
> > > -#define _sP6_hi                                384
> > > -#define _sP5_lo                                448
> > > -#define _sP5_hi                                512
> > > -#define _sP4_lo                                576
> > > -#define _sP4_hi                                640
> > > -#define _sP3_lo                                704
> > > -#define _sP3_hi                                768
> > > -#define _sP2_lo                                832
> > > -#define _sP2_hi                                896
> > > -#define _sP0_lo                                960
> > > -#define _sP0_hi                                1024
> > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > +   by use in the function. On cold-starts this might help the
> > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > +   that the prefetcher is less likely to detect a stream and pull
> > > +   irrelivant lines into cache.  */
> > > +
> > > +/* Offsets for data table __svml_stanh_data_internal.
> > > +   64 bytes each.  */
> > > +#define _sC_lo 0
> > > +#define _sC_hi 64
> > > +#define _sP7_lo        128
> > > +#define _sP7_hi        192
> > > +#define _sP6_lo        256
> > > +#define _sP6_hi        320
> > > +#define _sP5_lo        384
> > > +#define _sP5_hi        448
> > > +#define _sP4_lo        512
> > > +#define _sP4_hi        576
> > > +#define _sP3_lo        640
> > > +#define _sP3_hi        704
> > > +#define _sP2_lo        768
> > > +#define _sP2_hi        832
> > > +#define _sP0_lo        896
> > > +#define _sP0_hi        960
> > > +
> > >
> > >  #include <sysdep.h>
> > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > >
> > >         .section .text.evex512, "ax", @progbits
> > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > +          callout.  */
> > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > >
> > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > >         vpxord  %zmm3, %zmm3, %zmm3
> > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > >
> > >         /* Setup permute indices in zmm3.  */
> > >         vpsrld  $21, %zmm3, %zmm3
> > >
> > >         /* Store if there are any special cases in k1.  */
> > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > >
> > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > >
> > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > >
> > >         /* Store absolute values of inputs in zmm1.  */
> > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > >         vandnps %zmm0, %zmm4, %zmm1
> > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > >
> > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > >
> > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > >
> > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > >
> > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > >
> > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > >
> > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > >
> > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > >
> > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > >
> > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > >
> > >         /* Go to special inputs processing branch.  */
> > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > +
> > >         /* Wait until after branch of write over zmm0.  */
> > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > >
> > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > >
> > >         /* Cold case. edx has 1s where there was a special value that
> > >            needs to be handled by a tanhf call. Optimize for code size
> > > -          more so than speed here. */
> > > +          more so than speed here.  */
> > >  L(SPECIAL_VALUES_BRANCH):
> > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > -       callee save register saving code size. */
> > > +
> > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > +          as callee save register saving code size.  */
> > >         pushq   %r13
> > > -       cfi_adjust_cfa_offset(8)
> > > -       cfi_offset(r13, -16)
> > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > -        */
> > > +       cfi_adjust_cfa_offset (8)
> > > +       cfi_offset (r13, -16)
> > > +       /* Need to callee save registers to preserve state across tanhf
> > > +          calls.  */
> > >         pushq   %rbx
> > > -       cfi_adjust_cfa_offset(8)
> > > -       cfi_offset(rbx, -24)
> > > +       cfi_adjust_cfa_offset (8)
> > > +       cfi_offset (rbx, -24)
> > >         pushq   %rbp
> > > -       cfi_adjust_cfa_offset(8)
> > > -       cfi_offset(rbp, -32)
> > > +       cfi_adjust_cfa_offset (8)
> > > +       cfi_offset (rbp, -32)
> > >         movq    %rsp, %r13
> > > -       cfi_def_cfa_register(r13)
> > > +       cfi_def_cfa_register (r13)
> > >
> > >         /* Align stack and make room for 2x zmm vectors.  */
> > >         andq    $-64, %rsp
> > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > >
> > >         vzeroupper
> > >
> > > -       /* edx has 1s where there was a special value that needs to be handled
> > > -          by a tanhf call.  */
> > > +       /* edx has 1s where there was a special value that needs to be
> > > +          handled by a tanhf call.  */
> > >         movl    %edx, %ebx
> > >  L(SPECIAL_VALUES_LOOP):
> > > -       # LOE rbx rbp r12 r13 r14 r15
> > > -       /* use rbp as index for special value that is saved across calls to
> > > -          tanhf. We technically don't need a callee save register here as offset
> > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > -          in the loop. Realigning also costs more code size.  */
> > > +
> > > +       /* use rbp as index for special value that is saved across calls
> > > +          to tanhf. We technically don't need a callee save register
> > > +          here as offset to rsp is always [0, 56] so we can restore
> > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > +          also costs more code size.  */
> > >         xorl    %ebp, %ebp
> > >         tzcntl  %ebx, %ebp
> > >
> > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > >         call    tanhf@PLT
> > >
> > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > -          serialized stack/callee save restoration.  */
> > > +       /* No good way to avoid the store-forwarding fault this will
> > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > +          cost as it serialized stack/callee save restoration.  */
> > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > >
> > > -       blsrl   %ebx, %ebx
> > > +       blsrl   %ebx, %ebx
> > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > -       # LOE r12 r13 r14 r15
> > > +
> > >
> > >         /* All results have been written to (%rsp).  */
> > >         vmovaps (%rsp), %zmm0
> > >         /* Restore rsp.  */
> > >         movq    %r13, %rsp
> > > -       cfi_def_cfa_register(rsp)
> > > +       cfi_def_cfa_register (rsp)
> > >         /* Restore callee save registers.  */
> > >         popq    %rbp
> > > -       cfi_adjust_cfa_offset(-8)
> > > -       cfi_restore(rbp)
> > > +       cfi_adjust_cfa_offset (-8)
> > > +       cfi_restore (rbp)
> > >         popq    %rbx
> > > -       cfi_adjust_cfa_offset(-8)
> > > -       cfi_restore(rbp)
> > > +       cfi_adjust_cfa_offset (-8)
> > > +       cfi_restore (rbp)
> > >         popq    %r13
> > > -       cfi_adjust_cfa_offset(-8)
> > > -       cfi_restore(r13)
> > > +       cfi_adjust_cfa_offset (-8)
> > > +       cfi_restore (r13)
> > >         ret
> > >  END(_ZGVeN16v_tanhf_skx)
> > >
> > > -       .section .rodata, "a"
> > > +       .section .rodata.evex512, "a"
> > >         .align  16
> > > -#ifdef __svml_stanh_data_internal_typedef
> > > -typedef unsigned int VUINT32;
> > > -typedef struct
> > > -       {
> > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > -} __svml_stanh_data_internal;
> > > -#endif
> > > -
> > > -__svml_stanh_data_internal:
> > > -       .align  4
> > > -       /* _iExpMantMask_UISA */
> > > -       .long   0x7fe00000
> > > -
> > > -       .align  4
> > > -       /* _iMinIdxOfsMask_UISA */
> > > -       .long   0x3d400000
> > > -
> > > -       .align  4
> > > -       /* _iMaxIdxMask_UISA */
> > > -       .long   0x03e00000
> > > -
> > > -       .align  4
> > > -       /* _iExpMask */
> > > -       .long   0x7f000000
> > > -
> > > -       .align  64
> > > -__svml_stanh_data_internal_al64:
> > > -       .align  64
> > > -       /* _sC_lo */
> > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > -
> > > -       .align  64
> > > -       /* _sC_hi */
> > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP7_lo */
> > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > -
> > > -       .align  64
> > > -       /* _sP7_hi */
> > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > >
> > > -       .align  64
> > > -       /* _sSignMask */
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > -
> > > -       .align  64
> > > -       /* _sP6_lo */
> > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > -
> > > -       .align  64
> > > -       /* _sP6_hi */
> > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP5_lo */
> > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > -
> > > -       .align  64
> > > -       /* _sP5_hi */
> > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP4_lo */
> > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > -
> > > -       .align  64
> > > -       /* _sP4_hi */
> > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP3_lo */
> > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > -
> > > -       .align  64
> > > -       /* _sP3_hi */
> > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP2_lo */
> > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > -
> > > -       .align  64
> > > -       /* _sP2_hi */
> > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > -
> > > -       .align  64
> > > -       /* _sP0_lo */
> > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > -
> > > -       .align  64
> > > -       /* _sP0_hi */
> > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > +LOCAL_DATA_NAME_UNALIGNED:
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > >
> > >         .align  64
> > > -       .type   __svml_stanh_data_internal_al64, @object
> > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > -       .type   __svml_stanh_data_internal, @object
> > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > +LOCAL_DATA_NAME:
> > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > +
> > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > +
> > > +       .type   LOCAL_DATA_NAME, @object
> > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > --
> > > 2.34.1
> > >
> >
> > The data movement makes the assembler codes much harder to follow.
> > Sunil, what do you think of this patch series?
>
> What do you mean? The change on in how we define rodata or the movement
> to multiple files or something else?

The glibc way to support data files for assembly codes is to define
data in C and use *.sym to generate offsets for assembly files, like

sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
offsetof (struct cpu_features, xsave_state_size)
sysdeps/x86_64/dl-trampoline.h:  sub
_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
%RSP_LP
sysdeps/x86_64/dl-trampoline.h:  sub
_dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
Noah Goldstein Dec. 16, 2022, 9:51 p.m. UTC | #4
On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > No changes to the logic, just change how rodata is handled.
> > > >
> > > > 1. Define the rodatas using the new macros so they check that the
> > > >    offset is correct.
> > > >
> > > > 2. Use common data where applicable.
> > > > ---
> > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > index d74fc7731d..765e9ed7f7 100644
> > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > @@ -70,94 +70,99 @@
> > > >   *
> > > >   */
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > -   by use in the function. On cold-starts this might help the
> > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > -   irrelivant lines into cache.  */
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > - */
> > > > +
> > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > +
> > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > +   4 bytes each.  */
> > > >  #define _iExpMantMask_UISA             0
> > > >  #define _iMinIdxOfsMask_UISA           4
> > > >  #define _iMaxIdxMask_UISA              8
> > > >  #define _iExpMask                      12
> > > >
> > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > -   each.  */
> > > > -#define _sC_lo                         0
> > > > -#define _sC_hi                         64
> > > > -#define _sP7_lo                                128
> > > > -#define _sP7_hi                                192
> > > > -#define _sSignMask                     256
> > > > -#define _sP6_lo                                320
> > > > -#define _sP6_hi                                384
> > > > -#define _sP5_lo                                448
> > > > -#define _sP5_hi                                512
> > > > -#define _sP4_lo                                576
> > > > -#define _sP4_hi                                640
> > > > -#define _sP3_lo                                704
> > > > -#define _sP3_hi                                768
> > > > -#define _sP2_lo                                832
> > > > -#define _sP2_hi                                896
> > > > -#define _sP0_lo                                960
> > > > -#define _sP0_hi                                1024
> > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > +   by use in the function. On cold-starts this might help the
> > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > +   irrelivant lines into cache.  */
> > > > +
> > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > +   64 bytes each.  */
> > > > +#define _sC_lo 0
> > > > +#define _sC_hi 64
> > > > +#define _sP7_lo        128
> > > > +#define _sP7_hi        192
> > > > +#define _sP6_lo        256
> > > > +#define _sP6_hi        320
> > > > +#define _sP5_lo        384
> > > > +#define _sP5_hi        448
> > > > +#define _sP4_lo        512
> > > > +#define _sP4_hi        576
> > > > +#define _sP3_lo        640
> > > > +#define _sP3_hi        704
> > > > +#define _sP2_lo        768
> > > > +#define _sP2_hi        832
> > > > +#define _sP0_lo        896
> > > > +#define _sP0_hi        960
> > > > +
> > > >
> > > >  #include <sysdep.h>
> > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > >
> > > >         .section .text.evex512, "ax", @progbits
> > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > +          callout.  */
> > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > >
> > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > >
> > > >         /* Setup permute indices in zmm3.  */
> > > >         vpsrld  $21, %zmm3, %zmm3
> > > >
> > > >         /* Store if there are any special cases in k1.  */
> > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > >
> > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > >
> > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > >
> > > >         /* Store absolute values of inputs in zmm1.  */
> > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > >         vandnps %zmm0, %zmm4, %zmm1
> > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > >
> > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > >
> > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > >
> > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > >
> > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > >
> > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > >
> > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > >
> > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > >
> > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > >
> > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > >
> > > >         /* Go to special inputs processing branch.  */
> > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > +
> > > >         /* Wait until after branch of write over zmm0.  */
> > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > >
> > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > >
> > > >         /* Cold case. edx has 1s where there was a special value that
> > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > -          more so than speed here. */
> > > > +          more so than speed here.  */
> > > >  L(SPECIAL_VALUES_BRANCH):
> > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > -       callee save register saving code size. */
> > > > +
> > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > +          as callee save register saving code size.  */
> > > >         pushq   %r13
> > > > -       cfi_adjust_cfa_offset(8)
> > > > -       cfi_offset(r13, -16)
> > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > -        */
> > > > +       cfi_adjust_cfa_offset (8)
> > > > +       cfi_offset (r13, -16)
> > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > +          calls.  */
> > > >         pushq   %rbx
> > > > -       cfi_adjust_cfa_offset(8)
> > > > -       cfi_offset(rbx, -24)
> > > > +       cfi_adjust_cfa_offset (8)
> > > > +       cfi_offset (rbx, -24)
> > > >         pushq   %rbp
> > > > -       cfi_adjust_cfa_offset(8)
> > > > -       cfi_offset(rbp, -32)
> > > > +       cfi_adjust_cfa_offset (8)
> > > > +       cfi_offset (rbp, -32)
> > > >         movq    %rsp, %r13
> > > > -       cfi_def_cfa_register(r13)
> > > > +       cfi_def_cfa_register (r13)
> > > >
> > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > >         andq    $-64, %rsp
> > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > >
> > > >         vzeroupper
> > > >
> > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > -          by a tanhf call.  */
> > > > +       /* edx has 1s where there was a special value that needs to be
> > > > +          handled by a tanhf call.  */
> > > >         movl    %edx, %ebx
> > > >  L(SPECIAL_VALUES_LOOP):
> > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > -       /* use rbp as index for special value that is saved across calls to
> > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > -          in the loop. Realigning also costs more code size.  */
> > > > +
> > > > +       /* use rbp as index for special value that is saved across calls
> > > > +          to tanhf. We technically don't need a callee save register
> > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > +          also costs more code size.  */
> > > >         xorl    %ebp, %ebp
> > > >         tzcntl  %ebx, %ebp
> > > >
> > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > >         call    tanhf@PLT
> > > >
> > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > -          serialized stack/callee save restoration.  */
> > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > +          cost as it serialized stack/callee save restoration.  */
> > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > >
> > > > -       blsrl   %ebx, %ebx
> > > > +       blsrl   %ebx, %ebx
> > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > -       # LOE r12 r13 r14 r15
> > > > +
> > > >
> > > >         /* All results have been written to (%rsp).  */
> > > >         vmovaps (%rsp), %zmm0
> > > >         /* Restore rsp.  */
> > > >         movq    %r13, %rsp
> > > > -       cfi_def_cfa_register(rsp)
> > > > +       cfi_def_cfa_register (rsp)
> > > >         /* Restore callee save registers.  */
> > > >         popq    %rbp
> > > > -       cfi_adjust_cfa_offset(-8)
> > > > -       cfi_restore(rbp)
> > > > +       cfi_adjust_cfa_offset (-8)
> > > > +       cfi_restore (rbp)
> > > >         popq    %rbx
> > > > -       cfi_adjust_cfa_offset(-8)
> > > > -       cfi_restore(rbp)
> > > > +       cfi_adjust_cfa_offset (-8)
> > > > +       cfi_restore (rbp)
> > > >         popq    %r13
> > > > -       cfi_adjust_cfa_offset(-8)
> > > > -       cfi_restore(r13)
> > > > +       cfi_adjust_cfa_offset (-8)
> > > > +       cfi_restore (r13)
> > > >         ret
> > > >  END(_ZGVeN16v_tanhf_skx)
> > > >
> > > > -       .section .rodata, "a"
> > > > +       .section .rodata.evex512, "a"
> > > >         .align  16
> > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > -typedef unsigned int VUINT32;
> > > > -typedef struct
> > > > -       {
> > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > -} __svml_stanh_data_internal;
> > > > -#endif
> > > > -
> > > > -__svml_stanh_data_internal:
> > > > -       .align  4
> > > > -       /* _iExpMantMask_UISA */
> > > > -       .long   0x7fe00000
> > > > -
> > > > -       .align  4
> > > > -       /* _iMinIdxOfsMask_UISA */
> > > > -       .long   0x3d400000
> > > > -
> > > > -       .align  4
> > > > -       /* _iMaxIdxMask_UISA */
> > > > -       .long   0x03e00000
> > > > -
> > > > -       .align  4
> > > > -       /* _iExpMask */
> > > > -       .long   0x7f000000
> > > > -
> > > > -       .align  64
> > > > -__svml_stanh_data_internal_al64:
> > > > -       .align  64
> > > > -       /* _sC_lo */
> > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > -
> > > > -       .align  64
> > > > -       /* _sC_hi */
> > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP7_lo */
> > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > -
> > > > -       .align  64
> > > > -       /* _sP7_hi */
> > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > >
> > > > -       .align  64
> > > > -       /* _sSignMask */
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP6_lo */
> > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > -
> > > > -       .align  64
> > > > -       /* _sP6_hi */
> > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP5_lo */
> > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > -
> > > > -       .align  64
> > > > -       /* _sP5_hi */
> > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP4_lo */
> > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > -
> > > > -       .align  64
> > > > -       /* _sP4_hi */
> > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP3_lo */
> > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > -
> > > > -       .align  64
> > > > -       /* _sP3_hi */
> > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP2_lo */
> > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > -
> > > > -       .align  64
> > > > -       /* _sP2_hi */
> > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > -
> > > > -       .align  64
> > > > -       /* _sP0_lo */
> > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > -
> > > > -       .align  64
> > > > -       /* _sP0_hi */
> > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > >
> > > >         .align  64
> > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > -       .type   __svml_stanh_data_internal, @object
> > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > +LOCAL_DATA_NAME:
> > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > +
> > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > +
> > > > +       .type   LOCAL_DATA_NAME, @object
> > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > --
> > > > 2.34.1
> > > >
> > >
> > > The data movement makes the assembler codes much harder to follow.
> > > Sunil, what do you think of this patch series?
> >
> > What do you mean? The change on in how we define rodata or the movement
> > to multiple files or something else?
>
> The glibc way to support data files for assembly codes is to define
> data in C and use *.sym to generate offsets for assembly files, like

I see. Although to be fair the entire SVML codebase bucks that trend.

Seems like a more dramatic trend to move all the offsets to C.
>
> sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> offsetof (struct cpu_features, xsave_state_size)
> sysdeps/x86_64/dl-trampoline.h:  sub
> _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> %RSP_LP
> sysdeps/x86_64/dl-trampoline.h:  sub
> _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
>
> --
> H.J.
H.J. Lu Dec. 16, 2022, 10:01 p.m. UTC | #5
On Fri, Dec 16, 2022 at 1:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > No changes to the logic, just change how rodata is handled.
> > > > >
> > > > > 1. Define the rodatas using the new macros so they check that the
> > > > >    offset is correct.
> > > > >
> > > > > 2. Use common data where applicable.
> > > > > ---
> > > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > index d74fc7731d..765e9ed7f7 100644
> > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > @@ -70,94 +70,99 @@
> > > > >   *
> > > > >   */
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > > -   by use in the function. On cold-starts this might help the
> > > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > > -   irrelivant lines into cache.  */
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > > - */
> > > > > +
> > > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > > +
> > > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > > +   4 bytes each.  */
> > > > >  #define _iExpMantMask_UISA             0
> > > > >  #define _iMinIdxOfsMask_UISA           4
> > > > >  #define _iMaxIdxMask_UISA              8
> > > > >  #define _iExpMask                      12
> > > > >
> > > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > > -   each.  */
> > > > > -#define _sC_lo                         0
> > > > > -#define _sC_hi                         64
> > > > > -#define _sP7_lo                                128
> > > > > -#define _sP7_hi                                192
> > > > > -#define _sSignMask                     256
> > > > > -#define _sP6_lo                                320
> > > > > -#define _sP6_hi                                384
> > > > > -#define _sP5_lo                                448
> > > > > -#define _sP5_hi                                512
> > > > > -#define _sP4_lo                                576
> > > > > -#define _sP4_hi                                640
> > > > > -#define _sP3_lo                                704
> > > > > -#define _sP3_hi                                768
> > > > > -#define _sP2_lo                                832
> > > > > -#define _sP2_hi                                896
> > > > > -#define _sP0_lo                                960
> > > > > -#define _sP0_hi                                1024
> > > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > > +   by use in the function. On cold-starts this might help the
> > > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > > +   irrelivant lines into cache.  */
> > > > > +
> > > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > > +   64 bytes each.  */
> > > > > +#define _sC_lo 0
> > > > > +#define _sC_hi 64
> > > > > +#define _sP7_lo        128
> > > > > +#define _sP7_hi        192
> > > > > +#define _sP6_lo        256
> > > > > +#define _sP6_hi        320
> > > > > +#define _sP5_lo        384
> > > > > +#define _sP5_hi        448
> > > > > +#define _sP4_lo        512
> > > > > +#define _sP4_hi        576
> > > > > +#define _sP3_lo        640
> > > > > +#define _sP3_hi        704
> > > > > +#define _sP2_lo        768
> > > > > +#define _sP2_hi        832
> > > > > +#define _sP0_lo        896
> > > > > +#define _sP0_hi        960
> > > > > +
> > > > >
> > > > >  #include <sysdep.h>
> > > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > > >
> > > > >         .section .text.evex512, "ax", @progbits
> > > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > > +          callout.  */
> > > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > >
> > > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > >
> > > > >         /* Setup permute indices in zmm3.  */
> > > > >         vpsrld  $21, %zmm3, %zmm3
> > > > >
> > > > >         /* Store if there are any special cases in k1.  */
> > > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > >
> > > > >         /* Store absolute values of inputs in zmm1.  */
> > > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > > >         vandnps %zmm0, %zmm4, %zmm1
> > > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > >
> > > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > >
> > > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > >
> > > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > >
> > > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > >
> > > > >         /* Go to special inputs processing branch.  */
> > > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > +
> > > > >         /* Wait until after branch of write over zmm0.  */
> > > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > >
> > > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > >
> > > > >         /* Cold case. edx has 1s where there was a special value that
> > > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > > -          more so than speed here. */
> > > > > +          more so than speed here.  */
> > > > >  L(SPECIAL_VALUES_BRANCH):
> > > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > > -       callee save register saving code size. */
> > > > > +
> > > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > > +          as callee save register saving code size.  */
> > > > >         pushq   %r13
> > > > > -       cfi_adjust_cfa_offset(8)
> > > > > -       cfi_offset(r13, -16)
> > > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > -        */
> > > > > +       cfi_adjust_cfa_offset (8)
> > > > > +       cfi_offset (r13, -16)
> > > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > > +          calls.  */
> > > > >         pushq   %rbx
> > > > > -       cfi_adjust_cfa_offset(8)
> > > > > -       cfi_offset(rbx, -24)
> > > > > +       cfi_adjust_cfa_offset (8)
> > > > > +       cfi_offset (rbx, -24)
> > > > >         pushq   %rbp
> > > > > -       cfi_adjust_cfa_offset(8)
> > > > > -       cfi_offset(rbp, -32)
> > > > > +       cfi_adjust_cfa_offset (8)
> > > > > +       cfi_offset (rbp, -32)
> > > > >         movq    %rsp, %r13
> > > > > -       cfi_def_cfa_register(r13)
> > > > > +       cfi_def_cfa_register (r13)
> > > > >
> > > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > > >         andq    $-64, %rsp
> > > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > > >
> > > > >         vzeroupper
> > > > >
> > > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > > -          by a tanhf call.  */
> > > > > +       /* edx has 1s where there was a special value that needs to be
> > > > > +          handled by a tanhf call.  */
> > > > >         movl    %edx, %ebx
> > > > >  L(SPECIAL_VALUES_LOOP):
> > > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > > -       /* use rbp as index for special value that is saved across calls to
> > > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > -          in the loop. Realigning also costs more code size.  */
> > > > > +
> > > > > +       /* use rbp as index for special value that is saved across calls
> > > > > +          to tanhf. We technically don't need a callee save register
> > > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > > +          also costs more code size.  */
> > > > >         xorl    %ebp, %ebp
> > > > >         tzcntl  %ebx, %ebp
> > > > >
> > > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > > >         call    tanhf@PLT
> > > > >
> > > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > -          serialized stack/callee save restoration.  */
> > > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > > +          cost as it serialized stack/callee save restoration.  */
> > > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > > >
> > > > > -       blsrl   %ebx, %ebx
> > > > > +       blsrl   %ebx, %ebx
> > > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > > -       # LOE r12 r13 r14 r15
> > > > > +
> > > > >
> > > > >         /* All results have been written to (%rsp).  */
> > > > >         vmovaps (%rsp), %zmm0
> > > > >         /* Restore rsp.  */
> > > > >         movq    %r13, %rsp
> > > > > -       cfi_def_cfa_register(rsp)
> > > > > +       cfi_def_cfa_register (rsp)
> > > > >         /* Restore callee save registers.  */
> > > > >         popq    %rbp
> > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > -       cfi_restore(rbp)
> > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > +       cfi_restore (rbp)
> > > > >         popq    %rbx
> > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > -       cfi_restore(rbp)
> > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > +       cfi_restore (rbp)
> > > > >         popq    %r13
> > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > -       cfi_restore(r13)
> > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > +       cfi_restore (r13)
> > > > >         ret
> > > > >  END(_ZGVeN16v_tanhf_skx)
> > > > >
> > > > > -       .section .rodata, "a"
> > > > > +       .section .rodata.evex512, "a"
> > > > >         .align  16
> > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > -typedef unsigned int VUINT32;
> > > > > -typedef struct
> > > > > -       {
> > > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > > -} __svml_stanh_data_internal;
> > > > > -#endif
> > > > > -
> > > > > -__svml_stanh_data_internal:
> > > > > -       .align  4
> > > > > -       /* _iExpMantMask_UISA */
> > > > > -       .long   0x7fe00000
> > > > > -
> > > > > -       .align  4
> > > > > -       /* _iMinIdxOfsMask_UISA */
> > > > > -       .long   0x3d400000
> > > > > -
> > > > > -       .align  4
> > > > > -       /* _iMaxIdxMask_UISA */
> > > > > -       .long   0x03e00000
> > > > > -
> > > > > -       .align  4
> > > > > -       /* _iExpMask */
> > > > > -       .long   0x7f000000
> > > > > -
> > > > > -       .align  64
> > > > > -__svml_stanh_data_internal_al64:
> > > > > -       .align  64
> > > > > -       /* _sC_lo */
> > > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sC_hi */
> > > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP7_lo */
> > > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP7_hi */
> > > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > >
> > > > > -       .align  64
> > > > > -       /* _sSignMask */
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP6_lo */
> > > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP6_hi */
> > > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP5_lo */
> > > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP5_hi */
> > > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP4_lo */
> > > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP4_hi */
> > > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP3_lo */
> > > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP3_hi */
> > > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP2_lo */
> > > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP2_hi */
> > > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP0_lo */
> > > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > -
> > > > > -       .align  64
> > > > > -       /* _sP0_hi */
> > > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > > >
> > > > >         .align  64
> > > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > > -       .type   __svml_stanh_data_internal, @object
> > > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > > +LOCAL_DATA_NAME:
> > > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > > +
> > > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > > +
> > > > > +       .type   LOCAL_DATA_NAME, @object
> > > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > > --
> > > > > 2.34.1
> > > > >
> > > >
> > > > The data movement makes the assembler codes much harder to follow.
> > > > Sunil, what do you think of this patch series?
> > >
> > > What do you mean? The change on in how we define rodata or the movement
> > > to multiple files or something else?
> >
> > The glibc way to support data files for assembly codes is to define
> > data in C and use *.sym to generate offsets for assembly files, like
>
> I see. Although to be fair the entire SVML codebase bucks that trend.

It is because libmvec codes were generated by ICC and processed
by scripts.

> Seems like a more dramatic trend to move all the offsets to C.

Since you are adding data by hand, you should do it in C.

> >
> > sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> > offsetof (struct cpu_features, xsave_state_size)
> > sysdeps/x86_64/dl-trampoline.h:  sub
> > _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> > %RSP_LP
> > sysdeps/x86_64/dl-trampoline.h:  sub
> > _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> >
> > --
> > H.J.
Sunil Pandey Dec. 16, 2022, 10:54 p.m. UTC | #6
On Fri, Dec 16, 2022 at 2:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 1:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > No changes to the logic, just change how rodata is handled.
> > > > > >
> > > > > > 1. Define the rodatas using the new macros so they check that the
> > > > > >    offset is correct.
> > > > > >
> > > > > > 2. Use common data where applicable.
> > > > > > ---
> > > > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > index d74fc7731d..765e9ed7f7 100644
> > > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > @@ -70,94 +70,99 @@
> > > > > >   *
> > > > > >   */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > > > -   by use in the function. On cold-starts this might help the
> > > > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > > > -   irrelivant lines into cache.  */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > > > - */
> > > > > > +
> > > > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > > > +   4 bytes each.  */
> > > > > >  #define _iExpMantMask_UISA             0
> > > > > >  #define _iMinIdxOfsMask_UISA           4
> > > > > >  #define _iMaxIdxMask_UISA              8
> > > > > >  #define _iExpMask                      12
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > > > -   each.  */
> > > > > > -#define _sC_lo                         0
> > > > > > -#define _sC_hi                         64
> > > > > > -#define _sP7_lo                                128
> > > > > > -#define _sP7_hi                                192
> > > > > > -#define _sSignMask                     256
> > > > > > -#define _sP6_lo                                320
> > > > > > -#define _sP6_hi                                384
> > > > > > -#define _sP5_lo                                448
> > > > > > -#define _sP5_hi                                512
> > > > > > -#define _sP4_lo                                576
> > > > > > -#define _sP4_hi                                640
> > > > > > -#define _sP3_lo                                704
> > > > > > -#define _sP3_hi                                768
> > > > > > -#define _sP2_lo                                832
> > > > > > -#define _sP2_hi                                896
> > > > > > -#define _sP0_lo                                960
> > > > > > -#define _sP0_hi                                1024
> > > > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > > > +   by use in the function. On cold-starts this might help the
> > > > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > > > +   irrelivant lines into cache.  */
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > > > +   64 bytes each.  */
> > > > > > +#define _sC_lo 0
> > > > > > +#define _sC_hi 64
> > > > > > +#define _sP7_lo        128
> > > > > > +#define _sP7_hi        192
> > > > > > +#define _sP6_lo        256
> > > > > > +#define _sP6_hi        320
> > > > > > +#define _sP5_lo        384
> > > > > > +#define _sP5_hi        448
> > > > > > +#define _sP4_lo        512
> > > > > > +#define _sP4_hi        576
> > > > > > +#define _sP3_lo        640
> > > > > > +#define _sP3_hi        704
> > > > > > +#define _sP2_lo        768
> > > > > > +#define _sP2_hi        832
> > > > > > +#define _sP0_lo        896
> > > > > > +#define _sP0_hi        960
> > > > > > +
> > > > > >
> > > > > >  #include <sysdep.h>
> > > > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > > > >
> > > > > >         .section .text.evex512, "ax", @progbits
> > > > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > > > +          callout.  */
> > > > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > >
> > > > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Setup permute indices in zmm3.  */
> > > > > >         vpsrld  $21, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Store if there are any special cases in k1.  */
> > > > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > >
> > > > > >         /* Store absolute values of inputs in zmm1.  */
> > > > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > > > >         vandnps %zmm0, %zmm4, %zmm1
> > > > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Go to special inputs processing branch.  */
> > > > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > +
> > > > > >         /* Wait until after branch of write over zmm0.  */
> > > > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > > >
> > > > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Cold case. edx has 1s where there was a special value that
> > > > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > > > -          more so than speed here. */
> > > > > > +          more so than speed here.  */
> > > > > >  L(SPECIAL_VALUES_BRANCH):
> > > > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > > > -       callee save register saving code size. */
> > > > > > +
> > > > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > > > +          as callee save register saving code size.  */
> > > > > >         pushq   %r13
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(r13, -16)
> > > > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > > -        */
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (r13, -16)
> > > > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > > > +          calls.  */
> > > > > >         pushq   %rbx
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbx, -24)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbx, -24)
> > > > > >         pushq   %rbp
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbp, -32)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbp, -32)
> > > > > >         movq    %rsp, %r13
> > > > > > -       cfi_def_cfa_register(r13)
> > > > > > +       cfi_def_cfa_register (r13)
> > > > > >
> > > > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > > > >         andq    $-64, %rsp
> > > > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > > > >
> > > > > >         vzeroupper
> > > > > >
> > > > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > > > -          by a tanhf call.  */
> > > > > > +       /* edx has 1s where there was a special value that needs to be
> > > > > > +          handled by a tanhf call.  */
> > > > > >         movl    %edx, %ebx
> > > > > >  L(SPECIAL_VALUES_LOOP):
> > > > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > > > -       /* use rbp as index for special value that is saved across calls to
> > > > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > > -          in the loop. Realigning also costs more code size.  */
> > > > > > +
> > > > > > +       /* use rbp as index for special value that is saved across calls
> > > > > > +          to tanhf. We technically don't need a callee save register
> > > > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > > > +          also costs more code size.  */
> > > > > >         xorl    %ebp, %ebp
> > > > > >         tzcntl  %ebx, %ebp
> > > > > >
> > > > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > > > >         call    tanhf@PLT
> > > > > >
> > > > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > > -          serialized stack/callee save restoration.  */
> > > > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > > > +          cost as it serialized stack/callee save restoration.  */
> > > > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > > > >
> > > > > > -       blsrl   %ebx, %ebx
> > > > > > +       blsrl   %ebx, %ebx
> > > > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > > > -       # LOE r12 r13 r14 r15
> > > > > > +
> > > > > >
> > > > > >         /* All results have been written to (%rsp).  */
> > > > > >         vmovaps (%rsp), %zmm0
> > > > > >         /* Restore rsp.  */
> > > > > >         movq    %r13, %rsp
> > > > > > -       cfi_def_cfa_register(rsp)
> > > > > > +       cfi_def_cfa_register (rsp)
> > > > > >         /* Restore callee save registers.  */
> > > > > >         popq    %rbp
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %rbx
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %r13
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(r13)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (r13)
> > > > > >         ret
> > > > > >  END(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > > -       .section .rodata, "a"
> > > > > > +       .section .rodata.evex512, "a"
> > > > > >         .align  16
> > > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > > -typedef unsigned int VUINT32;
> > > > > > -typedef struct
> > > > > > -       {
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > > > -} __svml_stanh_data_internal;
> > > > > > -#endif
> > > > > > -
> > > > > > -__svml_stanh_data_internal:
> > > > > > -       .align  4
> > > > > > -       /* _iExpMantMask_UISA */
> > > > > > -       .long   0x7fe00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMinIdxOfsMask_UISA */
> > > > > > -       .long   0x3d400000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMaxIdxMask_UISA */
> > > > > > -       .long   0x03e00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iExpMask */
> > > > > > -       .long   0x7f000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -__svml_stanh_data_internal_al64:
> > > > > > -       .align  64
> > > > > > -       /* _sC_lo */
> > > > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sC_hi */
> > > > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_lo */
> > > > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_hi */
> > > > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > > >
> > > > > > -       .align  64
> > > > > > -       /* _sSignMask */
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_lo */
> > > > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_hi */
> > > > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_lo */
> > > > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_hi */
> > > > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_lo */
> > > > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_hi */
> > > > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_lo */
> > > > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_hi */
> > > > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_lo */
> > > > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_hi */
> > > > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_lo */
> > > > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_hi */
> > > > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > > > >
> > > > > >         .align  64
> > > > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > > > -       .type   __svml_stanh_data_internal, @object
> > > > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > > > +LOCAL_DATA_NAME:
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > > > +
> > > > > > +       .type   LOCAL_DATA_NAME, @object
> > > > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > > > --
> > > > > > 2.34.1
> > > > > >
> > > > >
> > > > > The data movement makes the assembler codes much harder to follow.
> > > > > Sunil, what do you think of this patch series?
> > > >
> > > > What do you mean? The change on in how we define rodata or the movement
> > > > to multiple files or something else?
> > >
> > > The glibc way to support data files for assembly codes is to define
> > > data in C and use *.sym to generate offsets for assembly files, like
> >
> > I see. Although to be fair the entire SVML codebase bucks that trend.
>
> It is because libmvec codes were generated by ICC and processed
> by scripts.
>
> > Seems like a more dramatic trend to move all the offsets to C.
>
> Since you are adding data by hand, you should do it in C.
>
> > >
> > > sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> > > offsetof (struct cpu_features, xsave_state_size)
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> > > %RSP_LP
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > >
> > > --
> > > H.J.
>
>

Does this restructuring provide any performance benefit as measured by
libmvec microbenchmark?




>
> --
> H.J.
Noah Goldstein June 27, 2023, 6:23 p.m. UTC | #7
On Fri, Dec 16, 2022 at 4:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 16, 2022 at 1:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 1:38 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 10:18 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Dec 16, 2022 at 9:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Wed, Dec 7, 2022 at 12:52 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > No changes to the logic, just change how rodata is handled.
> > > > > >
> > > > > > 1. Define the rodatas using the new macros so they check that the
> > > > > >    offset is correct.
> > > > > >
> > > > > > 2. Use common data where applicable.
> > > > > > ---
> > > > > >  .../multiarch/svml_s_tanhf16_core_avx512.S    | 450 ++++++++----------
> > > > > >  1 file changed, 197 insertions(+), 253 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > index d74fc7731d..765e9ed7f7 100644
> > > > > > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> > > > > > @@ -70,94 +70,99 @@
> > > > > >   *
> > > > > >   */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
> > > > > > -   by use in the function. On cold-starts this might help the
> > > > > > -   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > -   that the prefetcher is less likely to detect a stream and pull
> > > > > > -   irrelivant lines into cache.  */
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
> > > > > > - */
> > > > > > +
> > > > > > +#define LOCAL_DATA_NAME        __svml_stanh_data_internal
> > > > > > +#define LOCAL_DATA_NAME_UNALIGNED      __svml_stanh_data_internal_unaligned
> > > > > > +#include "svml_s_common_evex512_rodata_offsets.h"
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal_unaligned.
> > > > > > +   4 bytes each.  */
> > > > > >  #define _iExpMantMask_UISA             0
> > > > > >  #define _iMinIdxOfsMask_UISA           4
> > > > > >  #define _iMaxIdxMask_UISA              8
> > > > > >  #define _iExpMask                      12
> > > > > >
> > > > > > -/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
> > > > > > -   each.  */
> > > > > > -#define _sC_lo                         0
> > > > > > -#define _sC_hi                         64
> > > > > > -#define _sP7_lo                                128
> > > > > > -#define _sP7_hi                                192
> > > > > > -#define _sSignMask                     256
> > > > > > -#define _sP6_lo                                320
> > > > > > -#define _sP6_hi                                384
> > > > > > -#define _sP5_lo                                448
> > > > > > -#define _sP5_hi                                512
> > > > > > -#define _sP4_lo                                576
> > > > > > -#define _sP4_hi                                640
> > > > > > -#define _sP3_lo                                704
> > > > > > -#define _sP3_hi                                768
> > > > > > -#define _sP2_lo                                832
> > > > > > -#define _sP2_hi                                896
> > > > > > -#define _sP0_lo                                960
> > > > > > -#define _sP0_hi                                1024
> > > > > > +/* Offsets for data table __svml_stanh_data_internal. Ordered
> > > > > > +   by use in the function. On cold-starts this might help the
> > > > > > +   prefetcher. Possibly a better idea is to interleave start/end so
> > > > > > +   that the prefetcher is less likely to detect a stream and pull
> > > > > > +   irrelivant lines into cache.  */
> > > > > > +
> > > > > > +/* Offsets for data table __svml_stanh_data_internal.
> > > > > > +   64 bytes each.  */
> > > > > > +#define _sC_lo 0
> > > > > > +#define _sC_hi 64
> > > > > > +#define _sP7_lo        128
> > > > > > +#define _sP7_hi        192
> > > > > > +#define _sP6_lo        256
> > > > > > +#define _sP6_hi        320
> > > > > > +#define _sP5_lo        384
> > > > > > +#define _sP5_hi        448
> > > > > > +#define _sP4_lo        512
> > > > > > +#define _sP4_hi        576
> > > > > > +#define _sP3_lo        640
> > > > > > +#define _sP3_hi        704
> > > > > > +#define _sP2_lo        768
> > > > > > +#define _sP2_hi        832
> > > > > > +#define _sP0_lo        896
> > > > > > +#define _sP0_hi        960
> > > > > > +
> > > > > >
> > > > > >  #include <sysdep.h>
> > > > > > -#define TANHF_DATA(x)                  ((x)+__svml_stanh_data_internal_al64)
> > > > > > -#define TANHF_DATA_UNALIGNED(x)                ((x)+__svml_stanh_data_internal)
> > > > > >
> > > > > >         .section .text.evex512, "ax", @progbits
> > > > > >  ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > > -       /* Here huge arguments, INF and NaNs are filtered out to callout. */
> > > > > > -       vpandd  TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > -       vpsubd  TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > > +       /* Here huge arguments, INF and NaNs are filtered out to
> > > > > > +          callout.  */
> > > > > > +       vpandd  LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
> > > > > > +       vpsubd  LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
> > > > > >
> > > > > >         /* Selection arguments between [0, 0x03e00000] into zmm3.  */
> > > > > >         vpxord  %zmm3, %zmm3, %zmm3
> > > > > >         vpmaxsd %zmm3, %zmm2, %zmm3
> > > > > > -       vpminsd TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > > +       vpminsd LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Setup permute indices in zmm3.  */
> > > > > >         vpsrld  $21, %zmm3, %zmm3
> > > > > >
> > > > > >         /* Store if there are any special cases in k1.  */
> > > > > > -       vpcmpd  $6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > > +       vpcmpd  $6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sC_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sC_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > -       vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > > +       vmovaps LOCAL_DATA(_sP7_lo)(%rip), %zmm2
> > > > > > +       vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
> > > > > >
> > > > > >         /* Store absolute values of inputs in zmm1.  */
> > > > > > -       vmovaps TANHF_DATA(_sSignMask)(%rip), %zmm4
> > > > > > +       vmovaps COMMON_DATA(_SignMask)(%rip), %zmm4
> > > > > >         vandnps %zmm0, %zmm4, %zmm1
> > > > > >         vsubps  {rn-sae}, %zmm5, %zmm1, %zmm1
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > -       vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > > +       vmovaps LOCAL_DATA(_sP6_lo)(%rip), %zmm5
> > > > > > +       vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > -       vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > > +       vmovaps LOCAL_DATA(_sP5_lo)(%rip), %zmm6
> > > > > > +       vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > -       vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > > +       vmovaps LOCAL_DATA(_sP4_lo)(%rip), %zmm7
> > > > > > +       vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > -       vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > > +       vmovaps LOCAL_DATA(_sP3_lo)(%rip), %zmm8
> > > > > > +       vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > -       vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > > +       vmovaps LOCAL_DATA(_sP2_lo)(%rip), %zmm9
> > > > > > +       vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
> > > > > >
> > > > > > -       vmovaps TANHF_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > -       vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > > +       vmovaps LOCAL_DATA(_sP0_lo)(%rip), %zmm10
> > > > > > +       vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
> > > > > >
> > > > > >         vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
> > > > > >         vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
> > > > > > @@ -167,7 +172,7 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Go to special inputs processing branch.  */
> > > > > >         jne     L(SPECIAL_VALUES_BRANCH)
> > > > > > -       # LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > +
> > > > > >         /* Wait until after branch of write over zmm0.  */
> > > > > >         vpternlogd $0xec, %zmm4, %zmm2, %zmm0
> > > > > >
> > > > > > @@ -176,24 +181,24 @@ ENTRY(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > >         /* Cold case. edx has 1s where there was a special value that
> > > > > >            needs to be handled by a tanhf call. Optimize for code size
> > > > > > -          more so than speed here. */
> > > > > > +          more so than speed here.  */
> > > > > >  L(SPECIAL_VALUES_BRANCH):
> > > > > > -       # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
> > > > > > -    /* Use r13 to save/restore the stack. This allows us to use rbp as
> > > > > > -       callee save register saving code size. */
> > > > > > +
> > > > > > +       /* Use r13 to save/restore the stack. This allows us to use rbp
> > > > > > +          as callee save register saving code size.  */
> > > > > >         pushq   %r13
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(r13, -16)
> > > > > > -       /* Need to callee save registers to preserve state across tanhf calls.
> > > > > > -        */
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (r13, -16)
> > > > > > +       /* Need to callee save registers to preserve state across tanhf
> > > > > > +          calls.  */
> > > > > >         pushq   %rbx
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbx, -24)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbx, -24)
> > > > > >         pushq   %rbp
> > > > > > -       cfi_adjust_cfa_offset(8)
> > > > > > -       cfi_offset(rbp, -32)
> > > > > > +       cfi_adjust_cfa_offset (8)
> > > > > > +       cfi_offset (rbp, -32)
> > > > > >         movq    %rsp, %r13
> > > > > > -       cfi_def_cfa_register(r13)
> > > > > > +       cfi_def_cfa_register (r13)
> > > > > >
> > > > > >         /* Align stack and make room for 2x zmm vectors.  */
> > > > > >         andq    $-64, %rsp
> > > > > > @@ -207,16 +212,17 @@ L(SPECIAL_VALUES_BRANCH):
> > > > > >
> > > > > >         vzeroupper
> > > > > >
> > > > > > -       /* edx has 1s where there was a special value that needs to be handled
> > > > > > -          by a tanhf call.  */
> > > > > > +       /* edx has 1s where there was a special value that needs to be
> > > > > > +          handled by a tanhf call.  */
> > > > > >         movl    %edx, %ebx
> > > > > >  L(SPECIAL_VALUES_LOOP):
> > > > > > -       # LOE rbx rbp r12 r13 r14 r15
> > > > > > -       /* use rbp as index for special value that is saved across calls to
> > > > > > -          tanhf. We technically don't need a callee save register here as offset
> > > > > > -          to rsp is always [0, 56] so we can restore rsp by realigning to 64.
> > > > > > -          Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
> > > > > > -          in the loop. Realigning also costs more code size.  */
> > > > > > +
> > > > > > +       /* use rbp as index for special value that is saved across calls
> > > > > > +          to tanhf. We technically don't need a callee save register
> > > > > > +          here as offset to rsp is always [0, 56] so we can restore
> > > > > > +          rsp by realigning to 64. Essentially the tradeoff is 1 extra
> > > > > > +          save/restore vs 2 extra instructions in the loop. Realigning
> > > > > > +          also costs more code size.  */
> > > > > >         xorl    %ebp, %ebp
> > > > > >         tzcntl  %ebx, %ebp
> > > > > >
> > > > > > @@ -224,203 +230,141 @@ L(SPECIAL_VALUES_LOOP):
> > > > > >         vmovss  64(%rsp, %rbp, 4), %xmm0
> > > > > >         call    tanhf@PLT
> > > > > >
> > > > > > -       /* No good way to avoid the store-forwarding fault this will cause on
> > > > > > -          return. `lfence` avoids the SF fault but at greater cost as it
> > > > > > -          serialized stack/callee save restoration.  */
> > > > > > +       /* No good way to avoid the store-forwarding fault this will
> > > > > > +          cause on return. `lfence` avoids the SF fault but at greater
> > > > > > +          cost as it serialized stack/callee save restoration.  */
> > > > > >         vmovss  %xmm0, (%rsp, %rbp, 4)
> > > > > >
> > > > > > -       blsrl   %ebx, %ebx
> > > > > > +       blsrl   %ebx, %ebx
> > > > > >         jnz     L(SPECIAL_VALUES_LOOP)
> > > > > > -       # LOE r12 r13 r14 r15
> > > > > > +
> > > > > >
> > > > > >         /* All results have been written to (%rsp).  */
> > > > > >         vmovaps (%rsp), %zmm0
> > > > > >         /* Restore rsp.  */
> > > > > >         movq    %r13, %rsp
> > > > > > -       cfi_def_cfa_register(rsp)
> > > > > > +       cfi_def_cfa_register (rsp)
> > > > > >         /* Restore callee save registers.  */
> > > > > >         popq    %rbp
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %rbx
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(rbp)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (rbp)
> > > > > >         popq    %r13
> > > > > > -       cfi_adjust_cfa_offset(-8)
> > > > > > -       cfi_restore(r13)
> > > > > > +       cfi_adjust_cfa_offset (-8)
> > > > > > +       cfi_restore (r13)
> > > > > >         ret
> > > > > >  END(_ZGVeN16v_tanhf_skx)
> > > > > >
> > > > > > -       .section .rodata, "a"
> > > > > > +       .section .rodata.evex512, "a"
> > > > > >         .align  16
> > > > > > -#ifdef __svml_stanh_data_internal_typedef
> > > > > > -typedef unsigned int VUINT32;
> > > > > > -typedef struct
> > > > > > -       {
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
> > > > > > -       __declspec(align(4)) VUINT32 _iExpMask[1][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sC_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP7_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sSignMask[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP6_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP5_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP4_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP3_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP2_hi[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_lo[16][1];
> > > > > > -       __declspec(align(64)) VUINT32 _sP0_hi[16][1];
> > > > > > -} __svml_stanh_data_internal;
> > > > > > -#endif
> > > > > > -
> > > > > > -__svml_stanh_data_internal:
> > > > > > -       .align  4
> > > > > > -       /* _iExpMantMask_UISA */
> > > > > > -       .long   0x7fe00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMinIdxOfsMask_UISA */
> > > > > > -       .long   0x3d400000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iMaxIdxMask_UISA */
> > > > > > -       .long   0x03e00000
> > > > > > -
> > > > > > -       .align  4
> > > > > > -       /* _iExpMask */
> > > > > > -       .long   0x7f000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -__svml_stanh_data_internal_al64:
> > > > > > -       .align  64
> > > > > > -       /* _sC_lo */
> > > > > > -       .long   0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
> > > > > > -       .long   0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
> > > > > > -       .long   0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
> > > > > > -       .long   0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sC_hi */
> > > > > > -       .long   0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
> > > > > > -       .long   0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
> > > > > > -       .long   0x40500000, 0x40700000, 0x40900000, 0x40b00000
> > > > > > -       .long   0x40d00000, 0x40f00000, 0x41100000, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_lo */
> > > > > > -       .long   0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
> > > > > > -       .long   0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
> > > > > > -       .long   0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
> > > > > > -       .long   0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP7_hi */
> > > > > > -       .long   0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
> > > > > > -       .long   0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
> > > > > > -       .long   0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
> > > > > > -       .long   0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
> > > > > >
> > > > > > -       .align  64
> > > > > > -       /* _sSignMask */
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -       .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_lo */
> > > > > > -       .long   0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
> > > > > > -       .long   0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
> > > > > > -       .long   0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
> > > > > > -       .long   0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP6_hi */
> > > > > > -       .long   0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
> > > > > > -       .long   0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
> > > > > > -       .long   0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
> > > > > > -       .long   0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_lo */
> > > > > > -       .long   0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
> > > > > > -       .long   0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
> > > > > > -       .long   0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
> > > > > > -       .long   0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP5_hi */
> > > > > > -       .long   0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
> > > > > > -       .long   0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
> > > > > > -       .long   0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
> > > > > > -       .long   0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_lo */
> > > > > > -       .long   0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
> > > > > > -       .long   0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
> > > > > > -       .long   0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
> > > > > > -       .long   0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP4_hi */
> > > > > > -       .long   0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
> > > > > > -       .long   0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
> > > > > > -       .long   0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
> > > > > > -       .long   0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_lo */
> > > > > > -       .long   0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
> > > > > > -       .long   0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
> > > > > > -       .long   0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
> > > > > > -       .long   0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP3_hi */
> > > > > > -       .long   0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
> > > > > > -       .long   0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
> > > > > > -       .long   0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
> > > > > > -       .long   0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_lo */
> > > > > > -       .long   0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
> > > > > > -       .long   0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
> > > > > > -       .long   0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
> > > > > > -       .long   0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP2_hi */
> > > > > > -       .long   0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
> > > > > > -       .long   0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
> > > > > > -       .long   0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
> > > > > > -       .long   0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_lo */
> > > > > > -       .long   0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
> > > > > > -       .long   0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
> > > > > > -       .long   0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
> > > > > > -       .long   0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
> > > > > > -
> > > > > > -       .align  64
> > > > > > -       /* _sP0_hi */
> > > > > > -       .long   0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
> > > > > > -       .long   0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
> > > > > > -       .long   0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
> > > > > > -       .long   0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
> > > > > > +LOCAL_DATA_NAME_UNALIGNED:
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
> > > > > > +       float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
> > > > > > +       .type   LOCAL_DATA_NAME_UNALIGNED, @object
> > > > > > +       .size   LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
> > > > > >
> > > > > >         .align  64
> > > > > > -       .type   __svml_stanh_data_internal_al64, @object
> > > > > > -       .size   __svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
> > > > > > -       .type   __svml_stanh_data_internal, @object
> > > > > > -       .size   __svml_stanh_data_internal, .-__svml_stanh_data_internal
> > > > > > +LOCAL_DATA_NAME:
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_lo,
> > > > > > +               0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
> > > > > > +               0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
> > > > > > +               0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
> > > > > > +               0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sC_hi,
> > > > > > +               0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
> > > > > > +               0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
> > > > > > +               0x40500000, 0x40700000, 0x40900000, 0x40b00000,
> > > > > > +               0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_lo,
> > > > > > +               0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
> > > > > > +               0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
> > > > > > +               0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
> > > > > > +               0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP7_hi,
> > > > > > +               0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
> > > > > > +               0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
> > > > > > +               0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
> > > > > > +               0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_lo,
> > > > > > +               0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
> > > > > > +               0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
> > > > > > +               0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
> > > > > > +               0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP6_hi,
> > > > > > +               0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
> > > > > > +               0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
> > > > > > +               0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
> > > > > > +               0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_lo,
> > > > > > +               0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
> > > > > > +               0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
> > > > > > +               0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
> > > > > > +               0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP5_hi,
> > > > > > +               0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
> > > > > > +               0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
> > > > > > +               0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
> > > > > > +               0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_lo,
> > > > > > +               0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
> > > > > > +               0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
> > > > > > +               0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
> > > > > > +               0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP4_hi,
> > > > > > +               0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
> > > > > > +               0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
> > > > > > +               0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
> > > > > > +               0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_lo,
> > > > > > +               0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
> > > > > > +               0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
> > > > > > +               0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
> > > > > > +               0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP3_hi,
> > > > > > +               0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
> > > > > > +               0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
> > > > > > +               0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
> > > > > > +               0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_lo,
> > > > > > +               0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
> > > > > > +               0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
> > > > > > +               0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
> > > > > > +               0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP2_hi,
> > > > > > +               0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
> > > > > > +               0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
> > > > > > +               0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
> > > > > > +               0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_lo,
> > > > > > +               0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
> > > > > > +               0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
> > > > > > +               0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
> > > > > > +               0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
> > > > > > +
> > > > > > +       float_block (LOCAL_DATA_NAME, _sP0_hi,
> > > > > > +               0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
> > > > > > +               0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
> > > > > > +               0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
> > > > > > +               0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
> > > > > > +
> > > > > > +       .type   LOCAL_DATA_NAME, @object
> > > > > > +       .size   LOCAL_DATA_NAME, .-LOCAL_DATA_NAME
> > > > > > --
> > > > > > 2.34.1
> > > > > >
> > > > >
> > > > > The data movement makes the assembler codes much harder to follow.
> > > > > Sunil, what do you think of this patch series?
> > > >
> > > > What do you mean? The change on in how we define rodata or the movement
> > > > to multiple files or something else?
> > >
> > > The glibc way to support data files for assembly codes is to define
> > > data in C and use *.sym to generate offsets for assembly files, like
> >
> > I see. Although to be fair the entire SVML codebase bucks that trend.
>
> It is because libmvec codes were generated by ICC and processed
> by scripts.
>
> > Seems like a more dramatic trend to move all the offsets to C.
>
> Since you are adding data by hand, you should do it in C.

Since the plan is to integrate this piece-meal (function by function), think
it's easier to integrate into a system that matches the rest of the
unyet changed
files.

Once all of the SVML functions have been updated it will be simple
enough to script
the change from ASM -> C.
Thoughts?
>
> > >
> > > sysdeps/x86/cpu-features-offsets.sym:XSAVE_STATE_SIZE_OFFSET
> > > offsetof (struct cpu_features, xsave_state_size)
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip),
> > > %RSP_LP
> > > sysdeps/x86_64/dl-trampoline.h:  sub
> > > _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index d74fc7731d..765e9ed7f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -70,94 +70,99 @@ 
  *
  */
 
-/* Offsets for data table __svml_stanh_data_internal_avx512. Ordered
-   by use in the function. On cold-starts this might help the
-   prefetcher. Possibly a better idea is to interleave start/end so
-   that the prefetcher is less likely to detect a stream and pull
-   irrelivant lines into cache.  */
 
-/* Offsets for data table __svml_stanh_data_internal. 4 bytes each.
- */
+
+#define LOCAL_DATA_NAME	__svml_stanh_data_internal
+#define LOCAL_DATA_NAME_UNALIGNED	__svml_stanh_data_internal_unaligned
+#include "svml_s_common_evex512_rodata_offsets.h"
+
+/* Offsets for data table __svml_stanh_data_internal_unaligned.
+   4 bytes each.  */
 #define _iExpMantMask_UISA		0
 #define _iMinIdxOfsMask_UISA		4
 #define _iMaxIdxMask_UISA		8
 #define _iExpMask			12
 
-/* Offsets for data table __svml_stanh_data_internal_al64. 64 bytes
-   each.  */
-#define _sC_lo				0
-#define _sC_hi				64
-#define _sP7_lo				128
-#define _sP7_hi				192
-#define _sSignMask			256
-#define _sP6_lo				320
-#define _sP6_hi				384
-#define _sP5_lo				448
-#define _sP5_hi				512
-#define _sP4_lo				576
-#define _sP4_hi				640
-#define _sP3_lo				704
-#define _sP3_hi				768
-#define _sP2_lo				832
-#define _sP2_hi				896
-#define _sP0_lo				960
-#define _sP0_hi				1024
+/* Offsets for data table __svml_stanh_data_internal. Ordered
+   by use in the function. On cold-starts this might help the
+   prefetcher. Possibly a better idea is to interleave start/end so
+   that the prefetcher is less likely to detect a stream and pull
+   irrelivant lines into cache.  */
+
+/* Offsets for data table __svml_stanh_data_internal.
+   64 bytes each.  */
+#define _sC_lo	0
+#define _sC_hi	64
+#define _sP7_lo	128
+#define _sP7_hi	192
+#define _sP6_lo	256
+#define _sP6_hi	320
+#define _sP5_lo	384
+#define _sP5_hi	448
+#define _sP4_lo	512
+#define _sP4_hi	576
+#define _sP3_lo	640
+#define _sP3_hi	704
+#define _sP2_lo	768
+#define _sP2_hi	832
+#define _sP0_lo	896
+#define _sP0_hi	960
+
 
 #include <sysdep.h>
-#define TANHF_DATA(x)			((x)+__svml_stanh_data_internal_al64)
-#define TANHF_DATA_UNALIGNED(x)		((x)+__svml_stanh_data_internal)
 
 	.section .text.evex512, "ax", @progbits
 ENTRY(_ZGVeN16v_tanhf_skx)
-	/* Here huge arguments, INF and NaNs are filtered out to callout. */
-	vpandd	TANHF_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
-	vpsubd	TANHF_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
+	/* Here huge arguments, INF and NaNs are filtered out to
+	   callout.  */
+	vpandd	LOCAL_DATA_UNALIGNED(_iExpMantMask_UISA)(%rip){1to16}, %zmm0, %zmm1
+	vpsubd	LOCAL_DATA_UNALIGNED(_iMinIdxOfsMask_UISA)(%rip){1to16}, %zmm1, %zmm2
 
 	/* Selection arguments between [0, 0x03e00000] into zmm3.  */
 	vpxord	%zmm3, %zmm3, %zmm3
 	vpmaxsd	%zmm3, %zmm2, %zmm3
-	vpminsd	TANHF_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
+	vpminsd	LOCAL_DATA_UNALIGNED(_iMaxIdxMask_UISA)(%rip){1to16}, %zmm3, %zmm3
 
 	/* Setup permute indices in zmm3.  */
 	vpsrld	$21, %zmm3, %zmm3
 
 	/* Store if there are any special cases in k1.  */
-	vpcmpd	$6, TANHF_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
+	vpcmpd	$6, LOCAL_DATA_UNALIGNED(_iExpMask)(%rip){1to16}, %zmm1, %k1
 
-	vmovaps	TANHF_DATA(_sC_lo)(%rip), %zmm5
-	vpermt2ps TANHF_DATA(_sC_hi)(%rip), %zmm3, %zmm5
+	vmovaps	LOCAL_DATA(_sC_lo)(%rip), %zmm5
+	vpermt2ps LOCAL_DATA(_sC_hi)(%rip), %zmm3, %zmm5
 
-	vmovaps	TANHF_DATA(_sP7_lo)(%rip), %zmm2
-	vpermt2ps TANHF_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
+	vmovaps	LOCAL_DATA(_sP7_lo)(%rip), %zmm2
+	vpermt2ps LOCAL_DATA(_sP7_hi)(%rip), %zmm3, %zmm2
 
 	/* Store absolute values of inputs in zmm1.  */
-	vmovaps	TANHF_DATA(_sSignMask)(%rip), %zmm4
+	vmovaps	COMMON_DATA(_SignMask)(%rip), %zmm4
 	vandnps	%zmm0, %zmm4, %zmm1
 	vsubps	{rn-sae}, %zmm5, %zmm1, %zmm1
 
-	vmovaps	TANHF_DATA(_sP6_lo)(%rip), %zmm5
-	vpermt2ps TANHF_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
+	vmovaps	LOCAL_DATA(_sP6_lo)(%rip), %zmm5
+	vpermt2ps LOCAL_DATA(_sP6_hi)(%rip), %zmm3, %zmm5
 
-	vmovaps	TANHF_DATA(_sP5_lo)(%rip), %zmm6
-	vpermt2ps TANHF_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
+	vmovaps	LOCAL_DATA(_sP5_lo)(%rip), %zmm6
+	vpermt2ps LOCAL_DATA(_sP5_hi)(%rip), %zmm3, %zmm6
 
 	vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm2
 	vfmadd213ps {rn-sae}, %zmm6, %zmm1, %zmm2
 
-	vmovaps	TANHF_DATA(_sP4_lo)(%rip), %zmm7
-	vpermt2ps TANHF_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
+	vmovaps	LOCAL_DATA(_sP4_lo)(%rip), %zmm7
+	vpermt2ps LOCAL_DATA(_sP4_hi)(%rip), %zmm3, %zmm7
 
-	vmovaps	TANHF_DATA(_sP3_lo)(%rip), %zmm8
-	vpermt2ps TANHF_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
+	vmovaps	LOCAL_DATA(_sP3_lo)(%rip), %zmm8
+	vpermt2ps LOCAL_DATA(_sP3_hi)(%rip), %zmm3, %zmm8
 
 	vfmadd213ps {rn-sae}, %zmm7, %zmm1, %zmm2
 	vfmadd213ps {rn-sae}, %zmm8, %zmm1, %zmm2
 
-	vmovaps	TANHF_DATA(_sP2_lo)(%rip), %zmm9
-	vpermt2ps TANHF_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
+	vmovaps	LOCAL_DATA(_sP2_lo)(%rip), %zmm9
+	vpermt2ps LOCAL_DATA(_sP2_hi)(%rip), %zmm3, %zmm9
 
-	vmovaps	TANHF_DATA(_sP0_lo)(%rip), %zmm10
-	vpermt2ps TANHF_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
+	vmovaps	LOCAL_DATA(_sP0_lo)(%rip), %zmm10
+	vpermt2ps LOCAL_DATA(_sP0_hi)(%rip), %zmm3, %zmm10
 
 	vfmadd213ps {rn-sae}, %zmm9, %zmm1, %zmm2
 	vfmadd213ps {rn-sae}, %zmm10, %zmm1, %zmm2
@@ -167,7 +172,7 @@  ENTRY(_ZGVeN16v_tanhf_skx)
 
 	/* Go to special inputs processing branch.  */
 	jne	L(SPECIAL_VALUES_BRANCH)
-	# LOE rbx r12 r13 r14 r15 zmm0 zmm2 zmm4
+
 	/* Wait until after branch of write over zmm0.  */
 	vpternlogd $0xec, %zmm4, %zmm2, %zmm0
 
@@ -176,24 +181,24 @@  ENTRY(_ZGVeN16v_tanhf_skx)
 
 	/* Cold case. edx has 1s where there was a special value that
 	   needs to be handled by a tanhf call. Optimize for code size
-	   more so than speed here. */
+	   more so than speed here.  */
 L(SPECIAL_VALUES_BRANCH):
-	# LOE rbx rdx r12 r13 r14 r15 zmm0 zmm2 zmm4
-    /* Use r13 to save/restore the stack. This allows us to use rbp as
-       callee save register saving code size. */
+
+	/* Use r13 to save/restore the stack. This allows us to use rbp
+	   as callee save register saving code size.  */
 	pushq	%r13
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(r13, -16)
-	/* Need to callee save registers to preserve state across tanhf calls.
-	 */
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (r13, -16)
+	/* Need to callee save registers to preserve state across tanhf
+	   calls.  */
 	pushq	%rbx
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbx, -24)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbx, -24)
 	pushq	%rbp
-	cfi_adjust_cfa_offset(8)
-	cfi_offset(rbp, -32)
+	cfi_adjust_cfa_offset (8)
+	cfi_offset (rbp, -32)
 	movq	%rsp, %r13
-	cfi_def_cfa_register(r13)
+	cfi_def_cfa_register (r13)
 
 	/* Align stack and make room for 2x zmm vectors.  */
 	andq	$-64, %rsp
@@ -207,16 +212,17 @@  L(SPECIAL_VALUES_BRANCH):
 
 	vzeroupper
 
-	/* edx has 1s where there was a special value that needs to be handled
-	   by a tanhf call.  */
+	/* edx has 1s where there was a special value that needs to be
+	   handled by a tanhf call.  */
 	movl	%edx, %ebx
 L(SPECIAL_VALUES_LOOP):
-	# LOE rbx rbp r12 r13 r14 r15
-	/* use rbp as index for special value that is saved across calls to
-	   tanhf. We technically don't need a callee save register here as offset
-	   to rsp is always [0, 56] so we can restore rsp by realigning to 64.
-	   Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
-	   in the loop. Realigning also costs more code size.  */
+
+	/* use rbp as index for special value that is saved across calls
+	   to tanhf. We technically don't need a callee save register
+	   here as offset to rsp is always [0, 56] so we can restore
+	   rsp by realigning to 64. Essentially the tradeoff is 1 extra
+	   save/restore vs 2 extra instructions in the loop. Realigning
+	   also costs more code size.  */
 	xorl	%ebp, %ebp
 	tzcntl	%ebx, %ebp
 
@@ -224,203 +230,141 @@  L(SPECIAL_VALUES_LOOP):
 	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
-	/* No good way to avoid the store-forwarding fault this will cause on
-	   return. `lfence` avoids the SF fault but at greater cost as it
-	   serialized stack/callee save restoration.  */
+	/* No good way to avoid the store-forwarding fault this will
+	   cause on return. `lfence` avoids the SF fault but at greater
+	   cost as it serialized stack/callee save restoration.  */
 	vmovss	%xmm0, (%rsp, %rbp, 4)
 
-	blsrl   %ebx, %ebx
+	blsrl	%ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
-	# LOE r12 r13 r14 r15
+
 
 	/* All results have been written to (%rsp).  */
 	vmovaps	(%rsp), %zmm0
 	/* Restore rsp.  */
 	movq	%r13, %rsp
-	cfi_def_cfa_register(rsp)
+	cfi_def_cfa_register (rsp)
 	/* Restore callee save registers.  */
 	popq	%rbp
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%rbx
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(rbp)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (rbp)
 	popq	%r13
-	cfi_adjust_cfa_offset(-8)
-	cfi_restore(r13)
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (r13)
 	ret
 END(_ZGVeN16v_tanhf_skx)
 
-	.section .rodata, "a"
+	.section .rodata.evex512, "a"
 	.align	16
-#ifdef __svml_stanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct
-	{
-	__declspec(align(4)) VUINT32 _iExpMantMask_UISA[1][1];
-	__declspec(align(4)) VUINT32 _iMinIdxOfsMask_UISA[1][1];
-	__declspec(align(4)) VUINT32 _iMaxIdxMask_UISA[1][1];
-	__declspec(align(4)) VUINT32 _iExpMask[1][1];
-	__declspec(align(64)) VUINT32 _sC_lo[16][1];
-	__declspec(align(64)) VUINT32 _sC_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP7_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP7_hi[16][1];
-	__declspec(align(64)) VUINT32 _sSignMask[16][1];
-	__declspec(align(64)) VUINT32 _sP6_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP6_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP5_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP5_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP4_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP4_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP3_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP3_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP2_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP2_hi[16][1];
-	__declspec(align(64)) VUINT32 _sP0_lo[16][1];
-	__declspec(align(64)) VUINT32 _sP0_hi[16][1];
-} __svml_stanh_data_internal;
-#endif
-
-__svml_stanh_data_internal:
-	.align	4
-	/* _iExpMantMask_UISA */
-	.long	0x7fe00000
-
-	.align	4
-	/* _iMinIdxOfsMask_UISA */
-	.long	0x3d400000
-
-	.align	4
-	/* _iMaxIdxMask_UISA */
-	.long	0x03e00000
-
-	.align	4
-	/* _iExpMask */
-	.long	0x7f000000
-
-	.align	64
-__svml_stanh_data_internal_al64:
-	.align	64
-	/* _sC_lo */
-	.long	0x00000000, 0x3d700000, 0x3d900000, 0x3db00000
-	.long	0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000
-	.long	0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000
-	.long	0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000
-
-	.align	64
-	/* _sC_hi */
-	.long	0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000
-	.long	0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000
-	.long	0x40500000, 0x40700000, 0x40900000, 0x40b00000
-	.long	0x40d00000, 0x40f00000, 0x41100000, 0x00000000
-
-	.align	64
-	/* _sP7_lo */
-	.long	0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e
-	.long	0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57
-	.long	0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f
-	.long	0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0
-
-	.align	64
-	/* _sP7_hi */
-	.long	0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b
-	.long	0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22
-	.long	0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950
-	.long	0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000
 
-	.align	64
-	/* _sSignMask */
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000
-
-	.align	64
-	/* _sP6_lo */
-	.long	0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756
-	.long	0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0
-	.long	0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17
-	.long	0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad
-
-	.align	64
-	/* _sP6_hi */
-	.long	0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63
-	.long	0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66
-	.long	0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3
-	.long	0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000
-
-	.align	64
-	/* _sP5_lo */
-	.long	0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d
-	.long	0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670
-	.long	0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405
-	.long	0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4
-
-	.align	64
-	/* _sP5_hi */
-	.long	0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9
-	.long	0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd
-	.long	0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232
-	.long	0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000
-
-	.align	64
-	/* _sP4_lo */
-	.long	0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120
-	.long	0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a
-	.long	0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88
-	.long	0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e
-
-	.align	64
-	/* _sP4_hi */
-	.long	0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96
-	.long	0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67
-	.long	0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9
-	.long	0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000
-
-	.align	64
-	/* _sP3_lo */
-	.long	0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d
-	.long	0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3
-	.long	0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca
-	.long	0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92
-
-	.align	64
-	/* _sP3_hi */
-	.long	0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704
-	.long	0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06
-	.long	0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2
-	.long	0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000
-
-	.align	64
-	/* _sP2_lo */
-	.long	0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f
-	.long	0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580
-	.long	0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92
-	.long	0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360
-
-	.align	64
-	/* _sP2_hi */
-	.long	0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2
-	.long	0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4
-	.long	0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b
-	.long	0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000
-
-	.align	64
-	/* _sP0_lo */
-	.long	0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169
-	.long	0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984
-	.long	0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163
-	.long	0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0
-
-	.align	64
-	/* _sP0_hi */
-	.long	0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53
-	.long	0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85
-	.long	0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0
-	.long	0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000
+LOCAL_DATA_NAME_UNALIGNED:
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMantMask_UISA, 0x7fe00000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iMinIdxOfsMask_UISA, 0x3d400000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iMaxIdxMask_UISA, 0x03e00000)
+	float_block (LOCAL_DATA_NAME_UNALIGNED, _iExpMask, 0x7f000000)
+	.type	LOCAL_DATA_NAME_UNALIGNED, @object
+	.size	LOCAL_DATA_NAME_UNALIGNED, .-LOCAL_DATA_NAME_UNALIGNED
 
 	.align	64
-	.type	__svml_stanh_data_internal_al64, @object
-	.size	__svml_stanh_data_internal_al64, .-__svml_stanh_data_internal_al64
-	.type	__svml_stanh_data_internal, @object
-	.size	__svml_stanh_data_internal, .-__svml_stanh_data_internal
+LOCAL_DATA_NAME:
+	float_block (LOCAL_DATA_NAME, _sC_lo,
+		0x00000000, 0x3d700000, 0x3d900000, 0x3db00000,
+		0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
+		0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000,
+		0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000)
+
+	float_block (LOCAL_DATA_NAME, _sC_hi,
+		0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000,
+		0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
+		0x40500000, 0x40700000, 0x40900000, 0x40b00000,
+		0x40d00000, 0x40f00000, 0x41100000, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP7_lo,
+		0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e,
+		0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
+		0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f,
+		0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0)
+
+	float_block (LOCAL_DATA_NAME, _sP7_hi,
+		0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b,
+		0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
+		0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950,
+		0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP6_lo,
+		0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756,
+		0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
+		0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17,
+		0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad)
+
+	float_block (LOCAL_DATA_NAME, _sP6_hi,
+		0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63,
+		0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
+		0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3,
+		0x35a8a5e6, 0x34369b17, 0x322487b0, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP5_lo,
+		0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d,
+		0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
+		0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405,
+		0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4)
+
+	float_block (LOCAL_DATA_NAME, _sP5_hi,
+		0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9,
+		0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
+		0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232,
+		0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP4_lo,
+		0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120,
+		0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
+		0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88,
+		0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e)
+
+	float_block (LOCAL_DATA_NAME, _sP4_hi,
+		0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96,
+		0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
+		0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9,
+		0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP3_lo,
+		0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d,
+		0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
+		0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca,
+		0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92)
+
+	float_block (LOCAL_DATA_NAME, _sP3_hi,
+		0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704,
+		0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
+		0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2,
+		0xb717b096, 0xb5a43bae, 0xb383012c, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP2_lo,
+		0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f,
+		0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
+		0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92,
+		0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360)
+
+	float_block (LOCAL_DATA_NAME, _sP2_hi,
+		0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2,
+		0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
+		0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b,
+		0x3717b0da, 0x35a43bce, 0x338306c6, 0x00000000)
+
+	float_block (LOCAL_DATA_NAME, _sP0_lo,
+		0x00000000, 0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169,
+		0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
+		0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163,
+		0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0)
+
+	float_block (LOCAL_DATA_NAME, _sP0_hi,
+		0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53,
+		0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
+		0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0,
+		0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000)
+
+	.type	LOCAL_DATA_NAME, @object
+	.size	LOCAL_DATA_NAME, .-LOCAL_DATA_NAME