[v1,10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S

Message ID	20221207085236.1424424-10-goldstein.w.n@gmail.com
State	New
Headers	show Return-Path: <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 255D6389839E To: libc-alpha@sourceware.org Cc: goldstein.w.n@gmail.com, hjl.tools@gmail.com, andrey.kolesov@intel.com, carlos@systemhalted.org Subject: [PATCH v1 10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S Date: Wed, 7 Dec 2022 00:52:19 -0800 Message-Id: <20221207085236.1424424-10-goldstein.w.n@gmail.com> In-Reply-To: <20221207085236.1424424-1-goldstein.w.n@gmail.com> References: <20221207085236.1424424-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list From: Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> Reply-To: Noah Goldstein <goldstein.w.n@gmail.com> Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org>
Series	[v1,01/27] x86/fpu: Create helper file for common data macros \| expand [v1,01/27] x86/fpu: Create helper file for common data macros [v1,02/27] x86/fpu: Add file for common data used across svml_s__avx2.S files [v1,03/27] x86/fpu: Add file for common data used across svml_s__avx512.S files [v1,04/27] x86/fpu: Add file for common data used across svml_s__sse4.S files [v1,05/27] x86/fpu: Build common data files for svml_s__{avx512, avx2, sse4}.S [v1,06/27] x86/fpu: Update rodata usage in svml_s_tanhf__{avx2, sse4} [v1,07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S [v1,08/27] x86/fpu: Update rodata usage in svml_s_atanhf16_core_avx512.S [v1,09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S [v1,10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S [v1,11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S [v1,12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S [v1,13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S [v1,14/27] x86/fpu: Add common rodata file for svml_s_tanf__{avx512, avx2, sse4}.S [v1,15/27] x86/fpu: Optimize svml_s_tanf16_core_avx512.S [v1,16/27] x86/fpu: Optimize svml_s_tanf4_core_sse4.S [v1,17/27] x86/fpu: Optimize svml_s_tanf8_core_avx2.S [v1,18/27] x86/fpu: Optimize svml_s_log10f16_core_avx512.S [v1,19/27] x86/fpu: Optimize svml_s_log10f4_core_sse4.S [v1,20/27] x86/fpu: Optimize svml_s_log10f8_core_avx2.S [v1,21/27] x86/fpu: Optimize svml_s_log2f16_core_avx512.S [v1,22/27] x86/fpu: Optimize svml_s_log2f4_core_sse4.S [v1,23/27] x86/fpu: Optimize svml_s_log2f8_core_avx2.S [v1,24/27] x86/fpu: Optimize svml_s_logf16_core_avx512.S [v1,25/27] x86/fpu: Optimize svml_s_logf4_core_sse4.S [v1,26/27] x86/fpu: Optimize svml_s_logf8_core_avx2.S [v1,27/27] x86/fpu: Remove unused svml_s_logf_data.S file

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S index 49ffd7a9b2..0c93da0166 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S @@ -1,5 +1,5 @@ /* Function atanhf vectorized with AVX2. - Copyright (C) 2021-2022 Free Software Foundation, Inc. +n Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -30,181 +30,178 @@ * */ -/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered + +#define LOCAL_DATA_NAME __svml_satan_data_internal +#include "svml_s_common_avx2_rodata_offsets.h" + +/* Offsets for data table __svml_satanh_data_internal. Ordered by use in the function. On cold-starts this might hhelp the prefetcher. Possibly a better idea is to interleave start/end so that the prefetcher is less likely to detect a stream and pull irrelivant lines into cache. */ -#define SgnMask 0 -#define sOne 32 -#define sTopMask12 64 -#define TinyRange 96 -#define iBrkValue 128 -#define iOffExpoMask 160 -#define sPoly 192 -#define sLn2 448 -#define sHalf 480 +#define _TinyRange 0 +#define _Poly_1 32 +#define _Poly_2 64 +#define _Poly_3 96 +#define _Poly_4 128 +#define _Poly_5 160 +#define _Poly_6 192 +#define _Poly_7 224 +#define _Half 256 #include <sysdep.h> -#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) .section .text.avx2, "ax", @progbits ENTRY(_ZGVdN8v_atanhf_avx2) - /* Strip off the sign, so treat X as positive until right at the end */ - vmovaps ATANHF_DATA(SgnMask)(%rip), %ymm2 + /* Strip off the sign, so treat X as positive until right at the end. */ + vmovaps COMMON_DATA(_AbsMask)(%rip), %ymm2 vandps %ymm2, %ymm0, %ymm3 - /* Load constants including One = 1 */ - vmovups ATANHF_DATA(sOne)(%rip), %ymm5 + /* Load constants including One = 1. */ + vmovups COMMON_DATA(_OneF)(%rip), %ymm5 vsubps %ymm3, %ymm5, %ymm1 - vmovups ATANHF_DATA(sTopMask12)(%rip), %ymm4 + vmovups COMMON_DATA(_Neg4096)(%rip), %ymm4 vrcpps %ymm1, %ymm7 vsubps %ymm1, %ymm5, %ymm9 vandps %ymm4, %ymm7, %ymm6 vsubps %ymm3, %ymm9, %ymm7 - /* No need to split sU when FMA is available */ + /* No need to split sU when FMA is available. */ vfnmadd213ps %ymm5, %ymm6, %ymm1 vmovaps %ymm0, %ymm8 vfmadd213ps %ymm0, %ymm0, %ymm0 vfnmadd231ps %ymm6, %ymm7, %ymm1 - /* - * Check whether |X| < 1, in which case we use the main function. - * Otherwise set the rangemask so that the callout will get used. - * Note that this will also use the callout for NaNs since not(NaN < 1). - */ + /* Check whether |X| < 1, in which case we use the main + function. Otherwise set the rangemask so that the callout + will get used. Note that this will also use the callout for + NaNs since not(NaN < 1). */ vcmpnlt_uqps %ymm5, %ymm3, %ymm14 - vcmplt_oqps ATANHF_DATA(TinyRange)(%rip), %ymm3, %ymm15 + vcmplt_oqps LOCAL_DATA(_TinyRange)(%rip), %ymm3, %ymm15 - /* - * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, - * the upper part UHi being <= 12 bits long. Then we have - * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). - */ + /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two + pieces, the upper part UHi being <= 12 bits long. Then we + have atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V + / (UHi + ULo)). */ vaddps %ymm3, %ymm3, %ymm3 - /* - * Split V as well into upper 12 bits and lower part, so that we can get - * a preliminary quotient estimate without rounding error. - */ + /* Split V as well into upper 12 bits and lower part, so that we + can get a preliminary quotient estimate without rounding + error. */ vandps %ymm4, %ymm3, %ymm4 vsubps %ymm4, %ymm3, %ymm7 - /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ + /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo. */ vmulps %ymm4, %ymm6, %ymm4 - /* Compute D = E + E^2 */ + /* Compute D = E + E^2. */ vfmadd213ps %ymm1, %ymm1, %ymm1 /* Record the sign for eventual reincorporation. */ vandnps %ymm8, %ymm2, %ymm3 - /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ + /* Or the sign bit in with the tiny result to handle atanh(-0) + correctly. */ vorps %ymm3, %ymm0, %ymm13 vmulps %ymm7, %ymm6, %ymm2 - /* - * Compute R * (VHi + VLo) * (1 + E + E^2) - * = R * (VHi + VLo) * (1 + D) - * = QHi + (QHi * D + QLo + QLo * D) - */ - - /* - * If less precision is acceptable the `vmulps %ymm1, %ymm4, %ymm9; - * vaddps %ymm1, %ymm9, %ymm1` can be replaced with - * `vfmadd231ps %ymm1, %ymm4, %ymm4`. - */ + /* Compute R * (VHi + VLo) * (1 + E + E^2) + = R * (VHi + VLo) * (1 + D) + = QHi + (QHi * D + QLo + QLo * D). */ + + /* If less precision is acceptable the: + `vmulps %ymm1, %ymm4, %ymm6; vaddps %ymm1, %ymm9, %ymm1` + can be replaced with: + `vfmadd231ps %ymm1, %ymm4, %ymm4`. */ vmulps %ymm1, %ymm4, %ymm6 vfmadd213ps %ymm2, %ymm2, %ymm1 vaddps %ymm1, %ymm6, %ymm1 - /* - * Now finally accumulate the high and low parts of the - * argument to log1p, H + L, with a final compensated summation. - */ + /* Now finally accumulate the high and low parts of the + argument to log1p, H + L, with a final compensated summation. */ vaddps %ymm1, %ymm4, %ymm2 - /* reduction: compute r, n */ - vmovups ATANHF_DATA(iBrkValue)(%rip), %ymm9 + /* reduction: compute r, n. */ + vmovups COMMON_DATA(_IBrkValue)(%rip), %ymm9 - /* - * Now we feed into the log1p code, using H in place of _VARG1 and - * later incorporating L into the reduced argument. - * compute 1+x as high, low parts - */ + /* Now we feed into the log1p code, using H in place of _VARG1 and + later incorporating L into the reduced argument. + compute 1+x as high, low parts. */ vmaxps %ymm2, %ymm5, %ymm0 vminps %ymm2, %ymm5, %ymm6 - /* This is needed for rounding (see `vaddps %ymm1, %ymm4, %ymm2`). */ + /* This is needed for rounding (see `vaddps %ymm1, %ymm4, + %ymm2`). */ vsubps %ymm2, %ymm4, %ymm2 vaddps %ymm6, %ymm0, %ymm4 vpsubd %ymm9, %ymm4, %ymm7 vsubps %ymm4, %ymm0, %ymm4 vaddps %ymm2, %ymm1, %ymm2 - vmovaps ATANHF_DATA(iOffExpoMask)(%rip), %ymm1 + vmovaps COMMON_DATA(_NotiOffExpoMask)(%rip), %ymm1 - vandps %ymm1, %ymm7, %ymm0 + vandnps %ymm7, %ymm1, %ymm0 vaddps %ymm4, %ymm6, %ymm4 - vandnps %ymm7, %ymm1, %ymm6 - vmovups ATANHF_DATA(sPoly+0)(%rip), %ymm1 + vandps %ymm7, %ymm1, %ymm6 + + vmovups LOCAL_DATA(_Poly_1)(%rip), %ymm1 vpaddd %ymm9, %ymm0, %ymm0 vaddps %ymm4, %ymm2, %ymm4 vpsubd %ymm6, %ymm5, %ymm6 - /* polynomial evaluation */ + /* polynomial evaluation. */ vsubps %ymm5, %ymm0, %ymm2 vfmadd231ps %ymm4, %ymm6, %ymm2 - vfmadd213ps ATANHF_DATA(sPoly+32)(%rip), %ymm2, %ymm1 - vfmadd213ps ATANHF_DATA(sPoly+64)(%rip), %ymm2, %ymm1 - vfmadd213ps ATANHF_DATA(sPoly+96)(%rip), %ymm2, %ymm1 - vfmadd213ps ATANHF_DATA(sPoly+128)(%rip), %ymm2, %ymm1 - vfmadd213ps ATANHF_DATA(sPoly+160)(%rip), %ymm2, %ymm1 - vfmadd213ps ATANHF_DATA(sPoly+192)(%rip), %ymm2, %ymm1 - vfmadd213ps ATANHF_DATA(sPoly+224)(%rip), %ymm2, %ymm1 + + vfmadd213ps LOCAL_DATA(_Poly_2)(%rip), %ymm2, %ymm1 + vfmadd213ps LOCAL_DATA(_Poly_3)(%rip), %ymm2, %ymm1 + vfmadd213ps LOCAL_DATA(_Poly_4)(%rip), %ymm2, %ymm1 + vfmadd213ps LOCAL_DATA(_Poly_5)(%rip), %ymm2, %ymm1 + vfmadd213ps LOCAL_DATA(_Poly_6)(%rip), %ymm2, %ymm1 + vfmadd213ps LOCAL_DATA(_Poly_7)(%rip), %ymm2, %ymm1 + vfmadd213ps COMMON_DATA(_Neg5F)(%rip), %ymm2, %ymm1 vmulps %ymm1, %ymm2, %ymm1 vfmadd213ps %ymm2, %ymm2, %ymm1 - /* final reconstruction */ + /* final reconstruction. */ vpsrad $23, %ymm7, %ymm6 vcvtdq2ps %ymm6, %ymm2 - vfmadd132ps ATANHF_DATA(sLn2)(%rip), %ymm1, %ymm2 + vfmadd132ps COMMON_DATA(_Ln2)(%rip), %ymm1, %ymm2 - /* Finally, halve the result and reincorporate the sign */ - vxorps ATANHF_DATA(sHalf)(%rip), %ymm3, %ymm3 + /* Finally, halve the result and reincorporate the sign. */ + vxorps LOCAL_DATA(_Half)(%rip), %ymm3, %ymm3 vmulps %ymm2, %ymm3, %ymm2 vmovmskps %ymm14, %edx testl %edx, %edx vblendvps %ymm15, %ymm13, %ymm2, %ymm0 - /* Go to special inputs processing branch */ + /* Go to special inputs processing branch. */ jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx rdx r12 r13 r14 r15 ymm0 + /* No registers to restore on fast path. */ ret /* Cold case. edx has 1s where there was a special value that needs to be handled by a atanhf call. Optimize for code size - more so than speed here. */ + more so than speed here. */ L(SPECIAL_VALUES_BRANCH): - # LOE rbx rdx r12 r13 r14 r15 ymm0 ymm8 - /* Use r13 to save/restore the stack. This allows us to use rbp as - callee save register saving code size. */ + + /* Use r13 to save/restore the stack. This allows us to use rbp + as callee save register saving code size. */ pushq %r13 - cfi_adjust_cfa_offset(8) - cfi_offset(r13, -16) - /* Need to callee save registers to preserve state across tanhf calls. - */ + cfi_adjust_cfa_offset (8) + cfi_offset (r13, -16) + /* Need to callee save registers to preserve state across tanhf + calls. */ pushq %rbx - cfi_adjust_cfa_offset(8) - cfi_offset(rbx, -24) + cfi_adjust_cfa_offset (8) + cfi_offset (rbx, -24) pushq %rbp - cfi_adjust_cfa_offset(8) - cfi_offset(rbp, -32) + cfi_adjust_cfa_offset (8) + cfi_offset (rbp, -32) movq %rsp, %r13 - cfi_def_cfa_register(r13) + cfi_def_cfa_register (r13) /* Align stack and make room for 2x ymm vectors. */ andq $-32, %rsp @@ -217,16 +214,17 @@ L(SPECIAL_VALUES_BRANCH): vzeroupper - /* edx has 1s where there was a special value that needs to be handled - by a atanhf call. */ + /* edx has 1s where there was a special value that needs to be + handled by a atanhf call. */ movl %edx, %ebx L(SPECIAL_VALUES_LOOP): - # LOE rbx rbp r12 r13 r14 r15 - /* use rbp as index for special value that is saved across calls to - atanhf. We technically don't need a callee save register here as offset - to rsp is always [0, 28] so we can restore rsp by realigning to 64. - Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions - in the loop. Realigning also costs more code size. */ + + /* use rbp as index for special value that is saved across calls + to atanhf. We technically don't need a callee save register + here as offset to rsp is always [0, 28] so we can restore + rsp by realigning to 64. Essentially the tradeoff is 1 extra + save/restore vs 2 extra instructions in the loop. Realigning + also costs more code size. */ xorl %ebp, %ebp tzcntl %ebx, %ebp @@ -234,100 +232,52 @@ L(SPECIAL_VALUES_LOOP): vmovss 32(%rsp, %rbp, 4), %xmm0 call atanhf@PLT - /* No good way to avoid the store-forwarding fault this will cause on - return. `lfence` avoids the SF fault but at greater cost as it - serialized stack/callee save restoration. */ + /* No good way to avoid the store-forwarding fault this will + cause on return. `lfence` avoids the SF fault but at greater + cost as it serialized stack/callee save restoration. */ vmovss %xmm0, (%rsp, %rbp, 4) - blsrl %ebx, %ebx + blsrl %ebx, %ebx jnz L(SPECIAL_VALUES_LOOP) - # LOE r12 r13 r14 r15 - /* All results have been written to (%rsp). */ vmovups (%rsp), %ymm0 /* Restore rsp. */ movq %r13, %rsp - cfi_def_cfa_register(rsp) + cfi_def_cfa_register (rsp) /* Restore callee save registers. */ popq %rbp - cfi_adjust_cfa_offset(-8) - cfi_restore(rbp) + cfi_adjust_cfa_offset (-8) + cfi_restore (rbp) popq %rbx - cfi_adjust_cfa_offset(-8) - cfi_restore(rbp) + cfi_adjust_cfa_offset (-8) + cfi_restore (rbp) popq %r13 - cfi_adjust_cfa_offset(-8) - cfi_restore(r13) + cfi_adjust_cfa_offset (-8) + cfi_restore (r13) ret END(_ZGVdN8v_atanhf_avx2) - .section .rodata, "a" - .align 32 -#ifdef __svml_satanh_data_internal_typedef -typedef unsigned int VUINT32; -typedef struct{ - __declspec(align(32)) VUINT32 SgnMask[8][1]; - __declspec(align(32)) VUINT32 sOne[8][1]; - __declspec(align(32)) VUINT32 sTopMask12[8][1]; - __declspec(align(32)) VUINT32 TinyRange[8][1]; - __declspec(align(32)) VUINT32 iBrkValue[8][1]; - __declspec(align(32)) VUINT32 iOffExpoMask[8][1]; - __declspec(align(32)) VUINT32 sPoly[8][8][1]; - __declspec(align(32)) VUINT32 sLn2[8][1]; - __declspec(align(32)) VUINT32 sHalf[8][1]; -} __svml_satanh_data_internal; -#endif -__svml_satanh_data_internal: - /* SgnMask */ - .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff - .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff - /* sOne = SP 1.0 */ + .section .rodata.avx2, "a" .align 32 - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 - /* sTopMask12 */ - .align 32 - .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 - .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 - /* TinyRange */ - .align 32 - .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 - .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 - /* iBrkValue = SP 2/3 */ - .align 32 - .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab - .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab - /* iOffExpoMask = SP significand mask */ - .align 32 - .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff - .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff - /* sPoly[] = SP polynomial */ - .align 32 - .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed - .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ - .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 - .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ - .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 - .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ - .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 - .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ - .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 - .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ - .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e - .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ - .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 - .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ - .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 - .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ - /* sLn2 = SP ln(2) */ - .align 32 - .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 - .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 - /* sHalf */ - .align 32 - .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000 - .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000 - .align 32 - .type __svml_satanh_data_internal, @object - .size __svml_satanh_data_internal, .-__svml_satanh_data_internal + +LOCAL_DATA_NAME: + DATA_VEC (LOCAL_DATA_NAME, _TinyRange, 0x0C000000) + /* _Poly[] = SP polynomial. */ + /* 1.3820238411426544189453125e-01 P7. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_1, 0x3e0d84ed) + /* -1.5122179687023162841796875e-01 P6. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_2, 0xbe1ad9e3) + /* 1.4042308926582336425781250e-01 P5. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_3, 0x3e0fcb12) + /* -1.6472326219081878662109375e-01 P4. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_4, 0xbe28ad37) + /* 2.0007920265197753906250000e-01 P3. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_5, 0x3e4ce190) + /* -2.5004237890243530273437500e-01 P2. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_6, 0xbe80058e) + /* 3.3333265781402587890625000e-01 P1. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_7, 0x3eaaaa94) + DATA_VEC (LOCAL_DATA_NAME, _Half, 0x3F000000) + .type LOCAL_DATA_NAME, @object + .size LOCAL_DATA_NAME, .-LOCAL_DATA_NAME

[v1,10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S

Commit Message

Patch