[v1,09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S

Message ID	20221207085236.1424424-9-goldstein.w.n@gmail.com
State	New
Headers	show Return-Path: <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 2A79F392B11B To: libc-alpha@sourceware.org Cc: goldstein.w.n@gmail.com, hjl.tools@gmail.com, andrey.kolesov@intel.com, carlos@systemhalted.org Subject: [PATCH v1 09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S Date: Wed, 7 Dec 2022 00:52:18 -0800 Message-Id: <20221207085236.1424424-9-goldstein.w.n@gmail.com> In-Reply-To: <20221207085236.1424424-1-goldstein.w.n@gmail.com> References: <20221207085236.1424424-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list From: Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> Reply-To: Noah Goldstein <goldstein.w.n@gmail.com> Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org>
Series	[v1,01/27] x86/fpu: Create helper file for common data macros \| expand [v1,01/27] x86/fpu: Create helper file for common data macros [v1,02/27] x86/fpu: Add file for common data used across svml_s__avx2.S files [v1,03/27] x86/fpu: Add file for common data used across svml_s__avx512.S files [v1,04/27] x86/fpu: Add file for common data used across svml_s__sse4.S files [v1,05/27] x86/fpu: Build common data files for svml_s__{avx512, avx2, sse4}.S [v1,06/27] x86/fpu: Update rodata usage in svml_s_tanhf__{avx2, sse4} [v1,07/27] x86/fpu: Update rodata usage in svml_s_tanhf16_core_avx512.S [v1,08/27] x86/fpu: Update rodata usage in svml_s_atanhf16_core_avx512.S [v1,09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S [v1,10/27] x86/fpu: Update rodata usage in svml_s_atanhf8_core_avx2.S [v1,11/27] x86/fpu: Optimize svml_s_atanf16_core_avx512.S [v1,12/27] x86/fpu: Optimize svml_s_atanf4_core_sse4.S [v1,13/27] x86/fpu: Optimize svml_s_atanf8_core_avx2.S [v1,14/27] x86/fpu: Add common rodata file for svml_s_tanf__{avx512, avx2, sse4}.S [v1,15/27] x86/fpu: Optimize svml_s_tanf16_core_avx512.S [v1,16/27] x86/fpu: Optimize svml_s_tanf4_core_sse4.S [v1,17/27] x86/fpu: Optimize svml_s_tanf8_core_avx2.S [v1,18/27] x86/fpu: Optimize svml_s_log10f16_core_avx512.S [v1,19/27] x86/fpu: Optimize svml_s_log10f4_core_sse4.S [v1,20/27] x86/fpu: Optimize svml_s_log10f8_core_avx2.S [v1,21/27] x86/fpu: Optimize svml_s_log2f16_core_avx512.S [v1,22/27] x86/fpu: Optimize svml_s_log2f4_core_sse4.S [v1,23/27] x86/fpu: Optimize svml_s_log2f8_core_avx2.S [v1,24/27] x86/fpu: Optimize svml_s_logf16_core_avx512.S [v1,25/27] x86/fpu: Optimize svml_s_logf4_core_sse4.S [v1,26/27] x86/fpu: Optimize svml_s_logf8_core_avx2.S [v1,27/27] x86/fpu: Remove unused svml_s_logf_data.S file

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S index 37200b3601..da5744506f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S @@ -30,46 +30,45 @@ * */ -/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered - by use in the function. On cold-starts this might help the - prefetcher. Possibly a better idea is to interleave start/end so - that the prefetcher is less likely to detect a stream and pull - irrelivant lines into cache. */ -#define sOne 0 -#define SgnMask 16 -#define sTopMask12 32 -#define iBrkValue 48 -#define iOffExpoMask 64 -#define sPoly 80 -#define sLn2 208 -#define TinyRange 224 +#define LOCAL_DATA_NAME __svml_satanh_data_internal +#include "svml_s_common_sse4_rodata_offsets.h" + +/* Offsets for data table __svml_stan_data_internal. */ +#define _Poly_1 0 +#define _Poly_2 16 +#define _Poly_3 32 +#define _Poly_4 48 +#define _Poly_5 64 +#define _Poly_6 80 +#define _Poly_7 96 +#define _TinyRange 112 #include <sysdep.h> -#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal) .section .text.sse4, "ax", @progbits ENTRY(_ZGVbN4v_atanhf_sse4) movaps %xmm0, %xmm5 - /* Load constants including One = 1 */ - movups ATANHF_DATA(sOne)(%rip), %xmm4 + /* Load constants including One = 1. */ + movups COMMON_DATA(_OneF)(%rip), %xmm4 movaps %xmm5, %xmm3 - /* Strip off the sign, so treat X as positive until right at the end */ - movups ATANHF_DATA(SgnMask)(%rip), %xmm1 + /* Strip off the sign, so treat X as positive until right at the + end. */ + movups COMMON_DATA(_AbsMask)(%rip), %xmm1 movaps %xmm4, %xmm2 andps %xmm1, %xmm0 movaps %xmm4, %xmm10 - movups ATANHF_DATA(sTopMask12)(%rip), %xmm11 + movups COMMON_DATA(_Neg4096)(%rip), %xmm11 movaps %xmm4, %xmm14 movaps %xmm11, %xmm9 - /* - * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces, - * the upper part UHi being <= 12 bits long. Then we have - * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)). - */ + /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two + pieces, the upper part UHi being <= 12 bits long. Then we + have: + atanh(X) = 1/2 * log((1 + X) / (1 - X)) + = 1/2 * log1p(V / (UHi + ULo)). */ movaps %xmm0, %xmm6 mulps %xmm5, %xmm3 subps %xmm0, %xmm2 @@ -80,65 +79,61 @@ ENTRY(_ZGVbN4v_atanhf_sse4) andps %xmm2, %xmm9 - /* - * Check whether |X| < 1, in which case we use the main function. - * Otherwise set the rangemask so that the callout will get used. - * Note that this will also use the callout for NaNs since not(NaN < 1). - */ + /* Check whether |X| < 1, in which case we use the main + function. Otherwise set the rangemask so that the callout + will get used. Note that this will also use the callout for + NaNs since not(NaN < 1). */ rcpps %xmm9, %xmm7 subps %xmm9, %xmm2 andps %xmm11, %xmm7 - /* - * Split V as well into upper 12 bits and lower part, so that we can get - * a preliminary quotient estimate without rounding error. - */ + /* Split V as well into upper 12 bits and lower part, so that we + can get a preliminary quotient estimate without rounding + error. */ andps %xmm6, %xmm11 mulps %xmm7, %xmm9 addps %xmm2, %xmm10 subps %xmm11, %xmm6 - /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */ + /* Hence get initial quotient estimate: + QHi + QLo = R * VHi + R * VLo. */ mulps %xmm7, %xmm11 mulps %xmm7, %xmm10 subps %xmm9, %xmm14 mulps %xmm6, %xmm7 subps %xmm10, %xmm14 - /* Compute D = E + E^2 */ + /* Compute D = E + E^2. */ movaps %xmm14, %xmm13 movaps %xmm4, %xmm8 mulps %xmm14, %xmm13 - /* reduction: compute r,n */ - movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9 + /* reduction: compute r,n. */ + movdqu COMMON_DATA(_IBrkValue)(%rip), %xmm9 addps %xmm13, %xmm14 - /* - * Compute R * (VHi + VLo) * (1 + E + E^2) - * = R * (VHi + VLo) * (1 + D) - * = QHi + (QHi * D + QLo + QLo * D) - */ + /* Compute R * (VHi + VLo) * (1 + E + E^2) + = R * (VHi + VLo) * (1 + D) + = QHi + (QHi * D + QLo + QLo * D). */ movaps %xmm14, %xmm2 mulps %xmm7, %xmm14 mulps %xmm11, %xmm2 addps %xmm14, %xmm7 - movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12 + movdqu COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm12 movaps %xmm4, %xmm14 - /* Record the sign for eventual reincorporation. */ + /* Record the sign for eventual reincorporation. */ addps %xmm7, %xmm2 - /* - * Now finally accumulate the high and low parts of the - * argument to log1p, H + L, with a final compensated summation. - */ + /* Now finally accumulate the high and low parts of the + argument to log1p, H + L, with a final compensated summation. */ movaps %xmm2, %xmm6 andnps %xmm5, %xmm1 movaps %xmm4, %xmm7 - /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */ + /* Or the sign bit in with the tiny result to handle atanh(-0) + correctly. */ addps %xmm11, %xmm6 maxps %xmm6, %xmm7 minps %xmm6, %xmm8 @@ -149,43 +144,43 @@ ENTRY(_ZGVbN4v_atanhf_sse4) subps %xmm10, %xmm7 psubd %xmm9, %xmm10 addps %xmm8, %xmm7 - pand %xmm10, %xmm12 + pandn %xmm10, %xmm12 psrad $23, %xmm10 cvtdq2ps %xmm10, %xmm13 addps %xmm7, %xmm2 - /* final reconstruction */ + /* final reconstruction. */ pslld $23, %xmm10 paddd %xmm9, %xmm12 psubd %xmm10, %xmm14 - /* polynomial evaluation */ + /* polynomial evaluation. */ subps %xmm4, %xmm12 mulps %xmm14, %xmm2 - movups ATANHF_DATA(sPoly+0)(%rip), %xmm7 + movups LOCAL_DATA(_Poly_1)(%rip), %xmm7 addps %xmm12, %xmm2 mulps %xmm2, %xmm7 - /* Finally, halve the result and reincorporate the sign */ - addps ATANHF_DATA(sPoly+16)(%rip), %xmm7 + /* Finally, halve the result and reincorporate the sign. */ + addps LOCAL_DATA(_Poly_2)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+32)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_3)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+48)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_4)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+64)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_5)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+80)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_6)(%rip), %xmm7 mulps %xmm2, %xmm7 - addps ATANHF_DATA(sPoly+96)(%rip), %xmm7 + addps LOCAL_DATA(_Poly_7)(%rip), %xmm7 mulps %xmm2, %xmm7 - movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6 + movaps COMMON_DATA(_Neg5F)(%rip), %xmm6 addps %xmm6, %xmm7 mulps %xmm2, %xmm7 mulps %xmm2, %xmm7 - mulps ATANHF_DATA(sLn2)(%rip), %xmm13 - /* We can build `sHalf` with `sPoly & sOne`. */ + mulps COMMON_DATA(_Ln2)(%rip), %xmm13 + /* We can build `sHalf` with `_Poly & sOne`. */ andps %xmm4, %xmm6 orps %xmm1, %xmm3 xorps %xmm6, %xmm1 @@ -197,7 +192,7 @@ ENTRY(_ZGVbN4v_atanhf_sse4) /* Finish check of NaNs. */ cmpleps %xmm0, %xmm4 movmskps %xmm4, %edx - cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0 + cmpltps LOCAL_DATA(_TinyRange)(%rip), %xmm0 andps %xmm0, %xmm3 andnps %xmm1, %xmm0 @@ -206,115 +201,84 @@ ENTRY(_ZGVbN4v_atanhf_sse4) testl %edx, %edx /* Go to special inputs processing branch. */ jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 xmm0 + /* No registers to restore on fast path. */ ret /* Cold case. edx has 1s where there was a special value that needs to be handled by a atanhf call. Optimize for code size - more so than speed here. */ + more so than speed here. */ L(SPECIAL_VALUES_BRANCH): - # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5 + /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on - call entry will be 16-byte aligned. */ + call entry will be 16-byte aligned. */ subq $56, %rsp - cfi_def_cfa_offset(64) + cfi_def_cfa_offset (64) movups %xmm0, 24(%rsp) movups %xmm5, 40(%rsp) /* Use rbx/rbp for callee save registers as they get short - encoding for many instructions (as compared with r12/r13). */ + encoding for many instructions (as compared with r12/r13). */ movq %rbx, (%rsp) - cfi_offset(rbx, -64) + cfi_offset (rbx, -64) movq %rbp, 8(%rsp) - cfi_offset(rbp, -56) - /* edx has 1s where there was a special value that needs to be handled - by a tanhf call. */ + cfi_offset (rbp, -56) + /* edx has 1s where there was a special value that needs to be + handled by a tanhf call. */ movl %edx, %ebx L(SPECIAL_VALUES_LOOP): - # LOE rbx rbp r12 r13 r14 r15 - /* use rbp as index for special value that is saved across calls to - tanhf. We technically don't need a callee save register here as offset - to rsp is always [0, 12] so we can restore rsp by realigning to 64. - Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions - in the loop. */ + + /* use rbp as index for special value that is saved across calls + to tanhf. We technically don't need a callee save register + here as offset to rsp is always [0, 12] so we can restore + rsp by realigning to 64. Essentially the tradeoff is 1 extra + save/restore vs 2 extra instructions in the loop. */ xorl %ebp, %ebp bsfl %ebx, %ebp /* Scalar math fucntion call to process special input. */ movss 40(%rsp, %rbp, 4), %xmm0 call atanhf@PLT - /* No good way to avoid the store-forwarding fault this will cause on - return. `lfence` avoids the SF fault but at greater cost as it - serialized stack/callee save restoration. */ + /* No good way to avoid the store-forwarding fault this will + cause on return. `lfence` avoids the SF fault but at greater + cost as it serialized stack/callee save restoration. */ movss %xmm0, 24(%rsp, %rbp, 4) leal -1(%rbx), %eax andl %eax, %ebx jnz L(SPECIAL_VALUES_LOOP) - # LOE r12 r13 r14 r15 + /* All results have been written to 24(%rsp). */ movups 24(%rsp), %xmm0 movq (%rsp), %rbx - cfi_restore(rbx) + cfi_restore (rbx) movq 8(%rsp), %rbp - cfi_restore(rbp) + cfi_restore (rbp) addq $56, %rsp - cfi_def_cfa_offset(8) + cfi_def_cfa_offset (8) ret END(_ZGVbN4v_atanhf_sse4) - .section .rodata, "a" + .section .rodata.sse4, "a" .align 16 -#ifdef __svml_satanh_data_internal_typedef -typedef unsigned int VUINT32; -typedef struct{ - __declspec(align(16)) VUINT32 sOne[4][1]; - __declspec(align(16)) VUINT32 SgnMask[4][1]; - __declspec(align(16)) VUINT32 sTopMask12[4][1]; - __declspec(align(16)) VUINT32 iBrkValue[4][1]; - __declspec(align(16)) VUINT32 iOffExpoMask[4][1]; - __declspec(align(16)) VUINT32 sPoly[8][4][1]; - __declspec(align(16)) VUINT32 sLn2[4][1]; - __declspec(align(16)) VUINT32 TinyRange[4][1]; -} __svml_satanh_data_internal; -#endif - -__svml_satanh_data_internal: - /* sOne = SP 1.0 */ - .align 16 - .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 - /* SgnMask */ - .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff - /* sTopMask12 */ - .align 16 - .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000 - /* iBrkValue = SP 2/3 */ - .align 16 - .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab - /* iOffExpoMask = SP significand mask ==*/ - .align 16 - .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff - - /* sPoly[] = SP polynomial */ - .align 16 - .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */ - .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */ - .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */ - .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */ - .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */ - .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */ - .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */ - .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */ - - /* sLn2 = SP ln(2) */ - .align 16 - .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 - /* TinyRange */ - .align 16 - .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000 - .align 16 - .type __svml_satanh_data_internal, @object - .size __svml_satanh_data_internal, .-__svml_satanh_data_internal +LOCAL_DATA_NAME: + /* _Poly[] = SP polynomial. */ + /* 1.3820238411426544189453125e-01 P7. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_1, 0x3e0d84ed) + /* -1.5122179687023162841796875e-01 P6. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_2, 0xbe1ad9e3) + /* 1.4042308926582336425781250e-01 P5. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_3, 0x3e0fcb12) + /* -1.6472326219081878662109375e-01 P4. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_4, 0xbe28ad37) + /* 2.0007920265197753906250000e-01 P3. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_5, 0x3e4ce190) + /* -2.5004237890243530273437500e-01 P2. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_6, 0xbe80058e) + /* 3.3333265781402587890625000e-01 P1. */ + DATA_VEC (LOCAL_DATA_NAME, _Poly_7, 0x3eaaaa94) + DATA_VEC (LOCAL_DATA_NAME, _TinyRange, 0x0C000000) + .type LOCAL_DATA_NAME, @object + .size LOCAL_DATA_NAME, .-LOCAL_DATA_NAME

[v1,09/27] x86/fpu: Update rodata usage in svml_s_atanhf4_core_sse4.S

Commit Message

Patch