@@ -30,46 +30,45 @@
*
*/
-/* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
- by use in the function. On cold-starts this might help the
- prefetcher. Possibly a better idea is to interleave start/end so
- that the prefetcher is less likely to detect a stream and pull
- irrelivant lines into cache. */
-#define sOne 0
-#define SgnMask 16
-#define sTopMask12 32
-#define iBrkValue 48
-#define iOffExpoMask 64
-#define sPoly 80
-#define sLn2 208
-#define TinyRange 224
+#define LOCAL_DATA_NAME __svml_satanh_data_internal
+#include "svml_s_common_sse4_rodata_offsets.h"
+
+/* Offsets for data table __svml_stan_data_internal. */
+#define _Poly_1 0
+#define _Poly_2 16
+#define _Poly_3 32
+#define _Poly_4 48
+#define _Poly_5 64
+#define _Poly_6 80
+#define _Poly_7 96
+#define _TinyRange 112
#include <sysdep.h>
-#define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal)
.section .text.sse4, "ax", @progbits
ENTRY(_ZGVbN4v_atanhf_sse4)
movaps %xmm0, %xmm5
- /* Load constants including One = 1 */
- movups ATANHF_DATA(sOne)(%rip), %xmm4
+ /* Load constants including One = 1. */
+ movups COMMON_DATA(_OneF)(%rip), %xmm4
movaps %xmm5, %xmm3
- /* Strip off the sign, so treat X as positive until right at the end */
- movups ATANHF_DATA(SgnMask)(%rip), %xmm1
+ /* Strip off the sign, so treat X as positive until right at the
+ end. */
+ movups COMMON_DATA(_AbsMask)(%rip), %xmm1
movaps %xmm4, %xmm2
andps %xmm1, %xmm0
movaps %xmm4, %xmm10
- movups ATANHF_DATA(sTopMask12)(%rip), %xmm11
+ movups COMMON_DATA(_Neg4096)(%rip), %xmm11
movaps %xmm4, %xmm14
movaps %xmm11, %xmm9
- /*
- * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
- * the upper part UHi being <= 12 bits long. Then we have
- * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
- */
+ /* Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two
+ pieces, the upper part UHi being <= 12 bits long. Then we
+ have:
+ atanh(X) = 1/2 * log((1 + X) / (1 - X))
+ = 1/2 * log1p(V / (UHi + ULo)). */
movaps %xmm0, %xmm6
mulps %xmm5, %xmm3
subps %xmm0, %xmm2
@@ -80,65 +79,61 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
andps %xmm2, %xmm9
- /*
- * Check whether |X| < 1, in which case we use the main function.
- * Otherwise set the rangemask so that the callout will get used.
- * Note that this will also use the callout for NaNs since not(NaN < 1).
- */
+ /* Check whether |X| < 1, in which case we use the main
+ function. Otherwise set the rangemask so that the callout
+ will get used. Note that this will also use the callout for
+ NaNs since not(NaN < 1). */
rcpps %xmm9, %xmm7
subps %xmm9, %xmm2
andps %xmm11, %xmm7
- /*
- * Split V as well into upper 12 bits and lower part, so that we can get
- * a preliminary quotient estimate without rounding error.
- */
+ /* Split V as well into upper 12 bits and lower part, so that we
+ can get a preliminary quotient estimate without rounding
+ error. */
andps %xmm6, %xmm11
mulps %xmm7, %xmm9
addps %xmm2, %xmm10
subps %xmm11, %xmm6
- /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
+ /* Hence get initial quotient estimate:
+ QHi + QLo = R * VHi + R * VLo. */
mulps %xmm7, %xmm11
mulps %xmm7, %xmm10
subps %xmm9, %xmm14
mulps %xmm6, %xmm7
subps %xmm10, %xmm14
- /* Compute D = E + E^2 */
+ /* Compute D = E + E^2. */
movaps %xmm14, %xmm13
movaps %xmm4, %xmm8
mulps %xmm14, %xmm13
- /* reduction: compute r,n */
- movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9
+ /* reduction: compute r,n. */
+ movdqu COMMON_DATA(_IBrkValue)(%rip), %xmm9
addps %xmm13, %xmm14
- /*
- * Compute R * (VHi + VLo) * (1 + E + E^2)
- * = R * (VHi + VLo) * (1 + D)
- * = QHi + (QHi * D + QLo + QLo * D)
- */
+ /* Compute R * (VHi + VLo) * (1 + E + E^2)
+ = R * (VHi + VLo) * (1 + D)
+ = QHi + (QHi * D + QLo + QLo * D). */
movaps %xmm14, %xmm2
mulps %xmm7, %xmm14
mulps %xmm11, %xmm2
addps %xmm14, %xmm7
- movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
+ movdqu COMMON_DATA(_NotiOffExpoMask)(%rip), %xmm12
movaps %xmm4, %xmm14
- /* Record the sign for eventual reincorporation. */
+ /* Record the sign for eventual reincorporation. */
addps %xmm7, %xmm2
- /*
- * Now finally accumulate the high and low parts of the
- * argument to log1p, H + L, with a final compensated summation.
- */
+ /* Now finally accumulate the high and low parts of the
+ argument to log1p, H + L, with a final compensated summation. */
movaps %xmm2, %xmm6
andnps %xmm5, %xmm1
movaps %xmm4, %xmm7
- /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
+ /* Or the sign bit in with the tiny result to handle atanh(-0)
+ correctly. */
addps %xmm11, %xmm6
maxps %xmm6, %xmm7
minps %xmm6, %xmm8
@@ -149,43 +144,43 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
subps %xmm10, %xmm7
psubd %xmm9, %xmm10
addps %xmm8, %xmm7
- pand %xmm10, %xmm12
+ pandn %xmm10, %xmm12
psrad $23, %xmm10
cvtdq2ps %xmm10, %xmm13
addps %xmm7, %xmm2
- /* final reconstruction */
+ /* final reconstruction. */
pslld $23, %xmm10
paddd %xmm9, %xmm12
psubd %xmm10, %xmm14
- /* polynomial evaluation */
+ /* polynomial evaluation. */
subps %xmm4, %xmm12
mulps %xmm14, %xmm2
- movups ATANHF_DATA(sPoly+0)(%rip), %xmm7
+ movups LOCAL_DATA(_Poly_1)(%rip), %xmm7
addps %xmm12, %xmm2
mulps %xmm2, %xmm7
- /* Finally, halve the result and reincorporate the sign */
- addps ATANHF_DATA(sPoly+16)(%rip), %xmm7
+ /* Finally, halve the result and reincorporate the sign. */
+ addps LOCAL_DATA(_Poly_2)(%rip), %xmm7
mulps %xmm2, %xmm7
- addps ATANHF_DATA(sPoly+32)(%rip), %xmm7
+ addps LOCAL_DATA(_Poly_3)(%rip), %xmm7
mulps %xmm2, %xmm7
- addps ATANHF_DATA(sPoly+48)(%rip), %xmm7
+ addps LOCAL_DATA(_Poly_4)(%rip), %xmm7
mulps %xmm2, %xmm7
- addps ATANHF_DATA(sPoly+64)(%rip), %xmm7
+ addps LOCAL_DATA(_Poly_5)(%rip), %xmm7
mulps %xmm2, %xmm7
- addps ATANHF_DATA(sPoly+80)(%rip), %xmm7
+ addps LOCAL_DATA(_Poly_6)(%rip), %xmm7
mulps %xmm2, %xmm7
- addps ATANHF_DATA(sPoly+96)(%rip), %xmm7
+ addps LOCAL_DATA(_Poly_7)(%rip), %xmm7
mulps %xmm2, %xmm7
- movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6
+ movaps COMMON_DATA(_Neg5F)(%rip), %xmm6
addps %xmm6, %xmm7
mulps %xmm2, %xmm7
mulps %xmm2, %xmm7
- mulps ATANHF_DATA(sLn2)(%rip), %xmm13
- /* We can build `sHalf` with `sPoly & sOne`. */
+ mulps COMMON_DATA(_Ln2)(%rip), %xmm13
+ /* We can build `sHalf` with `_Poly & sOne`. */
andps %xmm4, %xmm6
orps %xmm1, %xmm3
xorps %xmm6, %xmm1
@@ -197,7 +192,7 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
/* Finish check of NaNs. */
cmpleps %xmm0, %xmm4
movmskps %xmm4, %edx
- cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
+ cmpltps LOCAL_DATA(_TinyRange)(%rip), %xmm0
andps %xmm0, %xmm3
andnps %xmm1, %xmm0
@@ -206,115 +201,84 @@ ENTRY(_ZGVbN4v_atanhf_sse4)
testl %edx, %edx
/* Go to special inputs processing branch. */
jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 xmm0
+
/* No registers to restore on fast path. */
ret
/* Cold case. edx has 1s where there was a special value that
needs to be handled by a atanhf call. Optimize for code size
- more so than speed here. */
+ more so than speed here. */
L(SPECIAL_VALUES_BRANCH):
- # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
+
/* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
- call entry will be 16-byte aligned. */
+ call entry will be 16-byte aligned. */
subq $56, %rsp
- cfi_def_cfa_offset(64)
+ cfi_def_cfa_offset (64)
movups %xmm0, 24(%rsp)
movups %xmm5, 40(%rsp)
/* Use rbx/rbp for callee save registers as they get short
- encoding for many instructions (as compared with r12/r13). */
+ encoding for many instructions (as compared with r12/r13). */
movq %rbx, (%rsp)
- cfi_offset(rbx, -64)
+ cfi_offset (rbx, -64)
movq %rbp, 8(%rsp)
- cfi_offset(rbp, -56)
- /* edx has 1s where there was a special value that needs to be handled
- by a tanhf call. */
+ cfi_offset (rbp, -56)
+ /* edx has 1s where there was a special value that needs to be
+ handled by a tanhf call. */
movl %edx, %ebx
L(SPECIAL_VALUES_LOOP):
- # LOE rbx rbp r12 r13 r14 r15
- /* use rbp as index for special value that is saved across calls to
- tanhf. We technically don't need a callee save register here as offset
- to rsp is always [0, 12] so we can restore rsp by realigning to 64.
- Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
- in the loop. */
+
+ /* use rbp as index for special value that is saved across calls
+ to tanhf. We technically don't need a callee save register
+ here as offset to rsp is always [0, 12] so we can restore
+ rsp by realigning to 64. Essentially the tradeoff is 1 extra
+ save/restore vs 2 extra instructions in the loop. */
xorl %ebp, %ebp
bsfl %ebx, %ebp
/* Scalar math fucntion call to process special input. */
movss 40(%rsp, %rbp, 4), %xmm0
call atanhf@PLT
- /* No good way to avoid the store-forwarding fault this will cause on
- return. `lfence` avoids the SF fault but at greater cost as it
- serialized stack/callee save restoration. */
+ /* No good way to avoid the store-forwarding fault this will
+ cause on return. `lfence` avoids the SF fault but at greater
+ cost as it serialized stack/callee save restoration. */
movss %xmm0, 24(%rsp, %rbp, 4)
leal -1(%rbx), %eax
andl %eax, %ebx
jnz L(SPECIAL_VALUES_LOOP)
- # LOE r12 r13 r14 r15
+
/* All results have been written to 24(%rsp). */
movups 24(%rsp), %xmm0
movq (%rsp), %rbx
- cfi_restore(rbx)
+ cfi_restore (rbx)
movq 8(%rsp), %rbp
- cfi_restore(rbp)
+ cfi_restore (rbp)
addq $56, %rsp
- cfi_def_cfa_offset(8)
+ cfi_def_cfa_offset (8)
ret
END(_ZGVbN4v_atanhf_sse4)
- .section .rodata, "a"
+ .section .rodata.sse4, "a"
.align 16
-#ifdef __svml_satanh_data_internal_typedef
-typedef unsigned int VUINT32;
-typedef struct{
- __declspec(align(16)) VUINT32 sOne[4][1];
- __declspec(align(16)) VUINT32 SgnMask[4][1];
- __declspec(align(16)) VUINT32 sTopMask12[4][1];
- __declspec(align(16)) VUINT32 iBrkValue[4][1];
- __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
- __declspec(align(16)) VUINT32 sPoly[8][4][1];
- __declspec(align(16)) VUINT32 sLn2[4][1];
- __declspec(align(16)) VUINT32 TinyRange[4][1];
-} __svml_satanh_data_internal;
-#endif
-
-__svml_satanh_data_internal:
- /* sOne = SP 1.0 */
- .align 16
- .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
- /* SgnMask */
- .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
- /* sTopMask12 */
- .align 16
- .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
- /* iBrkValue = SP 2/3 */
- .align 16
- .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
- /* iOffExpoMask = SP significand mask ==*/
- .align 16
- .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
-
- /* sPoly[] = SP polynomial */
- .align 16
- .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
- .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
- .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
- .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
- .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
- .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
- .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
- .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
-
- /* sLn2 = SP ln(2) */
- .align 16
- .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
- /* TinyRange */
- .align 16
- .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
- .align 16
- .type __svml_satanh_data_internal, @object
- .size __svml_satanh_data_internal, .-__svml_satanh_data_internal
+LOCAL_DATA_NAME:
+ /* _Poly[] = SP polynomial. */
+ /* 1.3820238411426544189453125e-01 P7. */
+ DATA_VEC (LOCAL_DATA_NAME, _Poly_1, 0x3e0d84ed)
+ /* -1.5122179687023162841796875e-01 P6. */
+ DATA_VEC (LOCAL_DATA_NAME, _Poly_2, 0xbe1ad9e3)
+ /* 1.4042308926582336425781250e-01 P5. */
+ DATA_VEC (LOCAL_DATA_NAME, _Poly_3, 0x3e0fcb12)
+ /* -1.6472326219081878662109375e-01 P4. */
+ DATA_VEC (LOCAL_DATA_NAME, _Poly_4, 0xbe28ad37)
+ /* 2.0007920265197753906250000e-01 P3. */
+ DATA_VEC (LOCAL_DATA_NAME, _Poly_5, 0x3e4ce190)
+ /* -2.5004237890243530273437500e-01 P2. */
+ DATA_VEC (LOCAL_DATA_NAME, _Poly_6, 0xbe80058e)
+ /* 3.3333265781402587890625000e-01 P1. */
+ DATA_VEC (LOCAL_DATA_NAME, _Poly_7, 0x3eaaaa94)
+ DATA_VEC (LOCAL_DATA_NAME, _TinyRange, 0x0C000000)
+ .type LOCAL_DATA_NAME, @object
+ .size LOCAL_DATA_NAME, .-LOCAL_DATA_NAME