@@ -28,146 +28,81 @@
*
*/
-/* Offsets for data table __svml_satan_data_internal_avx512
- */
-#define AbsMask 0
-#define Shifter 64
-#define MaxThreshold 128
-#define MOne 192
-#define One 256
-#define LargeX 320
-#define Zero 384
-#define Tbl_H 448
-#define Pi2 576
-#define coeff_1 640
-#define coeff_2 704
-#define coeff_3 768
+#define LOCAL_DATA_NAME __svml_satan_data_internal
+#include "svml_s_common_evex512_rodata_offsets.h"
+/* Offsets for data table __svml_satan_data_internal. */
+#define _sPC8 0
+#define _sPC7 64
+#define _sPC6 128
+#define _sPC5 192
+#define _sPC4 256
+#define _sPC3 320
+#define _sPC2 384
+#define _sPC1 448
+#define _sPIO2 512
#include <sysdep.h>
.section .text.evex512, "ax", @progbits
ENTRY(_ZGVeN16v_atanf_skx)
- vandps __svml_satan_data_internal_avx512(%rip), %zmm0, %zmm7
- vmovups MaxThreshold+__svml_satan_data_internal_avx512(%rip), %zmm3
- vmovups One+__svml_satan_data_internal_avx512(%rip), %zmm8
-
- /* round to 2 bits after binary point */
- vreduceps $40, {sae}, %zmm7, %zmm5
-
- /* saturate X range */
- vmovups LargeX+__svml_satan_data_internal_avx512(%rip), %zmm6
- vmovups Shifter+__svml_satan_data_internal_avx512(%rip), %zmm2
- vcmpps $29, {sae}, %zmm3, %zmm7, %k1
-
- /* table lookup sequence */
- vmovups Tbl_H+__svml_satan_data_internal_avx512(%rip), %zmm3
- vsubps {rn-sae}, %zmm5, %zmm7, %zmm4
- vaddps {rn-sae}, %zmm2, %zmm7, %zmm1
- vxorps %zmm0, %zmm7, %zmm0
- vfmadd231ps {rn-sae}, %zmm7, %zmm4, %zmm8
- vmovups coeff_2+__svml_satan_data_internal_avx512(%rip), %zmm4
-
- /* if|X|>=MaxThreshold, set DiffX=-1 */
- vblendmps MOne+__svml_satan_data_internal_avx512(%rip), %zmm5, %zmm9{%k1}
- vmovups coeff_3+__svml_satan_data_internal_avx512(%rip), %zmm5
-
- /* if|X|>=MaxThreshold, set Y=X */
- vminps {sae}, %zmm7, %zmm6, %zmm8{%k1}
-
- /* R+Rl = DiffX/Y */
- vgetmantps $0, {sae}, %zmm9, %zmm12
- vgetexpps {sae}, %zmm9, %zmm10
- vpermt2ps Tbl_H+64+__svml_satan_data_internal_avx512(%rip), %zmm1, %zmm3
- vgetmantps $0, {sae}, %zmm8, %zmm15
- vgetexpps {sae}, %zmm8, %zmm11
- vmovups coeff_1+__svml_satan_data_internal_avx512(%rip), %zmm1
-
- /* set table value to Pi/2 for large X */
- vblendmps Pi2+__svml_satan_data_internal_avx512(%rip), %zmm3, %zmm9{%k1}
- vrcp14ps %zmm15, %zmm13
- vsubps {rn-sae}, %zmm11, %zmm10, %zmm2
- vmulps {rn-sae}, %zmm13, %zmm12, %zmm14
- vfnmadd213ps {rn-sae}, %zmm12, %zmm14, %zmm15
- vfmadd213ps {rn-sae}, %zmm14, %zmm13, %zmm15
- vscalefps {rn-sae}, %zmm2, %zmm15, %zmm7
-
- /* polynomial evaluation */
- vmulps {rn-sae}, %zmm7, %zmm7, %zmm8
- vmulps {rn-sae}, %zmm7, %zmm8, %zmm6
- vfmadd231ps {rn-sae}, %zmm8, %zmm1, %zmm4
- vfmadd213ps {rn-sae}, %zmm5, %zmm4, %zmm8
- vfmadd213ps {rn-sae}, %zmm7, %zmm6, %zmm8
- vaddps {rn-sae}, %zmm9, %zmm8, %zmm10
- vxorps %zmm0, %zmm10, %zmm0
+ /* 1) If x>1, then r=-1/x, PIO2=Pi/2
+ 2) If -1<=x<=1, then r=x, PIO2=0
+ 3) If x<-1, then r=-1/x, PIO2=-Pi/2. */
+ vmovups COMMON_DATA(_OneF)(%rip), %zmm2
+ vmovups COMMON_DATA(_SignMask)(%rip), %zmm7
+
+
+ /* Use minud\maxud operations for argument reduction. */
+ vandnps %zmm0, %zmm7, %zmm3
+ vpcmpgtd %zmm2, %zmm3, %k1
+
+ vpmaxud %zmm3, %zmm2, %zmm4
+ vpminud %zmm3, %zmm2, %zmm5
+
+ vdivps %zmm4, %zmm5, %zmm4
+
+ vandps %zmm7, %zmm0, %zmm3
+ vmovdqa32 %zmm7, %zmm7{%k1}{z}
+
+ vmulps %zmm4, %zmm4, %zmm1
+ vpternlogq $0x96, %zmm3, %zmm4, %zmm7
+
+ /* Polynomial. */
+
+ vmovups LOCAL_DATA(_sPC8)(%rip), %zmm0
+ vmovups LOCAL_DATA(_sPC7)(%rip), %zmm4
+
+ vmulps %zmm1, %zmm1, %zmm5
+
+ vfmadd213ps LOCAL_DATA(_sPC6)(%rip), %zmm5, %zmm0
+ vfmadd213ps LOCAL_DATA(_sPC5)(%rip), %zmm5, %zmm4
+ vfmadd213ps LOCAL_DATA(_sPC4)(%rip), %zmm5, %zmm0
+ vfmadd213ps LOCAL_DATA(_sPC3)(%rip), %zmm5, %zmm4
+ vfmadd213ps LOCAL_DATA(_sPC2)(%rip), %zmm5, %zmm0
+ vfmadd213ps LOCAL_DATA(_sPC1)(%rip), %zmm5, %zmm4
+ vfmadd213ps %zmm4, %zmm1, %zmm0
+ vfmadd213ps %zmm2, %zmm1, %zmm0
+ vorps LOCAL_DATA(_sPIO2)(%rip), %zmm3, %zmm3{%k1}
+
+ /* Reconstruction. */
+ vfmadd213ps %zmm3, %zmm7, %zmm0
ret
END(_ZGVeN16v_atanf_skx)
- .section .rodata, "a"
+ .section .rodata.evex512, "a"
.align 64
-#ifdef __svml_satan_data_internal_avx512_typedef
-typedef unsigned int VUINT32;
-typedef struct {
- __declspec(align(64)) VUINT32 AbsMask[16][1];
- __declspec(align(64)) VUINT32 Shifter[16][1];
- __declspec(align(64)) VUINT32 MaxThreshold[16][1];
- __declspec(align(64)) VUINT32 MOne[16][1];
- __declspec(align(64)) VUINT32 One[16][1];
- __declspec(align(64)) VUINT32 LargeX[16][1];
- __declspec(align(64)) VUINT32 Zero[16][1];
- __declspec(align(64)) VUINT32 Tbl_H[32][1];
- __declspec(align(64)) VUINT32 Pi2[16][1];
- __declspec(align(64)) VUINT32 coeff[3][16][1];
-} __svml_satan_data_internal_avx512;
-#endif
-__svml_satan_data_internal_avx512:
- /* AbsMask */
- .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
- /* Shifter */
- .align 64
- .long 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000, 0x4a000000
- /* MaxThreshold */
- .align 64
- .long 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000, 0x40F80000
- /* MOne */
- .align 64
- .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
- /* One */
- .align 64
- .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
- /* LargeX */
- .align 64
- .long 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000, 0x4f800000
- /* Zero */
- .align 64
- .long 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
- /* Tbl_H */
- .align 64
- .long 0x00000000, 0x3e7adbb0
- .long 0x3eed6338, 0x3f24bc7d
- .long 0x3f490fdb, 0x3f6563e3
- .long 0x3f7b985f, 0x3f869c79
- .long 0x3f8db70d, 0x3f93877b
- .long 0x3f985b6c, 0x3f9c6b53
- .long 0x3f9fe0bb, 0x3fa2daa4
- .long 0x3fa57088, 0x3fa7b46f
- .long 0x3fa9b465, 0x3fab7b7a
- .long 0x3fad1283, 0x3fae809e
- .long 0x3fafcb99, 0x3fb0f836
- .long 0x3fb20a6a, 0x3fb30581
- .long 0x3fb3ec43, 0x3fb4c10a
- .long 0x3fb585d7, 0x3fb63c64
- .long 0x3fb6e62c, 0x3fb78478
- .long 0x3fb81868, 0x3fb8a2f5
- /* Pi2 */
- .align 64
- .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
- /* coeff3 */
- .align 64
- .long 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de, 0xbe0fa8de
- .long 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2, 0x3e4cc8e2
- .long 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa, 0xbeaaaaaa
- .align 64
- .type __svml_satan_data_internal_avx512, @object
- .size __svml_satan_data_internal_avx512, .-__svml_satan_data_internal_avx512
+LOCAL_DATA_NAME:
+ DATA_VEC (LOCAL_DATA_NAME, _sPC8, 0x3B322CC0)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC7, 0xBC7F2631)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC6, 0x3D2BC384)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC5, 0xBD987629)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC4, 0x3DD96474)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC3, 0xBE1161F8)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC2, 0x3E4CB79F)
+ DATA_VEC (LOCAL_DATA_NAME, _sPC1, 0xBEAAAA49)
+ DATA_VEC (LOCAL_DATA_NAME, _sPIO2, 0x3FC90FDB)
+
+ .type LOCAL_DATA_NAME, @object
+ .size LOCAL_DATA_NAME, .-LOCAL_DATA_NAME