Message ID | 20220620200210.2772523-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1] x86: Replace all sse instructions with vex equivilent in avx+ files | expand |
On Mon, Jun 20, 2022 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Most of these don't really matter as there was no dirty upper state > but we should generally avoid stray sse when its not needed. > > The one case that really matters is in svml_d_tanh4_core_avx2.S: > > blendvps %xmm0, %xmm8, %xmm7 > > When there was a dirty upper state. > > Tested on x86_64-linux > --- > sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S | 6 +++--- > sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S | 4 ++-- > sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S | 4 ++-- > sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +- > 75 files changed, 158 insertions(+), 158 deletions(-) > > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S > index e19bddd2e2..73025e8b0f 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S > @@ -210,11 +210,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call acos@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S > index f4c72c3618..b8cc6dd776 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S > @@ -232,11 +232,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call acos@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S > index 5d0b23b72c..126110cf17 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S > @@ -372,11 +372,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call acosh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S > index b9a1131664..db0ef3b9dd 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S > @@ -317,11 +317,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call acosh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S > index ba96089504..612a45da30 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S > @@ -202,11 +202,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call asin@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S > index 0f5b773b04..e7b41ab232 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S > @@ -224,11 +224,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call asin@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S > index 131b716c95..1fcbb245b7 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S > @@ -429,11 +429,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call asinh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S > index 5bdc6859f0..8445fc8ba4 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S > @@ -343,11 +343,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call asinh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S > index 1b601576cc..a45cae79a1 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S > @@ -277,12 +277,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > - movsd 64(%rsp, %r14, 8), %xmm1 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm1 > call atan2@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 96(%rsp, %r14, 8) > + vmovsd %xmm0, 96(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S > index ef9581075d..c3b0f7940c 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S > @@ -295,12 +295,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > - movsd 128(%rsp, %r14, 8), %xmm1 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 128(%rsp, %r14, 8), %xmm1 > call atan2@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 192(%rsp, %r14, 8) > + vmovsd %xmm0, 192(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S > index b5cbfd224c..c9c41ef9f4 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S > @@ -339,11 +339,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call atanh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S > index 3193c026dd..de4edb3cc0 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S > @@ -274,11 +274,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call atanh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S > index 96ecbe05c1..71a25f3db8 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S > @@ -262,11 +262,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call cbrt@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S > index 25df252108..a3d9104f5e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S > @@ -282,11 +282,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call cosh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S > index 066bbc7de6..4ff0e038a3 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S > @@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call cosh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S > index c832b65e3e..6efd2e95ba 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S > @@ -258,11 +258,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call erfc@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S > index 77228814d3..42bdfe6f18 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S > @@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call erfc@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S > index 7271bcc1d9..f519bcce45 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S > @@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call exp10@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S > index 40b01c3cd0..3f0c670199 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S > @@ -191,11 +191,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call exp10@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S > index ced774e89c..afa00a38bb 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S > @@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call exp2@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S > index 7a85fd8b18..eee785dbf5 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S > @@ -227,11 +227,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call exp2@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S > index 590341c243..4a3202750f 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S > @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call expm1@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S > index efae1f8b66..0fa17f3a73 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S > @@ -211,11 +211,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call expm1@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S > index ae5738c1b7..5c693d132e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S > @@ -231,12 +231,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > - movsd 64(%rsp, %r14, 8), %xmm1 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm1 > call hypot@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 96(%rsp, %r14, 8) > + vmovsd %xmm0, 96(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S > index 0c404fd5ee..a392252c8b 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S > @@ -194,12 +194,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > - movsd 128(%rsp, %r14, 8), %xmm1 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 128(%rsp, %r14, 8), %xmm1 > call hypot@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 192(%rsp, %r14, 8) > + vmovsd %xmm0, 192(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S > index 2461c6ad56..9bf45a6dc2 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S > @@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call log10@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S > index 5d129ef4e5..101618cce9 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S > @@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call log10@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S > index 13235793e8..39ec0024cf 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S > @@ -263,11 +263,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call log1p@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S > index dd55b5dd18..3033fcb5b3 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S > @@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call log1p@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S > index 25d2edaae5..84bdb2090d 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S > @@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call log2@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S > index bcb6736dec..b3e9bb3ca4 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S > @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call log2@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S > index ae16600579..ad2a06ad37 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S > @@ -280,11 +280,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call sinh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S > index 075665d57d..7ca915e30f 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S > @@ -271,11 +271,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call sinh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S > index 01c86736e7..f26daf316b 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S > @@ -267,11 +267,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call tan@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S > index 376479035e..0c90328b0a 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S > @@ -239,11 +239,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call tan@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S > index 7ddf145b25..ea41d326eb 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S > @@ -110,7 +110,7 @@ ENTRY(_ZGVdN4v_tanh_avx2) > vpcmpgtd %xmm11, %xmm9, %xmm10 > vpcmpgtd %xmm8, %xmm9, %xmm0 > vpand %xmm10, %xmm9, %xmm7 > - blendvps %xmm0, %xmm8, %xmm7 > + vblendvps %xmm0, %xmm8, %xmm7, %xmm7 > > /* > * VSHRIMM( I, iIndex, = iIndex, (17 - 4) ); > @@ -272,11 +272,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 32(%rsp, %r14, 8), %xmm0 > + vmovsd 32(%rsp, %r14, 8), %xmm0 > call tanh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 64(%rsp, %r14, 8) > + vmovsd %xmm0, 64(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S > index 82c0119500..c995401a24 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S > @@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movsd 64(%rsp, %r14, 8), %xmm0 > + vmovsd 64(%rsp, %r14, 8), %xmm0 > call tanh@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movsd %xmm0, 128(%rsp, %r14, 8) > + vmovsd %xmm0, 128(%rsp, %r14, 8) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > index 26fef1f268..fd84977e95 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S > @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call acosf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S > index bf28a5dd00..078fe5a898 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S > @@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call acosf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S > index 3f44e75248..65026e647d 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S > @@ -290,11 +290,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call acoshf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S > index 3a70fc1448..489dac033c 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S > @@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call acoshf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S > index 4e9984d870..2accef703e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S > @@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call asinf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S > index 59bea9dc42..257c8da2f7 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S > @@ -187,11 +187,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call asinf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S > index 6b569ecf41..a0c27922e4 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S > @@ -313,11 +313,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call asinhf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S > index 794030a481..d6f6c3d5aa 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S > @@ -361,11 +361,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call asinhf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S > index 56aa5bb917..15ffa4b6c9 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S > @@ -257,12 +257,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > - movss 128(%rsp, %r14, 4), %xmm1 > + vmovss 64(%rsp, %r14, 4), %xmm0 > + vmovss 128(%rsp, %r14, 4), %xmm1 > call atan2f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 192(%rsp, %r14, 4) > + vmovss %xmm0, 192(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S > index 29ebbb6db2..08b18c3e3f 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S > @@ -238,12 +238,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > - movss 64(%rsp, %r14, 4), %xmm1 > + vmovss 32(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm1 > call atan2f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 96(%rsp, %r14, 4) > + vmovss %xmm0, 96(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S > index f42462c581..94186a14cb 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S > @@ -222,13 +222,13 @@ L(SPECIAL_VALUES_LOOP): > tzcntl %ebx, %ebp > > /* Scalar math fucntion call to process special input. */ > - movss 64(%rsp, %rbp, 4), %xmm0 > + vmovss 64(%rsp, %rbp, 4), %xmm0 > call atanhf@PLT > > /* No good way to avoid the store-forwarding fault this will cause on > return. `lfence` avoids the SF fault but at greater cost as it > serialized stack/callee save restoration. */ > - movss %xmm0, (%rsp, %rbp, 4) > + vmovss %xmm0, (%rsp, %rbp, 4) > > blsrl %ebx, %ebx > jnz L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S > index 43eb423831..49ffd7a9b2 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S > @@ -231,13 +231,13 @@ L(SPECIAL_VALUES_LOOP): > tzcntl %ebx, %ebp > > /* Scalar math fucntion call to process special input. */ > - movss 32(%rsp, %rbp, 4), %xmm0 > + vmovss 32(%rsp, %rbp, 4), %xmm0 > call atanhf@PLT > > /* No good way to avoid the store-forwarding fault this will cause on > return. `lfence` avoids the SF fault but at greater cost as it > serialized stack/callee save restoration. */ > - movss %xmm0, (%rsp, %rbp, 4) > + vmovss %xmm0, (%rsp, %rbp, 4) > > blsrl %ebx, %ebx > jnz L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S > index d24d36163d..14b58c171a 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S > @@ -304,11 +304,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call cbrtf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S > index 6b740bf866..d1a5ddf5b4 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S > @@ -228,11 +228,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call coshf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S > index 6f29218af1..a00650ccd6 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S > @@ -242,11 +242,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call coshf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S > index 9daaa0c06d..5fb5b2f0f7 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S > @@ -218,11 +218,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call erfcf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S > index 4cafc1bcd5..60b9fab000 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S > @@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call erfcf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S > index eb9f3f8d8b..10f0b2cb37 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S > @@ -186,11 +186,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call exp10f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S > index 11244d5a5f..275ab42529 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S > @@ -238,11 +238,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call exp10f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S > index 5b406c6e32..8a5f1e3985 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S > @@ -209,11 +209,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call exp2f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S > index f7a80a4d64..cc87e66425 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S > @@ -188,11 +188,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call exp2f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S > index 71d23e632c..7fe830daa4 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S > @@ -194,11 +194,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call expm1f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S > index 73f862528a..d5d7fa2791 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S > @@ -212,11 +212,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call expm1f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S > index 548936fe61..c92e3ab065 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S > @@ -202,12 +202,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > - movss 128(%rsp, %r14, 4), %xmm1 > + vmovss 64(%rsp, %r14, 4), %xmm0 > + vmovss 128(%rsp, %r14, 4), %xmm1 > call hypotf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 192(%rsp, %r14, 4) > + vmovss %xmm0, 192(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S > index fc97828008..7a26c5accc 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S > @@ -226,12 +226,12 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > - movss 64(%rsp, %r14, 4), %xmm1 > + vmovss 32(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm1 > call hypotf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 96(%rsp, %r14, 4) > + vmovss %xmm0, 96(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S > index b192dfe464..0eb9b23c4e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S > @@ -161,11 +161,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call log10f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S > index ea51c28f81..4bdc62e90e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S > @@ -174,11 +174,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call log10f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S > index 8fa5068595..2c864f0c0e 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S > @@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call log1pf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S > index 54d6a9a685..7326a2b5ad 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S > @@ -190,11 +190,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call log1pf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S > index 3b0a28fee0..02b255dde8 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S > @@ -158,11 +158,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call log2f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S > index eaa5112178..2245d40f84 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S > @@ -169,11 +169,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call log2f@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S > index fad4847f28..89be733eb2 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S > @@ -252,11 +252,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call sinhf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S > index 8c4b46cee2..e358e2efee 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S > @@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 32(%rsp, %r14, 4), %xmm0 > + vmovss 32(%rsp, %r14, 4), %xmm0 > call sinhf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 64(%rsp, %r14, 4) > + vmovss %xmm0, 64(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S > index f2a18f0b2c..4e18cdc0ce 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S > @@ -235,11 +235,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %r12d, %r14d > - movss 64(%rsp, %r14, 4), %xmm0 > + vmovss 64(%rsp, %r14, 4), %xmm0 > call tanf@PLT > # LOE rbx r14 r15 r12d r13d xmm0 > > - movss %xmm0, 128(%rsp, %r14, 4) > + vmovss %xmm0, 128(%rsp, %r14, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S > index cd33fac643..d34e61ac41 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S > @@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP): > > L(SCALAR_MATH_CALL): > movl %ebx, %r13d > - movss 32(%rsp, %r13, 4), %xmm0 > + vmovss 32(%rsp, %r13, 4), %xmm0 > call tanf@PLT > # LOE r13 r14 r15 ebx r12d xmm0 > > - movss %xmm0, 64(%rsp, %r13, 4) > + vmovss %xmm0, 64(%rsp, %r13, 4) > > /* Process special inputs in loop */ > jmp L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S > index 7edc74a116..84f73fdaf9 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S > @@ -221,13 +221,13 @@ L(SPECIAL_VALUES_LOOP): > tzcntl %ebx, %ebp > > /* Scalar math fucntion call to process special input. */ > - movss 64(%rsp, %rbp, 4), %xmm0 > + vmovss 64(%rsp, %rbp, 4), %xmm0 > call tanhf@PLT > > /* No good way to avoid the store-forwarding fault this will cause on > return. `lfence` avoids the SF fault but at greater cost as it > serialized stack/callee save restoration. */ > - movss %xmm0, (%rsp, %rbp, 4) > + vmovss %xmm0, (%rsp, %rbp, 4) > > blsrl %ebx, %ebx > jnz L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S > index 55df346a00..ea3e9f4210 100644 > --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S > +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S > @@ -240,13 +240,13 @@ L(SPECIAL_VALUES_LOOP): > tzcntl %ebx, %ebp > > /* Scalar math function call to process special input. */ > - movss 32(%rsp, %rbp, 4), %xmm0 > + vmovss 32(%rsp, %rbp, 4), %xmm0 > call tanhf@PLT > > /* No good way to avoid the store-forwarding fault this will cause on > return. `lfence` avoids the SF fault but at greater cost as it > serialized stack/callee save restoration. */ > - movss %xmm0, (%rsp, %rbp, 4) > + vmovss %xmm0, (%rsp, %rbp, 4) > > blsrl %ebx, %ebx > jnz L(SPECIAL_VALUES_LOOP) > diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S > index bd26ba80d5..eb128a2ae3 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S > @@ -49,7 +49,7 @@ > > .section SECTION(.text), "ax", @progbits > ENTRY(STRRCHR) > - movd %esi, %xmm7 > + vmovd %esi, %xmm7 > movl %edi, %eax > /* Broadcast CHAR to YMM4. */ > VPBROADCAST %xmm7, %ymm7 > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S index e19bddd2e2..73025e8b0f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S @@ -210,11 +210,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call acos@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S index f4c72c3618..b8cc6dd776 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S @@ -232,11 +232,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call acos@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S index 5d0b23b72c..126110cf17 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S @@ -372,11 +372,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call acosh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S index b9a1131664..db0ef3b9dd 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S @@ -317,11 +317,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call acosh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S index ba96089504..612a45da30 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S @@ -202,11 +202,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call asin@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S index 0f5b773b04..e7b41ab232 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S @@ -224,11 +224,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call asin@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S index 131b716c95..1fcbb245b7 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S @@ -429,11 +429,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call asinh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S index 5bdc6859f0..8445fc8ba4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S @@ -343,11 +343,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call asinh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S index 1b601576cc..a45cae79a1 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S @@ -277,12 +277,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 - movsd 64(%rsp, %r14, 8), %xmm1 + vmovsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm1 call atan2@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 96(%rsp, %r14, 8) + vmovsd %xmm0, 96(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S index ef9581075d..c3b0f7940c 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S @@ -295,12 +295,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 - movsd 128(%rsp, %r14, 8), %xmm1 + vmovsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 128(%rsp, %r14, 8), %xmm1 call atan2@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 192(%rsp, %r14, 8) + vmovsd %xmm0, 192(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S index b5cbfd224c..c9c41ef9f4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S @@ -339,11 +339,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call atanh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S index 3193c026dd..de4edb3cc0 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S @@ -274,11 +274,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call atanh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S index 96ecbe05c1..71a25f3db8 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S @@ -262,11 +262,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call cbrt@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S index 25df252108..a3d9104f5e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S @@ -282,11 +282,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call cosh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S index 066bbc7de6..4ff0e038a3 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S @@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call cosh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S index c832b65e3e..6efd2e95ba 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S @@ -258,11 +258,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call erfc@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S index 77228814d3..42bdfe6f18 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S @@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call erfc@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S index 7271bcc1d9..f519bcce45 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S @@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call exp10@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S index 40b01c3cd0..3f0c670199 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S @@ -191,11 +191,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call exp10@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S index ced774e89c..afa00a38bb 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S @@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call exp2@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S index 7a85fd8b18..eee785dbf5 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S @@ -227,11 +227,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call exp2@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S index 590341c243..4a3202750f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call expm1@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S index efae1f8b66..0fa17f3a73 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S @@ -211,11 +211,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call expm1@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S index ae5738c1b7..5c693d132e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S @@ -231,12 +231,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 - movsd 64(%rsp, %r14, 8), %xmm1 + vmovsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm1 call hypot@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 96(%rsp, %r14, 8) + vmovsd %xmm0, 96(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S index 0c404fd5ee..a392252c8b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S @@ -194,12 +194,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 - movsd 128(%rsp, %r14, 8), %xmm1 + vmovsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 128(%rsp, %r14, 8), %xmm1 call hypot@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 192(%rsp, %r14, 8) + vmovsd %xmm0, 192(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S index 2461c6ad56..9bf45a6dc2 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S @@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call log10@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S index 5d129ef4e5..101618cce9 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S @@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call log10@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S index 13235793e8..39ec0024cf 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S @@ -263,11 +263,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call log1p@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S index dd55b5dd18..3033fcb5b3 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S @@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call log1p@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S index 25d2edaae5..84bdb2090d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S @@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call log2@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S index bcb6736dec..b3e9bb3ca4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call log2@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S index ae16600579..ad2a06ad37 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S @@ -280,11 +280,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call sinh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S index 075665d57d..7ca915e30f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S @@ -271,11 +271,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call sinh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S index 01c86736e7..f26daf316b 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S @@ -267,11 +267,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call tan@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S index 376479035e..0c90328b0a 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S @@ -239,11 +239,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call tan@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S index 7ddf145b25..ea41d326eb 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S @@ -110,7 +110,7 @@ ENTRY(_ZGVdN4v_tanh_avx2) vpcmpgtd %xmm11, %xmm9, %xmm10 vpcmpgtd %xmm8, %xmm9, %xmm0 vpand %xmm10, %xmm9, %xmm7 - blendvps %xmm0, %xmm8, %xmm7 + vblendvps %xmm0, %xmm8, %xmm7, %xmm7 /* * VSHRIMM( I, iIndex, = iIndex, (17 - 4) ); @@ -272,11 +272,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 32(%rsp, %r14, 8), %xmm0 + vmovsd 32(%rsp, %r14, 8), %xmm0 call tanh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 64(%rsp, %r14, 8) + vmovsd %xmm0, 64(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S index 82c0119500..c995401a24 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S @@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movsd 64(%rsp, %r14, 8), %xmm0 + vmovsd 64(%rsp, %r14, 8), %xmm0 call tanh@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp, %r14, 8) + vmovsd %xmm0, 128(%rsp, %r14, 8) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S index 26fef1f268..fd84977e95 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call acosf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S index bf28a5dd00..078fe5a898 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S @@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call acosf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S index 3f44e75248..65026e647d 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S @@ -290,11 +290,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call acoshf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S index 3a70fc1448..489dac033c 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S @@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call acoshf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S index 4e9984d870..2accef703e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S @@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call asinf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S index 59bea9dc42..257c8da2f7 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S @@ -187,11 +187,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call asinf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S index 6b569ecf41..a0c27922e4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S @@ -313,11 +313,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call asinhf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S index 794030a481..d6f6c3d5aa 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S @@ -361,11 +361,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call asinhf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S index 56aa5bb917..15ffa4b6c9 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S @@ -257,12 +257,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 - movss 128(%rsp, %r14, 4), %xmm1 + vmovss 64(%rsp, %r14, 4), %xmm0 + vmovss 128(%rsp, %r14, 4), %xmm1 call atan2f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 192(%rsp, %r14, 4) + vmovss %xmm0, 192(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S index 29ebbb6db2..08b18c3e3f 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S @@ -238,12 +238,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 - movss 64(%rsp, %r14, 4), %xmm1 + vmovss 32(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm1 call atan2f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 96(%rsp, %r14, 4) + vmovss %xmm0, 96(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S index f42462c581..94186a14cb 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S @@ -222,13 +222,13 @@ L(SPECIAL_VALUES_LOOP): tzcntl %ebx, %ebp /* Scalar math fucntion call to process special input. */ - movss 64(%rsp, %rbp, 4), %xmm0 + vmovss 64(%rsp, %rbp, 4), %xmm0 call atanhf@PLT /* No good way to avoid the store-forwarding fault this will cause on return. `lfence` avoids the SF fault but at greater cost as it serialized stack/callee save restoration. */ - movss %xmm0, (%rsp, %rbp, 4) + vmovss %xmm0, (%rsp, %rbp, 4) blsrl %ebx, %ebx jnz L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S index 43eb423831..49ffd7a9b2 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S @@ -231,13 +231,13 @@ L(SPECIAL_VALUES_LOOP): tzcntl %ebx, %ebp /* Scalar math fucntion call to process special input. */ - movss 32(%rsp, %rbp, 4), %xmm0 + vmovss 32(%rsp, %rbp, 4), %xmm0 call atanhf@PLT /* No good way to avoid the store-forwarding fault this will cause on return. `lfence` avoids the SF fault but at greater cost as it serialized stack/callee save restoration. */ - movss %xmm0, (%rsp, %rbp, 4) + vmovss %xmm0, (%rsp, %rbp, 4) blsrl %ebx, %ebx jnz L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S index d24d36163d..14b58c171a 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S @@ -304,11 +304,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call cbrtf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S index 6b740bf866..d1a5ddf5b4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S @@ -228,11 +228,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call coshf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S index 6f29218af1..a00650ccd6 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S @@ -242,11 +242,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call coshf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S index 9daaa0c06d..5fb5b2f0f7 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S @@ -218,11 +218,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call erfcf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S index 4cafc1bcd5..60b9fab000 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S @@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call erfcf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S index eb9f3f8d8b..10f0b2cb37 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S @@ -186,11 +186,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call exp10f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S index 11244d5a5f..275ab42529 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S @@ -238,11 +238,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call exp10f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S index 5b406c6e32..8a5f1e3985 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S @@ -209,11 +209,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call exp2f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S index f7a80a4d64..cc87e66425 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S @@ -188,11 +188,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call exp2f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S index 71d23e632c..7fe830daa4 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S @@ -194,11 +194,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call expm1f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S index 73f862528a..d5d7fa2791 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S @@ -212,11 +212,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call expm1f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S index 548936fe61..c92e3ab065 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S @@ -202,12 +202,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 - movss 128(%rsp, %r14, 4), %xmm1 + vmovss 64(%rsp, %r14, 4), %xmm0 + vmovss 128(%rsp, %r14, 4), %xmm1 call hypotf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 192(%rsp, %r14, 4) + vmovss %xmm0, 192(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S index fc97828008..7a26c5accc 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S @@ -226,12 +226,12 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 - movss 64(%rsp, %r14, 4), %xmm1 + vmovss 32(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm1 call hypotf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 96(%rsp, %r14, 4) + vmovss %xmm0, 96(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S index b192dfe464..0eb9b23c4e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S @@ -161,11 +161,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call log10f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S index ea51c28f81..4bdc62e90e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S @@ -174,11 +174,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call log10f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S index 8fa5068595..2c864f0c0e 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S @@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call log1pf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S index 54d6a9a685..7326a2b5ad 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S @@ -190,11 +190,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call log1pf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S index 3b0a28fee0..02b255dde8 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S @@ -158,11 +158,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call log2f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S index eaa5112178..2245d40f84 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S @@ -169,11 +169,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call log2f@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S index fad4847f28..89be733eb2 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S @@ -252,11 +252,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call sinhf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S index 8c4b46cee2..e358e2efee 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S @@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 32(%rsp, %r14, 4), %xmm0 + vmovss 32(%rsp, %r14, 4), %xmm0 call sinhf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 64(%rsp, %r14, 4) + vmovss %xmm0, 64(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S index f2a18f0b2c..4e18cdc0ce 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S @@ -235,11 +235,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %r12d, %r14d - movss 64(%rsp, %r14, 4), %xmm0 + vmovss 64(%rsp, %r14, 4), %xmm0 call tanf@PLT # LOE rbx r14 r15 r12d r13d xmm0 - movss %xmm0, 128(%rsp, %r14, 4) + vmovss %xmm0, 128(%rsp, %r14, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S index cd33fac643..d34e61ac41 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S @@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP): L(SCALAR_MATH_CALL): movl %ebx, %r13d - movss 32(%rsp, %r13, 4), %xmm0 + vmovss 32(%rsp, %r13, 4), %xmm0 call tanf@PLT # LOE r13 r14 r15 ebx r12d xmm0 - movss %xmm0, 64(%rsp, %r13, 4) + vmovss %xmm0, 64(%rsp, %r13, 4) /* Process special inputs in loop */ jmp L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S index 7edc74a116..84f73fdaf9 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S @@ -221,13 +221,13 @@ L(SPECIAL_VALUES_LOOP): tzcntl %ebx, %ebp /* Scalar math fucntion call to process special input. */ - movss 64(%rsp, %rbp, 4), %xmm0 + vmovss 64(%rsp, %rbp, 4), %xmm0 call tanhf@PLT /* No good way to avoid the store-forwarding fault this will cause on return. `lfence` avoids the SF fault but at greater cost as it serialized stack/callee save restoration. */ - movss %xmm0, (%rsp, %rbp, 4) + vmovss %xmm0, (%rsp, %rbp, 4) blsrl %ebx, %ebx jnz L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S index 55df346a00..ea3e9f4210 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S @@ -240,13 +240,13 @@ L(SPECIAL_VALUES_LOOP): tzcntl %ebx, %ebp /* Scalar math function call to process special input. */ - movss 32(%rsp, %rbp, 4), %xmm0 + vmovss 32(%rsp, %rbp, 4), %xmm0 call tanhf@PLT /* No good way to avoid the store-forwarding fault this will cause on return. `lfence` avoids the SF fault but at greater cost as it serialized stack/callee save restoration. */ - movss %xmm0, (%rsp, %rbp, 4) + vmovss %xmm0, (%rsp, %rbp, 4) blsrl %ebx, %ebx jnz L(SPECIAL_VALUES_LOOP) diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index bd26ba80d5..eb128a2ae3 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -49,7 +49,7 @@ .section SECTION(.text), "ax", @progbits ENTRY(STRRCHR) - movd %esi, %xmm7 + vmovd %esi, %xmm7 movl %edi, %eax /* Broadcast CHAR to YMM4. */ VPBROADCAST %xmm7, %ymm7