diff mbox series

[v1] x86: Replace all sse instructions with vex equivilent in avx+ files

Message ID 20220620200210.2772523-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Replace all sse instructions with vex equivilent in avx+ files | expand

Commit Message

Noah Goldstein June 20, 2022, 8:02 p.m. UTC
Most of these don't really matter as there was no dirty upper state
but we should generally avoid stray sse when its not needed.

The one case that really matters is in svml_d_tanh4_core_avx2.S:

blendvps %xmm0, %xmm8, %xmm7

When there was a dirty upper state.

Tested on x86_64-linux
---
 sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S     | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S   | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S     | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S   | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S       | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S      | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S    | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S    | 6 +++---
 sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S    | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S   | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S      | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S  | 4 ++--
 sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S     | 4 ++--
 sysdeps/x86_64/multiarch/strrchr-avx2.S                    | 2 +-
 75 files changed, 158 insertions(+), 158 deletions(-)

Comments

H.J. Lu June 23, 2022, 2:17 a.m. UTC | #1
On Mon, Jun 20, 2022 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Most of these don't really matter as there was no dirty upper state
> but we should generally avoid stray sse when its not needed.
>
> The one case that really matters is in svml_d_tanh4_core_avx2.S:
>
> blendvps %xmm0, %xmm8, %xmm7
>
> When there was a dirty upper state.
>
> Tested on x86_64-linux
> ---
>  sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S     | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S   | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S     | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S   | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S       | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S      | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S    | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S    | 6 +++---
>  sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S    | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S   | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S      | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S  | 4 ++--
>  sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S     | 4 ++--
>  sysdeps/x86_64/multiarch/strrchr-avx2.S                    | 2 +-
>  75 files changed, 158 insertions(+), 158 deletions(-)
>
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> index e19bddd2e2..73025e8b0f 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
> @@ -210,11 +210,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    acos@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> index f4c72c3618..b8cc6dd776 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
> @@ -232,11 +232,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    acos@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
> index 5d0b23b72c..126110cf17 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
> @@ -372,11 +372,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    acosh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
> index b9a1131664..db0ef3b9dd 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
> @@ -317,11 +317,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    acosh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
> index ba96089504..612a45da30 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
> @@ -202,11 +202,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    asin@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
> index 0f5b773b04..e7b41ab232 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
> @@ -224,11 +224,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    asin@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
> index 131b716c95..1fcbb245b7 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
> @@ -429,11 +429,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    asinh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
> index 5bdc6859f0..8445fc8ba4 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
> @@ -343,11 +343,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    asinh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
> index 1b601576cc..a45cae79a1 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
> @@ -277,12 +277,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> -       movsd   64(%rsp, %r14, 8), %xmm1
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm1
>         call    atan2@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 96(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 96(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
> index ef9581075d..c3b0f7940c 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
> @@ -295,12 +295,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> -       movsd   128(%rsp, %r14, 8), %xmm1
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
> +       vmovsd  128(%rsp, %r14, 8), %xmm1
>         call    atan2@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 192(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 192(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
> index b5cbfd224c..c9c41ef9f4 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
> @@ -339,11 +339,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    atanh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
> index 3193c026dd..de4edb3cc0 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
> @@ -274,11 +274,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    atanh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
> index 96ecbe05c1..71a25f3db8 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
> @@ -262,11 +262,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    cbrt@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
> index 25df252108..a3d9104f5e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
> @@ -282,11 +282,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    cosh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
> index 066bbc7de6..4ff0e038a3 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
> @@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    cosh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
> index c832b65e3e..6efd2e95ba 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
> @@ -258,11 +258,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    erfc@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
> index 77228814d3..42bdfe6f18 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
> @@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    erfc@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
> index 7271bcc1d9..f519bcce45 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
> @@ -231,11 +231,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    exp10@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
> index 40b01c3cd0..3f0c670199 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
> @@ -191,11 +191,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    exp10@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
> index ced774e89c..afa00a38bb 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
> @@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    exp2@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
> index 7a85fd8b18..eee785dbf5 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
> @@ -227,11 +227,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    exp2@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
> index 590341c243..4a3202750f 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
> @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    expm1@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
> index efae1f8b66..0fa17f3a73 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
> @@ -211,11 +211,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    expm1@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
> index ae5738c1b7..5c693d132e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
> @@ -231,12 +231,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> -       movsd   64(%rsp, %r14, 8), %xmm1
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm1
>         call    hypot@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 96(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 96(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
> index 0c404fd5ee..a392252c8b 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
> @@ -194,12 +194,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> -       movsd   128(%rsp, %r14, 8), %xmm1
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
> +       vmovsd  128(%rsp, %r14, 8), %xmm1
>         call    hypot@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 192(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 192(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
> index 2461c6ad56..9bf45a6dc2 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
> @@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    log10@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
> index 5d129ef4e5..101618cce9 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
> @@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    log10@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
> index 13235793e8..39ec0024cf 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
> @@ -263,11 +263,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    log1p@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
> index dd55b5dd18..3033fcb5b3 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
> @@ -225,11 +225,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    log1p@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
> index 25d2edaae5..84bdb2090d 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
> @@ -223,11 +223,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    log2@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
> index bcb6736dec..b3e9bb3ca4 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
> @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    log2@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
> index ae16600579..ad2a06ad37 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
> @@ -280,11 +280,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    sinh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
> index 075665d57d..7ca915e30f 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
> @@ -271,11 +271,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    sinh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
> index 01c86736e7..f26daf316b 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
> @@ -267,11 +267,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    tan@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
> index 376479035e..0c90328b0a 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
> @@ -239,11 +239,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    tan@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
> index 7ddf145b25..ea41d326eb 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
> @@ -110,7 +110,7 @@ ENTRY(_ZGVdN4v_tanh_avx2)
>         vpcmpgtd %xmm11, %xmm9, %xmm10
>         vpcmpgtd %xmm8, %xmm9, %xmm0
>         vpand   %xmm10, %xmm9, %xmm7
> -       blendvps %xmm0, %xmm8, %xmm7
> +       vblendvps %xmm0, %xmm8, %xmm7, %xmm7
>
>         /*
>          * VSHRIMM( I, iIndex, = iIndex, (17 - 4) );
> @@ -272,11 +272,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   32(%rsp, %r14, 8), %xmm0
> +       vmovsd  32(%rsp, %r14, 8), %xmm0
>         call    tanh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 64(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 64(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
> index 82c0119500..c995401a24 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
> @@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movsd   64(%rsp, %r14, 8), %xmm0
> +       vmovsd  64(%rsp, %r14, 8), %xmm0
>         call    tanh@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movsd   %xmm0, 128(%rsp, %r14, 8)
> +       vmovsd  %xmm0, 128(%rsp, %r14, 8)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> index 26fef1f268..fd84977e95 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
> @@ -205,11 +205,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    acosf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> index bf28a5dd00..078fe5a898 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
> @@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    acosf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
> index 3f44e75248..65026e647d 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
> @@ -290,11 +290,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    acoshf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
> index 3a70fc1448..489dac033c 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
> @@ -286,11 +286,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    acoshf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
> index 4e9984d870..2accef703e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
> @@ -198,11 +198,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    asinf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
> index 59bea9dc42..257c8da2f7 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
> @@ -187,11 +187,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    asinf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
> index 6b569ecf41..a0c27922e4 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
> @@ -313,11 +313,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    asinhf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
> index 794030a481..d6f6c3d5aa 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
> @@ -361,11 +361,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    asinhf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
> index 56aa5bb917..15ffa4b6c9 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
> @@ -257,12 +257,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> -       movss   128(%rsp, %r14, 4), %xmm1
> +       vmovss  64(%rsp, %r14, 4), %xmm0
> +       vmovss  128(%rsp, %r14, 4), %xmm1
>         call    atan2f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 192(%rsp, %r14, 4)
> +       vmovss  %xmm0, 192(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
> index 29ebbb6db2..08b18c3e3f 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
> @@ -238,12 +238,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> -       movss   64(%rsp, %r14, 4), %xmm1
> +       vmovss  32(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm1
>         call    atan2f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 96(%rsp, %r14, 4)
> +       vmovss  %xmm0, 96(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> index f42462c581..94186a14cb 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
> @@ -222,13 +222,13 @@ L(SPECIAL_VALUES_LOOP):
>         tzcntl  %ebx, %ebp
>
>         /* Scalar math fucntion call to process special input.  */
> -       movss   64(%rsp, %rbp, 4), %xmm0
> +       vmovss  64(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
>
>         /* No good way to avoid the store-forwarding fault this will cause on
>            return. `lfence` avoids the SF fault but at greater cost as it
>            serialized stack/callee save restoration.  */
> -       movss   %xmm0, (%rsp, %rbp, 4)
> +       vmovss  %xmm0, (%rsp, %rbp, 4)
>
>         blsrl   %ebx, %ebx
>         jnz     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> index 43eb423831..49ffd7a9b2 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
> @@ -231,13 +231,13 @@ L(SPECIAL_VALUES_LOOP):
>         tzcntl  %ebx, %ebp
>
>         /* Scalar math fucntion call to process special input.  */
> -       movss   32(%rsp, %rbp, 4), %xmm0
> +       vmovss  32(%rsp, %rbp, 4), %xmm0
>         call    atanhf@PLT
>
>         /* No good way to avoid the store-forwarding fault this will cause on
>            return. `lfence` avoids the SF fault but at greater cost as it
>            serialized stack/callee save restoration.  */
> -       movss   %xmm0, (%rsp, %rbp, 4)
> +       vmovss  %xmm0, (%rsp, %rbp, 4)
>
>         blsrl   %ebx, %ebx
>         jnz     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
> index d24d36163d..14b58c171a 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
> @@ -304,11 +304,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    cbrtf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
> index 6b740bf866..d1a5ddf5b4 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
> @@ -228,11 +228,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    coshf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
> index 6f29218af1..a00650ccd6 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
> @@ -242,11 +242,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    coshf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
> index 9daaa0c06d..5fb5b2f0f7 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
> @@ -218,11 +218,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    erfcf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
> index 4cafc1bcd5..60b9fab000 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
> @@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    erfcf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
> index eb9f3f8d8b..10f0b2cb37 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
> @@ -186,11 +186,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    exp10f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
> index 11244d5a5f..275ab42529 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
> @@ -238,11 +238,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    exp10f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
> index 5b406c6e32..8a5f1e3985 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
> @@ -209,11 +209,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    exp2f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
> index f7a80a4d64..cc87e66425 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
> @@ -188,11 +188,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    exp2f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
> index 71d23e632c..7fe830daa4 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
> @@ -194,11 +194,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    expm1f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
> index 73f862528a..d5d7fa2791 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
> @@ -212,11 +212,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    expm1f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
> index 548936fe61..c92e3ab065 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
> @@ -202,12 +202,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> -       movss   128(%rsp, %r14, 4), %xmm1
> +       vmovss  64(%rsp, %r14, 4), %xmm0
> +       vmovss  128(%rsp, %r14, 4), %xmm1
>         call    hypotf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 192(%rsp, %r14, 4)
> +       vmovss  %xmm0, 192(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
> index fc97828008..7a26c5accc 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
> @@ -226,12 +226,12 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> -       movss   64(%rsp, %r14, 4), %xmm1
> +       vmovss  32(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm1
>         call    hypotf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 96(%rsp, %r14, 4)
> +       vmovss  %xmm0, 96(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
> index b192dfe464..0eb9b23c4e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
> @@ -161,11 +161,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    log10f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
> index ea51c28f81..4bdc62e90e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
> @@ -174,11 +174,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    log10f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
> index 8fa5068595..2c864f0c0e 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
> @@ -207,11 +207,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    log1pf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
> index 54d6a9a685..7326a2b5ad 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
> @@ -190,11 +190,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    log1pf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
> index 3b0a28fee0..02b255dde8 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
> @@ -158,11 +158,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    log2f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
> index eaa5112178..2245d40f84 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
> @@ -169,11 +169,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    log2f@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
> index fad4847f28..89be733eb2 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
> @@ -252,11 +252,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    sinhf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
> index 8c4b46cee2..e358e2efee 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
> @@ -243,11 +243,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   32(%rsp, %r14, 4), %xmm0
> +       vmovss  32(%rsp, %r14, 4), %xmm0
>         call    sinhf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r14, 4)
> +       vmovss  %xmm0, 64(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
> index f2a18f0b2c..4e18cdc0ce 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
> @@ -235,11 +235,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %r12d, %r14d
> -       movss   64(%rsp, %r14, 4), %xmm0
> +       vmovss  64(%rsp, %r14, 4), %xmm0
>         call    tanf@PLT
>         # LOE rbx r14 r15 r12d r13d xmm0
>
> -       movss   %xmm0, 128(%rsp, %r14, 4)
> +       vmovss  %xmm0, 128(%rsp, %r14, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
> index cd33fac643..d34e61ac41 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
> @@ -261,11 +261,11 @@ L(SPECIAL_VALUES_LOOP):
>
>  L(SCALAR_MATH_CALL):
>         movl    %ebx, %r13d
> -       movss   32(%rsp, %r13, 4), %xmm0
> +       vmovss  32(%rsp, %r13, 4), %xmm0
>         call    tanf@PLT
>         # LOE r13 r14 r15 ebx r12d xmm0
>
> -       movss   %xmm0, 64(%rsp, %r13, 4)
> +       vmovss  %xmm0, 64(%rsp, %r13, 4)
>
>         /* Process special inputs in loop */
>         jmp     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> index 7edc74a116..84f73fdaf9 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
> @@ -221,13 +221,13 @@ L(SPECIAL_VALUES_LOOP):
>         tzcntl  %ebx, %ebp
>
>         /* Scalar math fucntion call to process special input.  */
> -       movss   64(%rsp, %rbp, 4), %xmm0
> +       vmovss  64(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
>
>         /* No good way to avoid the store-forwarding fault this will cause on
>            return. `lfence` avoids the SF fault but at greater cost as it
>            serialized stack/callee save restoration.  */
> -       movss   %xmm0, (%rsp, %rbp, 4)
> +       vmovss  %xmm0, (%rsp, %rbp, 4)
>
>         blsrl   %ebx, %ebx
>         jnz     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> index 55df346a00..ea3e9f4210 100644
> --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
> @@ -240,13 +240,13 @@ L(SPECIAL_VALUES_LOOP):
>         tzcntl  %ebx, %ebp
>
>         /* Scalar math function call to process special input.  */
> -       movss   32(%rsp, %rbp, 4), %xmm0
> +       vmovss  32(%rsp, %rbp, 4), %xmm0
>         call    tanhf@PLT
>
>         /* No good way to avoid the store-forwarding fault this will cause on
>            return. `lfence` avoids the SF fault but at greater cost as it
>            serialized stack/callee save restoration.  */
> -       movss   %xmm0, (%rsp, %rbp, 4)
> +       vmovss  %xmm0, (%rsp, %rbp, 4)
>
>         blsrl   %ebx, %ebx
>         jnz     L(SPECIAL_VALUES_LOOP)
> diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> index bd26ba80d5..eb128a2ae3 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
> @@ -49,7 +49,7 @@
>
>         .section SECTION(.text), "ax", @progbits
>  ENTRY(STRRCHR)
> -       movd    %esi, %xmm7
> +       vmovd   %esi, %xmm7
>         movl    %edi, %eax
>         /* Broadcast CHAR to YMM4.  */
>         VPBROADCAST %xmm7, %ymm7
> --
> 2.34.1
>

LGTM.

Thanks.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
index e19bddd2e2..73025e8b0f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos4_core_avx2.S
@@ -210,11 +210,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	acos@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
index f4c72c3618..b8cc6dd776 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acos8_core_avx512.S
@@ -232,11 +232,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	acos@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
index 5d0b23b72c..126110cf17 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh4_core_avx2.S
@@ -372,11 +372,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	acosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
index b9a1131664..db0ef3b9dd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_acosh8_core_avx512.S
@@ -317,11 +317,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	acosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
index ba96089504..612a45da30 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin4_core_avx2.S
@@ -202,11 +202,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	asin@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
index 0f5b773b04..e7b41ab232 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asin8_core_avx512.S
@@ -224,11 +224,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	asin@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
index 131b716c95..1fcbb245b7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
@@ -429,11 +429,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	asinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
index 5bdc6859f0..8445fc8ba4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh8_core_avx512.S
@@ -343,11 +343,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	asinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
index 1b601576cc..a45cae79a1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan24_core_avx2.S
@@ -277,12 +277,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
-	movsd	64(%rsp, %r14, 8), %xmm1
+	vmovsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm1
 	call	atan2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 96(%rsp, %r14, 8)
+	vmovsd	%xmm0, 96(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
index ef9581075d..c3b0f7940c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan28_core_avx512.S
@@ -295,12 +295,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
-	movsd	128(%rsp, %r14, 8), %xmm1
+	vmovsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	128(%rsp, %r14, 8), %xmm1
 	call	atan2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 192(%rsp, %r14, 8)
+	vmovsd	%xmm0, 192(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
index b5cbfd224c..c9c41ef9f4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh4_core_avx2.S
@@ -339,11 +339,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	atanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
index 3193c026dd..de4edb3cc0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atanh8_core_avx512.S
@@ -274,11 +274,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	atanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
index 96ecbe05c1..71a25f3db8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cbrt4_core_avx2.S
@@ -262,11 +262,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	cbrt@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
index 25df252108..a3d9104f5e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh4_core_avx2.S
@@ -282,11 +282,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	cosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
index 066bbc7de6..4ff0e038a3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cosh8_core_avx512.S
@@ -231,11 +231,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	cosh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
index c832b65e3e..6efd2e95ba 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc4_core_avx2.S
@@ -258,11 +258,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	erfc@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
index 77228814d3..42bdfe6f18 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S
@@ -261,11 +261,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	erfc@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
index 7271bcc1d9..f519bcce45 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp104_core_avx2.S
@@ -231,11 +231,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	exp10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
index 40b01c3cd0..3f0c670199 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp108_core_avx512.S
@@ -191,11 +191,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	exp10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
index ced774e89c..afa00a38bb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp24_core_avx2.S
@@ -223,11 +223,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	exp2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
index 7a85fd8b18..eee785dbf5 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp28_core_avx512.S
@@ -227,11 +227,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	exp2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
index 590341c243..4a3202750f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm14_core_avx2.S
@@ -205,11 +205,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	expm1@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
index efae1f8b66..0fa17f3a73 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_expm18_core_avx512.S
@@ -211,11 +211,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	expm1@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
index ae5738c1b7..5c693d132e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S
@@ -231,12 +231,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
-	movsd	64(%rsp, %r14, 8), %xmm1
+	vmovsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm1
 	call	hypot@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 96(%rsp, %r14, 8)
+	vmovsd	%xmm0, 96(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
index 0c404fd5ee..a392252c8b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S
@@ -194,12 +194,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
-	movsd	128(%rsp, %r14, 8), %xmm1
+	vmovsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	128(%rsp, %r14, 8), %xmm1
 	call	hypot@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 192(%rsp, %r14, 8)
+	vmovsd	%xmm0, 192(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
index 2461c6ad56..9bf45a6dc2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log104_core_avx2.S
@@ -225,11 +225,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
index 5d129ef4e5..101618cce9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log108_core_avx512.S
@@ -207,11 +207,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log10@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
index 13235793e8..39ec0024cf 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p4_core_avx2.S
@@ -263,11 +263,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log1p@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
index dd55b5dd18..3033fcb5b3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log1p8_core_avx512.S
@@ -225,11 +225,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log1p@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
index 25d2edaae5..84bdb2090d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log24_core_avx2.S
@@ -223,11 +223,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	log2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
index bcb6736dec..b3e9bb3ca4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log28_core_avx512.S
@@ -205,11 +205,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	log2@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
index ae16600579..ad2a06ad37 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
@@ -280,11 +280,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	sinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
index 075665d57d..7ca915e30f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh8_core_avx512.S
@@ -271,11 +271,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	sinh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
index 01c86736e7..f26daf316b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan4_core_avx2.S
@@ -267,11 +267,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	tan@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
index 376479035e..0c90328b0a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan8_core_avx512.S
@@ -239,11 +239,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	tan@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
index 7ddf145b25..ea41d326eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh4_core_avx2.S
@@ -110,7 +110,7 @@  ENTRY(_ZGVdN4v_tanh_avx2)
 	vpcmpgtd %xmm11, %xmm9, %xmm10
 	vpcmpgtd %xmm8, %xmm9, %xmm0
 	vpand	%xmm10, %xmm9, %xmm7
-	blendvps %xmm0, %xmm8, %xmm7
+	vblendvps %xmm0, %xmm8, %xmm7, %xmm7
 
 	/*
 	 * VSHRIMM( I, iIndex, = iIndex, (17 - 4) );
@@ -272,11 +272,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	32(%rsp, %r14, 8), %xmm0
+	vmovsd	32(%rsp, %r14, 8), %xmm0
 	call	tanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 64(%rsp, %r14, 8)
+	vmovsd	%xmm0, 64(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
index 82c0119500..c995401a24 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tanh8_core_avx512.S
@@ -286,11 +286,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movsd	64(%rsp, %r14, 8), %xmm0
+	vmovsd	64(%rsp, %r14, 8), %xmm0
 	call	tanh@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movsd	%xmm0, 128(%rsp, %r14, 8)
+	vmovsd	%xmm0, 128(%rsp, %r14, 8)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
index 26fef1f268..fd84977e95 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf16_core_avx512.S
@@ -205,11 +205,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	acosf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
index bf28a5dd00..078fe5a898 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acosf8_core_avx2.S
@@ -198,11 +198,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	acosf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
index 3f44e75248..65026e647d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf16_core_avx512.S
@@ -290,11 +290,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	acoshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
index 3a70fc1448..489dac033c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S
@@ -286,11 +286,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	acoshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
index 4e9984d870..2accef703e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf16_core_avx512.S
@@ -198,11 +198,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	asinf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
index 59bea9dc42..257c8da2f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinf8_core_avx2.S
@@ -187,11 +187,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	asinf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
index 6b569ecf41..a0c27922e4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf16_core_avx512.S
@@ -313,11 +313,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	asinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
index 794030a481..d6f6c3d5aa 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_asinhf8_core_avx2.S
@@ -361,11 +361,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	asinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
index 56aa5bb917..15ffa4b6c9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f16_core_avx512.S
@@ -257,12 +257,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
-	movss	128(%rsp, %r14, 4), %xmm1
+	vmovss	64(%rsp, %r14, 4), %xmm0
+	vmovss	128(%rsp, %r14, 4), %xmm1
 	call	atan2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 192(%rsp, %r14, 4)
+	vmovss	%xmm0, 192(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
index 29ebbb6db2..08b18c3e3f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atan2f8_core_avx2.S
@@ -238,12 +238,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	movss	64(%rsp, %r14, 4), %xmm1
+	vmovss	32(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm1
 	call	atan2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 96(%rsp, %r14, 4)
+	vmovss	%xmm0, 96(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
index f42462c581..94186a14cb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf16_core_avx512.S
@@ -222,13 +222,13 @@  L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	64(%rsp, %rbp, 4), %xmm0
+	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
index 43eb423831..49ffd7a9b2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_atanhf8_core_avx2.S
@@ -231,13 +231,13 @@  L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	32(%rsp, %rbp, 4), %xmm0
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	atanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
index d24d36163d..14b58c171a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cbrtf8_core_avx2.S
@@ -304,11 +304,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	cbrtf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
index 6b740bf866..d1a5ddf5b4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf16_core_avx512.S
@@ -228,11 +228,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	coshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
index 6f29218af1..a00650ccd6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_coshf8_core_avx2.S
@@ -242,11 +242,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	coshf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
index 9daaa0c06d..5fb5b2f0f7 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf16_core_avx512.S
@@ -218,11 +218,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	erfcf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
index 4cafc1bcd5..60b9fab000 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_erfcf8_core_avx2.S
@@ -243,11 +243,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	erfcf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
index eb9f3f8d8b..10f0b2cb37 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f16_core_avx512.S
@@ -186,11 +186,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	exp10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
index 11244d5a5f..275ab42529 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp10f8_core_avx2.S
@@ -238,11 +238,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	exp10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
index 5b406c6e32..8a5f1e3985 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f16_core_avx512.S
@@ -209,11 +209,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	exp2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
index f7a80a4d64..cc87e66425 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_exp2f8_core_avx2.S
@@ -188,11 +188,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	exp2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
index 71d23e632c..7fe830daa4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f16_core_avx512.S
@@ -194,11 +194,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	expm1f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
index 73f862528a..d5d7fa2791 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expm1f8_core_avx2.S
@@ -212,11 +212,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	expm1f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
index 548936fe61..c92e3ab065 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf16_core_avx512.S
@@ -202,12 +202,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
-	movss	128(%rsp, %r14, 4), %xmm1
+	vmovss	64(%rsp, %r14, 4), %xmm0
+	vmovss	128(%rsp, %r14, 4), %xmm1
 	call	hypotf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 192(%rsp, %r14, 4)
+	vmovss	%xmm0, 192(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
index fc97828008..7a26c5accc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_hypotf8_core_avx2.S
@@ -226,12 +226,12 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
-	movss	64(%rsp, %r14, 4), %xmm1
+	vmovss	32(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm1
 	call	hypotf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 96(%rsp, %r14, 4)
+	vmovss	%xmm0, 96(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
index b192dfe464..0eb9b23c4e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f16_core_avx512.S
@@ -161,11 +161,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
index ea51c28f81..4bdc62e90e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log10f8_core_avx2.S
@@ -174,11 +174,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log10f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
index 8fa5068595..2c864f0c0e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf16_core_avx512.S
@@ -207,11 +207,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log1pf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
index 54d6a9a685..7326a2b5ad 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log1pf8_core_avx2.S
@@ -190,11 +190,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log1pf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
index 3b0a28fee0..02b255dde8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f16_core_avx512.S
@@ -158,11 +158,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	log2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
index eaa5112178..2245d40f84 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_log2f8_core_avx2.S
@@ -169,11 +169,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	log2f@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
index fad4847f28..89be733eb2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf16_core_avx512.S
@@ -252,11 +252,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	sinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
index 8c4b46cee2..e358e2efee 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinhf8_core_avx2.S
@@ -243,11 +243,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	32(%rsp, %r14, 4), %xmm0
+	vmovss	32(%rsp, %r14, 4), %xmm0
 	call	sinhf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 64(%rsp, %r14, 4)
+	vmovss	%xmm0, 64(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
index f2a18f0b2c..4e18cdc0ce 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S
@@ -235,11 +235,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%r12d, %r14d
-	movss	64(%rsp, %r14, 4), %xmm0
+	vmovss	64(%rsp, %r14, 4), %xmm0
 	call	tanf@PLT
 	# LOE rbx r14 r15 r12d r13d xmm0
 
-	movss	%xmm0, 128(%rsp, %r14, 4)
+	vmovss	%xmm0, 128(%rsp, %r14, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
index cd33fac643..d34e61ac41 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf8_core_avx2.S
@@ -261,11 +261,11 @@  L(SPECIAL_VALUES_LOOP):
 
 L(SCALAR_MATH_CALL):
 	movl	%ebx, %r13d
-	movss	32(%rsp, %r13, 4), %xmm0
+	vmovss	32(%rsp, %r13, 4), %xmm0
 	call	tanf@PLT
 	# LOE r13 r14 r15 ebx r12d xmm0
 
-	movss	%xmm0, 64(%rsp, %r13, 4)
+	vmovss	%xmm0, 64(%rsp, %r13, 4)
 
 	/* Process special inputs in loop */
 	jmp	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
index 7edc74a116..84f73fdaf9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf16_core_avx512.S
@@ -221,13 +221,13 @@  L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math fucntion call to process special input.  */
-	movss	64(%rsp, %rbp, 4), %xmm0
+	vmovss	64(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
index 55df346a00..ea3e9f4210 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanhf8_core_avx2.S
@@ -240,13 +240,13 @@  L(SPECIAL_VALUES_LOOP):
 	tzcntl	%ebx, %ebp
 
 	/* Scalar math function call to process special input.  */
-	movss	32(%rsp, %rbp, 4), %xmm0
+	vmovss	32(%rsp, %rbp, 4), %xmm0
 	call	tanhf@PLT
 
 	/* No good way to avoid the store-forwarding fault this will cause on
 	   return. `lfence` avoids the SF fault but at greater cost as it
 	   serialized stack/callee save restoration.  */
-	movss	%xmm0, (%rsp, %rbp, 4)
+	vmovss	%xmm0, (%rsp, %rbp, 4)
 
 	blsrl   %ebx, %ebx
 	jnz	L(SPECIAL_VALUES_LOOP)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index bd26ba80d5..eb128a2ae3 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -49,7 +49,7 @@ 
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY(STRRCHR)
-	movd	%esi, %xmm7
+	vmovd	%esi, %xmm7
 	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
 	VPBROADCAST %xmm7, %ymm7