diff mbox series

[v5,4/4] x86: Add avx2 optimized functions for the wchar_t strcpy family

Message ID 20221109013841.3707572-4-goldstein.w.n@gmail.com
State New
Headers show
Series [v5,1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions | expand

Commit Message

Noah Goldstein Nov. 9, 2022, 1:38 a.m. UTC
Implemented:
    wcscat-avx2  (+ 744 bytes
    wcscpy-avx2  (+ 539 bytes)
    wcpcpy-avx2  (+ 577 bytes)
    wcsncpy-avx2 (+1108 bytes)
    wcpncpy-avx2 (+1214 bytes)
    wcsncat-avx2 (+1085 bytes)

Performance Changes:
    Times are from N = 10 runs of the benchmark suite and are reported
    as geometric mean of all ratios of New Implementation / Best Old
    Implementation. Best Old Implementation was determined with the
    highest ISA implementation.

    wcscat-avx2     -> 0.975
    wcscpy-avx2     -> 0.591
    wcpcpy-avx2     -> 0.698
    wcsncpy-avx2    -> 0.730
    wcpncpy-avx2    -> 0.711
    wcsncat-avx2    -> 0.954

Code Size Changes:
    This change  increase the size of libc.so by ~5.5kb bytes. For
    reference the patch optimizing the normal strcpy family functions
    decreases libc.so by ~5.2kb.

Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
 sysdeps/x86_64/multiarch/Makefile          |  6 +++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 ++++++++++++++++++++--
 sysdeps/x86_64/multiarch/ifunc-wcs.h       |  7 ++++++
 sysdeps/x86_64/multiarch/wcpcpy-avx2.S     |  8 +++++++
 sysdeps/x86_64/multiarch/wcpcpy-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcpncpy-avx2.S    |  8 +++++++
 sysdeps/x86_64/multiarch/wcpncpy-generic.c |  2 +-
 sysdeps/x86_64/multiarch/wcscat-avx2.S     | 10 ++++++++
 sysdeps/x86_64/multiarch/wcscat-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcscpy-avx2.S     |  7 ++++++
 sysdeps/x86_64/multiarch/wcscpy-generic.c  |  2 +-
 sysdeps/x86_64/multiarch/wcscpy.c          |  5 ++++
 sysdeps/x86_64/multiarch/wcsncat-avx2.S    |  9 +++++++
 sysdeps/x86_64/multiarch/wcsncat-generic.c |  2 +-
 sysdeps/x86_64/multiarch/wcsncpy-avx2.S    |  7 ++++++
 sysdeps/x86_64/multiarch/wcsncpy-generic.c |  2 +-
 sysdeps/x86_64/wcpcpy-generic.c            |  2 +-
 sysdeps/x86_64/wcpcpy.S                    |  3 ++-
 sysdeps/x86_64/wcpncpy-generic.c           |  2 +-
 sysdeps/x86_64/wcpncpy.S                   |  3 ++-
 sysdeps/x86_64/wcscat-generic.c            |  2 +-
 sysdeps/x86_64/wcscat.S                    |  3 ++-
 sysdeps/x86_64/wcscpy.S                    |  1 +
 sysdeps/x86_64/wcsncat-generic.c           |  2 +-
 sysdeps/x86_64/wcsncat.S                   |  3 ++-
 sysdeps/x86_64/wcsncpy-generic.c           |  2 +-
 sysdeps/x86_64/wcsncpy.S                   |  3 ++-
 27 files changed, 115 insertions(+), 18 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S

Comments

H.J. Lu Nov. 9, 2022, 3:01 a.m. UTC | #1
On Tue, Nov 08, 2022 at 05:38:41PM -0800, Noah Goldstein wrote:
> Implemented:
>     wcscat-avx2  (+ 744 bytes
>     wcscpy-avx2  (+ 539 bytes)
>     wcpcpy-avx2  (+ 577 bytes)
>     wcsncpy-avx2 (+1108 bytes)
>     wcpncpy-avx2 (+1214 bytes)
>     wcsncat-avx2 (+1085 bytes)
> 
> Performance Changes:
>     Times are from N = 10 runs of the benchmark suite and are reported
>     as geometric mean of all ratios of New Implementation / Best Old
>     Implementation. Best Old Implementation was determined with the
>     highest ISA implementation.
> 
>     wcscat-avx2     -> 0.975
>     wcscpy-avx2     -> 0.591
>     wcpcpy-avx2     -> 0.698
>     wcsncpy-avx2    -> 0.730
>     wcpncpy-avx2    -> 0.711
>     wcsncat-avx2    -> 0.954
> 
> Code Size Changes:
>     This change  increase the size of libc.so by ~5.5kb bytes. For
>     reference the patch optimizing the normal strcpy family functions
>     decreases libc.so by ~5.2kb.
> 
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |  6 +++++
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 ++++++++++++++++++++--
>  sysdeps/x86_64/multiarch/ifunc-wcs.h       |  7 ++++++
>  sysdeps/x86_64/multiarch/wcpcpy-avx2.S     |  8 +++++++
>  sysdeps/x86_64/multiarch/wcpcpy-generic.c  |  2 +-
>  sysdeps/x86_64/multiarch/wcpncpy-avx2.S    |  8 +++++++
>  sysdeps/x86_64/multiarch/wcpncpy-generic.c |  2 +-
>  sysdeps/x86_64/multiarch/wcscat-avx2.S     | 10 ++++++++
>  sysdeps/x86_64/multiarch/wcscat-generic.c  |  2 +-
>  sysdeps/x86_64/multiarch/wcscpy-avx2.S     |  7 ++++++
>  sysdeps/x86_64/multiarch/wcscpy-generic.c  |  2 +-
>  sysdeps/x86_64/multiarch/wcscpy.c          |  5 ++++
>  sysdeps/x86_64/multiarch/wcsncat-avx2.S    |  9 +++++++
>  sysdeps/x86_64/multiarch/wcsncat-generic.c |  2 +-
>  sysdeps/x86_64/multiarch/wcsncpy-avx2.S    |  7 ++++++
>  sysdeps/x86_64/multiarch/wcsncpy-generic.c |  2 +-
>  sysdeps/x86_64/wcpcpy-generic.c            |  2 +-
>  sysdeps/x86_64/wcpcpy.S                    |  3 ++-
>  sysdeps/x86_64/wcpncpy-generic.c           |  2 +-
>  sysdeps/x86_64/wcpncpy.S                   |  3 ++-
>  sysdeps/x86_64/wcscat-generic.c            |  2 +-
>  sysdeps/x86_64/wcscat.S                    |  3 ++-
>  sysdeps/x86_64/wcscpy.S                    |  1 +
>  sysdeps/x86_64/wcsncat-generic.c           |  2 +-
>  sysdeps/x86_64/wcsncat.S                   |  3 ++-
>  sysdeps/x86_64/wcsncpy-generic.c           |  2 +-
>  sysdeps/x86_64/wcsncpy.S                   |  3 ++-
>  27 files changed, 115 insertions(+), 18 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/wcpcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcpncpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscat-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcscpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncat-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> 
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index d6e01940c3..e1e894c963 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -131,10 +131,13 @@ endif
>  
>  ifeq ($(subdir),wcsmbs)
>  sysdep_routines += \
> +  wcpcpy-avx2 \
>    wcpcpy-evex \
>    wcpcpy-generic \
> +  wcpncpy-avx2 \
>    wcpncpy-evex \
>    wcpncpy-generic \
> +  wcscat-avx2 \
>    wcscat-evex \
>    wcscat-generic \
>    wcschr-avx2 \
> @@ -146,6 +149,7 @@ sysdep_routines += \
>    wcscmp-avx2-rtm \
>    wcscmp-evex \
>    wcscmp-sse2 \
> +  wcscpy-avx2 \
>    wcscpy-evex \
>    wcscpy-generic \
>    wcscpy-ssse3 \
> @@ -155,11 +159,13 @@ sysdep_routines += \
>    wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
> +  wcsncat-avx2 \
>    wcsncat-evex \
>    wcsncat-generic \
>    wcsncmp-avx2 \
>    wcsncmp-avx2-rtm \
>    wcsncmp-evex \
> +  wcsncpy-avx2 \
>    wcsncpy-evex \
>    wcsncpy-generic \
>    wcsnlen-avx2 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c908d6c158..0c15dfebfd 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -907,6 +907,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcscpy_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcscpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
>  				     CPU_FEATURE_USABLE (SSSE3),
>  				     __wcscpy_ssse3)
>  	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
> @@ -920,7 +924,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (AVX512BW)
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcsncpy_evex)
> -	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcsncpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
>  				     1,
>  				     __wcsncpy_generic))
>  
> @@ -932,6 +940,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcpcpy_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcpcpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpcpy,
>  				     1,
>  				     __wcpcpy_generic))
>  
> @@ -942,7 +954,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (AVX512BW)
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcpncpy_evex)
> -	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
> +	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcpncpy_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
>  				     1,
>  				     __wcpncpy_generic))
>  
> @@ -954,6 +970,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcscat_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcscat_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscat,
>  				     1,
>  				     __wcscat_generic))
>  
> @@ -965,6 +985,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  				      && CPU_FEATURE_USABLE (BMI2)),
>  				     __wcsncat_evex)
>  	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
> +				     (CPU_FEATURE_USABLE (AVX2)
> +				      && CPU_FEATURE_USABLE (BMI2)),
> +				     __wcsncat_avx2)
> +	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncat,
>  				     1,
>  				     __wcsncat_generic))
>  
> diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> index 1d2a63458b..51194e620e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-wcs.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
> @@ -27,6 +27,8 @@
>  
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>  
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
>  
>  static inline void *
> @@ -42,6 +44,11 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
>  	return OPTIMIZE (evex);
> +
> +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
> +				       Prefer_No_VZEROUPPER, !))
> +	return OPTIMIZE (avx2);
> +
>      }
>  
>    return OPTIMIZE (GENERIC);
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> new file mode 100644
> index 0000000000..0fffd912d3
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPCPY
> +# define WCPCPY	__wcpcpy_avx2
> +#endif
> +
> +#define USE_AS_STPCPY
> +#define USE_AS_WCSCPY
> +#define STRCPY	WCPCPY
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> index 6039196a3e..0ba29b081f 100644
> --- a/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCPCPY __wcpcpy_generic
>  # include <wcsmbs/wcpcpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> new file mode 100644
> index 0000000000..b7e594f7b7
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
> @@ -0,0 +1,8 @@
> +#ifndef WCPNCPY
> +# define WCPNCPY	__wcpncpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STPCPY
> +#define STRNCPY	WCPNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> index de8d34320e..4aab4ecdd2 100644
> --- a/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCPNCPY __wcpncpy_generic
>  # include <wcsmbs/wcpncpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> new file mode 100644
> index 0000000000..a20f23c09d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
> @@ -0,0 +1,10 @@
> +#ifndef WCSCAT
> +# define WCSCAT	__wcscat_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRCPY	WCSCAT
> +
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
> index d86b4d5c00..6476f85bbb 100644
> --- a/sysdeps/x86_64/multiarch/wcscat-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSCAT __wcscat_generic
>  # include <wcsmbs/wcscat.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> new file mode 100644
> index 0000000000..6bc509da07
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSCPY
> +# define WCSCPY	__wcscpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRCPY	WCSCPY
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> index 4a1fffae4b..600d606c45 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
> @@ -18,7 +18,7 @@
>  
>  
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSCPY  __wcscpy_generic
>  # include <wcsmbs/wcscpy.c>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
> index 9ad77da8ac..e204059873 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy.c
> +++ b/sysdeps/x86_64/multiarch/wcscpy.c
> @@ -28,6 +28,8 @@
>  
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
>  
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> @@ -44,6 +46,9 @@ IFUNC_SELECTOR (void)
>        if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
>  	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
>  	return OPTIMIZE (evex);
> +
> +      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
> +	return OPTIMIZE (avx2);
>      }
>  
>    if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> new file mode 100644
> index 0000000000..a72105b7e9
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSNCAT
> +# define WCSNCAT	__wcsncat_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define USE_AS_STRCAT
> +
> +#define STRNCAT	WCSNCAT
> +#include "strncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> index 4b55cb40bc..9ced02b35e 100644
> --- a/sysdeps/x86_64/multiarch/wcsncat-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSNCAT __wcsncat_generic
>  # include <wcsmbs/wcsncat.c>
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> new file mode 100644
> index 0000000000..3a1a8a372c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
> @@ -0,0 +1,7 @@
> +#ifndef WCSNCPY
> +# define WCSNCPY	__wcsncpy_avx2
> +#endif
> +
> +#define USE_AS_WCSCPY
> +#define STRNCPY	WCSNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> index d0e8a86605..693521713b 100644
> --- a/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> +++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
> @@ -19,7 +19,7 @@
>  /* We always need to build this implementation as strspn-sse4 needs to
>     be able to fallback to it.  */
>  #include <isa-level.h>
> -#if ISA_SHOULD_BUILD (3)
> +#if ISA_SHOULD_BUILD (2)
>  
>  # define WCSNCPY __wcsncpy_generic
>  # include <wcsmbs/wcsncpy.c>
> diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
> index 3ddc98872f..4ab6182cd9 100644
> --- a/sysdeps/x86_64/wcpcpy-generic.c
> +++ b/sysdeps/x86_64/wcpcpy-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcpcpy.c>
>  
> diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
> index 4e4fca71eb..e64af6977f 100644
> --- a/sysdeps/x86_64/wcpcpy.S
> +++ b/sysdeps/x86_64/wcpcpy.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCPCPY	__wcpcpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
> index 0c76e5614c..18c0377d35 100644
> --- a/sysdeps/x86_64/wcpncpy-generic.c
> +++ b/sysdeps/x86_64/wcpncpy-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcpncpy.c>
>  
> diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
> index b4e531473e..0e0f432fbb 100644
> --- a/sysdeps/x86_64/wcpncpy.S
> +++ b/sysdeps/x86_64/wcpncpy.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCPNCPY	__wcpncpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
> index 512d0e4d43..639ceac523 100644
> --- a/sysdeps/x86_64/wcscat-generic.c
> +++ b/sysdeps/x86_64/wcscat-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcscat.c>
>  
> diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
> index ee8360b6e8..06130f58f9 100644
> --- a/sysdeps/x86_64/wcscat.S
> +++ b/sysdeps/x86_64/wcscat.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCSCAT	__wcscat
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
> index e403579961..4a859585a6 100644
> --- a/sysdeps/x86_64/wcscpy.S
> +++ b/sysdeps/x86_64/wcscpy.S
> @@ -29,6 +29,7 @@
>  # define WCSCPY	__wcscpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
>  # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
> diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
> index 86e20d9028..57bdd9b7cf 100644
> --- a/sysdeps/x86_64/wcsncat-generic.c
> +++ b/sysdeps/x86_64/wcsncat-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcsncat.c>
>  
> diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
> index 090055a1b8..e1d8609651 100644
> --- a/sysdeps/x86_64/wcsncat.S
> +++ b/sysdeps/x86_64/wcsncat.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCSNCAT	wcsncat
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
> index 0f0ee65b65..4dcbd8ac7f 100644
> --- a/sysdeps/x86_64/wcsncpy-generic.c
> +++ b/sysdeps/x86_64/wcsncpy-generic.c
> @@ -24,7 +24,7 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL <= 3
> +#if MINIMUM_X86_ISA_LEVEL <= 2
>  
>  # include <wcsmbs/wcsncpy.c>
>  
> diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
> index 32eaf1163b..f305b5eb9b 100644
> --- a/sysdeps/x86_64/wcsncpy.S
> +++ b/sysdeps/x86_64/wcsncpy.S
> @@ -24,11 +24,12 @@
>  
>  #include <isa-level.h>
>  
> -#if MINIMUM_X86_ISA_LEVEL >= 4
> +#if MINIMUM_X86_ISA_LEVEL >= 3
>  
>  # define WCSNCPY	__wcsncpy
>  
>  # define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
> +# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
>  /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
>     should never be used from here.  */
>  # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
> -- 
> 2.34.1
> 

LGTM.

Thanks.

H.J.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d6e01940c3..e1e894c963 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -131,10 +131,13 @@  endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += \
+  wcpcpy-avx2 \
   wcpcpy-evex \
   wcpcpy-generic \
+  wcpncpy-avx2 \
   wcpncpy-evex \
   wcpncpy-generic \
+  wcscat-avx2 \
   wcscat-evex \
   wcscat-generic \
   wcschr-avx2 \
@@ -146,6 +149,7 @@  sysdep_routines += \
   wcscmp-avx2-rtm \
   wcscmp-evex \
   wcscmp-sse2 \
+  wcscpy-avx2 \
   wcscpy-evex \
   wcscpy-generic \
   wcscpy-ssse3 \
@@ -155,11 +159,13 @@  sysdep_routines += \
   wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
+  wcsncat-avx2 \
   wcsncat-evex \
   wcsncat-generic \
   wcsncmp-avx2 \
   wcsncmp-avx2-rtm \
   wcsncmp-evex \
+  wcsncpy-avx2 \
   wcsncpy-evex \
   wcsncpy-generic \
   wcsnlen-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c908d6c158..0c15dfebfd 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -907,6 +907,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscpy_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscpy,
 				     CPU_FEATURE_USABLE (SSSE3),
 				     __wcscpy_ssse3)
 	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcscpy,
@@ -920,7 +924,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncpy_evex)
-	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
 				     1,
 				     __wcsncpy_generic))
 
@@ -932,6 +940,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpcpy_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpcpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpcpy,
 				     1,
 				     __wcpcpy_generic))
 
@@ -942,7 +954,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpncpy_evex)
-	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcpncpy,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcpncpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
 				     1,
 				     __wcpncpy_generic))
 
@@ -954,6 +970,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcscat_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcscat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcscat_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcscat,
 				     1,
 				     __wcscat_generic))
 
@@ -965,6 +985,10 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncat_evex)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsncat,
+				     (CPU_FEATURE_USABLE (AVX2)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsncat_avx2)
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncat,
 				     1,
 				     __wcsncat_generic))
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcs.h b/sysdeps/x86_64/multiarch/ifunc-wcs.h
index 1d2a63458b..51194e620e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wcs.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wcs.h
@@ -27,6 +27,8 @@ 
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (GENERIC) attribute_hidden;
 
 static inline void *
@@ -42,6 +44,11 @@  IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
+
     }
 
   return OPTIMIZE (GENERIC);
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-avx2.S b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
new file mode 100644
index 0000000000..0fffd912d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpcpy-avx2.S
@@ -0,0 +1,8 @@ 
+#ifndef WCPCPY
+# define WCPCPY	__wcpcpy_avx2
+#endif
+
+#define USE_AS_STPCPY
+#define USE_AS_WCSCPY
+#define STRCPY	WCPCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpcpy-generic.c b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
index 6039196a3e..0ba29b081f 100644
--- a/sysdeps/x86_64/multiarch/wcpcpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpcpy-generic.c
@@ -19,7 +19,7 @@ 
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPCPY __wcpcpy_generic
 # include <wcsmbs/wcpcpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-avx2.S b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
new file mode 100644
index 0000000000..b7e594f7b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcpncpy-avx2.S
@@ -0,0 +1,8 @@ 
+#ifndef WCPNCPY
+# define WCPNCPY	__wcpncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STPCPY
+#define STRNCPY	WCPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcpncpy-generic.c b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
index de8d34320e..4aab4ecdd2 100644
--- a/sysdeps/x86_64/multiarch/wcpncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcpncpy-generic.c
@@ -19,7 +19,7 @@ 
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCPNCPY __wcpncpy_generic
 # include <wcsmbs/wcpncpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscat-avx2.S b/sysdeps/x86_64/multiarch/wcscat-avx2.S
new file mode 100644
index 0000000000..a20f23c09d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscat-avx2.S
@@ -0,0 +1,10 @@ 
+#ifndef WCSCAT
+# define WCSCAT	__wcscat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRCPY	WCSCAT
+
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscat-generic.c b/sysdeps/x86_64/multiarch/wcscat-generic.c
index d86b4d5c00..6476f85bbb 100644
--- a/sysdeps/x86_64/multiarch/wcscat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscat-generic.c
@@ -19,7 +19,7 @@ 
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCAT __wcscat_generic
 # include <wcsmbs/wcscat.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy-avx2.S b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
new file mode 100644
index 0000000000..6bc509da07
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-avx2.S
@@ -0,0 +1,7 @@ 
+#ifndef WCSCPY
+# define WCSCPY	__wcscpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRCPY	WCSCPY
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-generic.c b/sysdeps/x86_64/multiarch/wcscpy-generic.c
index 4a1fffae4b..600d606c45 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcscpy-generic.c
@@ -18,7 +18,7 @@ 
 
 
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSCPY  __wcscpy_generic
 # include <wcsmbs/wcscpy.c>
diff --git a/sysdeps/x86_64/multiarch/wcscpy.c b/sysdeps/x86_64/multiarch/wcscpy.c
index 9ad77da8ac..e204059873 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.c
+++ b/sysdeps/x86_64/multiarch/wcscpy.c
@@ -28,6 +28,8 @@ 
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
@@ -44,6 +46,9 @@  IFUNC_SELECTOR (void)
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 	  && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
+
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
+	return OPTIMIZE (avx2);
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
diff --git a/sysdeps/x86_64/multiarch/wcsncat-avx2.S b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
new file mode 100644
index 0000000000..a72105b7e9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncat-avx2.S
@@ -0,0 +1,9 @@ 
+#ifndef WCSNCAT
+# define WCSNCAT	__wcsncat_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define USE_AS_STRCAT
+
+#define STRNCAT	WCSNCAT
+#include "strncat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncat-generic.c b/sysdeps/x86_64/multiarch/wcsncat-generic.c
index 4b55cb40bc..9ced02b35e 100644
--- a/sysdeps/x86_64/multiarch/wcsncat-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncat-generic.c
@@ -19,7 +19,7 @@ 
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCAT __wcsncat_generic
 # include <wcsmbs/wcsncat.c>
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-avx2.S b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
new file mode 100644
index 0000000000..3a1a8a372c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncpy-avx2.S
@@ -0,0 +1,7 @@ 
+#ifndef WCSNCPY
+# define WCSNCPY	__wcsncpy_avx2
+#endif
+
+#define USE_AS_WCSCPY
+#define STRNCPY	WCSNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncpy-generic.c b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
index d0e8a86605..693521713b 100644
--- a/sysdeps/x86_64/multiarch/wcsncpy-generic.c
+++ b/sysdeps/x86_64/multiarch/wcsncpy-generic.c
@@ -19,7 +19,7 @@ 
 /* We always need to build this implementation as strspn-sse4 needs to
    be able to fallback to it.  */
 #include <isa-level.h>
-#if ISA_SHOULD_BUILD (3)
+#if ISA_SHOULD_BUILD (2)
 
 # define WCSNCPY __wcsncpy_generic
 # include <wcsmbs/wcsncpy.c>
diff --git a/sysdeps/x86_64/wcpcpy-generic.c b/sysdeps/x86_64/wcpcpy-generic.c
index 3ddc98872f..4ab6182cd9 100644
--- a/sysdeps/x86_64/wcpcpy-generic.c
+++ b/sysdeps/x86_64/wcpcpy-generic.c
@@ -24,7 +24,7 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpcpy.c>
 
diff --git a/sysdeps/x86_64/wcpcpy.S b/sysdeps/x86_64/wcpcpy.S
index 4e4fca71eb..e64af6977f 100644
--- a/sysdeps/x86_64/wcpcpy.S
+++ b/sysdeps/x86_64/wcpcpy.S
@@ -24,11 +24,12 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPCPY	__wcpcpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpcpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpcpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcpncpy-generic.c b/sysdeps/x86_64/wcpncpy-generic.c
index 0c76e5614c..18c0377d35 100644
--- a/sysdeps/x86_64/wcpncpy-generic.c
+++ b/sysdeps/x86_64/wcpncpy-generic.c
@@ -24,7 +24,7 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcpncpy.c>
 
diff --git a/sysdeps/x86_64/wcpncpy.S b/sysdeps/x86_64/wcpncpy.S
index b4e531473e..0e0f432fbb 100644
--- a/sysdeps/x86_64/wcpncpy.S
+++ b/sysdeps/x86_64/wcpncpy.S
@@ -24,11 +24,12 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCPNCPY	__wcpncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcpncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcpncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscat-generic.c b/sysdeps/x86_64/wcscat-generic.c
index 512d0e4d43..639ceac523 100644
--- a/sysdeps/x86_64/wcscat-generic.c
+++ b/sysdeps/x86_64/wcscat-generic.c
@@ -24,7 +24,7 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcscat.c>
 
diff --git a/sysdeps/x86_64/wcscat.S b/sysdeps/x86_64/wcscat.S
index ee8360b6e8..06130f58f9 100644
--- a/sysdeps/x86_64/wcscat.S
+++ b/sysdeps/x86_64/wcscat.S
@@ -24,11 +24,12 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSCAT	__wcscat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcscpy.S b/sysdeps/x86_64/wcscpy.S
index e403579961..4a859585a6 100644
--- a/sysdeps/x86_64/wcscpy.S
+++ b/sysdeps/x86_64/wcscpy.S
@@ -29,6 +29,7 @@ 
 # define WCSCPY	__wcscpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcscpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcscpy-avx2.S"
 # define DEFAULT_IMPL_V2	"multiarch/wcscpy-ssse3.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
diff --git a/sysdeps/x86_64/wcsncat-generic.c b/sysdeps/x86_64/wcsncat-generic.c
index 86e20d9028..57bdd9b7cf 100644
--- a/sysdeps/x86_64/wcsncat-generic.c
+++ b/sysdeps/x86_64/wcsncat-generic.c
@@ -24,7 +24,7 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncat.c>
 
diff --git a/sysdeps/x86_64/wcsncat.S b/sysdeps/x86_64/wcsncat.S
index 090055a1b8..e1d8609651 100644
--- a/sysdeps/x86_64/wcsncat.S
+++ b/sysdeps/x86_64/wcsncat.S
@@ -24,11 +24,12 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCAT	wcsncat
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncat-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncat-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"
diff --git a/sysdeps/x86_64/wcsncpy-generic.c b/sysdeps/x86_64/wcsncpy-generic.c
index 0f0ee65b65..4dcbd8ac7f 100644
--- a/sysdeps/x86_64/wcsncpy-generic.c
+++ b/sysdeps/x86_64/wcsncpy-generic.c
@@ -24,7 +24,7 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL <= 3
+#if MINIMUM_X86_ISA_LEVEL <= 2
 
 # include <wcsmbs/wcsncpy.c>
 
diff --git a/sysdeps/x86_64/wcsncpy.S b/sysdeps/x86_64/wcsncpy.S
index 32eaf1163b..f305b5eb9b 100644
--- a/sysdeps/x86_64/wcsncpy.S
+++ b/sysdeps/x86_64/wcsncpy.S
@@ -24,11 +24,12 @@ 
 
 #include <isa-level.h>
 
-#if MINIMUM_X86_ISA_LEVEL >= 4
+#if MINIMUM_X86_ISA_LEVEL >= 3
 
 # define WCSNCPY	__wcsncpy
 
 # define DEFAULT_IMPL_V4	"multiarch/wcsncpy-evex.S"
+# define DEFAULT_IMPL_V3	"multiarch/wcsncpy-avx2.S"
 /* isa-default-impl.h expects DEFAULT_IMPL_V1 to be defined but it
    should never be used from here.  */
 # define DEFAULT_IMPL_V1	"ERROR -- Invalid ISA IMPL"