@@ -18,15 +18,9 @@
#include <sysdep.h>
-#define VEC_SIZE 16
-#define VEC(i) xmm##i
#define PREFETCHNT prefetchnta
-#define VMOVNT movntdq
-/* Use movups and movaps for smaller code sizes. */
-#define VMOVU movups
-#define VMOVA movaps
-#define MOV_SIZE 3
-#define SECTION(p) p
+#include "multiarch/sse2-vecs.h"
+
#ifdef USE_MULTIARCH
# if !IS_IN (libc)
@@ -18,26 +18,19 @@
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#define USE_WITH_SSE2 1
-#define VEC_SIZE 16
-#define MOV_SIZE 3
-#define RET_SIZE 1
-
-#define VEC(i) xmm##i
-#define VMOVU movups
-#define VMOVA movaps
+#include "multiarch/sse2-vecs.h"
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
- movd d, %xmm0; \
+ movd d, %VEC(0); \
movq r, %rax; \
- punpcklbw %xmm0, %xmm0; \
- punpcklwd %xmm0, %xmm0; \
- pshufd $0, %xmm0, %xmm0
+ punpcklbw %VEC(0), %VEC(0); \
+ punpcklwd %VEC(0), %VEC(0); \
+ pshufd $0, %VEC(0), %VEC(0)
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
- movd d, %xmm0; \
- pshufd $0, %xmm0, %xmm0; \
+ movd d, %VEC(0); \
+ pshufd $0, %VEC(0), %VEC(0); \
movq r, %rax
# define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -46,7 +39,6 @@
# define WMEMSET_VDUP_TO_VEC0_HIGH()
# define WMEMSET_VDUP_TO_VEC0_LOW()
-#define SECTION(p) p
#ifndef MEMSET_SYMBOL
# define MEMSET_CHK_SYMBOL(p,s) p
new file mode 100644
@@ -0,0 +1,33 @@
+/* Common config for AVX-RTM VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H 1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#define USE_WITH_RTM 1
+#include "avx-vecs.h"
+
+#endif
new file mode 100644
@@ -0,0 +1,53 @@
+/* Common config for AVX VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H 1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC 1
+#include "vec-macros.h"
+
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX 1
+#endif
+/* Included by RTM version. */
+#ifndef SECTION
+# define SECTION(p) p##.avx
+#endif
+
+#define VEC_SIZE 32
+/* 4-byte mov instructions with AVX2. */
+#define MOV_SIZE 4
+/* 1 (ret) + 3 (vzeroupper). */
+#define RET_SIZE 4
+#define VZEROUPPER vzeroupper
+
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+#define VMOVNT vmovntdq
+
+/* Often need to access xmm portion. */
+#define VEC_xmm VEC_any_xmm
+#define VEC VEC_any_ymm
+
+#endif
new file mode 100644
@@ -0,0 +1,33 @@
+/* Common config for AVX2-RTM VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _AVX2_RTM_VECS_H
+#define _AVX2_RTM_VECS_H 1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#define USE_WITH_RTM 1
+#include "avx2-vecs.h"
+
+#endif
new file mode 100644
@@ -0,0 +1,30 @@
+/* Common config for AVX2 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _AVX2_VECS_H
+#define _AVX2_VECS_H 1
+
+#define USE_WITH_AVX2 1
+/* Included by RTM version. */
+#ifndef SECTION
+# define SECTION(p) p##.avx
+#endif
+#include "avx-vecs.h"
+
+#endif
new file mode 100644
@@ -0,0 +1,50 @@
+/* Common config for EVEX256 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H 1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC 1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX256 1
+#ifndef SECTION
+# define SECTION(p) p##.evex
+#endif
+
+#define VEC_SIZE 32
+/* 6-byte mov instructions with EVEX. */
+#define MOV_SIZE 6
+/* No vzeroupper needed. */
+#define RET_SIZE 1
+#define VZEROUPPER
+
+#define VMOVU vmovdqu64
+#define VMOVA vmovdqa64
+#define VMOVNT vmovntdq
+
+/* Often need to access xmm portion. */
+#define VEC_xmm VEC_hi_xmm
+#define VEC VEC_hi_ymm
+
+#endif
new file mode 100644
@@ -0,0 +1,49 @@
+/* Common config for EVEX512 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H 1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC 1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX512 1
+#define SECTION(p) p##.evex512
+
+#define VEC_SIZE 64
+/* 6-byte mov instructions with EVEX. */
+#define MOV_SIZE 6
+/* No vzeroupper needed. */
+#define RET_SIZE 1
+#define VZEROUPPER
+
+#define VMOVU vmovdqu64
+#define VMOVA vmovdqa64
+#define VMOVNT vmovntdq
+
+/* Often need to access xmm/ymm portion. */
+#define VEC_xmm VEC_hi_xmm
+#define VEC_ymm VEC_hi_ymm
+#define VEC VEC_hi_zmm
+
+#endif
@@ -20,6 +20,11 @@
# include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
# ifndef MEMCHR
# define MEMCHR __memchr_evex
# endif
@@ -28,12 +33,14 @@
# define VPBROADCAST vpbroadcastd
# define VPMINU vpminud
# define VPCMP vpcmpd
+# define VPTESTNM vptestnmd
# define VPCMPEQ vpcmpeqd
# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPMINU vpminub
# define VPCMP vpcmpb
+# define VPTESTNM vptestnmb
# define VPCMPEQ vpcmpeqb
# define CHAR_SIZE 1
# endif
@@ -46,11 +53,11 @@
compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
version. */
# ifdef USE_IN_RTM
-# define VZEROUPPER
+# define MEMCHR_VZEROUPPER
# define BASE_OFFSET (VEC_SIZE * 4)
# define RET_SCALE CHAR_SIZE
# else
-# define VZEROUPPER vzeroupper
+# define MEMCHR_VZEROUPPER vzeroupper
# define BASE_OFFSET 0
# define RET_SCALE 1
# endif
@@ -68,22 +75,12 @@
# define ALGN_PTR_REG rcx
# endif
-# define XMMZERO xmm23
-# define YMMZERO ymm23
-# define XMMMATCH xmm16
-# define YMMMATCH ymm16
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-
-# ifndef SECTION
-# define SECTION(p) p##.evex
-# endif
+# define XMMZERO VEC_xmm(7)
+# define VECZERO VEC(7)
+
+# define XMMMATCH VEC_xmm(0)
+# define VECMATCH VEC(0)
-# define VEC_SIZE 32
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
# define PAGE_SIZE 4096
@@ -99,8 +96,8 @@ ENTRY (MEMCHR)
movl %edx, %edx
# endif
# endif
- /* Broadcast CHAR to YMMMATCH. */
- VPBROADCAST %esi, %YMMMATCH
+ /* Broadcast CHAR to VECMATCH. */
+ VPBROADCAST %esi, %VECMATCH
/* Check if we may cross page boundary with one vector load. */
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
@@ -108,7 +105,7 @@ ENTRY (MEMCHR)
ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k0
+ VPCMP $0, (%rdi), %VECMATCH, %k0
kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
/* If length < CHAR_PER_VEC handle special. */
@@ -155,7 +152,7 @@ L(cross_page_boundary):
/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
for rawmemchr. */
andq $-VEC_SIZE, %ALGN_PTR_REG
- VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ VPCMP $0, (%ALGN_PTR_REG), %VECMATCH, %k0
kmovd %k0, %r8d
# ifdef USE_AS_WMEMCHR
/* NB: Divide shift count by 4 since each bit in K0 represent 4
@@ -233,7 +230,7 @@ L(cross_page_continue):
L(cross_page_continue):
# endif
/* Load first VEC regardless. */
- VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
/* Adjust length. If near end handle specially. */
@@ -243,17 +240,17 @@ L(cross_page_continue):
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x4)
@@ -289,7 +286,7 @@ L(cross_page_continue):
# else
/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
encodable with EVEX registers (ymm16-ymm31). */
- vmovdqa64 %YMMMATCH, %ymm0
+ vmovdqa64 %VECMATCH, %ymm0
# endif
/* Compare 4 * VEC at a time forward. */
@@ -305,23 +302,23 @@ L(loop_4x_vec):
# ifdef USE_IN_RTM
/* It would be possible to save some instructions using 4x VPCMP
but bottleneck on port 5 makes it not woth it. */
- VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %VECMATCH, %k1
/* xor will set bytes match esi to zero. */
- vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
- vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
- VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ vpxorq (VEC_SIZE * 5)(%rdi), %VECMATCH, %VEC(2)
+ vpxorq (VEC_SIZE * 6)(%rdi), %VECMATCH, %VEC(3)
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %VECMATCH, %k3
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
- VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
- VPCMP $0, %YMM3, %YMMZERO, %k2
+ VPMINU %VEC(2), %VEC(3), %VEC(3){%k1}{z}
+ VPCMP $0, %VEC(3), %VECZERO, %k2
# else
/* Since vptern can only take 3x vectors fastest to do 1 vec
seperately with EVEX vpcmp. */
# ifdef USE_AS_WMEMCHR
/* vptern can only accept masks for epi32/epi64 so can only save
instruction using not equals mask on vptern with wmemchr. */
- VPCMP $4, (%rdi), %YMMMATCH, %k1
+ VPCMP $4, (%rdi), %VECMATCH, %k1
# else
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+ VPCMP $0, (%rdi), %VECMATCH, %k1
# endif
/* Compare 3x with vpcmpeq and or them all together with vptern.
*/
@@ -371,10 +368,10 @@ L(loop_4x_vec):
/* Fall through into less than 4 remaining vectors of length case.
*/
- VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
+ VPCMP $0, BASE_OFFSET(%rdi), %VECMATCH, %k0
addq $(BASE_OFFSET - VEC_SIZE), %rdi
kmovd %k0, %eax
- VZEROUPPER
+ MEMCHR_VZEROUPPER
L(last_4x_vec_or_less):
/* Check if first VEC contained match. */
@@ -391,7 +388,7 @@ L(last_2x_vec):
jle L(zero_end)
/* Check VEC2 and compare any match with remaining length. */
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
tzcntl %eax, %eax
cmpl %eax, %edx
@@ -437,7 +434,7 @@ L(loop_4x_vec_end):
jnz L(last_vec_x1_return)
# ifdef USE_IN_RTM
- VPCMP $0, %YMM2, %YMMZERO, %k0
+ VPCMP $0, %VEC(2), %VECZERO, %k0
kmovd %k0, %eax
# else
vpmovmskb %ymm2, %eax
@@ -460,7 +457,7 @@ L(loop_4x_vec_end):
orq %rcx, %rax
tzcntq %rax, %rax
leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
- VZEROUPPER
+ MEMCHR_VZEROUPPER
# endif
ret
@@ -473,7 +470,7 @@ L(last_vec_x1_return):
# else
addq %rdi, %rax
# endif
- VZEROUPPER
+ MEMCHR_VZEROUPPER
ret
.p2align 4
@@ -483,7 +480,7 @@ L(last_vec_x2_return):
if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
- VZEROUPPER
+ MEMCHR_VZEROUPPER
ret
# ifdef USE_IN_RTM
@@ -497,7 +494,7 @@ L(last_vec_x3_return):
# ifndef USE_AS_RAWMEMCHR
L(last_4x_vec_or_less_cmpeq):
- VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
subq $-(VEC_SIZE * 4), %rdi
/* Check first VEC regardless. */
@@ -510,13 +507,13 @@ L(last_4x_vec_or_less_cmpeq):
.p2align 4
L(last_4x_vec):
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
/* Create mask for possible matches within remaining length. */
# ifdef USE_AS_WMEMCHR
@@ -536,7 +533,7 @@ L(last_4x_vec):
jbe L(zero_end2)
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %VECMATCH, %k0
kmovd %k0, %eax
/* Shift remaining length mask for last VEC. */
# ifdef USE_AS_WMEMCHR
@@ -18,6 +18,11 @@
#if IS_IN (libc)
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
/* memcmp/wmemcmp is implemented as:
1. Use ymm vector compares when possible. The only case where
vector compares is not possible for when size < CHAR_PER_VEC
@@ -59,7 +64,6 @@ Latency:
# define MEMCMP __memcmp_evex_movbe
# endif
-# define VMOVU vmovdqu64
# ifdef USE_AS_WMEMCMP
# define VMOVU_MASK vmovdqu32
@@ -74,23 +78,9 @@ Latency:
# endif
-# define VEC_SIZE 32
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define XMM0 xmm16
-# define XMM1 xmm17
-# define XMM2 xmm18
-# define YMM0 ymm16
-# define XMM1 xmm17
-# define XMM2 xmm18
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
@@ -115,8 +105,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
kmovd %ecx, %k2
/* Safe to load full ymm with mask. */
- VMOVU_MASK (%rsi), %YMM2{%k2}
- VPCMP $4,(%rdi), %YMM2, %k1{%k2}
+ VMOVU_MASK (%rsi), %VEC(2){%k2}
+ VPCMP $4,(%rdi), %VEC(2), %k1{%k2}
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_0)
@@ -144,9 +134,9 @@ L(return_vec_0):
.p2align 4
L(more_1x_vec):
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %YMM1
+ VMOVU (%rsi), %VEC(1)
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4,(%rdi), %YMM1, %k1
+ VPCMP $4,(%rdi), %VEC(1), %k1
kmovd %k1, %eax
/* NB: eax must be destination register if going to
L(return_vec_[0,2]). For L(return_vec_3) destination register
@@ -158,8 +148,8 @@ L(more_1x_vec):
jbe L(last_1x_vec)
/* Check second VEC no matter what. */
- VMOVU VEC_SIZE(%rsi), %YMM2
- VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
+ VMOVU VEC_SIZE(%rsi), %VEC(2)
+ VPCMP $4, VEC_SIZE(%rdi), %VEC(2), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_1)
@@ -169,14 +159,14 @@ L(more_1x_vec):
jbe L(last_2x_vec)
/* Check third and fourth VEC no matter what. */
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %VEC(3), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_2)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %VEC(4), %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(return_vec_3)
@@ -189,8 +179,8 @@ L(more_1x_vec):
branches. */
/* Load first two VEC from s2 before adjusting addresses. */
- VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
- VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VEC(1)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VEC(2)
leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
@@ -199,23 +189,23 @@ L(more_1x_vec):
/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
will have some 1s. */
- vpxorq (%rdi), %YMM1, %YMM1
- vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
+ vpxorq (%rdi), %VEC(1), %VEC(1)
+ vpxorq (VEC_SIZE)(%rdi), %VEC(2), %VEC(2)
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
+ vpxorq (VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
- oring with YMM1. Result is stored in YMM4. */
- vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
+ oring with VEC(1). Result is stored in VEC(4). */
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %VEC(1), %VEC(4)
- /* Or together YMM2, YMM3, and YMM4 into YMM4. */
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */
+ vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
- /* Test YMM4 against itself. Store any CHAR mismatches in k1.
+ /* Test VEC(4) against itself. Store any CHAR mismatches in k1.
*/
- VPTEST %YMM4, %YMM4, %k1
+ VPTEST %VEC(4), %VEC(4), %k1
/* k1 must go to ecx for L(return_vec_0_1_2_3). */
kmovd %k1, %ecx
testl %ecx, %ecx
@@ -230,17 +220,17 @@ L(8x_end_return_vec_0_1_2_3):
L(8x_return_vec_0_1_2_3):
addq %rdi, %rsi
L(return_vec_0_1_2_3):
- VPTEST %YMM1, %YMM1, %k0
+ VPTEST %VEC(1), %VEC(1), %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(return_vec_0)
- VPTEST %YMM2, %YMM2, %k0
+ VPTEST %VEC(2), %VEC(2), %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(return_vec_1)
- VPTEST %YMM3, %YMM3, %k0
+ VPTEST %VEC(3), %VEC(3), %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(return_vec_2)
@@ -313,16 +303,16 @@ L(more_8x_vec):
.p2align 4
L(loop_4x_vec):
- VMOVU (%rsi, %rdi), %YMM1
- vpxorq (%rdi), %YMM1, %YMM1
- VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
- vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
- VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
- vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
- VPTEST %YMM4, %YMM4, %k1
+ VMOVU (%rsi, %rdi), %VEC(1)
+ vpxorq (%rdi), %VEC(1), %VEC(1)
+ VMOVU VEC_SIZE(%rsi, %rdi), %VEC(2)
+ vpxorq VEC_SIZE(%rdi), %VEC(2), %VEC(2)
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VEC(3)
+ vpxorq (VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VEC(4)
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %VEC(1), %VEC(4)
+ vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+ VPTEST %VEC(4), %VEC(4), %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(8x_return_vec_0_1_2_3)
@@ -335,21 +325,21 @@ L(loop_4x_vec):
cmpl $(VEC_SIZE * 3), %edi
jae L(8x_last_1x_vec)
/* Load regardless of branch. */
- VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
cmpl $(VEC_SIZE * 2), %edi
jae L(8x_last_2x_vec)
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+ vpxorq (VEC_SIZE * 2)(%rdx), %VEC(3), %VEC(3)
- VMOVU (%rsi, %rdx), %YMM1
- vpxorq (%rdx), %YMM1, %YMM1
+ VMOVU (%rsi, %rdx), %VEC(1)
+ vpxorq (%rdx), %VEC(1), %VEC(1)
- VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
- VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
- VPTEST %YMM4, %YMM4, %k1
+ VMOVU VEC_SIZE(%rsi, %rdx), %VEC(2)
+ vpxorq VEC_SIZE(%rdx), %VEC(2), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VEC(4)
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %VEC(1), %VEC(4)
+ vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+ VPTEST %VEC(4), %VEC(4), %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(8x_end_return_vec_0_1_2_3)
@@ -359,14 +349,14 @@ L(loop_4x_vec):
/* Only entry is from L(more_8x_vec). */
.p2align 4,, 10
L(8x_last_2x_vec):
- VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ VPCMP $4,(VEC_SIZE * 2)(%rdx), %VEC(3), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(8x_return_vec_2)
/* Naturally aligned to 16 bytes. */
L(8x_last_1x_vec):
- VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
- VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+ VPCMP $4,(VEC_SIZE * 3)(%rdx), %VEC(1), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(8x_return_vec_3)
@@ -399,8 +389,8 @@ L(8x_return_vec_3):
.p2align 4,, 10
L(last_2x_vec):
/* Check second to last VEC. */
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
- VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VEC(1)
+ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VEC(1), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_1_end)
@@ -408,8 +398,8 @@ L(last_2x_vec):
/* Check last VEC. */
.p2align 4
L(last_1x_vec):
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
- VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VEC(1)
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VEC(1), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_0_end)
@@ -18,6 +18,11 @@
#if IS_IN (libc)
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
/* __memcmpeq is implemented as:
1. Use ymm vector compares when possible. The only case where
vector compares is not possible for when size < VEC_SIZE
@@ -40,21 +45,11 @@
# endif
# define VMOVU_MASK vmovdqu8
-# define VMOVU vmovdqu64
# define VPCMP vpcmpub
# define VPTEST vptestmb
-# define VEC_SIZE 32
# define PAGE_SIZE 4096
-# define YMM0 ymm16
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-
.section .text.evex, "ax", @progbits
ENTRY_P2ALIGN (MEMCMPEQ, 6)
@@ -75,15 +70,15 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
/* Use masked loads as VEC_SIZE could page cross where length
(edx) would not. */
- VMOVU_MASK (%rsi), %YMM2{%k2}
- VPCMP $4,(%rdi), %YMM2, %k1{%k2}
+ VMOVU_MASK (%rsi), %VEC(2){%k2}
+ VPCMP $4,(%rdi), %VEC(2), %k1{%k2}
kmovd %k1, %eax
ret
L(last_1x_vec):
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
- VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VEC(1)
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %VEC(1), %k1
kmovd %k1, %eax
L(return_neq0):
ret
@@ -93,9 +88,9 @@ L(return_neq0):
.p2align 4
L(more_1x_vec):
/* From VEC + 1 to 2 * VEC. */
- VMOVU (%rsi), %YMM1
+ VMOVU (%rsi), %VEC(1)
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4,(%rdi), %YMM1, %k1
+ VPCMP $4,(%rdi), %VEC(1), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
@@ -104,8 +99,8 @@ L(more_1x_vec):
jbe L(last_1x_vec)
/* Check second VEC no matter what. */
- VMOVU VEC_SIZE(%rsi), %YMM2
- VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
+ VMOVU VEC_SIZE(%rsi), %VEC(2)
+ VPCMP $4, VEC_SIZE(%rdi), %VEC(2), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
@@ -115,14 +110,14 @@ L(more_1x_vec):
jbe L(last_2x_vec)
/* Check third and fourth VEC no matter what. */
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %VEC(3), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %VEC(4), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
@@ -134,8 +129,8 @@ L(more_1x_vec):
/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
branches. */
- VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
- VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(1)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(2)
addq %rdx, %rdi
/* Wait to load from s1 until addressed adjust due to
@@ -143,22 +138,22 @@ L(more_1x_vec):
/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
will have some 1s. */
- vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
- /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
- oring with YMM1. Result is stored in YMM1. */
- vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
-
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
- vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- /* Or together YMM1, YMM2, and YMM3 into YMM3. */
- VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
- vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
-
- /* Or together YMM2, YMM3, and YMM4 into YMM4. */
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-
- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
- VPTEST %YMM4, %YMM4, %k1
+ vpxorq -(VEC_SIZE * 4)(%rdi), %VEC(1), %VEC(1)
+ /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
+ oring with VEC(1). Result is stored in VEC(1). */
+ vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %VEC(1), %VEC(2)
+
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+ vpxorq -(VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
+ /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
+ VMOVU -(VEC_SIZE)(%rsi, %rdx), %VEC(4)
+ vpxorq -(VEC_SIZE)(%rdi), %VEC(4), %VEC(4)
+
+ /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */
+ vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+
+ /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
+ VPTEST %VEC(4), %VEC(4), %k1
kmovd %k1, %eax
ret
@@ -175,20 +170,20 @@ L(more_8x_vec):
subq $-(VEC_SIZE * 4), %rdi
.p2align 4
L(loop_4x_vec):
- VMOVU (%rsi, %rdi), %YMM1
- vpxorq (%rdi), %YMM1, %YMM1
+ VMOVU (%rsi, %rdi), %VEC(1)
+ vpxorq (%rdi), %VEC(1), %VEC(1)
- VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
- vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
+ VMOVU VEC_SIZE(%rsi, %rdi), %VEC(2)
+ vpternlogd $0xde,(VEC_SIZE)(%rdi), %VEC(1), %VEC(2)
- VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
- vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VEC(3)
+ vpxorq (VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
- VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VEC(4)
+ vpxorq (VEC_SIZE * 3)(%rdi), %VEC(4), %VEC(4)
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
- VPTEST %YMM4, %YMM4, %k1
+ vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+ VPTEST %VEC(4), %VEC(4), %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq2)
@@ -197,40 +192,40 @@ L(loop_4x_vec):
jb L(loop_4x_vec)
subq %rdx, %rdi
- VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VEC(4)
+ vpxorq (VEC_SIZE * 3)(%rdx), %VEC(4), %VEC(4)
/* rdi has 4 * VEC_SIZE - remaining length. */
cmpl $(VEC_SIZE * 3), %edi
jae L(8x_last_1x_vec)
/* Load regardless of branch. */
- VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
- /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
- oring with YMM4. Result is stored in YMM4. */
- vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+ /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
+ oring with VEC(4). Result is stored in VEC(4). */
+ vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %VEC(3), %VEC(4)
cmpl $(VEC_SIZE * 2), %edi
jae L(8x_last_2x_vec)
- VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
+ VMOVU VEC_SIZE(%rsi, %rdx), %VEC(2)
+ vpxorq VEC_SIZE(%rdx), %VEC(2), %VEC(2)
- VMOVU (%rsi, %rdx), %YMM1
- vpxorq (%rdx), %YMM1, %YMM1
+ VMOVU (%rsi, %rdx), %VEC(1)
+ vpxorq (%rdx), %VEC(1), %VEC(1)
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
+ vpternlogd $0xfe, %VEC(1), %VEC(2), %VEC(4)
L(8x_last_1x_vec):
L(8x_last_2x_vec):
- VPTEST %YMM4, %YMM4, %k1
+ VPTEST %VEC(4), %VEC(4), %k1
kmovd %k1, %eax
L(return_neq2):
ret
.p2align 4,, 8
L(last_2x_vec):
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
- vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
- vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
- VPTEST %YMM2, %YMM2, %k1
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(1)
+ vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VEC(1), %VEC(1)
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VEC(2)
+ vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VEC(1), %VEC(2)
+ VPTEST %VEC(2), %VEC(2), %k1
kmovd %k1, %eax
ret
@@ -1,16 +1,7 @@
#if IS_IN (libc)
-# define VEC_SIZE 32
-# define VEC(i) ymm##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
-# define MOV_SIZE 4
-# define ZERO_UPPER_VEC_REGISTERS_RETURN \
- ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-# define VZEROUPPER_RETURN jmp L(return)
+# include "avx-rtm-vecs.h"
-# define SECTION(p) p##.avx.rtm
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm
# include "memmove-vec-unaligned-erms.S"
@@ -1,11 +1,7 @@
#if IS_IN (libc)
-# define VEC_SIZE 32
-# define VEC(i) ymm##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
-# define MOV_SIZE 4
-# define SECTION(p) p##.avx
+
+# include "avx-vecs.h"
+
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
# include "memmove-vec-unaligned-erms.S"
@@ -1,32 +1,7 @@
#if IS_IN (libc)
-# define VEC_SIZE 64
-# define XMM0 xmm16
-# define XMM1 xmm17
-# define YMM0 ymm16
-# define YMM1 ymm17
-# define VEC0 zmm16
-# define VEC1 zmm17
-# define VEC2 zmm18
-# define VEC3 zmm19
-# define VEC4 zmm20
-# define VEC5 zmm21
-# define VEC6 zmm22
-# define VEC7 zmm23
-# define VEC8 zmm24
-# define VEC9 zmm25
-# define VEC10 zmm26
-# define VEC11 zmm27
-# define VEC12 zmm28
-# define VEC13 zmm29
-# define VEC14 zmm30
-# define VEC15 zmm31
-# define VEC(i) VEC##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE 6
-# define SECTION(p) p##.evex512
+
+# include "evex512-vecs.h"
+
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
# include "memmove-vec-unaligned-erms.S"
@@ -1,32 +1,7 @@
#if IS_IN (libc)
-# define VEC_SIZE 32
-# define XMM0 xmm16
-# define XMM1 xmm17
-# define YMM0 ymm16
-# define YMM1 ymm17
-# define VEC0 ymm16
-# define VEC1 ymm17
-# define VEC2 ymm18
-# define VEC3 ymm19
-# define VEC4 ymm20
-# define VEC5 ymm21
-# define VEC6 ymm22
-# define VEC7 ymm23
-# define VEC8 ymm24
-# define VEC9 ymm25
-# define VEC10 ymm26
-# define VEC11 ymm27
-# define VEC12 ymm28
-# define VEC13 ymm29
-# define VEC14 ymm30
-# define VEC15 ymm31
-# define VEC(i) VEC##i
-# define VMOVNT vmovntdq
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE 6
-# define SECTION(p) p##.evex
+
+# include "evex256-vecs.h"
+
# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
# include "memmove-vec-unaligned-erms.S"
@@ -60,21 +60,6 @@
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
-#ifndef XMM0
-# define XMM0 xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0 ymm0
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-# define VZEROUPPER vzeroupper
-# else
-# define VZEROUPPER
-# endif
-#endif
/* Whether to align before movsb. Ultimately we want 64 byte
align and not worth it to load 4x VEC for VEC_SIZE == 16. */
@@ -322,7 +307,7 @@ L(start_erms):
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
-L(return):
+L(return_vzeroupper):
# if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
# else
@@ -403,10 +388,10 @@ L(between_16_31):
.p2align 4,, 10
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
- VMOVU (%rsi), %YMM0
- VMOVU -32(%rsi, %rdx), %YMM1
- VMOVU %YMM0, (%rdi)
- VMOVU %YMM1, -32(%rdi, %rdx)
+ VMOVU (%rsi), %VEC_ymm(0)
+ VMOVU -32(%rsi, %rdx), %VEC_ymm(1)
+ VMOVU %VEC_ymm(0), (%rdi)
+ VMOVU %VEC_ymm(1), -32(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
@@ -1,9 +1,4 @@
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
- ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp L(return)
-
-#define SECTION(p) p##.avx.rtm
+#include "avx2-rtm-vecs.h"
#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
@@ -1,27 +1,19 @@
#if IS_IN (libc)
-# define USE_WITH_AVX2 1
-# define VEC_SIZE 32
-# define MOV_SIZE 4
-# define RET_SIZE 4
-
-# define VEC(i) ymm##i
-
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
+# include "avx2-vecs.h"
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
+ vmovd d, %VEC_xmm(0); \
movq r, %rax;
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
-# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
-# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %VEC_xmm(0), %VEC(0)
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %VEC_xmm(0), %VEC_xmm(0)
-# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
-# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %VEC_xmm(0), %VEC(0)
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %VEC_xmm(0), %VEC_xmm(0)
# ifndef SECTION
# define SECTION(p) p##.avx
@@ -1,26 +1,13 @@
#if IS_IN (libc)
-# define USE_WITH_AVX512 1
-# define VEC_SIZE 64
-# define MOV_SIZE 6
-# define RET_SIZE 1
-
-# define XMM0 xmm16
-# define YMM0 ymm16
-# define VEC0 zmm16
-# define VEC(i) VEC##i
-
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-# define VZEROUPPER
+# include "evex512-vecs.h"
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
- vpbroadcastb d, %VEC0; \
+ vpbroadcastb d, %VEC(0); \
movq r, %rax
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
- vpbroadcastd d, %VEC0; \
+ vpbroadcastd d, %VEC(0); \
movq r, %rax
# define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -29,7 +16,6 @@
# define WMEMSET_VDUP_TO_VEC0_HIGH()
# define WMEMSET_VDUP_TO_VEC0_LOW()
-# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
# define USE_LESS_VEC_MASK_STORE 1
@@ -1,26 +1,13 @@
#if IS_IN (libc)
-# define USE_WITH_EVEX 1
-# define VEC_SIZE 32
-# define MOV_SIZE 6
-# define RET_SIZE 1
-
-# define XMM0 xmm16
-# define YMM0 ymm16
-# define VEC0 ymm16
-# define VEC(i) VEC##i
-
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-# define VZEROUPPER
+# include "evex256-vecs.h"
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
- vpbroadcastb d, %VEC0; \
+ vpbroadcastb d, %VEC(0); \
movq r, %rax
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
- vpbroadcastd d, %VEC0; \
+ vpbroadcastd d, %VEC(0); \
movq r, %rax
# define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -29,7 +16,6 @@
# define WMEMSET_VDUP_TO_VEC0_HIGH()
# define WMEMSET_VDUP_TO_VEC0_LOW()
-# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s
# define WMEMSET_SYMBOL(p,s) p##_evex_##s
# define USE_LESS_VEC_MASK_STORE 1
@@ -34,27 +34,6 @@
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
#endif
-#ifndef XMM0
-# define XMM0 xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0 ymm0
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-# define VZEROUPPER vzeroupper
-# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
-# else
-# define VZEROUPPER
-# endif
-#endif
-
-#ifndef VZEROUPPER_SHORT_RETURN
-# define VZEROUPPER_SHORT_RETURN rep; ret
-#endif
-
#ifndef MOVQ
# if VEC_SIZE > 16
# define MOVQ vmovq
@@ -71,7 +50,7 @@
# define LOOP_4X_OFFSET (0)
#endif
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+#if defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512
# define END_REG rcx
# define LOOP_REG rdi
# define LESS_VEC_REG rax
@@ -222,7 +201,7 @@ L(last_2x_vec):
#endif
VZEROUPPER_RETURN
- /* If have AVX512 mask instructions put L(less_vec) close to
+ /* If have EVEX512 mask instructions put L(less_vec) close to
entry as it doesn't take much space and is likely a hot target.
*/
#ifdef USE_LESS_VEC_MASK_STORE
@@ -285,13 +264,13 @@ L(more_2x_vec):
/* Two different methods of setting up pointers / compare. The two
- methods are based on the fact that EVEX/AVX512 mov instructions take
- more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+ methods are based on the fact that EVEX/EVEX512 mov instructions take
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/EVEX512
machines also have fast LEA_BID. Both setup and END_REG to avoid complex
- address mode. For EVEX/AVX512 this saves code size and keeps a few
+ address mode. For EVEX/EVEX512 this saves code size and keeps a few
targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
bottlenecks. */
-#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+#if !(defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512)
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
addq %rdx, %END_REG
#endif
@@ -300,11 +279,11 @@ L(more_2x_vec):
jbe L(last_2x_vec)
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+#if defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512
+ /* If EVEX/EVEX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
LEA_BID. */
- /* END_REG is rcx for EVEX/AVX512. */
+ /* END_REG is rcx for EVEX/EVEX512. */
leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
#endif
@@ -313,9 +292,9 @@ L(more_2x_vec):
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+#if defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512
/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
- extra offset to addresses in loop. Used for AVX512 to save space
+ extra offset to addresses in loop. Used for EVEX512 to save space
as no way to get (VEC_SIZE * 4) in imm8. */
# if LOOP_4X_OFFSET == 0
subq $-(VEC_SIZE * 4), %LOOP_REG
@@ -327,7 +306,7 @@ L(more_2x_vec):
cmpq $(VEC_SIZE * 8), %rdx
#endif
jbe L(last_4x_vec)
-#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+#if !(defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512)
/* Set LOOP_REG (rdx). */
leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
#endif
@@ -348,7 +327,7 @@ L(last_4x_vec):
VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
-L(return):
+L(return_vzeroupper):
#if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
#else
@@ -386,7 +365,7 @@ L(cross_page):
jge L(between_16_31)
#endif
#ifndef USE_XMM_LESS_VEC
- MOVQ %XMM0, %SET_REG64
+ MOVQ %VEC_xmm(0), %SET_REG64
#endif
cmpl $8, %edx
jge L(between_8_15)
@@ -405,8 +384,8 @@ L(between_0_0):
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, (%LESS_VEC_REG)
- VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
+ VMOVU %VEC_ymm(0), (%LESS_VEC_REG)
+ VMOVU %VEC_ymm(0), -32(%LESS_VEC_REG, %rdx)
VZEROUPPER_RETURN
#endif
@@ -414,33 +393,33 @@ L(between_32_63):
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
L(between_16_31):
/* From 16 to 31. No branch when size == 16. */
- VMOVU %XMM0, (%LESS_VEC_REG)
- VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
+ VMOVU %VEC_xmm(0), (%LESS_VEC_REG)
+ VMOVU %VEC_xmm(0), -16(%LESS_VEC_REG, %rdx)
ret
#endif
- /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ /* Move size is 3 for SSE2, EVEX, and EVEX512. Move size is 4 for AVX2.
*/
.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
#ifdef USE_XMM_LESS_VEC
- MOVQ %XMM0, (%rdi)
- MOVQ %XMM0, -8(%rdi, %rdx)
+ MOVQ %VEC_xmm(0), (%rdi)
+ MOVQ %VEC_xmm(0), -8(%rdi, %rdx)
#else
movq %SET_REG64, (%LESS_VEC_REG)
movq %SET_REG64, -8(%LESS_VEC_REG, %rdx)
#endif
ret
- /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ /* Move size is 2 for SSE2, EVEX, and EVEX512. Move size is 4 for AVX2.
*/
.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
#ifdef USE_XMM_LESS_VEC
- MOVD %XMM0, (%rdi)
- MOVD %XMM0, -4(%rdi, %rdx)
+ MOVD %VEC_xmm(0), (%rdi)
+ MOVD %VEC_xmm(0), -4(%rdi, %rdx)
#else
movl %SET_REG32, (%LESS_VEC_REG)
movl %SET_REG32, -4(%LESS_VEC_REG, %rdx)
new file mode 100644
@@ -0,0 +1,48 @@
+/* Common config for SSE2 VECs
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H 1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC 1
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2 1
+#define SECTION(p) p
+
+#define VEC_SIZE 16
+/* 3-byte mov instructions with SSE2. */
+#define MOV_SIZE 3
+/* No vzeroupper needed. */
+#define RET_SIZE 1
+
+#define VMOVU movups
+#define VMOVA movaps
+#define VMOVNT movntdq
+#define VZEROUPPER
+
+#define VEC_xmm VEC_any_xmm
+#define VEC VEC_any_xmm
+
+
+#endif
@@ -20,24 +20,21 @@
# include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
# ifndef STRCAT
# define STRCAT __strcat_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
+# define XMMZERO VEC_xmm(0)
+# define VECZERO VEC(0)
# define USE_AS_STRCAT
-/* Number of bytes in a vector register */
-# define VEC_SIZE 32
-
.section .text.evex,"ax",@progbits
ENTRY (STRCAT)
mov %rdi, %r9
@@ -51,7 +48,7 @@ ENTRY (STRCAT)
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
cmp $(VEC_SIZE * 3), %ecx
ja L(fourth_vector_boundary)
- vpcmpb $0, (%rdi), %YMMZERO, %k0
+ vpcmpb $0, (%rdi), %VECZERO, %k0
kmovd %k0, %edx
test %edx, %edx
jnz L(exit_null_on_first_vector)
@@ -61,7 +58,7 @@ ENTRY (STRCAT)
L(fourth_vector_boundary):
mov %rdi, %rax
and $-VEC_SIZE, %rax
- vpcmpb $0, (%rax), %YMMZERO, %k0
+ vpcmpb $0, (%rax), %VECZERO, %k0
mov $-1, %r10d
sub %rax, %rcx
shl %cl, %r10d
@@ -70,85 +67,85 @@ L(fourth_vector_boundary):
jnz L(exit)
L(align_vec_size_start):
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ vpcmpb $0, VEC_SIZE(%rax), %VECZERO, %k0
kmovd %k0, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
add $(VEC_SIZE * 4), %rax
kmovd %k4, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
kmovd %k4, %edx
add $(VEC_SIZE * 4), %rax
test %edx, %edx
jnz L(exit_null_on_second_vector)
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
add $(VEC_SIZE * 4), %rax
kmovd %k4, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(exit_null_on_fifth_vector)
@@ -156,7 +153,7 @@ L(align_vec_size_start):
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+ vpcmpb $0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
add $(VEC_SIZE * 5), %rax
kmovd %k4, %edx
test %edx, %edx
@@ -165,7 +162,7 @@ L(align_vec_size_start):
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ vpcmpb $0, VEC_SIZE(%rax), %VECZERO, %k0
add $VEC_SIZE, %rax
kmovd %k0, %edx
test %edx, %edx
@@ -174,7 +171,7 @@ L(align_vec_size_start):
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
+ vpcmpb $0, VEC_SIZE(%rax), %VECZERO, %k0
add $VEC_SIZE, %rax
kmovd %k0, %edx
test %edx, %edx
@@ -183,7 +180,7 @@ L(align_vec_size_start):
test $((VEC_SIZE * 4) - 1), %rax
jz L(align_four_vec_loop)
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
+ vpcmpb $0, VEC_SIZE(%rax), %VECZERO, %k1
add $VEC_SIZE, %rax
kmovd %k1, %edx
test %edx, %edx
@@ -193,34 +190,34 @@ L(align_vec_size_start):
.p2align 4
L(align_four_vec_loop):
- VMOVA (%rax), %YMM0
- VMOVA (VEC_SIZE * 2)(%rax), %YMM1
- vpminub VEC_SIZE(%rax), %YMM0, %YMM0
- vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
- vpminub %YMM0, %YMM1, %YMM0
+ VMOVA (%rax), %VEC(1)
+ VMOVA (VEC_SIZE * 2)(%rax), %VEC(2)
+ vpminub VEC_SIZE(%rax), %VEC(1), %VEC(1)
+ vpminub (VEC_SIZE * 3)(%rax), %VEC(2), %VEC(2)
+ vpminub %VEC(1), %VEC(2), %VEC(1)
/* If K0 != 0, there is a null byte. */
- vpcmpb $0, %YMM0, %YMMZERO, %k0
+ vpcmpb $0, %VEC(1), %VECZERO, %k0
add $(VEC_SIZE * 4), %rax
ktestd %k0, %k0
jz L(align_four_vec_loop)
- vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECZERO, %k0
sub $(VEC_SIZE * 5), %rax
kmovd %k0, %edx
test %edx, %edx
jnz L(exit_null_on_second_vector)
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+ vpcmpb $0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(exit_null_on_third_vector)
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+ vpcmpb $0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
kmovd %k2, %edx
test %edx, %edx
jnz L(exit_null_on_fourth_vector)
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+ vpcmpb $0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
kmovd %k3, %edx
sub %rdi, %rax
bsf %rdx, %rdx
@@ -20,13 +20,15 @@
# include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
# ifndef STRCHR
# define STRCHR __strchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMP vpcmpd
@@ -45,27 +47,13 @@
# define CHAR_SIZE 1
# endif
-# define XMMZERO xmm16
-
-# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
-# define YMM2 ymm19
-# define YMM3 ymm20
-# define YMM4 ymm21
-# define YMM5 ymm22
-# define YMM6 ymm23
-# define YMM7 ymm24
-# define YMM8 ymm25
-
-# define VEC_SIZE 32
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
ENTRY_P2ALIGN (STRCHR, 5)
- /* Broadcast CHAR to YMM0. */
- VPBROADCAST %esi, %YMM0
+ /* Broadcast CHAR to VEC(1). */
+ VPBROADCAST %esi, %VEC(1)
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
/* Check if we cross page boundary with one vector load.
@@ -75,13 +63,13 @@ ENTRY_P2ALIGN (STRCHR, 5)
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null bytes. */
- VMOVU (%rdi), %YMM1
+ VMOVU (%rdi), %VEC(2)
/* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
+ vpxorq %VEC(2), %VEC(1), %VEC(3)
+ VPMINU %VEC(3), %VEC(2), %VEC(3)
+ /* Each bit in K0 represents a CHAR or a null byte in VEC(2). */
+ VPTESTN %VEC(3), %VEC(3), %k0
kmovd %k0, %eax
testl %eax, %eax
jz L(aligned_more)
@@ -200,41 +188,41 @@ L(cross_page_continue):
/* This method has higher latency but has better port
distribution. */
- VMOVA (VEC_SIZE)(%rdi), %YMM1
+ VMOVA (VEC_SIZE)(%rdi), %VEC(2)
/* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
+ vpxorq %VEC(2), %VEC(1), %VEC(3)
+ VPMINU %VEC(3), %VEC(2), %VEC(3)
+ /* Each bit in K0 represents a CHAR or a null byte in VEC(2). */
+ VPTESTN %VEC(3), %VEC(3), %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1)
/* This method has higher latency but has better port
distribution. */
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
- /* Each bit in K0 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMM0, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPTESTN %YMM1, %YMM1, %k1
+ VMOVA (VEC_SIZE * 2)(%rdi), %VEC(2)
+ /* Each bit in K0 represents a CHAR in VEC(2). */
+ VPCMP $0, %VEC(2), %VEC(1), %k0
+ /* Each bit in K1 represents a CHAR in VEC(2). */
+ VPTESTN %VEC(2), %VEC(2), %k1
kortestd %k0, %k1
jnz L(first_vec_x2)
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
+ VMOVA (VEC_SIZE * 3)(%rdi), %VEC(2)
/* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
+ vpxorq %VEC(2), %VEC(1), %VEC(3)
+ VPMINU %VEC(3), %VEC(2), %VEC(3)
+ /* Each bit in K0 represents a CHAR or a null byte in VEC(2). */
+ VPTESTN %VEC(3), %VEC(3), %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
- /* Each bit in K0 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMM0, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPTESTN %YMM1, %YMM1, %k1
+ VMOVA (VEC_SIZE * 4)(%rdi), %VEC(2)
+ /* Each bit in K0 represents a CHAR in VEC(2). */
+ VPCMP $0, %VEC(2), %VEC(1), %k0
+ /* Each bit in K1 represents a CHAR in VEC(2). */
+ VPTESTN %VEC(2), %VEC(2), %k1
kortestd %k0, %k1
jnz L(first_vec_x4)
@@ -246,54 +234,54 @@ L(cross_page_continue):
L(loop_4x_vec):
/* Check 4x VEC at a time. No penalty to imm32 offset with evex
encoding. */
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
- VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
- VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
- VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
+ VMOVA (VEC_SIZE * 4)(%rdi), %VEC(2)
+ VMOVA (VEC_SIZE * 5)(%rdi), %VEC(3)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VEC(4)
+ VMOVA (VEC_SIZE * 7)(%rdi), %VEC(5)
- /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+ /* For VEC(2) and VEC(4) use xor to set the CHARs matching esi to
zero. */
- vpxorq %YMM1, %YMM0, %YMM5
- /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
+ vpxorq %VEC(2), %VEC(1), %VEC(6)
+ /* For VEC(3) and VEC(5) cmp not equals to CHAR and store result in
k register. Its possible to save either 1 or 2 instructions
- using cmp no equals method for either YMM1 or YMM1 and YMM3
+ using cmp no equals method for either VEC(2) or VEC(2) and VEC(4)
respectively but bottleneck on p5 makes it not worth it. */
- VPCMP $4, %YMM0, %YMM2, %k2
- vpxorq %YMM3, %YMM0, %YMM7
- VPCMP $4, %YMM0, %YMM4, %k4
+ VPCMP $4, %VEC(1), %VEC(3), %k2
+ vpxorq %VEC(4), %VEC(1), %VEC(8)
+ VPCMP $4, %VEC(1), %VEC(5), %k4
/* Use min to select all zeros from either xor or end of string).
*/
- VPMINU %YMM1, %YMM5, %YMM1
- VPMINU %YMM3, %YMM7, %YMM3
+ VPMINU %VEC(2), %VEC(6), %VEC(2)
+ VPMINU %VEC(4), %VEC(8), %VEC(4)
/* Use min + zeromask to select for zeros. Since k2 and k4 will
have 0 as positions that matched with CHAR which will set
- zero in the corresponding destination bytes in YMM2 / YMM4.
+ zero in the corresponding destination bytes in VEC(3) / VEC(5).
*/
- VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
- VPMINU %YMM3, %YMM4, %YMM4
- VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
+ VPMINU %VEC(2), %VEC(3), %VEC(3){%k2}{z}
+ VPMINU %VEC(4), %VEC(5), %VEC(5)
+ VPMINU %VEC(3), %VEC(5), %VEC(5){%k4}{z}
- VPTESTN %YMM4, %YMM4, %k1
+ VPTESTN %VEC(5), %VEC(5), %k1
kmovd %k1, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
jz L(loop_4x_vec)
- VPTESTN %YMM1, %YMM1, %k0
+ VPTESTN %VEC(2), %VEC(2), %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x1)
- VPTESTN %YMM2, %YMM2, %k0
+ VPTESTN %VEC(3), %VEC(3), %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x2)
- VPTESTN %YMM3, %YMM3, %k0
+ VPTESTN %VEC(4), %VEC(4), %k0
kmovd %k0, %eax
- /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
+ /* Combine VEC(4) matches (eax) with VEC(5) matches (ecx). */
# ifdef USE_AS_WCSCHR
sall $8, %ecx
orl %ecx, %eax
@@ -351,12 +339,12 @@ L(cross_page_boundary):
movq %rdi, %rdx
/* Align rdi. */
andq $-VEC_SIZE, %rdi
- VMOVA (%rdi), %YMM1
+ VMOVA (%rdi), %VEC(2)
/* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPTESTN %YMM2, %YMM2, %k0
+ vpxorq %VEC(2), %VEC(1), %VEC(3)
+ VPMINU %VEC(3), %VEC(2), %VEC(3)
+ /* Each bit in K0 represents a CHAR or a null byte in VEC(2). */
+ VPTESTN %VEC(3), %VEC(3), %k0
kmovd %k0, %eax
/* Remove the leading bits. */
# ifdef USE_AS_WCSCHR
@@ -18,6 +18,11 @@
#if IS_IN (libc)
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
# include <sysdep.h>
# if defined USE_AS_STRCASECMP_L
# include "locale-defines.h"
@@ -29,13 +34,9 @@
# define PAGE_SIZE 4096
- /* VEC_SIZE = Number of bytes in a ymm register. */
-# define VEC_SIZE 32
+ /* VEC_SIZE = Number of bytes in a VEC register. */
# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
# ifdef USE_AS_WCSCMP
# ifndef OVERFLOW_STRCMP
# define OVERFLOW_STRCMP __wcscmp_evex
@@ -86,31 +87,7 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
-# define XMM0 xmm17
-# define XMM1 xmm18
-
-# define XMM10 xmm27
-# define XMM11 xmm28
-# define XMM12 xmm29
-# define XMM13 xmm30
-# define XMM14 xmm31
-
-
-# define YMM0 ymm17
-# define YMM1 ymm18
-# define YMM2 ymm19
-# define YMM3 ymm20
-# define YMM4 ymm21
-# define YMM5 ymm22
-# define YMM6 ymm23
-# define YMM7 ymm24
-# define YMM8 ymm25
-# define YMM9 ymm26
-# define YMM10 ymm27
-# define YMM11 ymm28
-# define YMM12 ymm29
-# define YMM13 ymm30
-# define YMM14 ymm31
+
# ifdef USE_AS_STRCASECMP_L
# define BYTE_LOOP_REG OFFSET_REG
@@ -132,26 +109,33 @@
# endif
# endif
-# define LCASE_MIN_YMM %YMM12
-# define LCASE_MAX_YMM %YMM13
-# define CASE_ADD_YMM %YMM14
+# define LCASE_MIN_YMM %VEC(13)
+# define LCASE_MAX_YMM %VEC(14)
+# define CASE_ADD_YMM %VEC(15)
-# define LCASE_MIN_XMM %XMM12
-# define LCASE_MAX_XMM %XMM13
-# define CASE_ADD_XMM %XMM14
+# define LCASE_MIN_XMM %VEC_xmm(13)
+# define LCASE_MAX_XMM %VEC_xmm(14)
+# define CASE_ADD_XMM %VEC_xmm(15)
/* NB: wcsncmp uses r11 but strcasecmp is never used in
conjunction with wcscmp. */
# define TOLOWER_BASE %r11
# ifdef USE_AS_STRCASECMP_L
-# define _REG(x, y) x ## y
-# define REG(x, y) _REG(x, y)
+#define XMM11 VEC_xmm(11)
+#define XMM12 VEC_xmm(12)
+
+#define YMM11 VEC(11)
+#define YMM12 VEC(12)
+
+#define _REG(x, y) x ## y
+#define REG(x, y) _REG(x, y)
+
# define TOLOWER(reg1, reg2, ext) \
- vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
- vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
- vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
- vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
+ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 11); \
+ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 12); \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k5; \
+ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 12), %k6; \
vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
@@ -297,11 +281,11 @@ L(case_add):
L(no_page_cross):
/* Safe to compare 4x vectors. */
- VMOVU (%rdi), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
+ VMOVU (%rdi), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at (%rsi). */
- CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ in VEC(1) and 32 bytes at (%rsi). */
+ CMP_R1_S2_YMM (%VEC(1), (%rsi), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
cmpq $CHAR_PER_VEC, %rdx
@@ -474,9 +458,9 @@ L(ret4):
.p2align 5
L(more_3x_vec):
/* Safe to compare 4x vectors. */
- VMOVU (VEC_SIZE)(%rdi), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ VMOVU (VEC_SIZE)(%rdi), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), VEC_SIZE(%rsi), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1)
@@ -486,16 +470,16 @@ L(more_3x_vec):
jbe L(ret_zero)
# endif
- VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
+ VMOVU (VEC_SIZE * 2)(%rdi), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), (VEC_SIZE * 2)(%rsi), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_2)
- VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
+ VMOVU (VEC_SIZE * 3)(%rdi), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), (VEC_SIZE * 3)(%rsi), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_3)
@@ -574,46 +558,46 @@ L(loop):
/* Loop entry after handling page cross during loop. */
L(loop_skip_page_cross_check):
- VMOVA (VEC_SIZE * 0)(%rdi), %YMM0
- VMOVA (VEC_SIZE * 1)(%rdi), %YMM2
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
+ VMOVA (VEC_SIZE * 0)(%rdi), %VEC(1)
+ VMOVA (VEC_SIZE * 1)(%rdi), %VEC(3)
+ VMOVA (VEC_SIZE * 2)(%rdi), %VEC(5)
+ VMOVA (VEC_SIZE * 3)(%rdi), %VEC(7)
- VPMINU %YMM0, %YMM2, %YMM8
- VPMINU %YMM4, %YMM6, %YMM9
+ VPMINU %VEC(1), %VEC(3), %VEC(9)
+ VPMINU %VEC(5), %VEC(7), %VEC(10)
- /* A zero CHAR in YMM9 means that there is a null CHAR. */
- VPMINU %YMM8, %YMM9, %YMM9
+ /* A zero CHAR in VEC(10) means that there is a null CHAR. */
+ VPMINU %VEC(9), %VEC(10), %VEC(10)
- /* Each bit set in K1 represents a non-null CHAR in YMM9. */
- VPTESTM %YMM9, %YMM9, %k1
+ /* Each bit set in K1 represents a non-null CHAR in VEC(10). */
+ VPTESTM %VEC(10), %VEC(10), %k1
# ifndef USE_AS_STRCASECMP_L
- vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
- vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
- vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
- /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
- oring with YMM1. Result is stored in YMM6. */
- vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+ vpxorq (VEC_SIZE * 0)(%rsi), %VEC(1), %VEC(2)
+ vpxorq (VEC_SIZE * 1)(%rsi), %VEC(3), %VEC(4)
+ vpxorq (VEC_SIZE * 2)(%rsi), %VEC(5), %VEC(6)
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with VEC(7) while
+ oring with VEC(2). Result is stored in VEC(7). */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VEC(2), %VEC(7)
# else
- VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
- TOLOWER_YMM (%YMM0, %YMM1)
- VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
- TOLOWER_YMM (%YMM2, %YMM3)
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
- TOLOWER_YMM (%YMM4, %YMM5)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
- TOLOWER_YMM (%YMM6, %YMM7)
- vpxorq %YMM0, %YMM1, %YMM1
- vpxorq %YMM2, %YMM3, %YMM3
- vpxorq %YMM4, %YMM5, %YMM5
- vpternlogd $0xde, %YMM7, %YMM1, %YMM6
-# endif
- /* Or together YMM3, YMM5, and YMM6. */
- vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
-
-
- /* A non-zero CHAR in YMM6 represents a mismatch. */
- VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(2)
+ TOLOWER_YMM (%VEC(1), %VEC(2))
+ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(4)
+ TOLOWER_YMM (%VEC(3), %VEC(4))
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ TOLOWER_YMM (%VEC(5), %VEC(6))
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(8)
+ TOLOWER_YMM (%VEC(7), %VEC(8))
+ vpxorq %VEC(1), %VEC(2), %VEC(2)
+ vpxorq %VEC(3), %VEC(4), %VEC(4)
+ vpxorq %VEC(5), %VEC(6), %VEC(6)
+ vpternlogd $0xde, %VEC(8), %VEC(2), %VEC(7)
+# endif
+ /* Or together VEC(4), VEC(6), and VEC(7). */
+ vpternlogd $0xfe, %VEC(4), %VEC(6), %VEC(7)
+
+
+ /* A non-zero CHAR in VEC(7) represents a mismatch. */
+ VPTESTNM %VEC(7), %VEC(7), %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
@@ -621,14 +605,14 @@ L(loop_skip_page_cross_check):
/* Find which VEC has the mismatch of end of string. */
- VPTESTM %YMM0, %YMM0, %k1
- VPTESTNM %YMM1, %YMM1, %k0{%k1}
+ VPTESTM %VEC(1), %VEC(1), %k1
+ VPTESTNM %VEC(2), %VEC(2), %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
- VPTESTM %YMM2, %YMM2, %k1
- VPTESTNM %YMM3, %YMM3, %k0{%k1}
+ VPTESTM %VEC(3), %VEC(3), %k1
+ VPTESTNM %VEC(4), %VEC(4), %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -641,8 +625,8 @@ L(return_vec_2_3_end):
jbe L(ret_zero_end)
# endif
- VPTESTM %YMM4, %YMM4, %k1
- VPTESTNM %YMM5, %YMM5, %k0{%k1}
+ VPTESTM %VEC(5), %VEC(5), %k1
+ VPTESTNM %VEC(6), %VEC(6), %k0{%k1}
kmovd %k0, %ecx
TESTEQ %ecx
# if CHAR_PER_VEC <= 16
@@ -787,9 +771,9 @@ L(page_cross_during_loop):
cmpl $-(VEC_SIZE * 3), %eax
jle L(less_1x_vec_till_page_cross)
- VMOVA (%rdi), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+ VMOVA (%rdi), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), (%rsi), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_0_end)
@@ -808,9 +792,9 @@ L(less_1x_vec_till_page_cross):
to read back -VEC_SIZE. If rdi is truly at the start of a page
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
- VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+ VMOVU -VEC_SIZE(%rdi, %rax), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), -VEC_SIZE(%rsi, %rax), %VEC(2), %k1){%k2}
/* Mask of potentially valid bits. The lower bits can be out of
range comparisons (but safe regarding page crosses). */
@@ -901,9 +885,9 @@ L(more_2x_vec_till_page_cross):
/* If more 2x vec till cross we will complete a full loop
iteration here. */
- VMOVA VEC_SIZE(%rdi), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+ VMOVA VEC_SIZE(%rdi), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), VEC_SIZE(%rsi), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_1_end)
@@ -916,16 +900,16 @@ L(more_2x_vec_till_page_cross):
subl $-(VEC_SIZE * 4), %eax
/* Safe to include comparisons from lower bytes. */
- VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), -(VEC_SIZE * 2)(%rsi, %rax), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_0)
- VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), -(VEC_SIZE * 1)(%rsi, %rax), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(return_vec_page_cross_1)
@@ -946,23 +930,23 @@ L(more_2x_vec_till_page_cross):
# endif
/* Finish the loop. */
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
- VPMINU %YMM4, %YMM6, %YMM9
- VPTESTM %YMM9, %YMM9, %k1
+ VMOVA (VEC_SIZE * 2)(%rdi), %VEC(5)
+ VMOVA (VEC_SIZE * 3)(%rdi), %VEC(7)
+ VPMINU %VEC(5), %VEC(7), %VEC(10)
+ VPTESTM %VEC(10), %VEC(10), %k1
# ifndef USE_AS_STRCASECMP_L
- vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
- /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
- vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+ vpxorq (VEC_SIZE * 2)(%rsi), %VEC(5), %VEC(6)
+ /* VEC(7) = VEC(6) | ((VEC_SIZE * 3)(%rsi) ^ VEC(7)). */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VEC(6), %VEC(7)
# else
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
- TOLOWER_YMM (%YMM4, %YMM5)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
- TOLOWER_YMM (%YMM6, %YMM7)
- vpxorq %YMM4, %YMM5, %YMM5
- vpternlogd $0xde, %YMM7, %YMM5, %YMM6
-# endif
- VPTESTNM %YMM6, %YMM6, %k0{%k1}
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ TOLOWER_YMM (%VEC(5), %VEC(6))
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(8)
+ TOLOWER_YMM (%VEC(7), %VEC(8))
+ vpxorq %VEC(5), %VEC(6), %VEC(6)
+ vpternlogd $0xde, %VEC(8), %VEC(6), %VEC(7)
+# endif
+ VPTESTNM %VEC(7), %VEC(7), %k0{%k1}
kmovd %k0, %LOOP_REG
TESTEQ %LOOP_REG
jnz L(return_vec_2_3_end)
@@ -1074,9 +1058,9 @@ L(page_cross):
loadable memory until within 1x VEC of page cross. */
.p2align 4,, 8
L(page_cross_loop):
- VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
TESTEQ %ecx
jnz L(check_ret_vec_page_cross)
@@ -1098,9 +1082,9 @@ L(page_cross_loop):
to not cross page so is safe to load. Since we have already
loaded at least 1 VEC from rsi it is also guranteed to be safe.
*/
- VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
- VPTESTM %YMM0, %YMM0, %k2
- CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(1)
+ VPTESTM %VEC(1), %VEC(1), %k2
+ CMP_R1_S2_YMM (%VEC(1), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(2), %k1){%k2}
kmovd %k1, %ecx
# ifdef USE_AS_STRNCMP
@@ -21,36 +21,22 @@
# ifndef USE_AS_STRCAT
# include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
# ifndef STRCPY
# define STRCPY __strcpy_evex
# endif
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-# define VEC_SIZE 32
-# endif
-
-# define XMM2 xmm18
-# define XMM3 xmm19
-
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-# define YMM7 ymm23
-
# ifndef USE_AS_STRCAT
/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM1 ymm17
+# define XMMZERO VEC_xmm(0)
+# define VECZERO VEC(0)
.section .text.evex,"ax",@progbits
ENTRY (STRCPY)
@@ -74,7 +60,7 @@ ENTRY (STRCPY)
and $-VEC_SIZE, %rsi
and $(VEC_SIZE - 1), %ecx
- vpcmpb $0, (%rsi), %YMMZERO, %k0
+ vpcmpb $0, (%rsi), %VECZERO, %k0
kmovd %k0, %edx
shr %cl, %rdx
@@ -93,7 +79,7 @@ ENTRY (STRCPY)
test %edx, %edx
jnz L(CopyVecSizeTail)
- vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1
+ vpcmpb $0, VEC_SIZE(%rsi), %VECZERO, %k1
kmovd %k1, %edx
# ifdef USE_AS_STRNCPY
@@ -104,8 +90,8 @@ ENTRY (STRCPY)
test %edx, %edx
jnz L(CopyTwoVecSize)
- VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */
- VMOVU %YMM2, (%rdi)
+ VMOVU (%rsi, %rcx), %VEC(2) /* copy VEC_SIZE bytes */
+ VMOVU %VEC(2), (%rdi)
/* If source address alignment != destination address alignment */
.p2align 4
@@ -117,10 +103,10 @@ L(UnalignVecSizeBoth):
or %rcx, %r8
# endif
mov $VEC_SIZE, %rcx
- VMOVA (%rsi, %rcx), %YMM2
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
+ VMOVA (%rsi, %rcx), %VEC(2)
+ VMOVU %VEC(2), (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %VEC(2)
+ vpcmpb $0, %VEC(2), %VECZERO, %k0
kmovd %k0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -134,9 +120,9 @@ L(UnalignVecSizeBoth):
jnz L(CopyVecSize)
# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- vpcmpb $0, %YMM3, %YMMZERO, %k0
+ VMOVU %VEC(2), (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %VEC(3)
+ vpcmpb $0, %VEC(3), %VECZERO, %k0
kmovd %k0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -150,9 +136,9 @@ L(UnalignVecSizeBoth):
jnz L(CopyVecSize)
# endif
- VMOVU %YMM3, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM4
- vpcmpb $0, %YMM4, %YMMZERO, %k0
+ VMOVU %VEC(3), (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %VEC(4)
+ vpcmpb $0, %VEC(4), %VECZERO, %k0
kmovd %k0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -166,9 +152,9 @@ L(UnalignVecSizeBoth):
jnz L(CopyVecSize)
# endif
- VMOVU %YMM4, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
+ VMOVU %VEC(4), (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %VEC(2)
+ vpcmpb $0, %VEC(2), %VECZERO, %k0
kmovd %k0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -182,9 +168,9 @@ L(UnalignVecSizeBoth):
jnz L(CopyVecSize)
# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
+ VMOVU %VEC(2), (%rdi, %rcx)
+ VMOVA VEC_SIZE(%rsi, %rcx), %VEC(2)
+ vpcmpb $0, %VEC(2), %VECZERO, %k0
kmovd %k0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -198,9 +184,9 @@ L(UnalignVecSizeBoth):
jnz L(CopyVecSize)
# endif
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- VMOVU %YMM2, (%rdi, %rcx)
- vpcmpb $0, %YMM3, %YMMZERO, %k0
+ VMOVA VEC_SIZE(%rsi, %rcx), %VEC(3)
+ VMOVU %VEC(2), (%rdi, %rcx)
+ vpcmpb $0, %VEC(3), %VECZERO, %k0
kmovd %k0, %edx
add $VEC_SIZE, %rcx
# ifdef USE_AS_STRNCPY
@@ -214,7 +200,7 @@ L(UnalignVecSizeBoth):
jnz L(CopyVecSize)
# endif
- VMOVU %YMM3, (%rdi, %rcx)
+ VMOVU %VEC(3), (%rdi, %rcx)
mov %rsi, %rdx
lea VEC_SIZE(%rsi, %rcx), %rsi
and $-(VEC_SIZE * 4), %rsi
@@ -224,15 +210,15 @@ L(UnalignVecSizeBoth):
lea (VEC_SIZE * 8)(%r8, %rdx), %r8
# endif
L(UnalignedFourVecSizeLoop):
- VMOVA (%rsi), %YMM4
- VMOVA VEC_SIZE(%rsi), %YMM5
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM5, %YMM4, %YMM2
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
+ VMOVA (%rsi), %VEC(4)
+ VMOVA VEC_SIZE(%rsi), %VEC(5)
+ VMOVA (VEC_SIZE * 2)(%rsi), %VEC(6)
+ VMOVA (VEC_SIZE * 3)(%rsi), %VEC(7)
+ vpminub %VEC(5), %VEC(4), %VEC(2)
+ vpminub %VEC(7), %VEC(6), %VEC(3)
+ vpminub %VEC(2), %VEC(3), %VEC(2)
/* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
+ vpcmpb $0, %VEC(2), %VECZERO, %k7
kmovd %k7, %edx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 4), %r8
@@ -244,19 +230,19 @@ L(UnalignedFourVecSizeLoop):
L(UnalignedFourVecSizeLoop_start):
add $(VEC_SIZE * 4), %rdi
add $(VEC_SIZE * 4), %rsi
- VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi)
- VMOVA (%rsi), %YMM4
- VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi)
- VMOVA VEC_SIZE(%rsi), %YMM5
- vpminub %YMM5, %YMM4, %YMM2
- VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi)
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVU %YMM7, -VEC_SIZE(%rdi)
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
+ VMOVU %VEC(4), -(VEC_SIZE * 4)(%rdi)
+ VMOVA (%rsi), %VEC(4)
+ VMOVU %VEC(5), -(VEC_SIZE * 3)(%rdi)
+ VMOVA VEC_SIZE(%rsi), %VEC(5)
+ vpminub %VEC(5), %VEC(4), %VEC(2)
+ VMOVU %VEC(6), -(VEC_SIZE * 2)(%rdi)
+ VMOVA (VEC_SIZE * 2)(%rsi), %VEC(6)
+ VMOVU %VEC(7), -VEC_SIZE(%rdi)
+ VMOVA (VEC_SIZE * 3)(%rsi), %VEC(7)
+ vpminub %VEC(7), %VEC(6), %VEC(3)
+ vpminub %VEC(2), %VEC(3), %VEC(2)
/* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
+ vpcmpb $0, %VEC(2), %VECZERO, %k7
kmovd %k7, %edx
# ifdef USE_AS_STRNCPY
sub $(VEC_SIZE * 4), %r8
@@ -266,32 +252,32 @@ L(UnalignedFourVecSizeLoop_start):
jz L(UnalignedFourVecSizeLoop_start)
L(UnalignedFourVecSizeLeave):
- vpcmpb $0, %YMM4, %YMMZERO, %k1
+ vpcmpb $0, %VEC(4), %VECZERO, %k1
kmovd %k1, %edx
test %edx, %edx
jnz L(CopyVecSizeUnaligned_0)
- vpcmpb $0, %YMM5, %YMMZERO, %k2
+ vpcmpb $0, %VEC(5), %VECZERO, %k2
kmovd %k2, %ecx
test %ecx, %ecx
jnz L(CopyVecSizeUnaligned_16)
- vpcmpb $0, %YMM6, %YMMZERO, %k3
+ vpcmpb $0, %VEC(6), %VECZERO, %k3
kmovd %k3, %edx
test %edx, %edx
jnz L(CopyVecSizeUnaligned_32)
- vpcmpb $0, %YMM7, %YMMZERO, %k4
+ vpcmpb $0, %VEC(7), %VECZERO, %k4
kmovd %k4, %ecx
bsf %ecx, %edx
- VMOVU %YMM4, (%rdi)
- VMOVU %YMM5, VEC_SIZE(%rdi)
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
# endif
- VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
add $(VEC_SIZE - 1), %r8
sub %rdx, %r8
lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
@@ -305,9 +291,9 @@ L(UnalignedFourVecSizeLeave):
/* If source address alignment == destination address alignment */
L(SourceStringAlignmentLessTwoVecSize):
- VMOVU (%rsi), %YMM3
- VMOVU VEC_SIZE(%rsi), %YMM2
- vpcmpb $0, %YMM3, %YMMZERO, %k0
+ VMOVU (%rsi), %VEC(3)
+ VMOVU VEC_SIZE(%rsi), %VEC(2)
+ vpcmpb $0, %VEC(3), %VECZERO, %k0
kmovd %k0, %edx
# ifdef USE_AS_STRNCPY
@@ -321,8 +307,8 @@ L(SourceStringAlignmentLessTwoVecSize):
test %edx, %edx
jnz L(CopyVecSizeTail1)
- VMOVU %YMM3, (%rdi)
- vpcmpb $0, %YMM2, %YMMZERO, %k0
+ VMOVU %VEC(3), (%rdi)
+ vpcmpb $0, %VEC(2), %VECZERO, %k0
kmovd %k0, %edx
# ifdef USE_AS_STRNCPY
@@ -402,7 +388,7 @@ L(CopyVecSizeUnaligned_0):
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
- VMOVU %YMM4, (%rdi)
+ VMOVU %VEC(4), (%rdi)
add $((VEC_SIZE * 4) - 1), %r8
sub %rdx, %r8
lea 1(%rdi, %rdx), %rdi
@@ -414,12 +400,12 @@ L(CopyVecSizeUnaligned_0):
.p2align 4
L(CopyVecSizeUnaligned_16):
bsf %ecx, %edx
- VMOVU %YMM4, (%rdi)
+ VMOVU %VEC(4), (%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea VEC_SIZE(%rdi, %rdx), %rax
# endif
- VMOVU %YMM5, VEC_SIZE(%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
add $((VEC_SIZE * 3) - 1), %r8
sub %rdx, %r8
lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
@@ -433,13 +419,13 @@ L(CopyVecSizeUnaligned_16):
.p2align 4
L(CopyVecSizeUnaligned_32):
bsf %edx, %edx
- VMOVU %YMM4, (%rdi)
- VMOVU %YMM5, VEC_SIZE(%rdi)
+ VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
# ifdef USE_AS_STPCPY
lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
# endif
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
add $((VEC_SIZE * 2) - 1), %r8
sub %rdx, %r8
lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
@@ -454,22 +440,22 @@ L(CopyVecSizeUnaligned_32):
# ifndef USE_AS_STRCAT
.p2align 4
L(CopyVecSizeUnalignedVec6):
- VMOVU %YMM6, (%rdi, %rcx)
+ VMOVU %VEC(6), (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
.p2align 4
L(CopyVecSizeUnalignedVec5):
- VMOVU %YMM5, (%rdi, %rcx)
+ VMOVU %VEC(5), (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
.p2align 4
L(CopyVecSizeUnalignedVec4):
- VMOVU %YMM4, (%rdi, %rcx)
+ VMOVU %VEC(4), (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
.p2align 4
L(CopyVecSizeUnalignedVec3):
- VMOVU %YMM3, (%rdi, %rcx)
+ VMOVU %VEC(3), (%rdi, %rcx)
jmp L(CopyVecSizeVecExit)
# endif
@@ -626,10 +612,10 @@ L(Exit8_15):
.p2align 4
L(Exit16_31):
- VMOVU (%rsi), %XMM2
- VMOVU -15(%rsi, %rdx), %XMM3
- VMOVU %XMM2, (%rdi)
- VMOVU %XMM3, -15(%rdi, %rdx)
+ VMOVU (%rsi), %VEC_xmm(2)
+ VMOVU -15(%rsi, %rdx), %VEC_xmm(3)
+ VMOVU %VEC_xmm(2), (%rdi)
+ VMOVU %VEC_xmm(3), -15(%rdi, %rdx)
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
@@ -643,10 +629,10 @@ L(Exit16_31):
.p2align 4
L(Exit32_63):
- VMOVU (%rsi), %YMM2
- VMOVU -31(%rsi, %rdx), %YMM3
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, -31(%rdi, %rdx)
+ VMOVU (%rsi), %VEC(2)
+ VMOVU -31(%rsi, %rdx), %VEC(3)
+ VMOVU %VEC(2), (%rdi)
+ VMOVU %VEC(3), -31(%rdi, %rdx)
# ifdef USE_AS_STPCPY
lea (%rdi, %rdx), %rax
# endif
@@ -728,10 +714,10 @@ L(StrncpyExit9_16):
.p2align 4
L(StrncpyExit17_32):
- VMOVU (%rsi), %XMM2
- VMOVU -16(%rsi, %r8), %XMM3
- VMOVU %XMM2, (%rdi)
- VMOVU %XMM3, -16(%rdi, %r8)
+ VMOVU (%rsi), %VEC_xmm(2)
+ VMOVU -16(%rsi, %r8), %VEC_xmm(3)
+ VMOVU %VEC_xmm(2), (%rdi)
+ VMOVU %VEC_xmm(3), -16(%rdi, %r8)
# ifdef USE_AS_STPCPY
lea (%rdi, %r8), %rax
# endif
@@ -743,10 +729,10 @@ L(StrncpyExit17_32):
.p2align 4
L(StrncpyExit33_64):
/* 0/32, 31/16 */
- VMOVU (%rsi), %YMM2
- VMOVU -VEC_SIZE(%rsi, %r8), %YMM3
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, -VEC_SIZE(%rdi, %r8)
+ VMOVU (%rsi), %VEC(2)
+ VMOVU -VEC_SIZE(%rsi, %r8), %VEC(3)
+ VMOVU %VEC(2), (%rdi)
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %r8)
# ifdef USE_AS_STPCPY
lea (%rdi, %r8), %rax
# endif
@@ -758,11 +744,11 @@ L(StrncpyExit33_64):
.p2align 4
L(StrncpyExit65):
/* 0/32, 32/32, 64/1 */
- VMOVU (%rsi), %YMM2
- VMOVU 32(%rsi), %YMM3
+ VMOVU (%rsi), %VEC(2)
+ VMOVU 32(%rsi), %VEC(3)
mov 64(%rsi), %cl
- VMOVU %YMM2, (%rdi)
- VMOVU %YMM3, 32(%rdi)
+ VMOVU %VEC(2), (%rdi)
+ VMOVU %VEC(3), 32(%rdi)
mov %cl, 64(%rdi)
# ifdef USE_AS_STPCPY
lea 65(%rdi), %rax
@@ -810,7 +796,7 @@ L(Fill17_32):
.p2align 4
L(CopyVecSizeUnalignedVec2):
- VMOVU %YMM2, (%rdi, %rcx)
+ VMOVU %VEC(2), (%rdi, %rcx)
.p2align 4
L(CopyVecSizeVecExit):
@@ -829,7 +815,7 @@ L(StrncpyFillTailWithZero):
sub $VEC_SIZE, %r8
jbe L(StrncpyFillExit)
- VMOVU %YMMZERO, (%rdi)
+ VMOVU %VECZERO, (%rdi)
add $VEC_SIZE, %rdi
mov %rdi, %rsi
@@ -840,10 +826,10 @@ L(StrncpyFillTailWithZero):
jb L(StrncpyFillLessFourVecSize)
L(StrncpyFillLoopVmovdqa):
- VMOVA %YMMZERO, (%rdi)
- VMOVA %YMMZERO, VEC_SIZE(%rdi)
- VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi)
- VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi)
+ VMOVA %VECZERO, (%rdi)
+ VMOVA %VECZERO, VEC_SIZE(%rdi)
+ VMOVA %VECZERO, (VEC_SIZE * 2)(%rdi)
+ VMOVA %VECZERO, (VEC_SIZE * 3)(%rdi)
add $(VEC_SIZE * 4), %rdi
sub $(VEC_SIZE * 4), %r8
jae L(StrncpyFillLoopVmovdqa)
@@ -851,12 +837,12 @@ L(StrncpyFillLoopVmovdqa):
L(StrncpyFillLessFourVecSize):
add $(VEC_SIZE * 2), %r8
jl L(StrncpyFillLessTwoVecSize)
- VMOVA %YMMZERO, (%rdi)
- VMOVA %YMMZERO, VEC_SIZE(%rdi)
+ VMOVA %VECZERO, (%rdi)
+ VMOVA %VECZERO, VEC_SIZE(%rdi)
add $(VEC_SIZE * 2), %rdi
sub $VEC_SIZE, %r8
jl L(StrncpyFillExit)
- VMOVA %YMMZERO, (%rdi)
+ VMOVA %VECZERO, (%rdi)
add $VEC_SIZE, %rdi
jmp L(Fill)
@@ -864,7 +850,7 @@ L(StrncpyFillLessFourVecSize):
L(StrncpyFillLessTwoVecSize):
add $VEC_SIZE, %r8
jl L(StrncpyFillExit)
- VMOVA %YMMZERO, (%rdi)
+ VMOVA %VECZERO, (%rdi)
add $VEC_SIZE, %rdi
jmp L(Fill)
@@ -897,16 +883,16 @@ L(UnalignedFourVecSizeLeaveCase3):
and $-VEC_SIZE, %rcx
add $(VEC_SIZE * 3), %r8
jl L(CopyVecSizeCase3)
- VMOVU %YMM4, (%rdi)
+ VMOVU %VEC(4), (%rdi)
sub $VEC_SIZE, %r8
jb L(CopyVecSizeCase3)
- VMOVU %YMM5, VEC_SIZE(%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
sub $VEC_SIZE, %r8
jb L(CopyVecSizeCase3)
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
sub $VEC_SIZE, %r8
jb L(CopyVecSizeCase3)
- VMOVU %YMM7, (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
# ifdef USE_AS_STPCPY
lea (VEC_SIZE * 4)(%rdi), %rax
# endif
@@ -918,7 +904,7 @@ L(UnalignedFourVecSizeLeaveCase3):
.p2align 4
L(UnalignedFourVecSizeLeaveCase2):
xor %ecx, %ecx
- vpcmpb $0, %YMM4, %YMMZERO, %k1
+ vpcmpb $0, %VEC(4), %VECZERO, %k1
kmovd %k1, %edx
add $(VEC_SIZE * 3), %r8
jle L(CopyVecSizeCase2OrCase3)
@@ -928,9 +914,9 @@ L(UnalignedFourVecSizeLeaveCase2):
# else
jnz L(CopyVecSize)
# endif
- vpcmpb $0, %YMM5, %YMMZERO, %k2
+ vpcmpb $0, %VEC(5), %VECZERO, %k2
kmovd %k2, %edx
- VMOVU %YMM4, (%rdi)
+ VMOVU %VEC(4), (%rdi)
add $VEC_SIZE, %rcx
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
@@ -941,9 +927,9 @@ L(UnalignedFourVecSizeLeaveCase2):
jnz L(CopyVecSize)
# endif
- vpcmpb $0, %YMM6, %YMMZERO, %k3
+ vpcmpb $0, %VEC(6), %VECZERO, %k3
kmovd %k3, %edx
- VMOVU %YMM5, VEC_SIZE(%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
add $VEC_SIZE, %rcx
sub $VEC_SIZE, %r8
jbe L(CopyVecSizeCase2OrCase3)
@@ -954,9 +940,9 @@ L(UnalignedFourVecSizeLeaveCase2):
jnz L(CopyVecSize)
# endif
- vpcmpb $0, %YMM7, %YMMZERO, %k4
+ vpcmpb $0, %VEC(7), %VECZERO, %k4
kmovd %k4, %edx
- VMOVU %YMM6, (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
lea VEC_SIZE(%rdi, %rcx), %rdi
lea VEC_SIZE(%rsi, %rcx), %rsi
bsf %edx, %edx
@@ -32,7 +32,6 @@
# define CHAR_SIZE 1
# endif
-# define XMM0 xmm16
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
@@ -44,12 +43,6 @@
# define RDX rdx
# define SHR shrq
# define TEXTSUFFIX evex512
-# define VMM0 zmm16
-# define VMM1 zmm17
-# define VMM2 zmm18
-# define VMM3 zmm19
-# define VMM4 zmm20
-# define VMOVA vmovdqa64
# elif VEC_SIZE == 32
/* Currently Unused. */
# define KMOV kmovd
@@ -59,12 +52,6 @@
# define RDX edx
# define SHR shrl
# define TEXTSUFFIX evex256
-# define VMM0 ymm16
-# define VMM1 ymm17
-# define VMM2 ymm18
-# define VMM3 ymm19
-# define VMM4 ymm20
-# define VMOVA vmovdqa32
# endif
.section .text.TEXTSUFFIX, "ax", @progbits
@@ -82,13 +69,13 @@ ENTRY_P2ALIGN (STRLEN, 6)
# endif
movl %edi, %eax
- vpxorq %XMM0, %XMM0, %XMM0
+ vpxorq %VEC_xmm(0), %VEC_xmm(0), %VEC_xmm(0)
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(page_cross)
/* Compare [w]char for null, mask bit will be set for match. */
- VPCMP $0, (%rdi), %VMM0, %k0
+ VPCMP $0, (%rdi), %VEC(0), %k0
KMOV %k0, %RAX
test %RAX, %RAX
jz L(align_more)
@@ -127,7 +114,7 @@ L(align_more):
# endif
/* Loop unroll 4 times for 4 vector loop. */
- VPCMP $0, (%rax), %VMM0, %k0
+ VPCMP $0, (%rax), %VEC(0), %k0
KMOV %k0, %RCX
test %RCX, %RCX
jnz L(ret_vec_x1)
@@ -137,7 +124,7 @@ L(align_more):
jbe L(ret_max)
# endif
- VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
+ VPCMP $0, VEC_SIZE(%rax), %VEC(0), %k0
KMOV %k0, %RCX
test %RCX, %RCX
jnz L(ret_vec_x2)
@@ -147,7 +134,7 @@ L(align_more):
jbe L(ret_max)
# endif
- VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rax), %VEC(0), %k0
KMOV %k0, %RCX
test %RCX, %RCX
jnz L(ret_vec_x3)
@@ -157,7 +144,7 @@ L(align_more):
jbe L(ret_max)
# endif
- VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+ VPCMP $0, (VEC_SIZE * 3)(%rax), %VEC(0), %k0
KMOV %k0, %RCX
test %RCX, %RCX
jnz L(ret_vec_x4)
@@ -195,19 +182,19 @@ L(loop_entry):
# endif
/* VPMINU and VPCMP combination provide better performance as
compared to alternative combinations. */
- VMOVA (VEC_SIZE * 4)(%rax), %VMM1
- VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
- VMOVA (VEC_SIZE * 6)(%rax), %VMM3
- VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+ VMOVA (VEC_SIZE * 4)(%rax), %VEC(1)
+ VPMINU (VEC_SIZE * 5)(%rax), %VEC(1), %VEC(2)
+ VMOVA (VEC_SIZE * 6)(%rax), %VEC(3)
+ VPMINU (VEC_SIZE * 7)(%rax), %VEC(3), %VEC(4)
- VPTESTN %VMM2, %VMM2, %k0
- VPTESTN %VMM4, %VMM4, %k1
+ VPTESTN %VEC(2), %VEC(2), %k0
+ VPTESTN %VEC(4), %VEC(4), %k1
subq $-(VEC_SIZE * 4), %rax
KORTEST %k0, %k1
jz L(loop)
- VPTESTN %VMM1, %VMM1, %k2
+ VPTESTN %VEC(1), %VEC(1), %k2
KMOV %k2, %RCX
test %RCX, %RCX
jnz L(ret_vec_x1)
@@ -218,7 +205,7 @@ L(loop_entry):
test %RCX, %RCX
jnz L(ret_vec_x2)
- VPTESTN %VMM3, %VMM3, %k3
+ VPTESTN %VEC(3), %VEC(3), %k3
KMOV %k3, %RCX
test %RCX, %RCX
jnz L(ret_vec_x3)
@@ -285,7 +272,7 @@ L(page_cross):
/* ecx contains number of w[char] to be skipped as a result
of address alignment. */
xorq %rdi, %rax
- VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+ VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(0), %k0
KMOV %k0, %RAX
/* Ignore number of character for alignment adjustment. */
SHR %cl, %RAX
@@ -20,6 +20,11 @@
# include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
# ifndef STRLEN
# define STRLEN __strlen_evex
# endif
@@ -38,14 +43,8 @@
# define CHAR_SIZE 1
# endif
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
+# define XMMZERO VEC_xmm(0)
+# define VECZERO VEC(0)
# define VEC_SIZE 32
# define PAGE_SIZE 4096
@@ -74,7 +73,7 @@ ENTRY (STRLEN)
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
null byte. */
- VPCMP $0, (%rdi), %YMMZERO, %k0
+ VPCMP $0, (%rdi), %VECZERO, %k0
kmovd %k0, %eax
# ifdef USE_AS_STRNLEN
/* If length < CHAR_PER_VEC handle special. */
@@ -194,7 +193,7 @@ L(cross_page_continue):
# endif
# endif
/* Load first VEC regardless. */
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+ VPCMP $0, VEC_SIZE(%rdi), %VECZERO, %k0
# ifdef USE_AS_STRNLEN
/* Adjust length. If near end handle specially. */
subq %rcx, %rsi
@@ -204,17 +203,17 @@ L(cross_page_continue):
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %VECZERO, %k0
kmovd %k0, %eax
test %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %VECZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %VECZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x4)
@@ -240,7 +239,7 @@ L(cross_page_continue):
.p2align 4
L(loop_4x_vec):
/* Load first VEC regardless. */
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVA (VEC_SIZE * 4)(%rdi), %VEC(1)
# ifdef USE_AS_STRNLEN
/* Break if at end of length. */
subq $(CHAR_PER_VEC * 4), %rsi
@@ -250,12 +249,12 @@ L(loop_4x_vec):
the matches in ymm2/ymm4 can only be returned if there where no
matches in ymm1/ymm3 respectively there is no issue with overlap.
*/
- VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
- VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
- VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+ VPMINU (VEC_SIZE * 5)(%rdi), %VEC(1), %VEC(2)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VEC(3)
+ VPMINU (VEC_SIZE * 7)(%rdi), %VEC(3), %VEC(4)
- VPCMP $0, %YMM2, %YMMZERO, %k0
- VPCMP $0, %YMM4, %YMMZERO, %k1
+ VPCMP $0, %VEC(2), %VECZERO, %k0
+ VPCMP $0, %VEC(4), %VECZERO, %k1
subq $-(VEC_SIZE * 4), %rdi
kortestd %k0, %k1
jz L(loop_4x_vec)
@@ -269,7 +268,7 @@ L(loop_4x_vec):
testl %eax, %eax
jz L(second_vec_return)
- VPCMP $0, %YMM1, %YMMZERO, %k2
+ VPCMP $0, %VEC(1), %VECZERO, %k2
kmovd %k2, %edx
/* Combine VEC1 matches (edx) with VEC2 matches (eax). */
# ifdef USE_AS_WCSLEN
@@ -288,10 +287,10 @@ L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
L(last_4x_vec_or_less_load):
- /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+ /* Depending on entry adjust rdi / prepare first VEC in VEC(1). */
+ VMOVA (VEC_SIZE * 4)(%rdi), %VEC(1)
L(last_4x_vec_or_less_cmpeq):
- VPCMP $0, %YMM1, %YMMZERO, %k0
+ VPCMP $0, %VEC(1), %VECZERO, %k0
addq $(VEC_SIZE * 3), %rdi
L(last_4x_vec_or_less):
kmovd %k0, %eax
@@ -311,7 +310,7 @@ L(last_4x_vec_or_less):
subl $CHAR_PER_VEC, %esi
jb L(max)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %VECZERO, %k0
kmovd %k0, %eax
tzcntl %eax, %eax
/* Check the end of data. */
@@ -334,8 +333,8 @@ L(max):
in the 4x VEC loop can use 2 byte encoding. */
.p2align 4
L(second_vec_return):
- VPCMP $0, %YMM3, %YMMZERO, %k0
- /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
+ VPCMP $0, %VEC(3), %VECZERO, %k0
+ /* Combine VEC(3) matches (k0) with VEC(4) matches (k1). */
# ifdef USE_AS_WCSLEN
kunpckbw %k0, %k1, %k0
kmovd %k0, %eax
@@ -369,14 +368,14 @@ L(last_4x_vec):
testl %eax, %eax
jnz L(last_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %VECZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x2)
/* Normalize length. */
andl $(CHAR_PER_VEC * 4 - 1), %esi
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %VECZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x3)
@@ -385,7 +384,7 @@ L(last_4x_vec):
subl $(CHAR_PER_VEC * 3), %esi
jb L(max)
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %VECZERO, %k0
kmovd %k0, %eax
tzcntl %eax, %eax
/* Check the end of data. */
@@ -447,7 +446,7 @@ L(cross_page_boundary):
movq %rdi, %rdx
/* Align data to VEC_SIZE. */
andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMZERO, %k0
+ VPCMP $0, (%rdi), %VECZERO, %k0
kmovd %k0, %eax
/* Remove the leading bytes. */
# ifdef USE_AS_WCSLEN
@@ -2,6 +2,5 @@
# define STRLEN __strlen_evex512
#endif
-#define VEC_SIZE 64
-
+#include "evex512-vecs.h"
#include "strlen-evex-base.S"
@@ -20,6 +20,11 @@
# include <sysdep.h>
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+# error "VEC_SIZE != 32 unimplemented"
+# endif
+
# ifndef STRRCHR
# define STRRCHR __strrchr_evex
# endif
@@ -54,44 +59,31 @@
# define VPCMP vpcmpb
# endif
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMMMATCH ymm17
-# define YMMSAVE ymm18
-
-# define YMM1 ymm19
-# define YMM2 ymm20
-# define YMM3 ymm21
-# define YMM4 ymm22
-# define YMM5 ymm23
-# define YMM6 ymm24
-# define YMM7 ymm25
-# define YMM8 ymm26
-
+# define VECMATCH VEC(1)
# define VEC_SIZE 32
# define PAGE_SIZE 4096
.section .text.evex, "ax", @progbits
ENTRY(STRRCHR)
movl %edi, %eax
- /* Broadcast CHAR to YMMMATCH. */
- VPBROADCAST %esi, %YMMMATCH
+ /* Broadcast CHAR to VECMATCH. */
+ VPBROADCAST %esi, %VECMATCH
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
jg L(cross_page_boundary)
L(page_cross_continue):
- VMOVU (%rdi), %YMM1
- /* k0 has a 1 for each zero CHAR in YMM1. */
- VPTESTN %YMM1, %YMM1, %k0
+ VMOVU (%rdi), %VEC(3)
+ /* k0 has a 1 for each zero CHAR in VEC(3). */
+ VPTESTN %VEC(3), %VEC(3), %k0
kmovd %k0, %ecx
testl %ecx, %ecx
jz L(aligned_more)
/* fallthrough: zero CHAR in first VEC. */
- /* K1 has a 1 for each search CHAR match in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ /* K1 has a 1 for each search CHAR match in VEC(3). */
+ VPCMP $0, %VECMATCH, %VEC(3), %k1
kmovd %k1, %eax
/* Build mask up until first zero CHAR (used to mask of
potential search CHAR matches past the end of the string).
@@ -114,18 +106,18 @@ L(ret0):
search path for earlier matches. */
.p2align 4,, 6
L(first_vec_x1):
- VPCMP $0, %YMMMATCH, %YMM2, %k1
+ VPCMP $0, %VECMATCH, %VEC(4), %k1
kmovd %k1, %eax
blsmskl %ecx, %ecx
/* eax non-zero if search CHAR in range. */
andl %ecx, %eax
jnz L(first_vec_x1_return)
- /* fallthrough: no match in YMM2 then need to check for earlier
- matches (in YMM1). */
+ /* fallthrough: no match in VEC(4) then need to check for earlier
+ matches (in VEC(3)). */
.p2align 4,, 4
L(first_vec_x0_test):
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VPCMP $0, %VECMATCH, %VEC(3), %k1
kmovd %k1, %eax
testl %eax, %eax
jz L(ret1)
@@ -140,14 +132,14 @@ L(ret1):
.p2align 4,, 10
L(first_vec_x1_or_x2):
- VPCMP $0, %YMM3, %YMMMATCH, %k3
- VPCMP $0, %YMM2, %YMMMATCH, %k2
+ VPCMP $0, %VEC(5), %VECMATCH, %k3
+ VPCMP $0, %VEC(4), %VECMATCH, %k2
/* K2 and K3 have 1 for any search CHAR match. Test if any
- matches between either of them. Otherwise check YMM1. */
+ matches between either of them. Otherwise check VEC(3). */
kortestd %k2, %k3
jz L(first_vec_x0_test)
- /* Guranteed that YMM2 and YMM3 are within range so merge the
+ /* Guranteed that VEC(4) and VEC(5) are within range so merge the
two bitmasks then get last result. */
kunpck %k2, %k3, %k3
kmovq %k3, %rax
@@ -157,10 +149,10 @@ L(first_vec_x1_or_x2):
.p2align 4,, 6
L(first_vec_x3):
- VPCMP $0, %YMMMATCH, %YMM4, %k1
+ VPCMP $0, %VECMATCH, %VEC(6), %k1
kmovd %k1, %eax
blsmskl %ecx, %ecx
- /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
+ /* If no search CHAR match in range check VEC(3)/VEC(4)/VEC(5). */
andl %ecx, %eax
jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
@@ -169,9 +161,9 @@ L(first_vec_x3):
.p2align 4,, 6
L(first_vec_x0_x1_test):
- VPCMP $0, %YMMMATCH, %YMM2, %k1
+ VPCMP $0, %VECMATCH, %VEC(4), %k1
kmovd %k1, %eax
- /* Check YMM2 for last match first. If no match try YMM1. */
+ /* Check VEC(4) for last match first. If no match try VEC(3). */
testl %eax, %eax
jz L(first_vec_x0_test)
.p2align 4,, 4
@@ -182,10 +174,10 @@ L(first_vec_x1_return):
.p2align 4,, 10
L(first_vec_x2):
- VPCMP $0, %YMMMATCH, %YMM3, %k1
+ VPCMP $0, %VECMATCH, %VEC(5), %k1
kmovd %k1, %eax
blsmskl %ecx, %ecx
- /* Check YMM3 for last match first. If no match try YMM2/YMM1.
+ /* Check VEC(5) for last match first. If no match try VEC(4)/VEC(3).
*/
andl %ecx, %eax
jz L(first_vec_x0_x1_test)
@@ -196,23 +188,23 @@ L(first_vec_x2):
.p2align 4
L(aligned_more):
- /* Need to keep original pointer incase YMM1 has last match. */
+ /* Need to keep original pointer incase VEC(3) has last match. */
movq %rdi, %rsi
andq $-VEC_SIZE, %rdi
- VMOVU VEC_SIZE(%rdi), %YMM2
- VPTESTN %YMM2, %YMM2, %k0
+ VMOVU VEC_SIZE(%rdi), %VEC(4)
+ VPTESTN %VEC(4), %VEC(4), %k0
kmovd %k0, %ecx
testl %ecx, %ecx
jnz L(first_vec_x1)
- VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
- VPTESTN %YMM3, %YMM3, %k0
+ VMOVU (VEC_SIZE * 2)(%rdi), %VEC(5)
+ VPTESTN %VEC(5), %VEC(5), %k0
kmovd %k0, %ecx
testl %ecx, %ecx
jnz L(first_vec_x2)
- VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
- VPTESTN %YMM4, %YMM4, %k0
+ VMOVU (VEC_SIZE * 3)(%rdi), %VEC(6)
+ VPTESTN %VEC(6), %VEC(6), %k0
kmovd %k0, %ecx
movq %rdi, %r8
testl %ecx, %ecx
@@ -221,24 +213,24 @@ L(aligned_more):
andq $-(VEC_SIZE * 2), %rdi
.p2align 4
L(first_aligned_loop):
- /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ /* Preserve VEC(3), VEC(4), VEC(5), and VEC(6) until we can gurantee
they don't store a match. */
- VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
- VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+ VMOVA (VEC_SIZE * 4)(%rdi), %VEC(7)
+ VMOVA (VEC_SIZE * 5)(%rdi), %VEC(8)
- VPCMP $0, %YMM5, %YMMMATCH, %k2
- vpxord %YMM6, %YMMMATCH, %YMM7
+ VPCMP $0, %VEC(7), %VECMATCH, %k2
+ vpxord %VEC(8), %VECMATCH, %VEC(9)
- VPMIN %YMM5, %YMM6, %YMM8
- VPMIN %YMM8, %YMM7, %YMM7
+ VPMIN %VEC(7), %VEC(8), %VEC(10)
+ VPMIN %VEC(10), %VEC(9), %VEC(9)
- VPTESTN %YMM7, %YMM7, %k1
+ VPTESTN %VEC(9), %VEC(9), %k1
subq $(VEC_SIZE * -2), %rdi
kortestd %k1, %k2
jz L(first_aligned_loop)
- VPCMP $0, %YMM6, %YMMMATCH, %k3
- VPTESTN %YMM8, %YMM8, %k1
+ VPCMP $0, %VEC(8), %VECMATCH, %k3
+ VPTESTN %VEC(10), %VEC(10), %k1
ktestd %k1, %k1
jz L(second_aligned_loop_prep)
@@ -247,7 +239,7 @@ L(first_aligned_loop):
.p2align 4,, 6
L(first_vec_x1_or_x2_or_x3):
- VPCMP $0, %YMM4, %YMMMATCH, %k4
+ VPCMP $0, %VEC(6), %VECMATCH, %k4
kmovd %k4, %eax
testl %eax, %eax
jz L(first_vec_x1_or_x2)
@@ -257,7 +249,7 @@ L(first_vec_x1_or_x2_or_x3):
.p2align 4,, 8
L(return_first_aligned_loop):
- VPTESTN %YMM5, %YMM5, %k0
+ VPTESTN %VEC(7), %VEC(7), %k0
kunpck %k0, %k1, %k0
kmov_2x %k0, %maskz_2x
@@ -282,22 +274,22 @@ L(second_aligned_loop_set_furthest_match):
.p2align 4
L(second_aligned_loop):
- VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
- VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+ VMOVU (VEC_SIZE * 4)(%rdi), %VEC(3)
+ VMOVU (VEC_SIZE * 5)(%rdi), %VEC(4)
- VPCMP $0, %YMM1, %YMMMATCH, %k2
- vpxord %YMM2, %YMMMATCH, %YMM3
+ VPCMP $0, %VEC(3), %VECMATCH, %k2
+ vpxord %VEC(4), %VECMATCH, %VEC(5)
- VPMIN %YMM1, %YMM2, %YMM4
- VPMIN %YMM3, %YMM4, %YMM3
+ VPMIN %VEC(3), %VEC(4), %VEC(6)
+ VPMIN %VEC(5), %VEC(6), %VEC(5)
- VPTESTN %YMM3, %YMM3, %k1
+ VPTESTN %VEC(5), %VEC(5), %k1
subq $(VEC_SIZE * -2), %rdi
kortestd %k1, %k2
jz L(second_aligned_loop)
- VPCMP $0, %YMM2, %YMMMATCH, %k3
- VPTESTN %YMM4, %YMM4, %k1
+ VPCMP $0, %VEC(4), %VECMATCH, %k3
+ VPTESTN %VEC(6), %VEC(6), %k1
ktestd %k1, %k1
jz L(second_aligned_loop_set_furthest_match)
@@ -312,7 +304,7 @@ L(return_old_match):
ret
L(return_new_match):
- VPTESTN %YMM1, %YMM1, %k0
+ VPTESTN %VEC(3), %VEC(3), %k0
kunpck %k0, %k1, %k0
kmov_2x %k0, %maskz_2x
@@ -334,8 +326,8 @@ L(cross_page_boundary):
as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
a bit of code size. */
xorq %rdi, %rax
- VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
- VPTESTN %YMM1, %YMM1, %k0
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(3)
+ VPTESTN %VEC(3), %VEC(3), %k0
kmovd %k0, %ecx
/* Shift out zero CHAR matches that are before the begining of
@@ -351,7 +343,7 @@ L(cross_page_boundary):
jz L(page_cross_continue)
/* Found zero CHAR so need to test for search CHAR. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VPCMP $0, %VECMATCH, %VEC(3), %k1
kmovd %k1, %eax
/* Shift out search CHAR matches that are before the begining of
src (rdi). */
new file mode 100644
@@ -0,0 +1,90 @@
+/* Macro helpers for VEC_{type}({vec_num})
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _VEC_MACROS_H
+# define _VEC_MACROS_H 1
+
+# ifndef HAS_VEC
+# error "Never include this file directly. Always include a vector config."
+# endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+ VEC(N) values. */
+#define VEC_hi_xmm0 xmm16
+#define VEC_hi_xmm1 xmm17
+#define VEC_hi_xmm2 xmm18
+#define VEC_hi_xmm3 xmm19
+#define VEC_hi_xmm4 xmm20
+#define VEC_hi_xmm5 xmm21
+#define VEC_hi_xmm6 xmm22
+#define VEC_hi_xmm7 xmm23
+#define VEC_hi_xmm8 xmm24
+#define VEC_hi_xmm9 xmm25
+#define VEC_hi_xmm10 xmm26
+#define VEC_hi_xmm11 xmm27
+#define VEC_hi_xmm12 xmm28
+#define VEC_hi_xmm13 xmm29
+#define VEC_hi_xmm14 xmm30
+#define VEC_hi_xmm15 xmm31
+
+#define VEC_hi_ymm0 ymm16
+#define VEC_hi_ymm1 ymm17
+#define VEC_hi_ymm2 ymm18
+#define VEC_hi_ymm3 ymm19
+#define VEC_hi_ymm4 ymm20
+#define VEC_hi_ymm5 ymm21
+#define VEC_hi_ymm6 ymm22
+#define VEC_hi_ymm7 ymm23
+#define VEC_hi_ymm8 ymm24
+#define VEC_hi_ymm9 ymm25
+#define VEC_hi_ymm10 ymm26
+#define VEC_hi_ymm11 ymm27
+#define VEC_hi_ymm12 ymm28
+#define VEC_hi_ymm13 ymm29
+#define VEC_hi_ymm14 ymm30
+#define VEC_hi_ymm15 ymm31
+
+#define VEC_hi_zmm0 zmm16
+#define VEC_hi_zmm1 zmm17
+#define VEC_hi_zmm2 zmm18
+#define VEC_hi_zmm3 zmm19
+#define VEC_hi_zmm4 zmm20
+#define VEC_hi_zmm5 zmm21
+#define VEC_hi_zmm6 zmm22
+#define VEC_hi_zmm7 zmm23
+#define VEC_hi_zmm8 zmm24
+#define VEC_hi_zmm9 zmm25
+#define VEC_hi_zmm10 zmm26
+#define VEC_hi_zmm11 zmm27
+#define VEC_hi_zmm12 zmm28
+#define VEC_hi_zmm13 zmm29
+#define VEC_hi_zmm14 zmm30
+#define VEC_hi_zmm15 zmm31
+
+# define PRIMITIVE_VEC(vec, num) vec##num
+
+# define VEC_any_xmm(i) PRIMITIVE_VEC(xmm, i)
+# define VEC_any_ymm(i) PRIMITIVE_VEC(ymm, i)
+# define VEC_any_zmm(i) PRIMITIVE_VEC(zmm, i)
+
+# define VEC_hi_xmm(i) PRIMITIVE_VEC(VEC_hi_xmm, i)
+# define VEC_hi_ymm(i) PRIMITIVE_VEC(VEC_hi_ymm, i)
+# define VEC_hi_zmm(i) PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif