Message ID | 20240201005721.782679-1-tirtajames45@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v4] sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c | expand |
On Thu, Feb 1, 2024 at 1:00 AM James Tirta Halim <tirtajames45@gmail.com> wrote: > > Find the rarest byte in NE. Find the parts of HS that matches the rare byte > and the byte after it. If found, shift back to the start of NE in HS and > vector compare the first VEC_SIZE with NE. If matches, compare the rest > with MEMCMPEQ. > > Timings (Core i3-1115G4): > basic_memmem twoway_memmem __memmem_avx512 __memmem_avx2 > __memmem_generic > Total: > 6.80124e+06 1.06087e+06 219483 345385 768041 > Average: > 25958.9 4049.11 837.721 1318.26 2931.45 > > Passes make check. > > Changes in v1: > 1. Add memmem-avx2.c > > Changes in v2: > 1. Add avx512 support with a generic header file > 2. Use __memcmpeq instead of memcmp > 3. Remove scalar loop > 4. Fix unsafe unaligned load > > Changes in v3: > 1. Avoid checking for alignment to the start of the page since that will be rare > 2. Use __memcmpeq instead of __memcmpeq_avx2 (it generates undefined > reference errors) > 3. Add memmem.c (needs review) > 4. Add __memcmpeq_avx2 and __memcmpeq_avx512 to ifunc-impl-list.c (needs > review) > 5. Add libc_hidden_builtin_def and MEMMEM to memmem.c (needs review) > > Changes in v4: > 1. Correct the cpu feature checks in ifunc-impl-list.c and memmem.c to > use AVX512BW and BMI1 for AVX512 and AVX2 and BMI1 for AVX2 > 2. Correct the Makefile to use the appropriate flags > 3. Rename memmem-vectorized-avx.h to memmem-avx-base.h > 4. Remove unused vector macros (POPCNT and LZCNT) > > --- > string/memmem.c | 7 +- > sysdeps/x86_64/multiarch/Makefile | 5 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 ++ > sysdeps/x86_64/multiarch/memmem-avx-base.h | 217 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/memmem-avx2.c | 3 + > sysdeps/x86_64/multiarch/memmem-avx512.c | 16 ++ > sysdeps/x86_64/multiarch/memmem.c | 67 +++++++ > 7 files changed, 326 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx-base.h > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx512.c > create mode 100644 sysdeps/x86_64/multiarch/memmem.c > > diff --git a/string/memmem.c b/string/memmem.c > index 6badc1c3bd..62654b4bd0 100644 > --- a/string/memmem.c > +++ b/string/memmem.c > @@ -32,6 +32,10 @@ > > #undef memmem > > +#ifndef MEMMEM > +# define MEMMEM __memmem > +#endif > + > /* Hash character pairs so a small shift table can be used. All bits of > p[0] are included, but not all bits from p[-1]. So if two equal hashes > match on p[-1], p[0] matches too. Hash collisions are harmless and result > @@ -50,7 +54,7 @@ > The limit also implies worst-case performance is linear. > Needles larger than 256 characters use the linear-time Two-Way algorithm. */ > void * > -__memmem(const void *haystack, size_t hs_len, > +MEMMEM(const void *haystack, size_t hs_len, > const void *needle, size_t ne_len) > { > const unsigned char *hs = (const unsigned char *)haystack; > @@ -122,3 +126,4 @@ const void *needle, size_t ne_len) > libc_hidden_def(__memmem) > weak_alias(__memmem, memmem) > libc_hidden_weak(memmem) > +libc_hidden_builtin_def(MEMMEM) > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index e1e894c963..95c95eee4b 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -15,6 +15,8 @@ sysdep_routines += \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > memcmpeq-sse2 \ > + memmem-avx2 \ > + memmem-avx512 \ > memmove-avx-unaligned-erms \ > memmove-avx-unaligned-erms-rtm \ > memmove-avx512-no-vzeroupper \ > @@ -122,6 +124,9 @@ sysdep_routines += \ > varshift \ > # sysdep_routines > > +CFLAGS-memmem-avx2.c += -mavx2 -mbmi -O3 > +CFLAGS-memmem-avx512.c += -mavx512f -mavx512bw -mbmi -O3 > + > CFLAGS-strcspn-sse4.c += -msse4 > CFLAGS-strpbrk-sse4.c += -msse4 > CFLAGS-strspn-sse4.c += -msse4 > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 5427ff1907..300d4064ae 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -799,6 +799,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic)) > > + /* Support sysdeps/x86_64/multiarch/memmem.c. */ > + IFUNC_IMPL (i, name, memmem, > + IFUNC_IMPL_ADD (array, i, memmem, > + (CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI1)), > + __memmem_avx512) > + IFUNC_IMPL_ADD (array, i, memmem, > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI1)), > + __memmem_avx2) > + IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic)) > + > /* Support sysdeps/x86_64/multiarch/wcschr.c. */ > IFUNC_IMPL (i, name, wcschr, > X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, > diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.h b/sysdeps/x86_64/multiarch/memmem-avx-base.h > new file mode 100644 > index 0000000000..46883bb121 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.h > @@ -0,0 +1,217 @@ > +#include <immintrin.h> > +#include <inttypes.h> > +#include <string.h> > +#include <libc-pointer-arith.h> > + > +#ifndef FUNC_NAME > +# define __memmem_avx2 > +#endif > +#ifndef VEC > +# define VEC __m256i > +#endif > +#ifndef VEC_SIZE > +# define VEC_SIZE sizeof (VEC) > +#endif > +#ifndef MASK > +# define MASK uint32_t > +#endif > +#ifndef MASK_SIZE > +# define MASK_SIZE sizeof (MASK) > +#endif > +#ifndef LOAD > +# define LOAD(x) _mm256_load_si256 (x) > +#endif > +#ifndef LOADU > +# define LOADU(x) _mm256_loadu_si256 (x) > +#endif > +#ifndef STORE > +# define STORE(dst, src) _mm256_store_si256 (dst, src) > +#endif > +#ifndef STOREU > +# define STOREU(dst, src) _mm256_storeu_si256 (dst, src) > +#endif > +#ifndef CMPEQ8_MASK > +# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y)) > +#endif > +#ifndef SETZERO > +# define SETZERO(x) _mm256_setzero_si256 (x) > +#endif > +#ifndef SETONE8 > +# define SETONE8(x) _mm256_set1_epi8 (x) > +#endif > +#ifndef TZCNT > +# define TZCNT(x) _tzcnt_u32 (x) > +#endif > +#ifndef BLSR > +# define BLSR(x) _blsr_u32 (x) > +#endif > +#ifndef ONES > +# define ONES ((MASK) -1) > +#endif > + Things like `ONE`, `VEC_SIZE`, `MASK_SIZE`, etc... can just be unconditionally defined in memmem-avx-base Also, instead of having a default in memmem-avx-base, think the rest should be just be defined in the memem-avx2/memem-avx512. Otherwise theres not really preventing the `TZCNT`/`BLSR` from becoming desynced with `MASK` (likewise for the VEC defines). > +#ifndef MEMCMPEQ > +# define MEMCMPEQ __memcmpeq > +#endif > +#ifndef MEMCPY > +# define MEMCPY memcpy > +#endif > +#ifndef MEMCHR > +# define MEMCHR memchr > +#endif > +#ifndef PAGE_SIZE > +# define PAGE_SIZE 4096 > +#endif > +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) > + > +static inline void * > +find_rarest_byte (const void *ne, size_t n) > +{ > + /* Lower is rarer. The table is based on the > + *.c and *.h files in glibc. */ > + static const unsigned char rarebyte_table[256] > + = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, > + 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, > + 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, > + 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, > + 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, > + 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, > + 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, > + 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, > + 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, > + 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, > + 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, > + 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, > + 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, > + 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, > + 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, > + 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, > + 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, > + 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, > + 85, 49, 50, 51 }; > + const unsigned char *rare = (const unsigned char *) ne; > + const unsigned char *p = (const unsigned char *) ne; > + int c_rare = rarebyte_table[*rare]; > + int c; > + for (; n--; ++p) > + { > + c = rarebyte_table[*p]; > + if (c < c_rare) > + { > + rare = p; > + c_rare = c; > + } > + } > + return (void *) rare; > +} > + > +void * > +FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len) > +{ > + if (ne_len == 1) > + return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len); > + if (__glibc_unlikely (ne_len == 0)) > + return (void *) hs; > + if (__glibc_unlikely (hs_len < ne_len)) > + return NULL; > + VEC hv0, hv1, hv, nv; > + MASK i, hm0, hm1, m, cmpm; > + const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0; > + const MASK matchm = ONES << matchsh; > + const unsigned char *h = (const unsigned char *) hs; > + const unsigned char *const end = h + hs_len - ne_len; > + const unsigned char *hp; > + size_t shift = PTR_DIFF (find_rarest_byte (ne, ne_len), ne); think ne_len here should be probably limitted to something like MIN(ne_len, VEC_SIZE). > + if (shift == ne_len - 1) > + --shift; > + const VEC nv0 = SETONE8 (*((char *) ne + shift)); > + const VEC nv1 = SETONE8 (*((char *) ne + shift + 1)); > + h += shift; > + if (PTR_DIFF (PTR_ALIGN_UP (ne, PAGE_SIZE), ne) >= VEC_SIZE > + || PTR_IS_ALIGNED (ne, PAGE_SIZE) || ne_len >= VEC_SIZE) > + nv = LOADU ((VEC *) ne); think simpler logic is `(ne & (PAGE_SIZE - 1)) > (PAGE_SIZE - VEC_SIZE)` > + else > + MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len)); > + const unsigned int off = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE)); > + unsigned int off2 = (PTR_DIFF (end, (h - shift)) < VEC_SIZE) > + ? VEC_SIZE - (unsigned int) (end - (h - shift)) - 1 > + : 0; > + h -= off; > + hv0 = LOAD ((const VEC *) h); > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > + hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1; > + /* Clear matched bits that are out of bounds. */ > + m = (((hm0 & hm1) >> off) << off2) >> off2; > + while (m) > + { > + i = TZCNT (m); > + m = BLSR (m); > + hp = h + off + i - shift; > + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) > + { > + hv = LOADU ((VEC *) hp); > + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; > + if (cmpm == matchm) > + if (ne_len <= VEC_SIZE > + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, > + ne_len - VEC_SIZE)) > + return (void *) hp; > + } > + else > + { > + if (!MEMCMPEQ (hp, ne, ne_len)) > + return (void *) hp; > + } > + } > + h += VEC_SIZE - 1; > + for (; h - shift + VEC_SIZE <= end; h += VEC_SIZE) > + { > + hv0 = LOADU ((const VEC *) h); > + hv1 = LOAD ((const VEC *) (h + 1)); > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > + m = hm0 & hm1; > + while (m) > + { > + match: > + i = TZCNT (m); > + m = BLSR (m); > + hp = h + i - shift; > + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) > + { > + hv = LOADU ((VEC *) hp); > + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; > + if (cmpm == matchm) > + if (ne_len <= VEC_SIZE > + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, > + ne_len - VEC_SIZE)) > + return (void *) hp; > + } > + else > + { > + if (!MEMCMPEQ (hp, ne, ne_len)) > + return (void *) hp; > + } > + } > + } > + if (h - shift <= end) > + { > + off2 = VEC_SIZE - (unsigned int) (end - (h - shift)) - 1; > + hv1 = LOAD ((const VEC *) (h + 1)); > + if (PTR_DIFF (PTR_ALIGN_UP (h, PAGE_SIZE), h) >= VEC_SIZE) > + { > + hv0 = LOADU ((const VEC *) h); > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > + } > + else > + { > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > + hm0 = 1 | (MASK) CMPEQ8_MASK (hv1, nv0) << 1; > + } > + /* Clear matched bits that are out of bounds. */ > + m = ((hm0 & hm1) << off2) >> off2; > + if (m) > + goto match; > + } > + return NULL; > +} The implementation is ingeneral a bit hard to follow. Can you 1) comment the implementation. Particularly a bit lost following the setup code around `off`/`h`/`off2`/`shift`. > diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c > new file mode 100644 > index 0000000000..91f5d5d331 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c > @@ -0,0 +1,3 @@ > +#define FUNC_NAME __memmem_avx2 > + > +#include "memmem-avx-base.h" > diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c > new file mode 100644 > index 0000000000..163efa2133 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem-avx512.c > @@ -0,0 +1,16 @@ > +#define VEC __m512i > +#define MASK uint64_t > +#define LOAD(x) _mm512_load_si512 (x) > +#define LOADU(x) _mm512_loadu_si512 (x) > +#define STORE(dst, src) _mm512_store_si512 (dst, src) > +#define STOREU(dst, src) _mm512_storeu_si512 (dst, src) > +#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y) > +#define SETZERO(x) _mm512_setzero_si512 (x) > +#define SETONE8(x) _mm512_set1_epi8 (x) > +#define TZCNT(x) _tzcnt_u64 (x) > +#define BLSR(x) _blsr_u64 (x) > +#define ONES ((MASK) -1) > + > +#define FUNC_NAME __memmem_avx512 > + > +#include "memmem-avx-base.h" > diff --git a/sysdeps/x86_64/multiarch/memmem.c b/sysdeps/x86_64/multiarch/memmem.c > new file mode 100644 > index 0000000000..8fe7b77d33 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memmem.c > @@ -0,0 +1,67 @@ > +/* Multiple versions of memmem. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2012-2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Redefine memmem so that the compiler won't complain about the type > + mismatch with the IFUNC selector in strong_alias, below. */ > +#undef memmem > +#define memmem __redirect_memmem > +#include <string.h> > +#undef memmem > + > +#define MEMMEM __memmem_generic > +#ifdef SHARED > +# undef libc_hidden_builtin_def > +# define libc_hidden_builtin_def(name) \ > + __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic); > +#endif > + > +#include "string/memmem.c" > + > +extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden; > +extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden; > +extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden; > + > +#define SYMBOL_NAME memmem > + > +#include "init-arch.h" > + > +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > + ifunc symbol properly. */ > +extern __typeof (__redirect_memmem) __libc_memmem; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) > + return __memmem_avx512; > + > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) > + return __memmem_avx2; > + > + return __memmem_generic; > +} > + > +libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR ()); > +#undef memmem > +strong_alias (__libc_memmem, __memmem) > -- > 2.43.0 >
On Sat, Feb 3, 2024 at 5:48 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Thu, Feb 1, 2024 at 1:00 AM James Tirta Halim <tirtajames45@gmail.com> > wrote: > > > > Find the rarest byte in NE. Find the parts of HS that matches the rare > byte > > and the byte after it. If found, shift back to the start of NE in HS and > > vector compare the first VEC_SIZE with NE. If matches, compare the rest > > with MEMCMPEQ. > > > > Timings (Core i3-1115G4): > > basic_memmem twoway_memmem __memmem_avx512 __memmem_avx2 > > __memmem_generic > > Total: > > 6.80124e+06 1.06087e+06 219483 345385 768041 > > Average: > > 25958.9 4049.11 837.721 1318.26 2931.45 > > > > Passes make check. > > > > Changes in v1: > > 1. Add memmem-avx2.c > > > > Changes in v2: > > 1. Add avx512 support with a generic header file > > 2. Use __memcmpeq instead of memcmp > > 3. Remove scalar loop > > 4. Fix unsafe unaligned load > > > > Changes in v3: > > 1. Avoid checking for alignment to the start of the page since that will > be rare > > 2. Use __memcmpeq instead of __memcmpeq_avx2 (it generates undefined > > reference errors) > > 3. Add memmem.c (needs review) > > 4. Add __memcmpeq_avx2 and __memcmpeq_avx512 to ifunc-impl-list.c (needs > > review) > > 5. Add libc_hidden_builtin_def and MEMMEM to memmem.c (needs review) > > > > Changes in v4: > > 1. Correct the cpu feature checks in ifunc-impl-list.c and memmem.c to > > use AVX512BW and BMI1 for AVX512 and AVX2 and BMI1 for AVX2 > > 2. Correct the Makefile to use the appropriate flags > > 3. Rename memmem-vectorized-avx.h to memmem-avx-base.h > > 4. Remove unused vector macros (POPCNT and LZCNT) > > > > --- > > string/memmem.c | 7 +- > > sysdeps/x86_64/multiarch/Makefile | 5 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 ++ > > sysdeps/x86_64/multiarch/memmem-avx-base.h | 217 +++++++++++++++++++++ > > sysdeps/x86_64/multiarch/memmem-avx2.c | 3 + > > sysdeps/x86_64/multiarch/memmem-avx512.c | 16 ++ > > sysdeps/x86_64/multiarch/memmem.c | 67 +++++++ > > 7 files changed, 326 insertions(+), 1 deletion(-) > > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx-base.h > > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c > > create mode 100644 sysdeps/x86_64/multiarch/memmem-avx512.c > > create mode 100644 sysdeps/x86_64/multiarch/memmem.c > > > > diff --git a/string/memmem.c b/string/memmem.c > > index 6badc1c3bd..62654b4bd0 100644 > > --- a/string/memmem.c > > +++ b/string/memmem.c > > @@ -32,6 +32,10 @@ > > > > #undef memmem > > > > +#ifndef MEMMEM > > +# define MEMMEM __memmem > > +#endif > > + > > /* Hash character pairs so a small shift table can be used. All bits of > > p[0] are included, but not all bits from p[-1]. So if two equal > hashes > > match on p[-1], p[0] matches too. Hash collisions are harmless and > result > > @@ -50,7 +54,7 @@ > > The limit also implies worst-case performance is linear. > > Needles larger than 256 characters use the linear-time Two-Way > algorithm. */ > > void * > > -__memmem(const void *haystack, size_t hs_len, > > +MEMMEM(const void *haystack, size_t hs_len, > > const void *needle, size_t ne_len) > > { > > const unsigned char *hs = (const unsigned char *)haystack; > > @@ -122,3 +126,4 @@ const void *needle, size_t ne_len) > > libc_hidden_def(__memmem) > > weak_alias(__memmem, memmem) > > libc_hidden_weak(memmem) > > +libc_hidden_builtin_def(MEMMEM) > > diff --git a/sysdeps/x86_64/multiarch/Makefile > b/sysdeps/x86_64/multiarch/Makefile > > index e1e894c963..95c95eee4b 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -15,6 +15,8 @@ sysdep_routines += \ > > memcmpeq-avx2-rtm \ > > memcmpeq-evex \ > > memcmpeq-sse2 \ > > + memmem-avx2 \ > > + memmem-avx512 \ > > memmove-avx-unaligned-erms \ > > memmove-avx-unaligned-erms-rtm \ > > memmove-avx512-no-vzeroupper \ > > @@ -122,6 +124,9 @@ sysdep_routines += \ > > varshift \ > > # sysdep_routines > > > > +CFLAGS-memmem-avx2.c += -mavx2 -mbmi -O3 > > +CFLAGS-memmem-avx512.c += -mavx512f -mavx512bw -mbmi -O3 > > + > > CFLAGS-strcspn-sse4.c += -msse4 > > CFLAGS-strpbrk-sse4.c += -msse4 > > CFLAGS-strspn-sse4.c += -msse4 > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 5427ff1907..300d4064ae 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -799,6 +799,18 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > > IFUNC_IMPL_ADD (array, i, strstr, 1, > __strstr_sse2_unaligned) > > IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic)) > > > > + /* Support sysdeps/x86_64/multiarch/memmem.c. */ > > + IFUNC_IMPL (i, name, memmem, > > + IFUNC_IMPL_ADD (array, i, memmem, > > + (CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI1)), > > + __memmem_avx512) > > + IFUNC_IMPL_ADD (array, i, memmem, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI1)), > > + __memmem_avx2) > > + IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic)) > > + > > /* Support sysdeps/x86_64/multiarch/wcschr.c. */ > > IFUNC_IMPL (i, name, wcschr, > > X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, > > diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.h > b/sysdeps/x86_64/multiarch/memmem-avx-base.h > > new file mode 100644 > > index 0000000000..46883bb121 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.h > > @@ -0,0 +1,217 @@ > > +#include <immintrin.h> > > +#include <inttypes.h> > > +#include <string.h> > > +#include <libc-pointer-arith.h> > > + > > +#ifndef FUNC_NAME > > +# define __memmem_avx2 > > +#endif > > +#ifndef VEC > > +# define VEC __m256i > > +#endif > > +#ifndef VEC_SIZE > > +# define VEC_SIZE sizeof (VEC) > > +#endif > > +#ifndef MASK > > +# define MASK uint32_t > > +#endif > > +#ifndef MASK_SIZE > > +# define MASK_SIZE sizeof (MASK) > > +#endif > > +#ifndef LOAD > > +# define LOAD(x) _mm256_load_si256 (x) > > +#endif > > +#ifndef LOADU > > +# define LOADU(x) _mm256_loadu_si256 (x) > > +#endif > > +#ifndef STORE > > +# define STORE(dst, src) _mm256_store_si256 (dst, src) > > +#endif > > +#ifndef STOREU > > +# define STOREU(dst, src) _mm256_storeu_si256 (dst, src) > > +#endif > > +#ifndef CMPEQ8_MASK > > +# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, > y)) > > +#endif > > +#ifndef SETZERO > > +# define SETZERO(x) _mm256_setzero_si256 (x) > > +#endif > > +#ifndef SETONE8 > > +# define SETONE8(x) _mm256_set1_epi8 (x) > > +#endif > > +#ifndef TZCNT > > +# define TZCNT(x) _tzcnt_u32 (x) > > +#endif > > +#ifndef BLSR > > +# define BLSR(x) _blsr_u32 (x) > > +#endif > > +#ifndef ONES > > +# define ONES ((MASK) -1) > > +#endif > > + > Things like `ONE`, `VEC_SIZE`, `MASK_SIZE`, etc... > can just be unconditionally defined in memmem-avx-base > > Also, instead of having a default in memmem-avx-base, > think the rest should be just be defined in the memem-avx2/memem-avx512. > Otherwise theres not really preventing the `TZCNT`/`BLSR` from becoming > desynced with `MASK` (likewise for the VEC defines). > AVX2 macros are still defined in memmem-avx-base.h because otherwise, IDEs will show undeclared identifier errors when editing memmem-avx-base.h. > > > > +#ifndef MEMCMPEQ > > +# define MEMCMPEQ __memcmpeq > > +#endif > > +#ifndef MEMCPY > > +# define MEMCPY memcpy > > +#endif > > +#ifndef MEMCHR > > +# define MEMCHR memchr > > +#endif > > +#ifndef PAGE_SIZE > > +# define PAGE_SIZE 4096 > > +#endif > > +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) > > + > > +static inline void * > > +find_rarest_byte (const void *ne, size_t n) > > +{ > > + /* Lower is rarer. The table is based on the > > + *.c and *.h files in glibc. */ > > + static const unsigned char rarebyte_table[256] > > + = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, > 2, 158, 4, > > + 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, > 52, 53, > > + 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, > 218, 219, > > + 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, > 227, 223, > > + 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, > 189, 216, > > + 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, > 206, 204, > > + 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, > 163, 246, > > + 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, > 245, 243, > > + 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, > 171, 190, > > + 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, > 81, 87, > > + 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, > 156, 115, > > + 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, > 136, 137, > > + 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, > 150, 73, > > + 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, > 64, 149, > > + 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, > 144, 123, > > + 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, > 28, 67, > > + 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, > 33, 34, > > + 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, > 47, 48, > > + 85, 49, 50, 51 }; > > + const unsigned char *rare = (const unsigned char *) ne; > > + const unsigned char *p = (const unsigned char *) ne; > > + int c_rare = rarebyte_table[*rare]; > > + int c; > > + for (; n--; ++p) > > + { > > + c = rarebyte_table[*p]; > > + if (c < c_rare) > > + { > > + rare = p; > > + c_rare = c; > > + } > > + } > > + return (void *) rare; > > +} > > + > > +void * > > +FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len) > > +{ > > + if (ne_len == 1) > > + return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len); > > + if (__glibc_unlikely (ne_len == 0)) > > + return (void *) hs; > > + if (__glibc_unlikely (hs_len < ne_len)) > > + return NULL; > > + VEC hv0, hv1, hv, nv; > > + MASK i, hm0, hm1, m, cmpm; > > + const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : > 0; > > + const MASK matchm = ONES << matchsh; > > + const unsigned char *h = (const unsigned char *) hs; > > + const unsigned char *const end = h + hs_len - ne_len; > > + const unsigned char *hp; > > + size_t shift = PTR_DIFF (find_rarest_byte (ne, ne_len), ne); > think ne_len here should be probably limitted to something like > MIN(ne_len, VEC_SIZE). > Done in v5. > > + if (shift == ne_len - 1) > > + --shift; > > + const VEC nv0 = SETONE8 (*((char *) ne + shift)); > > + const VEC nv1 = SETONE8 (*((char *) ne + shift + 1)); > > + h += shift; > > + if (PTR_DIFF (PTR_ALIGN_UP (ne, PAGE_SIZE), ne) >= VEC_SIZE > > + || PTR_IS_ALIGNED (ne, PAGE_SIZE) || ne_len >= VEC_SIZE) > > + nv = LOADU ((VEC *) ne); > think simpler logic is `(ne & (PAGE_SIZE - 1)) > (PAGE_SIZE - VEC_SIZE)` > Done in v5. > > + else > > + MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len)); > > + const unsigned int off = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE)); > > + unsigned int off2 = (PTR_DIFF (end, (h - shift)) < VEC_SIZE) > > + ? VEC_SIZE - (unsigned int) (end - (h - > shift)) - 1 > > + : 0; > > + h -= off; > > + hv0 = LOAD ((const VEC *) h); > > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > > + hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1; > > + /* Clear matched bits that are out of bounds. */ > > + m = (((hm0 & hm1) >> off) << off2) >> off2; > > + while (m) > > + { > > + i = TZCNT (m); > > + m = BLSR (m); > > + hp = h + off + i - shift; > > + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) > > + { > > + hv = LOADU ((VEC *) hp); > > + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; > > + if (cmpm == matchm) > > + if (ne_len <= VEC_SIZE > > + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + > VEC_SIZE, > > + ne_len - VEC_SIZE)) > > + return (void *) hp; > > + } > > + else > > + { > > + if (!MEMCMPEQ (hp, ne, ne_len)) > > + return (void *) hp; > > + } > > + } > > + h += VEC_SIZE - 1; > > + for (; h - shift + VEC_SIZE <= end; h += VEC_SIZE) > > + { > > + hv0 = LOADU ((const VEC *) h); > > + hv1 = LOAD ((const VEC *) (h + 1)); > > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > > + m = hm0 & hm1; > > + while (m) > > + { > > + match: > > + i = TZCNT (m); > > + m = BLSR (m); > > + hp = h + i - shift; > > + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) > > + { > > + hv = LOADU ((VEC *) hp); > > + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; > > + if (cmpm == matchm) > > + if (ne_len <= VEC_SIZE > > + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + > VEC_SIZE, > > + ne_len - VEC_SIZE)) > > + return (void *) hp; > > + } > > + else > > + { > > + if (!MEMCMPEQ (hp, ne, ne_len)) > > + return (void *) hp; > > + } > > + } > > + } > > + if (h - shift <= end) > > + { > > + off2 = VEC_SIZE - (unsigned int) (end - (h - shift)) - 1; > > + hv1 = LOAD ((const VEC *) (h + 1)); > > + if (PTR_DIFF (PTR_ALIGN_UP (h, PAGE_SIZE), h) >= VEC_SIZE) > > + { > > + hv0 = LOADU ((const VEC *) h); > > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > > + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); > > + } > > + else > > + { > > + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); > > + hm0 = 1 | (MASK) CMPEQ8_MASK (hv1, nv0) << 1; > > + } > > + /* Clear matched bits that are out of bounds. */ > > + m = ((hm0 & hm1) << off2) >> off2; > > + if (m) > > + goto match; > > + } > > + return NULL; > > +} > > The implementation is ingeneral a bit hard to follow. > > Can you > 1) comment the implementation. Particularly a bit lost > following the setup code around `off`/`h`/`off2`/`shift`. > Comments added in v5. > > diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c > b/sysdeps/x86_64/multiarch/memmem-avx2.c > > new file mode 100644 > > index 0000000000..91f5d5d331 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c > > @@ -0,0 +1,3 @@ > > +#define FUNC_NAME __memmem_avx2 > > + > > +#include "memmem-avx-base.h" > > diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c > b/sysdeps/x86_64/multiarch/memmem-avx512.c > > new file mode 100644 > > index 0000000000..163efa2133 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memmem-avx512.c > > @@ -0,0 +1,16 @@ > > +#define VEC __m512i > > +#define MASK uint64_t > > +#define LOAD(x) _mm512_load_si512 (x) > > +#define LOADU(x) _mm512_loadu_si512 (x) > > +#define STORE(dst, src) _mm512_store_si512 (dst, src) > > +#define STOREU(dst, src) _mm512_storeu_si512 (dst, src) > > +#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y) > > +#define SETZERO(x) _mm512_setzero_si512 (x) > > +#define SETONE8(x) _mm512_set1_epi8 (x) > > +#define TZCNT(x) _tzcnt_u64 (x) > > +#define BLSR(x) _blsr_u64 (x) > > +#define ONES ((MASK) -1) > > + > > +#define FUNC_NAME __memmem_avx512 > > + > > +#include "memmem-avx-base.h" > > diff --git a/sysdeps/x86_64/multiarch/memmem.c > b/sysdeps/x86_64/multiarch/memmem.c > > new file mode 100644 > > index 0000000000..8fe7b77d33 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memmem.c > > @@ -0,0 +1,67 @@ > > +/* Multiple versions of memmem. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2012-2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* Redefine memmem so that the compiler won't complain about the type > > + mismatch with the IFUNC selector in strong_alias, below. */ > > +#undef memmem > > +#define memmem __redirect_memmem > > +#include <string.h> > > +#undef memmem > > + > > +#define MEMMEM __memmem_generic > > +#ifdef SHARED > > +# undef libc_hidden_builtin_def > > +# define libc_hidden_builtin_def(name) \ > > + __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic); > > +#endif > > + > > +#include "string/memmem.c" > > + > > +extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden; > > +extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden; > > +extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden; > > + > > +#define SYMBOL_NAME memmem > > + > > +#include "init-arch.h" > > + > > +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle > > + ifunc symbol properly. */ > > +extern __typeof (__redirect_memmem) __libc_memmem; > > + > > +static inline void * > > +IFUNC_SELECTOR (void) > > +{ > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > + > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) > > + return __memmem_avx512; > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) > > + return __memmem_avx2; > > + > > + return __memmem_generic; > > +} > > + > > +libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR > ()); > > +#undef memmem > > +strong_alias (__libc_memmem, __memmem) > > -- > > 2.43.0 > > >
diff --git a/string/memmem.c b/string/memmem.c index 6badc1c3bd..62654b4bd0 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -32,6 +32,10 @@ #undef memmem +#ifndef MEMMEM +# define MEMMEM __memmem +#endif + /* Hash character pairs so a small shift table can be used. All bits of p[0] are included, but not all bits from p[-1]. So if two equal hashes match on p[-1], p[0] matches too. Hash collisions are harmless and result @@ -50,7 +54,7 @@ The limit also implies worst-case performance is linear. Needles larger than 256 characters use the linear-time Two-Way algorithm. */ void * -__memmem(const void *haystack, size_t hs_len, +MEMMEM(const void *haystack, size_t hs_len, const void *needle, size_t ne_len) { const unsigned char *hs = (const unsigned char *)haystack; @@ -122,3 +126,4 @@ const void *needle, size_t ne_len) libc_hidden_def(__memmem) weak_alias(__memmem, memmem) libc_hidden_weak(memmem) +libc_hidden_builtin_def(MEMMEM) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index e1e894c963..95c95eee4b 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -15,6 +15,8 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ + memmem-avx2 \ + memmem-avx512 \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ @@ -122,6 +124,9 @@ sysdep_routines += \ varshift \ # sysdep_routines +CFLAGS-memmem-avx2.c += -mavx2 -mbmi -O3 +CFLAGS-memmem-avx512.c += -mavx512f -mavx512bw -mbmi -O3 + CFLAGS-strcspn-sse4.c += -msse4 CFLAGS-strpbrk-sse4.c += -msse4 CFLAGS-strspn-sse4.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 5427ff1907..300d4064ae 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -799,6 +799,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic)) + /* Support sysdeps/x86_64/multiarch/memmem.c. */ + IFUNC_IMPL (i, name, memmem, + IFUNC_IMPL_ADD (array, i, memmem, + (CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI1)), + __memmem_avx512) + IFUNC_IMPL_ADD (array, i, memmem, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI1)), + __memmem_avx2) + IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic)) + /* Support sysdeps/x86_64/multiarch/wcschr.c. */ IFUNC_IMPL (i, name, wcschr, X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.h b/sysdeps/x86_64/multiarch/memmem-avx-base.h new file mode 100644 index 0000000000..46883bb121 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.h @@ -0,0 +1,217 @@ +#include <immintrin.h> +#include <inttypes.h> +#include <string.h> +#include <libc-pointer-arith.h> + +#ifndef FUNC_NAME +# define __memmem_avx2 +#endif +#ifndef VEC +# define VEC __m256i +#endif +#ifndef VEC_SIZE +# define VEC_SIZE sizeof (VEC) +#endif +#ifndef MASK +# define MASK uint32_t +#endif +#ifndef MASK_SIZE +# define MASK_SIZE sizeof (MASK) +#endif +#ifndef LOAD +# define LOAD(x) _mm256_load_si256 (x) +#endif +#ifndef LOADU +# define LOADU(x) _mm256_loadu_si256 (x) +#endif +#ifndef STORE +# define STORE(dst, src) _mm256_store_si256 (dst, src) +#endif +#ifndef STOREU +# define STOREU(dst, src) _mm256_storeu_si256 (dst, src) +#endif +#ifndef CMPEQ8_MASK +# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y)) +#endif +#ifndef SETZERO +# define SETZERO(x) _mm256_setzero_si256 (x) +#endif +#ifndef SETONE8 +# define SETONE8(x) _mm256_set1_epi8 (x) +#endif +#ifndef TZCNT +# define TZCNT(x) _tzcnt_u32 (x) +#endif +#ifndef BLSR +# define BLSR(x) _blsr_u32 (x) +#endif +#ifndef ONES +# define ONES ((MASK) -1) +#endif + +#ifndef MEMCMPEQ +# define MEMCMPEQ __memcmpeq +#endif +#ifndef MEMCPY +# define MEMCPY memcpy +#endif +#ifndef MEMCHR +# define MEMCHR memchr +#endif +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +static inline void * +find_rarest_byte (const void *ne, size_t n) +{ + /* Lower is rarer. The table is based on the + *.c and *.h files in glibc. */ + static const unsigned char rarebyte_table[256] + = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, + 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, + 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, + 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, + 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, + 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, + 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, + 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, + 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, + 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, + 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, + 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, + 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, + 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, + 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, + 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, + 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, + 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, + 85, 49, 50, 51 }; + const unsigned char *rare = (const unsigned char *) ne; + const unsigned char *p = (const unsigned char *) ne; + int c_rare = rarebyte_table[*rare]; + int c; + for (; n--; ++p) + { + c = rarebyte_table[*p]; + if (c < c_rare) + { + rare = p; + c_rare = c; + } + } + return (void *) rare; +} + +void * +FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len) +{ + if (ne_len == 1) + return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len); + if (__glibc_unlikely (ne_len == 0)) + return (void *) hs; + if (__glibc_unlikely (hs_len < ne_len)) + return NULL; + VEC hv0, hv1, hv, nv; + MASK i, hm0, hm1, m, cmpm; + const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0; + const MASK matchm = ONES << matchsh; + const unsigned char *h = (const unsigned char *) hs; + const unsigned char *const end = h + hs_len - ne_len; + const unsigned char *hp; + size_t shift = PTR_DIFF (find_rarest_byte (ne, ne_len), ne); + if (shift == ne_len - 1) + --shift; + const VEC nv0 = SETONE8 (*((char *) ne + shift)); + const VEC nv1 = SETONE8 (*((char *) ne + shift + 1)); + h += shift; + if (PTR_DIFF (PTR_ALIGN_UP (ne, PAGE_SIZE), ne) >= VEC_SIZE + || PTR_IS_ALIGNED (ne, PAGE_SIZE) || ne_len >= VEC_SIZE) + nv = LOADU ((VEC *) ne); + else + MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len)); + const unsigned int off = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE)); + unsigned int off2 = (PTR_DIFF (end, (h - shift)) < VEC_SIZE) + ? VEC_SIZE - (unsigned int) (end - (h - shift)) - 1 + : 0; + h -= off; + hv0 = LOAD ((const VEC *) h); + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); + hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1; + /* Clear matched bits that are out of bounds. */ + m = (((hm0 & hm1) >> off) << off2) >> off2; + while (m) + { + i = TZCNT (m); + m = BLSR (m); + hp = h + off + i - shift; + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) + { + hv = LOADU ((VEC *) hp); + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; + if (cmpm == matchm) + if (ne_len <= VEC_SIZE + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, + ne_len - VEC_SIZE)) + return (void *) hp; + } + else + { + if (!MEMCMPEQ (hp, ne, ne_len)) + return (void *) hp; + } + } + h += VEC_SIZE - 1; + for (; h - shift + VEC_SIZE <= end; h += VEC_SIZE) + { + hv0 = LOADU ((const VEC *) h); + hv1 = LOAD ((const VEC *) (h + 1)); + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); + m = hm0 & hm1; + while (m) + { + match: + i = TZCNT (m); + m = BLSR (m); + hp = h + i - shift; + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE) + { + hv = LOADU ((VEC *) hp); + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh; + if (cmpm == matchm) + if (ne_len <= VEC_SIZE + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE, + ne_len - VEC_SIZE)) + return (void *) hp; + } + else + { + if (!MEMCMPEQ (hp, ne, ne_len)) + return (void *) hp; + } + } + } + if (h - shift <= end) + { + off2 = VEC_SIZE - (unsigned int) (end - (h - shift)) - 1; + hv1 = LOAD ((const VEC *) (h + 1)); + if (PTR_DIFF (PTR_ALIGN_UP (h, PAGE_SIZE), h) >= VEC_SIZE) + { + hv0 = LOADU ((const VEC *) h); + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0); + } + else + { + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1); + hm0 = 1 | (MASK) CMPEQ8_MASK (hv1, nv0) << 1; + } + /* Clear matched bits that are out of bounds. */ + m = ((hm0 & hm1) << off2) >> off2; + if (m) + goto match; + } + return NULL; +} diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c new file mode 100644 index 0000000000..91f5d5d331 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c @@ -0,0 +1,3 @@ +#define FUNC_NAME __memmem_avx2 + +#include "memmem-avx-base.h" diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c new file mode 100644 index 0000000000..163efa2133 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem-avx512.c @@ -0,0 +1,16 @@ +#define VEC __m512i +#define MASK uint64_t +#define LOAD(x) _mm512_load_si512 (x) +#define LOADU(x) _mm512_loadu_si512 (x) +#define STORE(dst, src) _mm512_store_si512 (dst, src) +#define STOREU(dst, src) _mm512_storeu_si512 (dst, src) +#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y) +#define SETZERO(x) _mm512_setzero_si512 (x) +#define SETONE8(x) _mm512_set1_epi8 (x) +#define TZCNT(x) _tzcnt_u64 (x) +#define BLSR(x) _blsr_u64 (x) +#define ONES ((MASK) -1) + +#define FUNC_NAME __memmem_avx512 + +#include "memmem-avx-base.h" diff --git a/sysdeps/x86_64/multiarch/memmem.c b/sysdeps/x86_64/multiarch/memmem.c new file mode 100644 index 0000000000..8fe7b77d33 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmem.c @@ -0,0 +1,67 @@ +/* Multiple versions of memmem. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2012-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Redefine memmem so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +#undef memmem +#define memmem __redirect_memmem +#include <string.h> +#undef memmem + +#define MEMMEM __memmem_generic +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic); +#endif + +#include "string/memmem.c" + +extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden; +extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden; +extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden; + +#define SYMBOL_NAME memmem + +#include "init-arch.h" + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_memmem) __libc_memmem; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features *cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) + return __memmem_avx512; + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, BMI1)) + return __memmem_avx2; + + return __memmem_generic; +} + +libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR ()); +#undef memmem +strong_alias (__libc_memmem, __memmem)