Message ID | 20231215170315.1806024-1-tirtajames45@gmail.com |
---|---|
State | New |
Headers | show |
Series | sysdeps/memmem-avx2.c: add memmem-avx2.c | expand |
On 12/15/23 12:03, James Tirta Halim wrote: > Find the rarest byte in NE. Do a naive loop until HS is aligned. Once aligned, find > the parts of HS that matches the rare byte and the byte after it, shift > back to the position of HS that should match NE and do a memcmp. Patch fails pre-commit CI -- Doesn't apply. https://patchwork.sourceware.org/project/glibc/patch/20231215170315.1806024-1-tirtajames45@gmail.com/ This looks like it depends on the up-thread patch. Please send patches as a series e.g. git format-patch HEAD~1; then use git send email. Please review the contribution checklist: https://sourceware.org/glibc/wiki/Contribution%20checklist Please review Copyright and license: https://sourceware.org/glibc/wiki/Contribution%20checklist#Copyright_and_license This patch needs either DCO or assignment. > Average timings (Core i5 8400): > __memmem_avx2 basic_memmem twoway_memmem memmem > 1342.942864 19100.87074 3335.335377 2745.971856 > > Passes make check. > > --- > sysdeps/x86_64/multiarch/memmem-avx2.c | 83 ++++++++++++++++---------- > 1 file changed, 50 insertions(+), 33 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c > index b0cced73aa..524d0fe45f 100644 > --- a/sysdeps/x86_64/multiarch/memmem-avx2.c > +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c > @@ -3,53 +3,70 @@ > #include <inttypes.h> > #include <libc-pointer-arith.h> > > +static inline void * > +__find_rarest_byte (const void *ne, > + size_t n) > +{ > + static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 }; > + const unsigned char *rare = (const unsigned char *) ne; > + const unsigned char *p = (const unsigned char *) ne; > + int c_rare = rarebyte_table[*rare]; > + int c; > + for (; n--; ++p) > + { > + c = rarebyte_table[*p]; > + if (c < c_rare) { > + rare = p; > + c_rare = c; > + } > + } > + return (void *) rare; > +} > + > void * > -__memmem_avx2 (const void *hs, size_t hs_len, const void *ne, size_t ne_len) > +__memmem_avx2 (const void *hs, > + size_t hs_len, > + const void *ne, > + size_t ne_len) > { > if (ne_len == 1) > return (void *) memchr (hs, *(unsigned char *) ne, hs_len); > if (__glibc_unlikely (ne_len == 0)) > return (void *) hs; > - if (__glibc_unlikely (hs_len == ne_len)) > - return !memcmp (hs, ne, ne_len) ? (void *) hs : NULL; > if (__glibc_unlikely (hs_len < ne_len)) > return NULL; > - const __m256i nv = _mm256_set1_epi8 (*(char *) ne); > const unsigned char *h = (const unsigned char *) hs; > - const unsigned char *n = (const unsigned char *) ne; > const unsigned char *const end = h + hs_len - ne_len; > - const int c1 = *(n + 1); > - n += 2, ne_len -= 2; > - __m256i hv; > - uint32_t i, m; > - if (!PTR_IS_ALIGNED (h)) { > - hv = _mm256_loadu_si256 ((const __m256i *) h); > - m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv)); > - for (; m; m = _blsr_u32 (m)) { > - i = _tzcnt_u32 (m); > - if (__glibc_unlikely (h + i > end)) > - return NULL; > - if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len)) > - return (char *) h + i; > - } > - h += sizeof (__m256i); > - if (__glibc_unlikely (h > end)) > + size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne); > + if (shift == ne_len - 1) > + --shift; > + h += shift; > + for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h) > + { > + if (__glibc_unlikely (h - shift > end)) > return NULL; > - h = (const unsigned char *) PTR_ALIGN_UP (h, sizeof (__m256i)); > - } > - for (;;) { > + if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len)) > + return (void *) (h - shift); > + } > + const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift)); > + const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1)); > + __m256i hv, hv1; > + uint32_t i, hm0, hm1, m; > + for (; h - shift <= end; h += sizeof (__m256i)) { > hv = _mm256_load_si256 ((const __m256i *) h); > - m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv)); > - for (; m; m = _blsr_u32 (m)) { > + hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1)); > + hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv)); > + hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1)); > + m = hm0 & hm1; > + while (m) > + { > i = _tzcnt_u32 (m); > - if (__glibc_unlikely (h + i > end)) > + m = _blsr_u32 (m); > + if (__glibc_unlikely (h + i - shift > end)) > return NULL; > - if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len)) > - return (char *) h + i; > - } > - h += sizeof (__m256i); > - if (__glibc_unlikely (h > end)) > - return NULL; > + if (!memcmp (h + i - shift, ne, ne_len)) > + return (char *) h + i - shift; > + } > } > return NULL; > }
diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c index b0cced73aa..524d0fe45f 100644 --- a/sysdeps/x86_64/multiarch/memmem-avx2.c +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c @@ -3,53 +3,70 @@ #include <inttypes.h> #include <libc-pointer-arith.h> +static inline void * +__find_rarest_byte (const void *ne, + size_t n) +{ + static const unsigned char rarebyte_table[256] = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4, 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53, 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219, 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223, 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216, 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204, 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246, 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243, 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190, 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87, 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115, 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137, 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73, 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149, 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123, 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67, 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34, 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48, 85, 49, 50, 51 }; + const unsigned char *rare = (const unsigned char *) ne; + const unsigned char *p = (const unsigned char *) ne; + int c_rare = rarebyte_table[*rare]; + int c; + for (; n--; ++p) + { + c = rarebyte_table[*p]; + if (c < c_rare) { + rare = p; + c_rare = c; + } + } + return (void *) rare; +} + void * -__memmem_avx2 (const void *hs, size_t hs_len, const void *ne, size_t ne_len) +__memmem_avx2 (const void *hs, + size_t hs_len, + const void *ne, + size_t ne_len) { if (ne_len == 1) return (void *) memchr (hs, *(unsigned char *) ne, hs_len); if (__glibc_unlikely (ne_len == 0)) return (void *) hs; - if (__glibc_unlikely (hs_len == ne_len)) - return !memcmp (hs, ne, ne_len) ? (void *) hs : NULL; if (__glibc_unlikely (hs_len < ne_len)) return NULL; - const __m256i nv = _mm256_set1_epi8 (*(char *) ne); const unsigned char *h = (const unsigned char *) hs; - const unsigned char *n = (const unsigned char *) ne; const unsigned char *const end = h + hs_len - ne_len; - const int c1 = *(n + 1); - n += 2, ne_len -= 2; - __m256i hv; - uint32_t i, m; - if (!PTR_IS_ALIGNED (h)) { - hv = _mm256_loadu_si256 ((const __m256i *) h); - m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv)); - for (; m; m = _blsr_u32 (m)) { - i = _tzcnt_u32 (m); - if (__glibc_unlikely (h + i > end)) - return NULL; - if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len)) - return (char *) h + i; - } - h += sizeof (__m256i); - if (__glibc_unlikely (h > end)) + size_t shift = PTR_DIFF (__find_rarest_byte (ne, ne_len), ne); + if (shift == ne_len - 1) + --shift; + h += shift; + for (; !PTR_IS_ALIGNED (h, sizeof (__m256i)); ++h) + { + if (__glibc_unlikely (h - shift > end)) return NULL; - h = (const unsigned char *) PTR_ALIGN_UP (h, sizeof (__m256i)); - } - for (;;) { + if (*h == *((unsigned char *) ne + shift) && !memcmp (h - shift, ne, ne_len)) + return (void *) (h - shift); + } + const __m256i nv = _mm256_set1_epi8 (*((char *) ne + shift)); + const __m256i nv1 = _mm256_set1_epi8 (*((char *) ne + shift + 1)); + __m256i hv, hv1; + uint32_t i, hm0, hm1, m; + for (; h - shift <= end; h += sizeof (__m256i)) { hv = _mm256_load_si256 ((const __m256i *) h); - m = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv)); - for (; m; m = _blsr_u32 (m)) { + hv1 = _mm256_loadu_si256 ((const __m256i *) (h + 1)); + hm0 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv, nv)); + hm1 = (uint32_t) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (hv1, nv1)); + m = hm0 & hm1; + while (m) + { i = _tzcnt_u32 (m); - if (__glibc_unlikely (h + i > end)) + m = _blsr_u32 (m); + if (__glibc_unlikely (h + i - shift > end)) return NULL; - if (*(h + i + 1) == c1 && !memcmp (h + i + 2, n, ne_len)) - return (char *) h + i; - } - h += sizeof (__m256i); - if (__glibc_unlikely (h > end)) - return NULL; + if (!memcmp (h + i - shift, ne, ne_len)) + return (char *) h + i - shift; + } } return NULL; }