Message ID | 20220323215734.3927131-7-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,01/23] benchtests: Use json-lib in bench-strchr.c | expand |
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary > sign extensions. > > geometric_mean(N=20) of all benchmarks that dont fallback on > sse2/strlen; New / Original: .928 > > All string/memory tests pass. > --- > Geomtric Mean N=20 runs; All functions page aligned > len, align1, align2, pos, New Time / Old Time > 0, 0, 0, 512, 1.207 > 1, 0, 0, 512, 1.039 > 1, 1, 0, 512, 0.997 > 1, 0, 1, 512, 0.981 > 1, 1, 1, 512, 0.977 > 2, 0, 0, 512, 1.02 > 2, 2, 0, 512, 0.979 > 2, 0, 2, 512, 0.902 > 2, 2, 2, 512, 0.958 > 3, 0, 0, 512, 0.978 > 3, 3, 0, 512, 0.988 > 3, 0, 3, 512, 0.979 > 3, 3, 3, 512, 0.955 > 4, 0, 0, 512, 0.969 > 4, 4, 0, 512, 0.991 > 4, 0, 4, 512, 0.94 > 4, 4, 4, 512, 0.958 > 5, 0, 0, 512, 0.963 > 5, 5, 0, 512, 1.004 > 5, 0, 5, 512, 0.948 > 5, 5, 5, 512, 0.971 > 6, 0, 0, 512, 0.933 > 6, 6, 0, 512, 1.007 > 6, 0, 6, 512, 0.921 > 6, 6, 6, 512, 0.969 > 7, 0, 0, 512, 0.928 > 7, 7, 0, 512, 0.976 > 7, 0, 7, 512, 0.932 > 7, 7, 7, 512, 0.995 > 8, 0, 0, 512, 0.931 > 8, 0, 8, 512, 0.766 > 9, 0, 0, 512, 0.965 > 9, 1, 0, 512, 0.999 > 9, 0, 9, 512, 0.765 > 9, 1, 9, 512, 0.97 > 10, 0, 0, 512, 0.976 > 10, 2, 0, 512, 0.991 > 10, 0, 10, 512, 0.768 > 10, 2, 10, 512, 0.926 > 11, 0, 0, 512, 0.958 > 11, 3, 0, 512, 1.006 > 11, 0, 11, 512, 0.768 > 11, 3, 11, 512, 0.908 > 12, 0, 0, 512, 0.945 > 12, 4, 0, 512, 0.896 > 12, 0, 12, 512, 0.764 > 12, 4, 12, 512, 0.785 > 13, 0, 0, 512, 0.957 > 13, 5, 0, 512, 1.019 > 13, 0, 13, 512, 0.76 > 13, 5, 13, 512, 0.785 > 14, 0, 0, 512, 0.918 > 14, 6, 0, 512, 1.004 > 14, 0, 14, 512, 0.78 > 14, 6, 14, 512, 0.711 > 15, 0, 0, 512, 0.855 > 15, 7, 0, 512, 0.985 > 15, 0, 15, 512, 0.779 > 15, 7, 15, 512, 0.772 > 16, 0, 0, 512, 0.987 > 16, 0, 16, 512, 0.99 > 17, 0, 0, 512, 0.996 > 17, 1, 0, 512, 0.979 > 17, 0, 17, 512, 1.001 > 17, 1, 17, 512, 1.03 > 18, 0, 0, 512, 0.976 > 18, 2, 0, 512, 0.989 > 18, 0, 18, 512, 0.976 > 18, 2, 18, 512, 0.992 > 19, 0, 0, 512, 0.991 > 19, 3, 0, 512, 0.988 > 19, 0, 19, 512, 1.009 > 19, 3, 19, 512, 1.018 > 20, 0, 0, 512, 0.999 > 20, 4, 0, 512, 1.005 > 20, 0, 20, 512, 0.993 > 20, 4, 20, 512, 0.983 > 21, 0, 0, 512, 0.982 > 21, 5, 0, 512, 0.988 > 21, 0, 21, 512, 0.978 > 21, 5, 21, 512, 0.984 > 22, 0, 0, 512, 0.988 > 22, 6, 0, 512, 0.979 > 22, 0, 22, 512, 0.984 > 22, 6, 22, 512, 0.983 > 23, 0, 0, 512, 0.996 > 23, 7, 0, 512, 0.998 > 23, 0, 23, 512, 0.979 > 23, 7, 23, 512, 0.987 > 24, 0, 0, 512, 0.99 > 24, 0, 24, 512, 0.979 > 25, 0, 0, 512, 0.985 > 25, 1, 0, 512, 0.988 > 25, 0, 25, 512, 0.99 > 25, 1, 25, 512, 0.986 > 26, 0, 0, 512, 1.005 > 26, 2, 0, 512, 0.995 > 26, 0, 26, 512, 0.992 > 26, 2, 26, 512, 0.983 > 27, 0, 0, 512, 0.986 > 27, 3, 0, 512, 0.978 > 27, 0, 27, 512, 0.986 > 27, 3, 27, 512, 0.973 > 28, 0, 0, 512, 0.995 > 28, 4, 0, 512, 0.993 > 28, 0, 28, 512, 0.983 > 28, 4, 28, 512, 1.005 > 29, 0, 0, 512, 0.983 > 29, 5, 0, 512, 0.982 > 29, 0, 29, 512, 0.984 > 29, 5, 29, 512, 1.005 > 30, 0, 0, 512, 0.978 > 30, 6, 0, 512, 0.985 > 30, 0, 30, 512, 0.994 > 30, 6, 30, 512, 0.993 > 31, 0, 0, 512, 0.984 > 31, 7, 0, 512, 0.983 > 31, 0, 31, 512, 1.0 > 31, 7, 31, 512, 1.031 > 4, 0, 0, 32, 0.916 > 4, 1, 0, 32, 0.952 > 4, 0, 1, 32, 0.927 > 4, 1, 1, 32, 0.969 > 4, 0, 0, 64, 0.961 > 4, 2, 0, 64, 0.955 > 4, 0, 2, 64, 0.975 > 4, 2, 2, 64, 0.972 > 4, 0, 0, 128, 0.971 > 4, 3, 0, 128, 0.982 > 4, 0, 3, 128, 0.945 > 4, 3, 3, 128, 0.971 > 4, 0, 0, 256, 1.004 > 4, 4, 0, 256, 0.966 > 4, 0, 4, 256, 0.961 > 4, 4, 4, 256, 0.971 > 4, 5, 0, 512, 0.929 > 4, 0, 5, 512, 0.969 > 4, 5, 5, 512, 0.985 > 4, 0, 0, 1024, 1.003 > 4, 6, 0, 1024, 1.009 > 4, 0, 6, 1024, 1.005 > 4, 6, 6, 1024, 0.999 > 4, 0, 0, 2048, 0.917 > 4, 7, 0, 2048, 1.015 > 4, 0, 7, 2048, 1.011 > 4, 7, 7, 2048, 0.907 > 10, 1, 0, 64, 0.964 > 10, 1, 1, 64, 0.966 > 10, 2, 0, 64, 0.953 > 10, 2, 2, 64, 0.972 > 10, 3, 0, 64, 0.962 > 10, 3, 3, 64, 0.969 > 10, 4, 0, 64, 0.957 > 10, 4, 4, 64, 0.969 > 10, 5, 0, 64, 0.961 > 10, 5, 5, 64, 0.965 > 10, 6, 0, 64, 0.949 > 10, 6, 6, 64, 0.9 > 10, 7, 0, 64, 0.957 > 10, 7, 7, 64, 0.897 > 6, 0, 0, 0, 0.991 > 6, 0, 0, 1, 1.011 > 6, 0, 1, 1, 0.939 > 6, 0, 0, 2, 1.016 > 6, 0, 2, 2, 0.94 > 6, 0, 0, 3, 1.019 > 6, 0, 3, 3, 0.941 > 6, 0, 0, 4, 1.056 > 6, 0, 4, 4, 0.884 > 6, 0, 0, 5, 0.977 > 6, 0, 5, 5, 0.934 > 6, 0, 0, 6, 0.954 > 6, 0, 6, 6, 0.93 > 6, 0, 0, 7, 0.963 > 6, 0, 7, 7, 0.916 > 6, 0, 0, 8, 0.963 > 6, 0, 8, 8, 0.945 > 6, 0, 0, 9, 1.028 > 6, 0, 9, 9, 0.942 > 6, 0, 0, 10, 0.955 > 6, 0, 10, 10, 0.831 > 6, 0, 0, 11, 0.948 > 6, 0, 11, 11, 0.82 > 6, 0, 0, 12, 1.033 > 6, 0, 12, 12, 0.873 > 6, 0, 0, 13, 0.983 > 6, 0, 13, 13, 0.852 > 6, 0, 0, 14, 0.984 > 6, 0, 14, 14, 0.853 > 6, 0, 0, 15, 0.984 > 6, 0, 15, 15, 0.882 > 6, 0, 0, 16, 0.971 > 6, 0, 16, 16, 0.958 > 6, 0, 0, 17, 0.938 > 6, 0, 17, 17, 0.947 > 6, 0, 0, 18, 0.96 > 6, 0, 18, 18, 0.938 > 6, 0, 0, 19, 0.903 > 6, 0, 19, 19, 0.943 > 6, 0, 0, 20, 0.947 > 6, 0, 20, 20, 0.951 > 6, 0, 0, 21, 0.948 > 6, 0, 21, 21, 0.96 > 6, 0, 0, 22, 0.926 > 6, 0, 22, 22, 0.951 > 6, 0, 0, 23, 0.923 > 6, 0, 23, 23, 0.959 > 6, 0, 0, 24, 0.918 > 6, 0, 24, 24, 0.952 > 6, 0, 0, 25, 0.97 > 6, 0, 25, 25, 0.952 > 6, 0, 0, 26, 0.871 > 6, 0, 26, 26, 0.869 > 6, 0, 0, 27, 0.935 > 6, 0, 27, 27, 0.836 > 6, 0, 0, 28, 0.936 > 6, 0, 28, 28, 0.857 > 6, 0, 0, 29, 0.876 > 6, 0, 29, 29, 0.859 > 6, 0, 0, 30, 0.934 > 6, 0, 30, 30, 0.857 > 6, 0, 0, 31, 0.962 > 6, 0, 31, 31, 0.86 > 6, 0, 0, 32, 0.912 > 6, 0, 32, 32, 0.94 > 6, 0, 0, 33, 0.903 > 6, 0, 33, 33, 0.968 > 6, 0, 0, 34, 0.913 > 6, 0, 34, 34, 0.896 > 6, 0, 0, 35, 0.904 > 6, 0, 35, 35, 0.913 > 6, 0, 0, 36, 0.905 > 6, 0, 36, 36, 0.907 > 6, 0, 0, 37, 0.899 > 6, 0, 37, 37, 0.9 > 6, 0, 0, 38, 0.912 > 6, 0, 38, 38, 0.919 > 6, 0, 0, 39, 0.925 > 6, 0, 39, 39, 0.927 > 6, 0, 0, 40, 0.923 > 6, 0, 40, 40, 0.972 > 6, 0, 0, 41, 0.92 > 6, 0, 41, 41, 0.966 > 6, 0, 0, 42, 0.915 > 6, 0, 42, 42, 0.834 > 6, 0, 0, 43, 0.92 > 6, 0, 43, 43, 0.856 > 6, 0, 0, 44, 0.908 > 6, 0, 44, 44, 0.858 > 6, 0, 0, 45, 0.932 > 6, 0, 45, 45, 0.847 > 6, 0, 0, 46, 0.927 > 6, 0, 46, 46, 0.859 > 6, 0, 0, 47, 0.902 > 6, 0, 47, 47, 0.855 > 6, 0, 0, 48, 0.949 > 6, 0, 48, 48, 0.934 > 6, 0, 0, 49, 0.907 > 6, 0, 49, 49, 0.943 > 6, 0, 0, 50, 0.934 > 6, 0, 50, 50, 0.943 > 6, 0, 0, 51, 0.933 > 6, 0, 51, 51, 0.939 > 6, 0, 0, 52, 0.944 > 6, 0, 52, 52, 0.944 > 6, 0, 0, 53, 0.939 > 6, 0, 53, 53, 0.938 > 6, 0, 0, 54, 0.9 > 6, 0, 54, 54, 0.923 > 6, 0, 0, 55, 0.9 > 6, 0, 55, 55, 0.927 > 6, 0, 0, 56, 0.9 > 6, 0, 56, 56, 0.917 > 6, 0, 0, 57, 0.9 > 6, 0, 57, 57, 0.916 > 6, 0, 0, 58, 0.914 > 6, 0, 58, 58, 0.784 > 6, 0, 0, 59, 0.863 > 6, 0, 59, 59, 0.846 > 6, 0, 0, 60, 0.88 > 6, 0, 60, 60, 0.827 > 6, 0, 0, 61, 0.896 > 6, 0, 61, 61, 0.847 > 6, 0, 0, 62, 0.894 > 6, 0, 62, 62, 0.865 > 6, 0, 0, 63, 0.934 > 6, 0, 63, 63, 0.866 > > sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++--------------- > 1 file changed, 37 insertions(+), 46 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c > index 013aebf797..c312fab8b1 100644 > --- a/sysdeps/x86_64/multiarch/strcspn-c.c > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c > @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a) > RETURN (NULL, strlen (s)); > > const char *aligned; > - __m128i mask; > - int offset = (int) ((size_t) a & 15); > + __m128i mask, maskz, zero; > + unsigned int maskz_bits; > + unsigned int offset = (unsigned int) ((size_t) a & 15); > + zero = _mm_set1_epi8 (0); > if (offset != 0) > { > /* Load masks. */ > aligned = (const char *) ((size_t) a & -16L); > __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > - > - mask = __m128i_shift_right (mask0, offset); > + maskz = _mm_cmpeq_epi8 (mask0, zero); > > /* Find where the NULL terminator is. */ > - int length = _mm_cmpistri (mask, mask, 0x3a); > - if (length == 16 - offset) > - { > - /* There is no NULL terminator. */ > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); > - int index = _mm_cmpistri (mask1, mask1, 0x3a); > - length += index; > - > - /* Don't use SSE4.2 if the length of A > 16. */ > - if (length > 16) > - return STRCSPN_SSE2 (s, a); > - > - if (index != 0) > - { > - /* Combine mask0 and mask1. We could play games with > - palignr, but frankly this data should be in L1 now > - so do the merge via an unaligned load. */ > - mask = _mm_loadu_si128 ((__m128i *) a); > - } > - } > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > + if (maskz_bits != 0) > + { > + mask = __m128i_shift_right (mask0, offset); > + offset = (unsigned int) ((size_t) s & 15); > + if (offset) > + goto start_unaligned; > + > + aligned = s; > + goto start_loop; > + } > } > - else > - { > - /* A is aligned. */ > - mask = _mm_load_si128 ((__m128i *) a); > > - /* Find where the NULL terminator is. */ > - int length = _mm_cmpistri (mask, mask, 0x3a); > - if (length == 16) > - { > - /* There is no NULL terminator. Don't use SSE4.2 if the length > - of A > 16. */ > - if (a[16] != 0) > - return STRCSPN_SSE2 (s, a); > - } > + /* A is aligned. */ > + mask = _mm_loadu_si128 ((__m128i *) a); > + /* Find where the NULL terminator is. */ > + maskz = _mm_cmpeq_epi8 (mask, zero); > + maskz_bits = _mm_movemask_epi8 (maskz); > + if (maskz_bits == 0) > + { > + /* There is no NULL terminator. Don't use SSE4.2 if the length > + of A > 16. */ > + if (a[16] != 0) > + return STRCSPN_SSE2 (s, a); > } > > - offset = (int) ((size_t) s & 15); > + aligned = s; > + offset = (unsigned int) ((size_t) s & 15); > if (offset != 0) > { > + start_unaligned: > /* Check partial string. */ > aligned = (const char *) ((size_t) s & -16L); > __m128i value = _mm_load_si128 ((__m128i *) aligned); > > value = __m128i_shift_right (value, offset); > > - int length = _mm_cmpistri (mask, value, 0x2); > + unsigned int length = _mm_cmpistri (mask, value, 0x2); > /* No need to check ZFlag since ZFlag is always 1. */ > - int cflag = _mm_cmpistrc (mask, value, 0x2); > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > if (cflag) > RETURN ((char *) (s + length), length); > /* Find where the NULL terminator is. */ > - int index = _mm_cmpistri (value, value, 0x3a); > + unsigned int index = _mm_cmpistri (value, value, 0x3a); > if (index < 16 - offset) > RETURN (NULL, index); > aligned += 16; > } > - else > - aligned = s; > > +start_loop: > while (1) > { > __m128i value = _mm_load_si128 ((__m128i *) aligned); > - int index = _mm_cmpistri (mask, value, 0x2); > - int cflag = _mm_cmpistrc (mask, value, 0x2); > - int zflag = _mm_cmpistrz (mask, value, 0x2); > + unsigned int index = _mm_cmpistri (mask, value, 0x2); > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); > if (cflag) > RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); > if (zflag) > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks.
On Thu, Mar 24, 2022 at 11:57 AM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of > > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary > > sign extensions. > > > > geometric_mean(N=20) of all benchmarks that dont fallback on > > sse2/strlen; New / Original: .928 > > > > All string/memory tests pass. > > --- > > Geomtric Mean N=20 runs; All functions page aligned > > len, align1, align2, pos, New Time / Old Time > > 0, 0, 0, 512, 1.207 > > 1, 0, 0, 512, 1.039 > > 1, 1, 0, 512, 0.997 > > 1, 0, 1, 512, 0.981 > > 1, 1, 1, 512, 0.977 > > 2, 0, 0, 512, 1.02 > > 2, 2, 0, 512, 0.979 > > 2, 0, 2, 512, 0.902 > > 2, 2, 2, 512, 0.958 > > 3, 0, 0, 512, 0.978 > > 3, 3, 0, 512, 0.988 > > 3, 0, 3, 512, 0.979 > > 3, 3, 3, 512, 0.955 > > 4, 0, 0, 512, 0.969 > > 4, 4, 0, 512, 0.991 > > 4, 0, 4, 512, 0.94 > > 4, 4, 4, 512, 0.958 > > 5, 0, 0, 512, 0.963 > > 5, 5, 0, 512, 1.004 > > 5, 0, 5, 512, 0.948 > > 5, 5, 5, 512, 0.971 > > 6, 0, 0, 512, 0.933 > > 6, 6, 0, 512, 1.007 > > 6, 0, 6, 512, 0.921 > > 6, 6, 6, 512, 0.969 > > 7, 0, 0, 512, 0.928 > > 7, 7, 0, 512, 0.976 > > 7, 0, 7, 512, 0.932 > > 7, 7, 7, 512, 0.995 > > 8, 0, 0, 512, 0.931 > > 8, 0, 8, 512, 0.766 > > 9, 0, 0, 512, 0.965 > > 9, 1, 0, 512, 0.999 > > 9, 0, 9, 512, 0.765 > > 9, 1, 9, 512, 0.97 > > 10, 0, 0, 512, 0.976 > > 10, 2, 0, 512, 0.991 > > 10, 0, 10, 512, 0.768 > > 10, 2, 10, 512, 0.926 > > 11, 0, 0, 512, 0.958 > > 11, 3, 0, 512, 1.006 > > 11, 0, 11, 512, 0.768 > > 11, 3, 11, 512, 0.908 > > 12, 0, 0, 512, 0.945 > > 12, 4, 0, 512, 0.896 > > 12, 0, 12, 512, 0.764 > > 12, 4, 12, 512, 0.785 > > 13, 0, 0, 512, 0.957 > > 13, 5, 0, 512, 1.019 > > 13, 0, 13, 512, 0.76 > > 13, 5, 13, 512, 0.785 > > 14, 0, 0, 512, 0.918 > > 14, 6, 0, 512, 1.004 > > 14, 0, 14, 512, 0.78 > > 14, 6, 14, 512, 0.711 > > 15, 0, 0, 512, 0.855 > > 15, 7, 0, 512, 0.985 > > 15, 0, 15, 512, 0.779 > > 15, 7, 15, 512, 0.772 > > 16, 0, 0, 512, 0.987 > > 16, 0, 16, 512, 0.99 > > 17, 0, 0, 512, 0.996 > > 17, 1, 0, 512, 0.979 > > 17, 0, 17, 512, 1.001 > > 17, 1, 17, 512, 1.03 > > 18, 0, 0, 512, 0.976 > > 18, 2, 0, 512, 0.989 > > 18, 0, 18, 512, 0.976 > > 18, 2, 18, 512, 0.992 > > 19, 0, 0, 512, 0.991 > > 19, 3, 0, 512, 0.988 > > 19, 0, 19, 512, 1.009 > > 19, 3, 19, 512, 1.018 > > 20, 0, 0, 512, 0.999 > > 20, 4, 0, 512, 1.005 > > 20, 0, 20, 512, 0.993 > > 20, 4, 20, 512, 0.983 > > 21, 0, 0, 512, 0.982 > > 21, 5, 0, 512, 0.988 > > 21, 0, 21, 512, 0.978 > > 21, 5, 21, 512, 0.984 > > 22, 0, 0, 512, 0.988 > > 22, 6, 0, 512, 0.979 > > 22, 0, 22, 512, 0.984 > > 22, 6, 22, 512, 0.983 > > 23, 0, 0, 512, 0.996 > > 23, 7, 0, 512, 0.998 > > 23, 0, 23, 512, 0.979 > > 23, 7, 23, 512, 0.987 > > 24, 0, 0, 512, 0.99 > > 24, 0, 24, 512, 0.979 > > 25, 0, 0, 512, 0.985 > > 25, 1, 0, 512, 0.988 > > 25, 0, 25, 512, 0.99 > > 25, 1, 25, 512, 0.986 > > 26, 0, 0, 512, 1.005 > > 26, 2, 0, 512, 0.995 > > 26, 0, 26, 512, 0.992 > > 26, 2, 26, 512, 0.983 > > 27, 0, 0, 512, 0.986 > > 27, 3, 0, 512, 0.978 > > 27, 0, 27, 512, 0.986 > > 27, 3, 27, 512, 0.973 > > 28, 0, 0, 512, 0.995 > > 28, 4, 0, 512, 0.993 > > 28, 0, 28, 512, 0.983 > > 28, 4, 28, 512, 1.005 > > 29, 0, 0, 512, 0.983 > > 29, 5, 0, 512, 0.982 > > 29, 0, 29, 512, 0.984 > > 29, 5, 29, 512, 1.005 > > 30, 0, 0, 512, 0.978 > > 30, 6, 0, 512, 0.985 > > 30, 0, 30, 512, 0.994 > > 30, 6, 30, 512, 0.993 > > 31, 0, 0, 512, 0.984 > > 31, 7, 0, 512, 0.983 > > 31, 0, 31, 512, 1.0 > > 31, 7, 31, 512, 1.031 > > 4, 0, 0, 32, 0.916 > > 4, 1, 0, 32, 0.952 > > 4, 0, 1, 32, 0.927 > > 4, 1, 1, 32, 0.969 > > 4, 0, 0, 64, 0.961 > > 4, 2, 0, 64, 0.955 > > 4, 0, 2, 64, 0.975 > > 4, 2, 2, 64, 0.972 > > 4, 0, 0, 128, 0.971 > > 4, 3, 0, 128, 0.982 > > 4, 0, 3, 128, 0.945 > > 4, 3, 3, 128, 0.971 > > 4, 0, 0, 256, 1.004 > > 4, 4, 0, 256, 0.966 > > 4, 0, 4, 256, 0.961 > > 4, 4, 4, 256, 0.971 > > 4, 5, 0, 512, 0.929 > > 4, 0, 5, 512, 0.969 > > 4, 5, 5, 512, 0.985 > > 4, 0, 0, 1024, 1.003 > > 4, 6, 0, 1024, 1.009 > > 4, 0, 6, 1024, 1.005 > > 4, 6, 6, 1024, 0.999 > > 4, 0, 0, 2048, 0.917 > > 4, 7, 0, 2048, 1.015 > > 4, 0, 7, 2048, 1.011 > > 4, 7, 7, 2048, 0.907 > > 10, 1, 0, 64, 0.964 > > 10, 1, 1, 64, 0.966 > > 10, 2, 0, 64, 0.953 > > 10, 2, 2, 64, 0.972 > > 10, 3, 0, 64, 0.962 > > 10, 3, 3, 64, 0.969 > > 10, 4, 0, 64, 0.957 > > 10, 4, 4, 64, 0.969 > > 10, 5, 0, 64, 0.961 > > 10, 5, 5, 64, 0.965 > > 10, 6, 0, 64, 0.949 > > 10, 6, 6, 64, 0.9 > > 10, 7, 0, 64, 0.957 > > 10, 7, 7, 64, 0.897 > > 6, 0, 0, 0, 0.991 > > 6, 0, 0, 1, 1.011 > > 6, 0, 1, 1, 0.939 > > 6, 0, 0, 2, 1.016 > > 6, 0, 2, 2, 0.94 > > 6, 0, 0, 3, 1.019 > > 6, 0, 3, 3, 0.941 > > 6, 0, 0, 4, 1.056 > > 6, 0, 4, 4, 0.884 > > 6, 0, 0, 5, 0.977 > > 6, 0, 5, 5, 0.934 > > 6, 0, 0, 6, 0.954 > > 6, 0, 6, 6, 0.93 > > 6, 0, 0, 7, 0.963 > > 6, 0, 7, 7, 0.916 > > 6, 0, 0, 8, 0.963 > > 6, 0, 8, 8, 0.945 > > 6, 0, 0, 9, 1.028 > > 6, 0, 9, 9, 0.942 > > 6, 0, 0, 10, 0.955 > > 6, 0, 10, 10, 0.831 > > 6, 0, 0, 11, 0.948 > > 6, 0, 11, 11, 0.82 > > 6, 0, 0, 12, 1.033 > > 6, 0, 12, 12, 0.873 > > 6, 0, 0, 13, 0.983 > > 6, 0, 13, 13, 0.852 > > 6, 0, 0, 14, 0.984 > > 6, 0, 14, 14, 0.853 > > 6, 0, 0, 15, 0.984 > > 6, 0, 15, 15, 0.882 > > 6, 0, 0, 16, 0.971 > > 6, 0, 16, 16, 0.958 > > 6, 0, 0, 17, 0.938 > > 6, 0, 17, 17, 0.947 > > 6, 0, 0, 18, 0.96 > > 6, 0, 18, 18, 0.938 > > 6, 0, 0, 19, 0.903 > > 6, 0, 19, 19, 0.943 > > 6, 0, 0, 20, 0.947 > > 6, 0, 20, 20, 0.951 > > 6, 0, 0, 21, 0.948 > > 6, 0, 21, 21, 0.96 > > 6, 0, 0, 22, 0.926 > > 6, 0, 22, 22, 0.951 > > 6, 0, 0, 23, 0.923 > > 6, 0, 23, 23, 0.959 > > 6, 0, 0, 24, 0.918 > > 6, 0, 24, 24, 0.952 > > 6, 0, 0, 25, 0.97 > > 6, 0, 25, 25, 0.952 > > 6, 0, 0, 26, 0.871 > > 6, 0, 26, 26, 0.869 > > 6, 0, 0, 27, 0.935 > > 6, 0, 27, 27, 0.836 > > 6, 0, 0, 28, 0.936 > > 6, 0, 28, 28, 0.857 > > 6, 0, 0, 29, 0.876 > > 6, 0, 29, 29, 0.859 > > 6, 0, 0, 30, 0.934 > > 6, 0, 30, 30, 0.857 > > 6, 0, 0, 31, 0.962 > > 6, 0, 31, 31, 0.86 > > 6, 0, 0, 32, 0.912 > > 6, 0, 32, 32, 0.94 > > 6, 0, 0, 33, 0.903 > > 6, 0, 33, 33, 0.968 > > 6, 0, 0, 34, 0.913 > > 6, 0, 34, 34, 0.896 > > 6, 0, 0, 35, 0.904 > > 6, 0, 35, 35, 0.913 > > 6, 0, 0, 36, 0.905 > > 6, 0, 36, 36, 0.907 > > 6, 0, 0, 37, 0.899 > > 6, 0, 37, 37, 0.9 > > 6, 0, 0, 38, 0.912 > > 6, 0, 38, 38, 0.919 > > 6, 0, 0, 39, 0.925 > > 6, 0, 39, 39, 0.927 > > 6, 0, 0, 40, 0.923 > > 6, 0, 40, 40, 0.972 > > 6, 0, 0, 41, 0.92 > > 6, 0, 41, 41, 0.966 > > 6, 0, 0, 42, 0.915 > > 6, 0, 42, 42, 0.834 > > 6, 0, 0, 43, 0.92 > > 6, 0, 43, 43, 0.856 > > 6, 0, 0, 44, 0.908 > > 6, 0, 44, 44, 0.858 > > 6, 0, 0, 45, 0.932 > > 6, 0, 45, 45, 0.847 > > 6, 0, 0, 46, 0.927 > > 6, 0, 46, 46, 0.859 > > 6, 0, 0, 47, 0.902 > > 6, 0, 47, 47, 0.855 > > 6, 0, 0, 48, 0.949 > > 6, 0, 48, 48, 0.934 > > 6, 0, 0, 49, 0.907 > > 6, 0, 49, 49, 0.943 > > 6, 0, 0, 50, 0.934 > > 6, 0, 50, 50, 0.943 > > 6, 0, 0, 51, 0.933 > > 6, 0, 51, 51, 0.939 > > 6, 0, 0, 52, 0.944 > > 6, 0, 52, 52, 0.944 > > 6, 0, 0, 53, 0.939 > > 6, 0, 53, 53, 0.938 > > 6, 0, 0, 54, 0.9 > > 6, 0, 54, 54, 0.923 > > 6, 0, 0, 55, 0.9 > > 6, 0, 55, 55, 0.927 > > 6, 0, 0, 56, 0.9 > > 6, 0, 56, 56, 0.917 > > 6, 0, 0, 57, 0.9 > > 6, 0, 57, 57, 0.916 > > 6, 0, 0, 58, 0.914 > > 6, 0, 58, 58, 0.784 > > 6, 0, 0, 59, 0.863 > > 6, 0, 59, 59, 0.846 > > 6, 0, 0, 60, 0.88 > > 6, 0, 60, 60, 0.827 > > 6, 0, 0, 61, 0.896 > > 6, 0, 61, 61, 0.847 > > 6, 0, 0, 62, 0.894 > > 6, 0, 62, 62, 0.865 > > 6, 0, 0, 63, 0.934 > > 6, 0, 63, 63, 0.866 > > > > sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++--------------- > > 1 file changed, 37 insertions(+), 46 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c > > index 013aebf797..c312fab8b1 100644 > > --- a/sysdeps/x86_64/multiarch/strcspn-c.c > > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c > > @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a) > > RETURN (NULL, strlen (s)); > > > > const char *aligned; > > - __m128i mask; > > - int offset = (int) ((size_t) a & 15); > > + __m128i mask, maskz, zero; > > + unsigned int maskz_bits; > > + unsigned int offset = (unsigned int) ((size_t) a & 15); > > + zero = _mm_set1_epi8 (0); > > if (offset != 0) > > { > > /* Load masks. */ > > aligned = (const char *) ((size_t) a & -16L); > > __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); > > - > > - mask = __m128i_shift_right (mask0, offset); > > + maskz = _mm_cmpeq_epi8 (mask0, zero); > > > > /* Find where the NULL terminator is. */ > > - int length = _mm_cmpistri (mask, mask, 0x3a); > > - if (length == 16 - offset) > > - { > > - /* There is no NULL terminator. */ > > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); > > - int index = _mm_cmpistri (mask1, mask1, 0x3a); > > - length += index; > > - > > - /* Don't use SSE4.2 if the length of A > 16. */ > > - if (length > 16) > > - return STRCSPN_SSE2 (s, a); > > - > > - if (index != 0) > > - { > > - /* Combine mask0 and mask1. We could play games with > > - palignr, but frankly this data should be in L1 now > > - so do the merge via an unaligned load. */ > > - mask = _mm_loadu_si128 ((__m128i *) a); > > - } > > - } > > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; > > + if (maskz_bits != 0) > > + { > > + mask = __m128i_shift_right (mask0, offset); > > + offset = (unsigned int) ((size_t) s & 15); > > + if (offset) > > + goto start_unaligned; > > + > > + aligned = s; > > + goto start_loop; > > + } > > } > > - else > > - { > > - /* A is aligned. */ > > - mask = _mm_load_si128 ((__m128i *) a); > > > > - /* Find where the NULL terminator is. */ > > - int length = _mm_cmpistri (mask, mask, 0x3a); > > - if (length == 16) > > - { > > - /* There is no NULL terminator. Don't use SSE4.2 if the length > > - of A > 16. */ > > - if (a[16] != 0) > > - return STRCSPN_SSE2 (s, a); > > - } > > + /* A is aligned. */ > > + mask = _mm_loadu_si128 ((__m128i *) a); > > + /* Find where the NULL terminator is. */ > > + maskz = _mm_cmpeq_epi8 (mask, zero); > > + maskz_bits = _mm_movemask_epi8 (maskz); > > + if (maskz_bits == 0) > > + { > > + /* There is no NULL terminator. Don't use SSE4.2 if the length > > + of A > 16. */ > > + if (a[16] != 0) > > + return STRCSPN_SSE2 (s, a); > > } > > > > - offset = (int) ((size_t) s & 15); > > + aligned = s; > > + offset = (unsigned int) ((size_t) s & 15); > > if (offset != 0) > > { > > + start_unaligned: > > /* Check partial string. */ > > aligned = (const char *) ((size_t) s & -16L); > > __m128i value = _mm_load_si128 ((__m128i *) aligned); > > > > value = __m128i_shift_right (value, offset); > > > > - int length = _mm_cmpistri (mask, value, 0x2); > > + unsigned int length = _mm_cmpistri (mask, value, 0x2); > > /* No need to check ZFlag since ZFlag is always 1. */ > > - int cflag = _mm_cmpistrc (mask, value, 0x2); > > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > > if (cflag) > > RETURN ((char *) (s + length), length); > > /* Find where the NULL terminator is. */ > > - int index = _mm_cmpistri (value, value, 0x3a); > > + unsigned int index = _mm_cmpistri (value, value, 0x3a); > > if (index < 16 - offset) > > RETURN (NULL, index); > > aligned += 16; > > } > > - else > > - aligned = s; > > > > +start_loop: > > while (1) > > { > > __m128i value = _mm_load_si128 ((__m128i *) aligned); > > - int index = _mm_cmpistri (mask, value, 0x2); > > - int cflag = _mm_cmpistrc (mask, value, 0x2); > > - int zflag = _mm_cmpistrz (mask, value, 0x2); > > + unsigned int index = _mm_cmpistri (mask, value, 0x2); > > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); > > + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); > > if (cflag) > > RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); > > if (zflag) > > -- > > 2.25.1 > > > > LGTM. > > Reviewed-by: H.J. Lu <hjl.tools@gmail.com> > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c index 013aebf797..c312fab8b1 100644 --- a/sysdeps/x86_64/multiarch/strcspn-c.c +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a) RETURN (NULL, strlen (s)); const char *aligned; - __m128i mask; - int offset = (int) ((size_t) a & 15); + __m128i mask, maskz, zero; + unsigned int maskz_bits; + unsigned int offset = (unsigned int) ((size_t) a & 15); + zero = _mm_set1_epi8 (0); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); - - mask = __m128i_shift_right (mask0, offset); + maskz = _mm_cmpeq_epi8 (mask0, zero); /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16 - offset) - { - /* There is no NULL terminator. */ - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); - int index = _mm_cmpistri (mask1, mask1, 0x3a); - length += index; - - /* Don't use SSE4.2 if the length of A > 16. */ - if (length > 16) - return STRCSPN_SSE2 (s, a); - - if (index != 0) - { - /* Combine mask0 and mask1. We could play games with - palignr, but frankly this data should be in L1 now - so do the merge via an unaligned load. */ - mask = _mm_loadu_si128 ((__m128i *) a); - } - } + maskz_bits = _mm_movemask_epi8 (maskz) >> offset; + if (maskz_bits != 0) + { + mask = __m128i_shift_right (mask0, offset); + offset = (unsigned int) ((size_t) s & 15); + if (offset) + goto start_unaligned; + + aligned = s; + goto start_loop; + } } - else - { - /* A is aligned. */ - mask = _mm_load_si128 ((__m128i *) a); - /* Find where the NULL terminator is. */ - int length = _mm_cmpistri (mask, mask, 0x3a); - if (length == 16) - { - /* There is no NULL terminator. Don't use SSE4.2 if the length - of A > 16. */ - if (a[16] != 0) - return STRCSPN_SSE2 (s, a); - } + /* A is aligned. */ + mask = _mm_loadu_si128 ((__m128i *) a); + /* Find where the NULL terminator is. */ + maskz = _mm_cmpeq_epi8 (mask, zero); + maskz_bits = _mm_movemask_epi8 (maskz); + if (maskz_bits == 0) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); } - offset = (int) ((size_t) s & 15); + aligned = s; + offset = (unsigned int) ((size_t) s & 15); if (offset != 0) { + start_unaligned: /* Check partial string. */ aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); value = __m128i_shift_right (value, offset); - int length = _mm_cmpistri (mask, value, 0x2); + unsigned int length = _mm_cmpistri (mask, value, 0x2); /* No need to check ZFlag since ZFlag is always 1. */ - int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); if (cflag) RETURN ((char *) (s + length), length); /* Find where the NULL terminator is. */ - int index = _mm_cmpistri (value, value, 0x3a); + unsigned int index = _mm_cmpistri (value, value, 0x3a); if (index < 16 - offset) RETURN (NULL, index); aligned += 16; } - else - aligned = s; +start_loop: while (1) { __m128i value = _mm_load_si128 ((__m128i *) aligned); - int index = _mm_cmpistri (mask, value, 0x2); - int cflag = _mm_cmpistrc (mask, value, 0x2); - int zflag = _mm_cmpistrz (mask, value, 0x2); + unsigned int index = _mm_cmpistri (mask, value, 0x2); + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); if (cflag) RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); if (zflag)