Message ID | 20150609142748.GA3982@domone |
---|---|
State | New |
Headers | show |
On Tue, Jun 09, 2015 at 04:27:48PM +0200, Ondřej Bílka wrote: > On Wed, May 13, 2015 at 09:48:27AM -0700, Paul Eggert wrote: > > Ondřej Bílka wrote: > > >How to synchronize this with gnulib? Only implementation specific detail > > >is utf8 detection. > > > > It could be something like this: > > > > #if _LIBC > > struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE]; > > uint_fast32_t encoding = > > current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; > > bool is_utf8 = encoding == !__cet_other; > > #else > > bool is_utf8 = STRCASEEQ (locale_charset (), > > "UTF-8", 'U','T','F','-','8',0,0,0,0) > > #endif > > > > We should package this sort of thing up and make it easier to use, > > but that could be another day. > > Yes I will use that pattern, it needs to change details like that it > also works for single-byte encodings. > > I also removed expect on MB_CUR_MAX as unicode is widespread. > > Also I now return directly match when entire pattern is normal and > FNM_PERIOD or FNM_FILE_NAME wasn't set which could also help performance > a bit. > > Then I could allow nonascii characters to start pattern unless its utf8 > and you have FNM_CASEFOLD, would it be better to add two tables or check > for testing these? > > * posix/fnmatch.c (fnmatch): Improve performance. > > diff --git a/posix/fnmatch.c b/posix/fnmatch.c > index fd85efa..4c32992 100644 > --- a/posix/fnmatch.c > +++ b/posix/fnmatch.c > @@ -131,6 +131,13 @@ extern int fnmatch (const char *pattern, const char *string, int flags); > # define ISWCTYPE(WC, WT) iswctype (WC, WT) > # endif > > +# ifdef _LIBC > +# define STRCASESTR __strcasestr > +# else > +# define STRCASESTR strcasestr > +# endif > + > + > # if (HAVE_MBSTATE_T && HAVE_MBSRTOWCS) || _LIBC > /* In this case we are implementing the multibyte character handling. */ > # define HANDLE_MULTIBYTE 1 > @@ -332,8 +339,62 @@ fnmatch (pattern, string, flags) > const char *string; > int flags; > { > + > + /* ASCII with \+/.*?[{(@! excluded. */ > + static unsigned char normal[256] = { > + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > + 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, > + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 > + }; > +# if _LIBC > + struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE]; > + uint_fast32_t encoding = > + current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; > + bool fast_encoding = (encoding != __cet_other); > +# else > +# if HANDLE_MULTIBYTE > + bool is_utf8 = STRCASEEQ (locale_charset (), > + "UTF-8", 'U','T','F','-','8',0,0,0,0); > + bool fast_encoding = (MB_CUR_MAX == 1) || is_utf; > +# else > + bool fast_encoding = true; > +# endif > +# endif > + > + if (fast_encoding) > + { > + char start[8]; > + char *string2; > + size_t i; > + for (i = 0; i < 7 && normal[(unsigned char) pattern[i]]; i++) > + start[i] = pattern[i]; > + start[i] = 0; > + if (flags & FNM_CASEFOLD) > + string2 = STRCASESTR (string, start); > + else > + string2 = strstr (string, start); > + if (!string2) > + return FNM_NOMATCH; > + > + if (pattern[i] == '\0' && (flags & (FNM_FILE_NAME | FNM_PERIOD)) == 0) > + return 0; > + } > + > # if HANDLE_MULTIBYTE > - if (__builtin_expect (MB_CUR_MAX, 1) != 1) > + if (MB_CUR_MAX != 1) > { > mbstate_t ps; > size_t n;
ping On Tue, Jun 16, 2015 at 07:30:17PM +0200, Ondřej Bílka wrote: > On Tue, Jun 09, 2015 at 04:27:48PM +0200, Ondřej Bílka wrote: > > On Wed, May 13, 2015 at 09:48:27AM -0700, Paul Eggert wrote: > > > Ondřej Bílka wrote: > > > >How to synchronize this with gnulib? Only implementation specific detail > > > >is utf8 detection. > > > > > > It could be something like this: > > > > > > #if _LIBC > > > struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE]; > > > uint_fast32_t encoding = > > > current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; > > > bool is_utf8 = encoding == !__cet_other; > > > #else > > > bool is_utf8 = STRCASEEQ (locale_charset (), > > > "UTF-8", 'U','T','F','-','8',0,0,0,0) > > > #endif > > > > > > We should package this sort of thing up and make it easier to use, > > > but that could be another day. > > > > Yes I will use that pattern, it needs to change details like that it > > also works for single-byte encodings. > > > > I also removed expect on MB_CUR_MAX as unicode is widespread. > > > > Also I now return directly match when entire pattern is normal and > > FNM_PERIOD or FNM_FILE_NAME wasn't set which could also help performance > > a bit. > > > > Then I could allow nonascii characters to start pattern unless its utf8 > > and you have FNM_CASEFOLD, would it be better to add two tables or check > > for testing these? > > > > * posix/fnmatch.c (fnmatch): Improve performance. > > > > diff --git a/posix/fnmatch.c b/posix/fnmatch.c > > index fd85efa..4c32992 100644 > > --- a/posix/fnmatch.c > > +++ b/posix/fnmatch.c > > @@ -131,6 +131,13 @@ extern int fnmatch (const char *pattern, const char *string, int flags); > > # define ISWCTYPE(WC, WT) iswctype (WC, WT) > > # endif > > > > +# ifdef _LIBC > > +# define STRCASESTR __strcasestr > > +# else > > +# define STRCASESTR strcasestr > > +# endif > > + > > + > > # if (HAVE_MBSTATE_T && HAVE_MBSRTOWCS) || _LIBC > > /* In this case we are implementing the multibyte character handling. */ > > # define HANDLE_MULTIBYTE 1 > > @@ -332,8 +339,62 @@ fnmatch (pattern, string, flags) > > const char *string; > > int flags; > > { > > + > > + /* ASCII with \+/.*?[{(@! excluded. */ > > + static unsigned char normal[256] = { > > + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > + 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, > > + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 > > + }; > > +# if _LIBC > > + struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE]; > > + uint_fast32_t encoding = > > + current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; > > + bool fast_encoding = (encoding != __cet_other); > > +# else > > +# if HANDLE_MULTIBYTE > > + bool is_utf8 = STRCASEEQ (locale_charset (), > > + "UTF-8", 'U','T','F','-','8',0,0,0,0); > > + bool fast_encoding = (MB_CUR_MAX == 1) || is_utf; > > +# else > > + bool fast_encoding = true; > > +# endif > > +# endif > > + > > + if (fast_encoding) > > + { > > + char start[8]; > > + char *string2; > > + size_t i; > > + for (i = 0; i < 7 && normal[(unsigned char) pattern[i]]; i++) > > + start[i] = pattern[i]; > > + start[i] = 0; > > + if (flags & FNM_CASEFOLD) > > + string2 = STRCASESTR (string, start); > > + else > > + string2 = strstr (string, start); > > + if (!string2) > > + return FNM_NOMATCH; > > + > > + if (pattern[i] == '\0' && (flags & (FNM_FILE_NAME | FNM_PERIOD)) == 0) > > + return 0; > > + } > > + > > # if HANDLE_MULTIBYTE > > - if (__builtin_expect (MB_CUR_MAX, 1) != 1) > > + if (MB_CUR_MAX != 1) > > { > > mbstate_t ps; > > size_t n; >
ping On Wed, Aug 12, 2015 at 07:41:20AM +0200, Ondřej Bílka wrote: > ping > On Tue, Jun 16, 2015 at 07:30:17PM +0200, Ondřej Bílka wrote: > > On Tue, Jun 09, 2015 at 04:27:48PM +0200, Ondřej Bílka wrote: > > > On Wed, May 13, 2015 at 09:48:27AM -0700, Paul Eggert wrote: > > > > Ondřej Bílka wrote: > > > > >How to synchronize this with gnulib? Only implementation specific detail > > > > >is utf8 detection. > > > > > > > > It could be something like this: > > > > > > > > #if _LIBC > > > > struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE]; > > > > uint_fast32_t encoding = > > > > current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; > > > > bool is_utf8 = encoding == !__cet_other; > > > > #else > > > > bool is_utf8 = STRCASEEQ (locale_charset (), > > > > "UTF-8", 'U','T','F','-','8',0,0,0,0) > > > > #endif > > > > > > > > We should package this sort of thing up and make it easier to use, > > > > but that could be another day. > > > > > > Yes I will use that pattern, it needs to change details like that it > > > also works for single-byte encodings. > > > > > > I also removed expect on MB_CUR_MAX as unicode is widespread. > > > > > > Also I now return directly match when entire pattern is normal and > > > FNM_PERIOD or FNM_FILE_NAME wasn't set which could also help performance > > > a bit. > > > > > > Then I could allow nonascii characters to start pattern unless its utf8 > > > and you have FNM_CASEFOLD, would it be better to add two tables or check > > > for testing these? > > > > > > * posix/fnmatch.c (fnmatch): Improve performance. > > > > > > diff --git a/posix/fnmatch.c b/posix/fnmatch.c > > > index fd85efa..4c32992 100644 > > > --- a/posix/fnmatch.c > > > +++ b/posix/fnmatch.c > > > @@ -131,6 +131,13 @@ extern int fnmatch (const char *pattern, const char *string, int flags); > > > # define ISWCTYPE(WC, WT) iswctype (WC, WT) > > > # endif > > > > > > +# ifdef _LIBC > > > +# define STRCASESTR __strcasestr > > > +# else > > > +# define STRCASESTR strcasestr > > > +# endif > > > + > > > + > > > # if (HAVE_MBSTATE_T && HAVE_MBSRTOWCS) || _LIBC > > > /* In this case we are implementing the multibyte character handling. */ > > > # define HANDLE_MULTIBYTE 1 > > > @@ -332,8 +339,62 @@ fnmatch (pattern, string, flags) > > > const char *string; > > > int flags; > > > { > > > + > > > + /* ASCII with \+/.*?[{(@! excluded. */ > > > + static unsigned char normal[256] = { > > > + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > > + 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, > > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, > > > + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, > > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, > > > + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, > > > + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 > > > + }; > > > +# if _LIBC > > > + struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE]; > > > + uint_fast32_t encoding = > > > + current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; > > > + bool fast_encoding = (encoding != __cet_other); > > > +# else > > > +# if HANDLE_MULTIBYTE > > > + bool is_utf8 = STRCASEEQ (locale_charset (), > > > + "UTF-8", 'U','T','F','-','8',0,0,0,0); > > > + bool fast_encoding = (MB_CUR_MAX == 1) || is_utf; > > > +# else > > > + bool fast_encoding = true; > > > +# endif > > > +# endif > > > + > > > + if (fast_encoding) > > > + { > > > + char start[8]; > > > + char *string2; > > > + size_t i; > > > + for (i = 0; i < 7 && normal[(unsigned char) pattern[i]]; i++) > > > + start[i] = pattern[i]; > > > + start[i] = 0; > > > + if (flags & FNM_CASEFOLD) > > > + string2 = STRCASESTR (string, start); > > > + else > > > + string2 = strstr (string, start); > > > + if (!string2) > > > + return FNM_NOMATCH; > > > + > > > + if (pattern[i] == '\0' && (flags & (FNM_FILE_NAME | FNM_PERIOD)) == 0) > > > + return 0; > > > + } > > > + > > > # if HANDLE_MULTIBYTE > > > - if (__builtin_expect (MB_CUR_MAX, 1) != 1) > > > + if (MB_CUR_MAX != 1) > > > { > > > mbstate_t ps; > > > size_t n; > >
diff --git a/posix/fnmatch.c b/posix/fnmatch.c index fd85efa..4c32992 100644 --- a/posix/fnmatch.c +++ b/posix/fnmatch.c @@ -131,6 +131,13 @@ extern int fnmatch (const char *pattern, const char *string, int flags); # define ISWCTYPE(WC, WT) iswctype (WC, WT) # endif +# ifdef _LIBC +# define STRCASESTR __strcasestr +# else +# define STRCASESTR strcasestr +# endif + + # if (HAVE_MBSTATE_T && HAVE_MBSRTOWCS) || _LIBC /* In this case we are implementing the multibyte character handling. */ # define HANDLE_MULTIBYTE 1 @@ -332,8 +339,62 @@ fnmatch (pattern, string, flags) const char *string; int flags; { + + /* ASCII with \+/.*?[{(@! excluded. */ + static unsigned char normal[256] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; +# if _LIBC + struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE]; + uint_fast32_t encoding = + current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word; + bool fast_encoding = (encoding != __cet_other); +# else +# if HANDLE_MULTIBYTE + bool is_utf8 = STRCASEEQ (locale_charset (), + "UTF-8", 'U','T','F','-','8',0,0,0,0); + bool fast_encoding = (MB_CUR_MAX == 1) || is_utf; +# else + bool fast_encoding = true; +# endif +# endif + + if (fast_encoding) + { + char start[8]; + char *string2; + size_t i; + for (i = 0; i < 7 && normal[(unsigned char) pattern[i]]; i++) + start[i] = pattern[i]; + start[i] = 0; + if (flags & FNM_CASEFOLD) + string2 = STRCASESTR (string, start); + else + string2 = strstr (string, start); + if (!string2) + return FNM_NOMATCH; + + if (pattern[i] == '\0' && (flags & (FNM_FILE_NAME | FNM_PERIOD)) == 0) + return 0; + } + # if HANDLE_MULTIBYTE - if (__builtin_expect (MB_CUR_MAX, 1) != 1) + if (MB_CUR_MAX != 1) { mbstate_t ps; size_t n;