Message ID | 695cd581035b59c759477b806640ee0b70df05f7.1686530834.git.nabijaczleweli@nabijaczleweli.xyz |
---|---|
State | New |
Headers | show |
Series | [v7,1/3] posix: regcomp(): clear RE_DOT_NOT_NULL | expand |
On 6/11/23 20:47, наб wrote: > re_search_internal () starts with > /* If initial states with non-begbuf contexts have no elements, > the regex must be anchored. If preg->newline_anchor is set, > we'll never use init_state_nl, so do not check it. */ > if (dfa->init_state->nodes.nelem == 0 > && dfa->init_state_word->nodes.nelem == 0 > && (dfa->init_state_nl->nodes.nelem == 0 > || !preg->newline_anchor)) > { > if (start != 0 && last_start != 0) > return REG_NOMATCH; > start = last_start = 0; > } > and heretofor start and last_start (for example when "abc", {1, 2}, > so matching just the "b") were != 0, and the return was taken for a "^b" > regex, which is erroneous. > > Fix this by giving re_search_internal (string+rm_so, start=0), > then fixing up the returned matches in an after-pass. > > This brings us to compatibility with the BSD spec and implementations. > > Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz> These changes are being made to sources shared between gnulib and glibc. As the files are listed in SHARED-SOURCES we cannot easily accept changes to them via DCO since they should be shared with gnulib which still requires copyright assignment. Would you be willing to disclaim these changes or assign copyright? > --- > posix/regexec.c | 41 ++++++++++++++++++++++++++++------------- > 1 file changed, 28 insertions(+), 13 deletions(-) > > diff --git a/posix/regexec.c b/posix/regexec.c > index bd0cd412d0..2ef868e1f6 100644 > --- a/posix/regexec.c > +++ b/posix/regexec.c > @@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len); > string; if REG_NOTEOL is set, then $ does not match at the end. > > Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if > - EFLAGS is invalid. */ > + EFLAGS is invalid. > + > + If REG_STARTEND, the bounds are > + [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo) > + instead of the usual > + [STRING, STRING + strlen(STRING)), > + but returned matches are still referenced to STRING, > + and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$"). > + re_search_internal () has a built-in assumption of > + (start != 0) <=> (^ doesn't match), so give it a truncated view > + and fix up the matches afterward. */ > > int > regexec (const regex_t *__restrict preg, const char *__restrict string, > size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags) > { > reg_errcode_t err; > - Idx start, length; > + Idx startoff = 0, length; > re_dfa_t *dfa = preg->buffer; > + size_t i = 0; > > if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) > return REG_BADPAT; > > if (eflags & REG_STARTEND) > { > - start = pmatch[0].rm_so; > - length = pmatch[0].rm_eo; > + startoff = pmatch[0].rm_so; > + string += startoff; > + length = pmatch[0].rm_eo - startoff; > } > else > - { > - start = 0; > - length = strlen (string); > - } > + length = strlen (string); > > lock_lock (dfa->lock); > if (preg->no_sub) > - err = re_search_internal (preg, string, length, start, length, > - length, 0, NULL, eflags); > - else > - err = re_search_internal (preg, string, length, start, length, > - length, nmatch, pmatch, eflags); > + nmatch = 0; > + err = re_search_internal (preg, string, length, 0, length, > + length, nmatch, pmatch, eflags); > lock_unlock (dfa->lock); > + > + if (err == REG_NOERROR && startoff) > + for (i = 0; i < nmatch; ++i) > + if (pmatch[i].rm_so != -1) > + { > + pmatch[i].rm_so += startoff; > + pmatch[i].rm_eo += startoff; > + } > return err != REG_NOERROR; > } >
On Mon, Jun 12, 2023 at 09:11:54AM -0400, Carlos O'Donell wrote: > On 6/11/23 20:47, наб wrote: > > re_search_internal () starts with > > /* If initial states with non-begbuf contexts have no elements, > > the regex must be anchored. If preg->newline_anchor is set, > > we'll never use init_state_nl, so do not check it. */ > > if (dfa->init_state->nodes.nelem == 0 > > && dfa->init_state_word->nodes.nelem == 0 > > && (dfa->init_state_nl->nodes.nelem == 0 > > || !preg->newline_anchor)) > > { > > if (start != 0 && last_start != 0) > > return REG_NOMATCH; > > start = last_start = 0; > > } > > and heretofor start and last_start (for example when "abc", {1, 2}, > > so matching just the "b") were != 0, and the return was taken for a "^b" > > regex, which is erroneous. > > > > Fix this by giving re_search_internal (string+rm_so, start=0), > > then fixing up the returned matches in an after-pass. > > > > This brings us to compatibility with the BSD spec and implementations. > > > > Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz> > > These changes are being made to sources shared between gnulib and glibc. > > As the files are listed in SHARED-SOURCES we cannot easily accept changes to them > via DCO since they should be shared with gnulib which still requires copyright > assignment. > > Would you be willing to disclaim these changes or assign copyright? Quite happy to disclaim all patches here, yeah; the process itself is unclear to me, however. Best,
diff --git a/posix/regexec.c b/posix/regexec.c index bd0cd412d0..2ef868e1f6 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len); string; if REG_NOTEOL is set, then $ does not match at the end. Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if - EFLAGS is invalid. */ + EFLAGS is invalid. + + If REG_STARTEND, the bounds are + [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo) + instead of the usual + [STRING, STRING + strlen(STRING)), + but returned matches are still referenced to STRING, + and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$"). + re_search_internal () has a built-in assumption of + (start != 0) <=> (^ doesn't match), so give it a truncated view + and fix up the matches afterward. */ int regexec (const regex_t *__restrict preg, const char *__restrict string, size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags) { reg_errcode_t err; - Idx start, length; + Idx startoff = 0, length; re_dfa_t *dfa = preg->buffer; + size_t i = 0; if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) return REG_BADPAT; if (eflags & REG_STARTEND) { - start = pmatch[0].rm_so; - length = pmatch[0].rm_eo; + startoff = pmatch[0].rm_so; + string += startoff; + length = pmatch[0].rm_eo - startoff; } else - { - start = 0; - length = strlen (string); - } + length = strlen (string); lock_lock (dfa->lock); if (preg->no_sub) - err = re_search_internal (preg, string, length, start, length, - length, 0, NULL, eflags); - else - err = re_search_internal (preg, string, length, start, length, - length, nmatch, pmatch, eflags); + nmatch = 0; + err = re_search_internal (preg, string, length, 0, length, + length, nmatch, pmatch, eflags); lock_unlock (dfa->lock); + + if (err == REG_NOERROR && startoff) + for (i = 0; i < nmatch; ++i) + if (pmatch[i].rm_so != -1) + { + pmatch[i].rm_so += startoff; + pmatch[i].rm_eo += startoff; + } return err != REG_NOERROR; }
re_search_internal () starts with /* If initial states with non-begbuf contexts have no elements, the regex must be anchored. If preg->newline_anchor is set, we'll never use init_state_nl, so do not check it. */ if (dfa->init_state->nodes.nelem == 0 && dfa->init_state_word->nodes.nelem == 0 && (dfa->init_state_nl->nodes.nelem == 0 || !preg->newline_anchor)) { if (start != 0 && last_start != 0) return REG_NOMATCH; start = last_start = 0; } and heretofor start and last_start (for example when "abc", {1, 2}, so matching just the "b") were != 0, and the return was taken for a "^b" regex, which is erroneous. Fix this by giving re_search_internal (string+rm_so, start=0), then fixing up the returned matches in an after-pass. This brings us to compatibility with the BSD spec and implementations. Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz> --- posix/regexec.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-)