Message ID | f758cdf7bb2cf9635d7c39c4144ce4f40b2a95f3.1683500149.git.nabijaczleweli@nabijaczleweli.xyz |
---|---|
State | New |
Headers | show |
Series | [v4,1/3] posix: add (failing) test for REG_STARTEND | expand |
Hey Paul, Could you take a look if this change make sense? We usually tend just sync with gnulib, and you and other gnulib developers seems way more active working on this code than glibc community. On 07/05/23 19:56, наб via Libc-alpha wrote: > re_search_internal () starts with > /* If initial states with non-begbuf contexts have no elements, > the regex must be anchored. If preg->newline_anchor is set, > we'll never use init_state_nl, so do not check it. */ > if (dfa->init_state->nodes.nelem == 0 > && dfa->init_state_word->nodes.nelem == 0 > && (dfa->init_state_nl->nodes.nelem == 0 > || !preg->newline_anchor)) > { > if (start != 0 && last_start != 0) > return REG_NOMATCH; > start = last_start = 0; > } > and heretofor start and last_start (for example when "abc", {1, 2}, > so matching just the "b") were != 0, and the return was taken for a "^b" > regex, which is erroneous. > > Fix this by giving re_search_internal (string+rm_so, start=0), > then fixing up the returned matches in an after-pass. > > This brings us to compatibility with the BSD spec and implementations. > > Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz> > --- > posix/regexec.c | 41 ++++++++++++++++++++++++++++------------- > 1 file changed, 28 insertions(+), 13 deletions(-) > > diff --git a/posix/regexec.c b/posix/regexec.c > index bd0cd412d0..2ef868e1f6 100644 > --- a/posix/regexec.c > +++ b/posix/regexec.c > @@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len); > string; if REG_NOTEOL is set, then $ does not match at the end. > > Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if > - EFLAGS is invalid. */ > + EFLAGS is invalid. > + > + If REG_STARTEND, the bounds are > + [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo) > + instead of the usual > + [STRING, STRING + strlen(STRING)), > + but returned matches are still referenced to STRING, > + and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$"). > + re_search_internal () has a built-in assumption of > + (start != 0) <=> (^ doesn't match), so give it a truncated view > + and fix up the matches afterward. */ > > int > regexec (const regex_t *__restrict preg, const char *__restrict string, > size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags) > { > reg_errcode_t err; > - Idx start, length; > + Idx startoff = 0, length; > re_dfa_t *dfa = preg->buffer; > + size_t i = 0; > > if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) > return REG_BADPAT; > > if (eflags & REG_STARTEND) > { > - start = pmatch[0].rm_so; > - length = pmatch[0].rm_eo; > + startoff = pmatch[0].rm_so; > + string += startoff; > + length = pmatch[0].rm_eo - startoff; > } > else > - { > - start = 0; > - length = strlen (string); > - } > + length = strlen (string); > > lock_lock (dfa->lock); > if (preg->no_sub) > - err = re_search_internal (preg, string, length, start, length, > - length, 0, NULL, eflags); > - else > - err = re_search_internal (preg, string, length, start, length, > - length, nmatch, pmatch, eflags); > + nmatch = 0; > + err = re_search_internal (preg, string, length, 0, length, > + length, nmatch, pmatch, eflags); > lock_unlock (dfa->lock); > + > + if (err == REG_NOERROR && startoff) > + for (i = 0; i < nmatch; ++i) > + if (pmatch[i].rm_so != -1) > + { > + pmatch[i].rm_so += startoff; > + pmatch[i].rm_eo += startoff; > + } > return err != REG_NOERROR; > } >
diff --git a/posix/regexec.c b/posix/regexec.c index bd0cd412d0..2ef868e1f6 100644 --- a/posix/regexec.c +++ b/posix/regexec.c @@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len); string; if REG_NOTEOL is set, then $ does not match at the end. Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if - EFLAGS is invalid. */ + EFLAGS is invalid. + + If REG_STARTEND, the bounds are + [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo) + instead of the usual + [STRING, STRING + strlen(STRING)), + but returned matches are still referenced to STRING, + and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$"). + re_search_internal () has a built-in assumption of + (start != 0) <=> (^ doesn't match), so give it a truncated view + and fix up the matches afterward. */ int regexec (const regex_t *__restrict preg, const char *__restrict string, size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags) { reg_errcode_t err; - Idx start, length; + Idx startoff = 0, length; re_dfa_t *dfa = preg->buffer; + size_t i = 0; if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND)) return REG_BADPAT; if (eflags & REG_STARTEND) { - start = pmatch[0].rm_so; - length = pmatch[0].rm_eo; + startoff = pmatch[0].rm_so; + string += startoff; + length = pmatch[0].rm_eo - startoff; } else - { - start = 0; - length = strlen (string); - } + length = strlen (string); lock_lock (dfa->lock); if (preg->no_sub) - err = re_search_internal (preg, string, length, start, length, - length, 0, NULL, eflags); - else - err = re_search_internal (preg, string, length, start, length, - length, nmatch, pmatch, eflags); + nmatch = 0; + err = re_search_internal (preg, string, length, 0, length, + length, nmatch, pmatch, eflags); lock_unlock (dfa->lock); + + if (err == REG_NOERROR && startoff) + for (i = 0; i < nmatch; ++i) + if (pmatch[i].rm_so != -1) + { + pmatch[i].rm_so += startoff; + pmatch[i].rm_eo += startoff; + } return err != REG_NOERROR; }
re_search_internal () starts with /* If initial states with non-begbuf contexts have no elements, the regex must be anchored. If preg->newline_anchor is set, we'll never use init_state_nl, so do not check it. */ if (dfa->init_state->nodes.nelem == 0 && dfa->init_state_word->nodes.nelem == 0 && (dfa->init_state_nl->nodes.nelem == 0 || !preg->newline_anchor)) { if (start != 0 && last_start != 0) return REG_NOMATCH; start = last_start = 0; } and heretofor start and last_start (for example when "abc", {1, 2}, so matching just the "b") were != 0, and the return was taken for a "^b" regex, which is erroneous. Fix this by giving re_search_internal (string+rm_so, start=0), then fixing up the returned matches in an after-pass. This brings us to compatibility with the BSD spec and implementations. Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz> --- posix/regexec.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-)