diff mbox series

[v9,2/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor

Message ID 2b9576418315d3eb1863148d2dbf1fadf1fccc9f.1688406717.git.nabijaczleweli@nabijaczleweli.xyz
State New
Headers show
Series [v9,1/3] posix: regcomp(): clear RE_DOT_NOT_NULL | expand

Commit Message

наб July 3, 2023, 5:52 p.m. UTC
re_search_internal () starts with
  /* If initial states with non-begbuf contexts have no elements,
     the regex must be anchored.  If preg->newline_anchor is set,
     we'll never use init_state_nl, so do not check it.  */
  if (dfa->init_state->nodes.nelem == 0
      && dfa->init_state_word->nodes.nelem == 0
      && (dfa->init_state_nl->nodes.nelem == 0
	  || !preg->newline_anchor))
    {
      if (start != 0 && last_start != 0)
        return REG_NOMATCH;
      start = last_start = 0;
    }
and heretofor start and last_start (for example when "abc", {1, 2},
so matching just the "b") were != 0, and the return was taken for a "^b"
regex, which is erroneous.

Fix this by giving re_search_internal (string+rm_so, start=0),
then fixing up the returned matches in an after-pass.

This brings us to compatibility with the BSD spec and implementations.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
I hereby disclaim all copyright interest in this changeset.

 posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)
diff mbox series

Patch

diff --git a/posix/regexec.c b/posix/regexec.c
index bd0cd412d0..2ef868e1f6 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -187,38 +187,53 @@  static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
    string; if REG_NOTEOL is set, then $ does not match at the end.
 
    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
-   EFLAGS is invalid.  */
+   EFLAGS is invalid.
+
+   If REG_STARTEND, the bounds are
+     [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
+   instead of the usual
+     [STRING, STRING + strlen(STRING)),
+   but returned matches are still referenced to STRING,
+   and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
+   re_search_internal () has a built-in assumption of
+   (start != 0) <=> (^ doesn't match), so give it a truncated view
+   and fix up the matches afterward.  */
 
 int
 regexec (const regex_t *__restrict preg, const char *__restrict string,
 	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
 {
   reg_errcode_t err;
-  Idx start, length;
+  Idx startoff = 0, length;
   re_dfa_t *dfa = preg->buffer;
+  size_t i = 0;
 
   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
     return REG_BADPAT;
 
   if (eflags & REG_STARTEND)
     {
-      start = pmatch[0].rm_so;
-      length = pmatch[0].rm_eo;
+      startoff = pmatch[0].rm_so;
+      string += startoff;
+      length = pmatch[0].rm_eo - startoff;
     }
   else
-    {
-      start = 0;
-      length = strlen (string);
-    }
+    length = strlen (string);
 
   lock_lock (dfa->lock);
   if (preg->no_sub)
-    err = re_search_internal (preg, string, length, start, length,
-			      length, 0, NULL, eflags);
-  else
-    err = re_search_internal (preg, string, length, start, length,
-			      length, nmatch, pmatch, eflags);
+    nmatch = 0;
+  err = re_search_internal (preg, string, length, 0, length,
+			    length, nmatch, pmatch, eflags);
   lock_unlock (dfa->lock);
+
+  if (err == REG_NOERROR && startoff)
+    for (i = 0; i < nmatch; ++i)
+      if (pmatch[i].rm_so != -1)
+	{
+	  pmatch[i].rm_so += startoff;
+	  pmatch[i].rm_eo += startoff;
+	}
   return err != REG_NOERROR;
 }