diff mbox series

[v4,3/3] posix: regexec(): fix REG_STARTEND, pmatch->rm_so != 0 w/^ anchor

Message ID f758cdf7bb2cf9635d7c39c4144ce4f40b2a95f3.1683500149.git.nabijaczleweli@nabijaczleweli.xyz
State New
Headers show
Series [v4,1/3] posix: add (failing) test for REG_STARTEND | expand

Commit Message

Ahelenia Ziemiańska May 7, 2023, 10:56 p.m. UTC
re_search_internal () starts with
  /* If initial states with non-begbuf contexts have no elements,
     the regex must be anchored.  If preg->newline_anchor is set,
     we'll never use init_state_nl, so do not check it.  */
  if (dfa->init_state->nodes.nelem == 0
      && dfa->init_state_word->nodes.nelem == 0
      && (dfa->init_state_nl->nodes.nelem == 0
	  || !preg->newline_anchor))
    {
      if (start != 0 && last_start != 0)
        return REG_NOMATCH;
      start = last_start = 0;
    }
and heretofor start and last_start (for example when "abc", {1, 2},
so matching just the "b") were != 0, and the return was taken for a "^b"
regex, which is erroneous.

Fix this by giving re_search_internal (string+rm_so, start=0),
then fixing up the returned matches in an after-pass.

This brings us to compatibility with the BSD spec and implementations.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
 posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

Comments

Adhemerval Zanella May 29, 2023, 6:11 p.m. UTC | #1
Hey Paul,

Could you take a look if this change make sense? We usually tend just sync 
with gnulib, and you and other gnulib developers seems way more active 
working on this code than glibc community.

On 07/05/23 19:56, наб via Libc-alpha wrote:
> re_search_internal () starts with
>   /* If initial states with non-begbuf contexts have no elements,
>      the regex must be anchored.  If preg->newline_anchor is set,
>      we'll never use init_state_nl, so do not check it.  */
>   if (dfa->init_state->nodes.nelem == 0
>       && dfa->init_state_word->nodes.nelem == 0
>       && (dfa->init_state_nl->nodes.nelem == 0
> 	  || !preg->newline_anchor))
>     {
>       if (start != 0 && last_start != 0)
>         return REG_NOMATCH;
>       start = last_start = 0;
>     }
> and heretofor start and last_start (for example when "abc", {1, 2},
> so matching just the "b") were != 0, and the return was taken for a "^b"
> regex, which is erroneous.
> 
> Fix this by giving re_search_internal (string+rm_so, start=0),
> then fixing up the returned matches in an after-pass.
> 
> This brings us to compatibility with the BSD spec and implementations.
> 
> Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
> ---
>  posix/regexec.c | 41 ++++++++++++++++++++++++++++-------------
>  1 file changed, 28 insertions(+), 13 deletions(-)
> 
> diff --git a/posix/regexec.c b/posix/regexec.c
> index bd0cd412d0..2ef868e1f6 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -187,38 +187,53 @@ static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
>     string; if REG_NOTEOL is set, then $ does not match at the end.
>  
>     Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
> -   EFLAGS is invalid.  */
> +   EFLAGS is invalid.
> +
> +   If REG_STARTEND, the bounds are
> +     [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
> +   instead of the usual
> +     [STRING, STRING + strlen(STRING)),
> +   but returned matches are still referenced to STRING,
> +   and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
> +   re_search_internal () has a built-in assumption of
> +   (start != 0) <=> (^ doesn't match), so give it a truncated view
> +   and fix up the matches afterward.  */
>  
>  int
>  regexec (const regex_t *__restrict preg, const char *__restrict string,
>  	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
>  {
>    reg_errcode_t err;
> -  Idx start, length;
> +  Idx startoff = 0, length;
>    re_dfa_t *dfa = preg->buffer;
> +  size_t i = 0;
>  
>    if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
>      return REG_BADPAT;
>  
>    if (eflags & REG_STARTEND)
>      {
> -      start = pmatch[0].rm_so;
> -      length = pmatch[0].rm_eo;
> +      startoff = pmatch[0].rm_so;
> +      string += startoff;
> +      length = pmatch[0].rm_eo - startoff;
>      }
>    else
> -    {
> -      start = 0;
> -      length = strlen (string);
> -    }
> +    length = strlen (string);
>  
>    lock_lock (dfa->lock);
>    if (preg->no_sub)
> -    err = re_search_internal (preg, string, length, start, length,
> -			      length, 0, NULL, eflags);
> -  else
> -    err = re_search_internal (preg, string, length, start, length,
> -			      length, nmatch, pmatch, eflags);
> +    nmatch = 0;
> +  err = re_search_internal (preg, string, length, 0, length,
> +			    length, nmatch, pmatch, eflags);
>    lock_unlock (dfa->lock);
> +
> +  if (err == REG_NOERROR && startoff)
> +    for (i = 0; i < nmatch; ++i)
> +      if (pmatch[i].rm_so != -1)
> +	{
> +	  pmatch[i].rm_so += startoff;
> +	  pmatch[i].rm_eo += startoff;
> +	}
>    return err != REG_NOERROR;
>  }
>
diff mbox series

Patch

diff --git a/posix/regexec.c b/posix/regexec.c
index bd0cd412d0..2ef868e1f6 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -187,38 +187,53 @@  static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
    string; if REG_NOTEOL is set, then $ does not match at the end.
 
    Return 0 if a match is found, REG_NOMATCH if not, REG_BADPAT if
-   EFLAGS is invalid.  */
+   EFLAGS is invalid.
+
+   If REG_STARTEND, the bounds are
+     [STRING + PMATCH->rm_so, STRING + PMATCH->rm_eo)
+   instead of the usual
+     [STRING, STRING + strlen(STRING)),
+   but returned matches are still referenced to STRING,
+   and matching is unaffected (i.e. "abc", {1, 2} matches regex "^b$").
+   re_search_internal () has a built-in assumption of
+   (start != 0) <=> (^ doesn't match), so give it a truncated view
+   and fix up the matches afterward.  */
 
 int
 regexec (const regex_t *__restrict preg, const char *__restrict string,
 	 size_t nmatch, regmatch_t pmatch[_REGEX_NELTS (nmatch)], int eflags)
 {
   reg_errcode_t err;
-  Idx start, length;
+  Idx startoff = 0, length;
   re_dfa_t *dfa = preg->buffer;
+  size_t i = 0;
 
   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
     return REG_BADPAT;
 
   if (eflags & REG_STARTEND)
     {
-      start = pmatch[0].rm_so;
-      length = pmatch[0].rm_eo;
+      startoff = pmatch[0].rm_so;
+      string += startoff;
+      length = pmatch[0].rm_eo - startoff;
     }
   else
-    {
-      start = 0;
-      length = strlen (string);
-    }
+    length = strlen (string);
 
   lock_lock (dfa->lock);
   if (preg->no_sub)
-    err = re_search_internal (preg, string, length, start, length,
-			      length, 0, NULL, eflags);
-  else
-    err = re_search_internal (preg, string, length, start, length,
-			      length, nmatch, pmatch, eflags);
+    nmatch = 0;
+  err = re_search_internal (preg, string, length, 0, length,
+			    length, nmatch, pmatch, eflags);
   lock_unlock (dfa->lock);
+
+  if (err == REG_NOERROR && startoff)
+    for (i = 0; i < nmatch; ++i)
+      if (pmatch[i].rm_so != -1)
+	{
+	  pmatch[i].rm_so += startoff;
+	  pmatch[i].rm_eo += startoff;
+	}
   return err != REG_NOERROR;
 }