diff mbox series

libcpp: Implement the strict reading of the #embed expansion rules

Message ID ZtccNY1sFIx/8rAp@tucnak
State New
Headers show
Series libcpp: Implement the strict reading of the #embed expansion rules | expand

Commit Message

Jakub Jelinek Sept. 3, 2024, 2:24 p.m. UTC
Hi!

The following patch attempts to implement the current wording of
the C23 #embed expansion rules on top of the
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661901.html
patch (haven't yet adjusted the rest of the series, but I expect
only minor tweaks).
After parsing #embed it first checks whether the tokens with
prevent_expansion = 1 match the
<h-char-sequence> embed-parameter-sequence[opt] new-line
or
"q-char-sequence" embed-parameter-sequence[opt] new-line
grammar.  If not (and that can be for tons of reasons, the first
token being a CPP_NAME (rather than CPP_HEADER_NAME or CPP_STRING),
or e.g. unbalanced token sequence in some parameter clause, or
(not currently tested in the patch, would need to wait for at least
two gnu namespace parameters other than gnu::base64) e.g.
identifier::identifier2::identifier3 () syntax (where
#define identifier gnu
#define identifier2 offset (16) gnu
#define identifier3 whatever
) etc.), it handles it like the patch before, i.e. everything
after #embed is macro expanded, if yes, the parameter names aren't macro
expanded, only limit (and later on gnu::offset) argument is macro expanded
(and diagnostics emitted if the closing ) comes from a macro so that one
doesn't actually bypass the no expansion like in embed-29.c) and
the prefix/suffix/if_empty arguments (but that is expanded only when
actually emitted into the #embed replacement, so it can after macro
expansion then contain unbalanced parens, but for non-empty resource
if_empty tokens aren't macro expanded and for empty resource
prefix/suffix tokens).

I've done this for #embed only and not for __has_embed because as I wrote
in my earlier mail, whether that is always macro expanded or not is unclear
given conflicting wording (or should it be expanded twice in some cases?),
but given the questions on __has_embed I haven't added further testsuite
coverage for macro expansion of __has_embed.

2024-09-03  Jakub Jelinek  <jakub@redhat.com>

libcpp/
	* internal.h (struct cpp_embed_params): Add no_expand member.
	* directives.cc (skip_balanced_token_seq): Don't set NO_EXPAND
	flags on the tokens here.
	(check_balanced_token_seq): New function.
	(do_embed): Check whether non-expanded tokens match
	<h-char-sequence> embed-parameter-sequence[opt] new-line
	or "q-char-sequence" embed-parameter-sequence[opt] new-line
	grammar, if yes, set params.no_expand and don't macro expand
	most of the tokens.
	* expr.cc (_cpp_parse_expr): Enable macro expansion if disabled
	in #embed argument and diagnose if closing paren comes from
	a macro.
	* files.cc (maybe_expand_embed_params_tokens): New function.
	(_cpp_stack_embed): Call maybe_expand_embed_params_tokens if
	needed, set NO_EXPAND flags on the tokens coming from
	prefix/suffix/if_empty.
gcc/testsuite/
	* c-c++-common/cpp/embed-28.c: New test.
	* c-c++-common/cpp/embed-29.c: New test.


	Jakub
diff mbox series

Patch

--- libcpp/internal.h.jj	2024-09-02 17:09:22.739723226 +0200
+++ libcpp/internal.h	2024-09-02 17:24:23.290579871 +0200
@@ -636,7 +636,7 @@  struct cpp_embed_params_tokens
 struct cpp_embed_params
 {
   location_t loc;
-  bool has_embed;
+  bool has_embed, no_expand;
   cpp_num_part limit;
   cpp_embed_params_tokens prefix, suffix, if_empty;
 };
--- libcpp/directives.cc.jj	2024-09-02 17:09:22.757723007 +0200
+++ libcpp/directives.cc	2024-09-03 15:46:09.891236633 +0200
@@ -977,7 +977,6 @@  skip_balanced_token_seq (cpp_reader *pfi
 	      save->cur_token = save->cur_run->base;
 	    }
 	  *save->cur_token = *token;
-	  save->cur_token->flags |= NO_EXPAND;
 	  save->cur_token++;
 	  save->count++;
 	}
@@ -1187,6 +1186,49 @@  _cpp_parse_embed_params (cpp_reader *pfi
   while (1);
 }
 
+/* Skip over balanced token sequence, stopping at END token.  Return
+   true if it is valid, false if invalid.  Update *CNT by the number of
+   consumed tokens.  */
+
+static bool
+check_balanced_token_seq (cpp_reader *pfile, cpp_ttype end, unsigned *cnt)
+{
+  do
+    {
+      const cpp_token *token = cpp_peek_token (pfile, 0);
+      if (token->type == CPP_EOF)
+	return false;
+      token = cpp_get_token (pfile);
+      ++*cnt;
+      if (token->type == end)
+	return true;
+      switch (token->type)
+	{
+	case CPP_OPEN_PAREN:
+	  if (!check_balanced_token_seq (pfile, CPP_CLOSE_PAREN, cnt))
+	    return false;
+	  break;
+	case CPP_OPEN_SQUARE:
+	  if (!check_balanced_token_seq (pfile, CPP_CLOSE_SQUARE, cnt))
+	    return false;
+	  break;
+	case CPP_OPEN_BRACE:
+	  if (!check_balanced_token_seq (pfile, CPP_CLOSE_BRACE, cnt))
+	    return false;
+	  break;
+	case CPP_CLOSE_PAREN:
+	case CPP_CLOSE_SQUARE:
+	case CPP_CLOSE_BRACE:
+	  return false;
+	default:
+	  break;
+	}
+    }
+  while (1);
+}
+
+
+
 /* Handle #embed directive.  */
 
 static void
@@ -1196,9 +1238,13 @@  do_embed (cpp_reader *pfile)
   struct cpp_embed_params params = {};
   bool ok;
   const char *fname = NULL;
+  unsigned int cnt, state;
+  void (*line_change) (cpp_reader *, const cpp_token *, int);
+  unsigned char prevent_expansion;
 
   /* Tell the lexer this is an embed directive.  */
   pfile->state.in_directive = 3;
+  prevent_expansion = pfile->state.prevent_expansion;
 
   if (CPP_OPTION (pfile, traditional))
     {
@@ -1218,6 +1264,113 @@  do_embed (cpp_reader *pfile)
 		   "#%s before C23 is a GCC extension", "embed");
     }
 
+  /* Determine if the #embed directive should be macro expanded or not.  */
+  pfile->state.prevent_expansion = 1;
+  pfile->keep_tokens++;
+  params.no_expand = true;
+  state = 0;
+  cnt = 0;
+  /* For peeked tokens temporarily disable line_change reporting,
+     until the tokens are parsed for real.  */
+  line_change = pfile->cb.line_change;
+  pfile->cb.line_change = NULL;
+  while (true)
+    {
+      const cpp_token *tok = cpp_peek_token (pfile, 0);
+      if (tok->type == CPP_EOF)
+	{
+	  switch (state)
+	    {
+	    case 0:
+	    case 3:
+	    case 4:
+	      params.no_expand = false;
+	      break;
+	    }
+	  break;
+	}
+      tok = cpp_get_token (pfile);
+      ++cnt;
+      if (tok->type == CPP_PADDING)
+	continue;
+      switch (state)
+	{
+	case 0:
+	  if ((tok->type == CPP_STRING && tok->val.str.text[0] != 'R')
+	      || tok->type == CPP_HEADER_NAME)
+	    {
+	      pfile->state.angled_headers = false;
+	      state = 1;
+	      continue;
+	    }
+	  break;
+	case 1:
+	  if (tok->type == CPP_NAME)
+	    {
+	      state = 2;
+	      continue;
+	    }
+	  break;
+	case 2:
+	  if (tok->type == CPP_NAME)
+	    continue;
+	  else if (tok->type == CPP_SCOPE)
+	    {
+	      state = 4;
+	      continue;
+	    }
+	  else if (tok->type == CPP_COLON && (tok->flags & COLON_SCOPE) != 0)
+	    {
+	      state = 3;
+	      continue;
+	    }
+	  else if (tok->type == CPP_OPEN_PAREN
+		   && check_balanced_token_seq (pfile, CPP_CLOSE_PAREN, &cnt))
+	    {
+	      state = 1;
+	      continue;
+	    }
+	  break;
+	case 3:
+	  if (tok->type == CPP_COLON)
+	    {
+	      state = 4;
+	      continue;
+	    }
+	  break;
+	case 4:
+	  if (tok->type == CPP_NAME)
+	    {
+	      state = 5;
+	      continue;
+	    }
+	  break;
+	case 5:
+	  if (tok->type == CPP_NAME)
+	    {
+	      state = 2;
+	      continue;
+	    }
+	  else if (tok->type == CPP_OPEN_PAREN
+		   && check_balanced_token_seq (pfile, CPP_CLOSE_PAREN, &cnt))
+	    {
+	      state = 1;
+	      continue;
+	    }
+	  break;
+	default:
+	  break;
+	}
+      params.no_expand = false;
+      break;
+    }
+
+  _cpp_backup_tokens_direct (pfile, cnt);
+  pfile->keep_tokens--;
+  pfile->cb.line_change = line_change;
+  pfile->state.angled_headers = true;
+  pfile->state.prevent_expansion = params.no_expand;
+
   fname = parse_include (pfile, &angle_brackets, NULL, &params.loc);
   if (!fname)
     {
@@ -1266,6 +1419,7 @@  do_embed (cpp_reader *pfile)
     }
 
  done:
+  pfile->state.prevent_expansion = prevent_expansion;
   XDELETEVEC (fname);
 }
 
--- libcpp/expr.cc.jj	2024-09-02 17:09:22.775722787 +0200
+++ libcpp/expr.cc	2024-09-03 14:14:40.039487661 +0200
@@ -1379,6 +1379,7 @@  _cpp_parse_expr (cpp_reader *pfile, cons
   unsigned int lex_count;
   bool saw_leading_not, want_value = true;
   location_t virtual_location = 0;
+  unsigned char no_expand = 0;
 
   pfile->state.skip_eval = 0;
 
@@ -1396,6 +1397,8 @@  _cpp_parse_expr (cpp_reader *pfile, cons
       top->op = CPP_OPEN_PAREN;
       top->token = open_paren;
       top->loc = open_paren->src_loc;
+      no_expand = pfile->state.prevent_expansion;
+      pfile->state.prevent_expansion = 0;
     }
 
   for (;;)
@@ -1493,7 +1496,12 @@  _cpp_parse_expr (cpp_reader *pfile, cons
 	{
 	case CPP_CLOSE_PAREN:
 	  if (pfile->state.in_directive == 3 && top == pfile->op_stack)
-	    goto embed_done;
+	    {
+	      if (pfile->context->prev && no_expand)
+		cpp_error_with_line (pfile, CPP_DL_ERROR, op.loc, 0,
+				     "closing ')' comes from macro expansion");
+	      goto embed_done;
+	    }
 	  continue;
 	case CPP_OR_OR:
 	  if (!num_zerop (top->value))
@@ -1538,12 +1546,16 @@  _cpp_parse_expr (cpp_reader *pfile, cons
       cpp_error_with_line (pfile, CPP_DL_ICE, top->loc, 0,
 			   "unbalanced stack in %s", dir);
     syntax_error:
+      if (no_expand)
+	pfile->state.prevent_expansion = no_expand;
       return false;  /* Return false on syntax error.  */
     }
 
   if (pfile->state.in_directive == 3)
     {
     embed_done:
+      if (no_expand)
+	pfile->state.prevent_expansion = 1;
       if (num_zerop (top->value))
 	return 0;
       if (!top->value.unsignedp
--- libcpp/files.cc.jj	2024-09-02 17:09:22.782722701 +0200
+++ libcpp/files.cc	2024-09-03 15:25:27.126296537 +0200
@@ -1217,6 +1217,76 @@  cpp_probe_header_unit (cpp_reader *pfile
   return nullptr;
 }
 
+/* Macro expand the TOKENS.  */
+
+void
+maybe_expand_embed_params_tokens (cpp_reader *pfile,
+				  cpp_embed_params_tokens *tokens)
+{
+  if (tokens->count == 0)
+    return;
+
+  _cpp_buff *tok_buff
+    = _cpp_get_buff (pfile, (tokens->count + 1) * sizeof (cpp_token));
+  cpp_token *toks = (cpp_token *) tok_buff->base;
+  cpp_token *tok = toks;
+  tokenrun *cur_run = &tokens->base_run;
+  while (cur_run)
+    {
+      size_t cnt = (cur_run->next ? cur_run->limit
+		    : tokens->cur_token) - cur_run->base;
+      cpp_token *t = cur_run->base;
+      memcpy (tok, t, cnt * sizeof (cpp_token));
+      tok += cnt;
+      cur_run = cur_run->next;
+    }
+  tok->type = CPP_EOF;
+  tok->src_loc = pfile->line_table->highest_line;
+  tok->flags = BOL;
+  ++tok;
+  tokenrun *n;
+  for (tokenrun *t = &tokens->base_run; t; t = n)
+    {
+      n = t->next;
+      XDELETEVEC (t->base);
+      if (t != &tokens->base_run)
+	XDELETE (t);
+    }
+  _cpp_push_token_context (pfile, NULL, toks, tok - toks);
+  pfile->context->buff = tok_buff;
+  tokens->count = 0;
+  _cpp_init_tokenrun (&tokens->base_run, 4);
+  tokens->cur_run = &tokens->base_run;
+  tokens->cur_token = tokens->base_run.base;
+  pfile->state.prevent_expansion = 0;
+  pfile->state.in_directive = 0;
+  do
+    {
+      const cpp_token *token = cpp_peek_token (pfile, 0);
+      if (token->type == CPP_EOF)
+	break;
+      token = cpp_get_token (pfile);
+      if (token->type == CPP_PADDING && tokens->count == 0)
+	continue;
+      if (tokens->cur_token == tokens->cur_run->limit)
+	{
+	  tokens->cur_run->next = XNEW (tokenrun);
+	  tokens->cur_run->next->prev = tokens->cur_run;
+	  _cpp_init_tokenrun (tokens->cur_run->next, 4);
+	  tokens->cur_run = tokens->cur_run->next;
+	  tokens->cur_token = tokens->cur_run->base;
+	}
+      *tokens->cur_token = *token;
+      tokens->cur_token++;
+      tokens->count++;
+    }
+  while (1);
+  while (pfile->context->prev)
+    _cpp_pop_context (pfile);
+  pfile->state.prevent_expansion = 1;
+  pfile->state.in_directive = 3;
+}
+
 /* Try to load FNAME with #embed/__has_embed parameters PARAMS.
    If !PARAMS->has_embed, return new token in pfile->directive_result
    (first token) and rest in a pushed non-macro context.
@@ -1392,6 +1462,17 @@  _cpp_stack_embed (cpp_reader *pfile, con
   if (params->limit < limit)
     limit = params->limit;
 
+  if (params->no_expand)
+    {
+      if (limit)
+	{
+	  maybe_expand_embed_params_tokens (pfile, &params->prefix);
+	  maybe_expand_embed_params_tokens (pfile, &params->suffix);
+	}
+      else
+	maybe_expand_embed_params_tokens (pfile, &params->if_empty);
+    }
+
   /* For sizes larger than say 64 bytes, this is just a temporary
      solution, we should emit a single new token which the FEs will
      handle as an optimization.  */
@@ -1470,6 +1551,9 @@  _cpp_stack_embed (cpp_reader *pfile, con
 	  tok += cnt;
 	  cur_run = cur_run->next;
 	}
+      pfile->directive_result.flags |= NO_EXPAND;
+      for (cpp_token *t = toks; t < tok; ++t)
+	t->flags |= NO_EXPAND;
     }
   for (size_t i = 0; i < limit; ++i)
     {
@@ -1507,6 +1591,8 @@  _cpp_stack_embed (cpp_reader *pfile, con
 	  cur_run = cur_run->next;
 	}
       orig_tok->flags |= PREV_WHITE;
+      for (cpp_token *t = orig_tok; t < tok; ++t)
+	t->flags |= NO_EXPAND;
     }
   pfile->directive_result.flags |= PREV_WHITE;
   if (count)
--- gcc/testsuite/c-c++-common/cpp/embed-28.c.jj	2024-09-03 14:50:30.710448414 +0200
+++ gcc/testsuite/c-c++-common/cpp/embed-28.c	2024-09-03 15:49:05.877117035 +0200
@@ -0,0 +1,66 @@ 
+/* { dg-do run } */
+/* { dg-options "--embed-dir=${srcdir}/c-c++-common/cpp/embed-dir" } */
+/* { dg-additional-options "-std=c23" { target c } } */
+
+const unsigned char a[] = {
+#embed "magna-carta.txt" prefix (1, ) suffix (, 2) limit (128)
+};
+#define embed !
+#define limit !
+#define prefix !
+#define suffix !
+#define if_empty !
+#define prefix_arg unsigned char b[] = { 1,
+#define suffix_arg , 2 };
+#define limit_arg 128
+#define concat(x,y) x##y
+#embed "magna-carta.txt" prefix (const prefix_arg) suffix (suffix_arg) limit (limit_arg) if_empty (concat (.,.) concat (<,>) concat (({[]}),(([[{{}}]]))))
+#define magna_carta "magna-carta.txt"
+#undef limit
+#undef prefix
+#undef suffix
+#undef if_empty
+#define limit __prefix__
+#define prefix __suffix__
+#define suffix __limit__
+#define empty
+const unsigned char c[] = {
+#embed empty "magna-carta.txt" limit (1, ) suffix (limit_arg) prefix (, 2)
+};
+const unsigned char d[] = {
+#embed magna_carta limit (1, ) prefix (, 2) suffix (128)
+};
+#define ignore(x)
+const unsigned char e[] = {
+#embed "magna-carta.txt" limit (1, ) prefix (, 2) suffix (ignore ({[) 128)
+};
+const unsigned char f[] = {
+#embed "magna-carta.txt" __limit__ (0) __if_empty__ (1, 2) __prefix__ (concat (<,>)) __suffix__ (concat (<,>))
+};
+#undef limit
+#define limit __limit__ (128) __prefix__(1,
+const unsigned char g[] = {
+#embed "magna-carta.txt" limit ) __suffix__(, 2)
+};
+
+int
+main ()
+{
+  if (sizeof (a) != 130
+      || a[0] != 1
+      || a[129] != 2
+      || sizeof (b) != 130
+      || __builtin_memcmp (a, b, 130) != 0
+      || sizeof (c) != 130
+      || __builtin_memcmp (a, c, 130) != 0
+      || sizeof (d) != 130
+      || __builtin_memcmp (a, d, 130) != 0
+      || sizeof (e) != 130
+      || __builtin_memcmp (a, e, 130) != 0
+      || sizeof (f) != 2
+      || f[0] != 1
+      || f[1] != 2
+      || sizeof (g) != 130
+      || __builtin_memcmp (a, g, 130) != 0)
+    __builtin_abort ();
+}
--- gcc/testsuite/c-c++-common/cpp/embed-29.c.jj	2024-09-03 15:51:55.921255480 +0200
+++ gcc/testsuite/c-c++-common/cpp/embed-29.c	2024-09-03 16:00:06.171004183 +0200
@@ -0,0 +1,11 @@ 
+/* { dg-do preprocess } */
+/* { dg-options "--embed-dir=${srcdir}/c-c++-common/cpp/embed-dir" } */
+/* { dg-additional-options "-std=c23" { target c } } */
+
+#define embed !
+#define limit !
+#define prefix !
+#define suffix !
+#define if_empty !
+#define limit_arg 1 ) __prefix__ (1,		/* { dg-error "closing '\\\)' comes from macro expansion" } */
+#embed "magna-carta.txt" limit (limit_arg)	/* { dg-message "in expansion of macro 'limit_arg'" } */