diff mbox series

[04/32] cpp lexer

Message ID d10060ca-ec44-4b34-e10a-ef37ee90efe9@acm.org
State New
Headers show
Series C++ 20 Modules | expand

Commit Message

Nathan Sidwell Nov. 3, 2020, 9:13 p.m. UTC
c++ modules creates 2 new kinds of preprocessor lines
[export] module ...
[export] import ...

To all intents and purposes these are cppdirectives spelt without a 
leading '#'.  module and import are context-sensitive keywords.  Thus 
preprocessor tokenizing needs a bit of token peeking.  This is that peeking.

We have a new node flag 'NODE_MODULE', which marks whether an identifier 
is significant to this peeking.  When we see such an identifier at the 
beginning of a logical line, we need to peek further and figure out 
whether these are those keywords.

When successfully peeked, we replace the identifiers with internal-only 
tokens that the c++ parser recognizes.

Comments

Joseph Myers Nov. 3, 2020, 11:08 p.m. UTC | #1
On Tue, 3 Nov 2020, Nathan Sidwell wrote:

> @@ -888,9 +915,9 @@ struct GTY(()) cpp_hashnode {
>    unsigned int directive_index : 7;	/* If is_directive,
>  					   then index into directive table.
>  					   Otherwise, a NODE_OPERATOR.  */
> -  unsigned char rid_code;		/* Rid code - for front ends.  */
> +  unsigned int rid_code : 8;		/* Rid code - for front ends.  */
> +  unsigned int flags : 9;		/* CPP flags.  */
>    ENUM_BITFIELD(node_type) type : 2;	/* CPP node type.  */
> -  unsigned int flags : 8;		/* CPP flags.  */
>  
>    /* 6 bits spare (plus another 32 on 64-bit hosts).  */

I'd expect this "6 bits spare" comment to be updated when expanding the 
flags field.
Nathan Sidwell Nov. 3, 2020, 11:21 p.m. UTC | #2
On 11/3/20 6:08 PM, Joseph Myers wrote:
> On Tue, 3 Nov 2020, Nathan Sidwell wrote:
> 
>> @@ -888,9 +915,9 @@ struct GTY(()) cpp_hashnode {
>>     unsigned int directive_index : 7;	/* If is_directive,
>>   					   then index into directive table.
>>   					   Otherwise, a NODE_OPERATOR.  */
>> -  unsigned char rid_code;		/* Rid code - for front ends.  */
>> +  unsigned int rid_code : 8;		/* Rid code - for front ends.  */
>> +  unsigned int flags : 9;		/* CPP flags.  */
>>     ENUM_BITFIELD(node_type) type : 2;	/* CPP node type.  */
>> -  unsigned int flags : 8;		/* CPP flags.  */
>>   
>>     /* 6 bits spare (plus another 32 on 64-bit hosts).  */
> 
> I'd expect this "6 bits spare" comment to be updated when expanding the
> flags field.

ah, that;s an error on my part separating two close pieces of this diff. 
  It does in fact say that

nathan
Jeff Law Nov. 6, 2020, 8:23 p.m. UTC | #3
On 11/3/20 2:13 PM, Nathan Sidwell wrote:
> c++ modules creates 2 new kinds of preprocessor lines
> [export] module ...
> [export] import ...
>
> To all intents and purposes these are cppdirectives spelt without a
> leading '#'.  module and import are context-sensitive keywords.  Thus
> preprocessor tokenizing needs a bit of token peeking.  This is that
> peeking.
>
> We have a new node flag 'NODE_MODULE', which marks whether an
> identifier is significant to this peeking.  When we see such an
> identifier at the beginning of a logical line, we need to peek further
> and figure out whether these are those keywords.
>
> When successfully peeked, we replace the identifiers with
> internal-only tokens that the c++ parser recognizes.
>
>
> 04-cpp-lexer.diff
>
> diff --git c/libcpp/include/cpplib.h w/libcpp/include/cpplib.h
> index 8e398863cf6..81be6457951 100644
> --- c/libcpp/include/cpplib.h
> +++ w/libcpp/include/cpplib.h
>
> @@ -888,9 +915,9 @@ struct GTY(()) cpp_hashnode {
>    unsigned int directive_index : 7;	/* If is_directive,
>  					   then index into directive table.
>  					   Otherwise, a NODE_OPERATOR.  */
> -  unsigned char rid_code;		/* Rid code - for front ends.  */
> +  unsigned int rid_code : 8;		/* Rid code - for front ends.  */
> +  unsigned int flags : 9;		/* CPP flags.  */
>    ENUM_BITFIELD(node_type) type : 2;	/* CPP node type.  */
> -  unsigned int flags : 8;		/* CPP flags.  */
>  
>    /* 6 bits spare (plus another 32 on 64-bit hosts).  */

Someone already mentioned it, but the # of spare bits needs updating.


>  
> diff --git c/libcpp/lex.c w/libcpp/lex.c
> index fb222924c8c..b3498f195bf 100644
> --- c/libcpp/lex.c
> +++ w/libcpp/lex.c
> @@ -2606,6 +2622,131 @@ _cpp_temp_token (cpp_reader *pfile)
>    return result;
>  }
>  
> +/* RESULT is a CPP_NAME with NODE_MODULE set.  See if we should enter
> +   deferred_pragma mode to tokenize the rest of the line.  */
> +
> +static void
> +cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
> +{
> +  unsigned backup = 0; /* Tokens we peeked.  */
> +  cpp_hashnode *node = result->val.node.node;
> +  cpp_token *peek = result;
> +  cpp_token *keyword = peek;
> +  cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
> +  int header_count = 0;
> +
> +  /* Enter directives mode for the peeking.  */
> +  pfile->state.in_deferred_pragma = true;
> +  pfile->state.pragma_allow_expansion = true;
> +  pfile->state.save_comments = 0;
> +  pfile->directive_line = result->src_loc;
It looks like you slam in known values when you drop out of directive
mode rather than restoring the original values.  That may be OK, I'm not
all that familiar with this code to know if that's corerct or not.
> +
> +  if (__builtin_expect (node == n_modules[spec_nodes::M__IMPORT][0], false))
> +    /* __import  */
> +    header_count = backup + 2 + 16;
> +  else if (__builtin_expect (node == n_modules[spec_nodes::M_IMPORT][0], false))
> +    /* import  */
> +    header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
> +  else if (__builtin_expect (node == n_modules[spec_nodes::M_MODULE][0], false))
> +    ; /* module  */
> +  else
> +    goto not_module;

Are the __builtin_expects really useful here?  If not I'd prefer to
avoid them and just let the predictors do their job.  Similarly 
elsewhere in this patch.


+ {
> +    not_module:
> +      /* Drop out of directive mode.  */
> +      pfile->state.save_comments
> +	= !CPP_OPTION (pfile, discard_comments);
> +      pfile->state.in_deferred_pragma = false;
> +      pfile->state.angled_headers = false;

This doesn't seem to match the code to go into directives mode all that
well.  You don't reset pragma_allow_expansion or directive_line.


Jeff
Nathan Sidwell Nov. 6, 2020, 9:06 p.m. UTC | #4
On 11/6/20 3:23 PM, Jeff Law wrote:

>>
>> 04-cpp-lexer.diff
>>
>> diff --git c/libcpp/include/cpplib.h w/libcpp/include/cpplib.h
>> index 8e398863cf6..81be6457951 100644
>> --- c/libcpp/include/cpplib.h
>> +++ w/libcpp/include/cpplib.h
>>
>> @@ -888,9 +915,9 @@ struct GTY(()) cpp_hashnode {
>>     unsigned int directive_index : 7;	/* If is_directive,
>>   					   then index into directive table.
>>   					   Otherwise, a NODE_OPERATOR.  */
>> -  unsigned char rid_code;		/* Rid code - for front ends.  */
>> +  unsigned int rid_code : 8;		/* Rid code - for front ends.  */
>> +  unsigned int flags : 9;		/* CPP flags.  */
>>     ENUM_BITFIELD(node_type) type : 2;	/* CPP node type.  */
>> -  unsigned int flags : 8;		/* CPP flags.  */
>>   
>>     /* 6 bits spare (plus another 32 on 64-bit hosts).  */
> 
> Someone already mentioned it, but the # of spare bits needs updating.

yeah, andit was me manually editing a patch wot flubbed it.

>> +  /* Enter directives mode for the peeking.  */
>> +  pfile->state.in_deferred_pragma = true;
>> +  pfile->state.pragma_allow_expansion = true;
>> +  pfile->state.save_comments = 0;
>> +  pfile->directive_line = result->src_loc;
> It looks like you slam in known values when you drop out of directive 
> mode rather than restoring the original values.  That may be OK, I'm not 
> all that familiar with this code to know if that's corerct or not.

Looks like comments would be useful, and maybe asserts.  You only get to 
call the peeking when you were in a known (non-directives) state.  I 
wanted the peeking to be as cheap as possible, so saving and restoring 
did not seem needed.


>> +
>> +  if (__builtin_expect (node == n_modules[spec_nodes::M__IMPORT][0], false))
>> +    /* __import  */
>> +    header_count = backup + 2 + 16;
>> +  else if (__builtin_expect (node == n_modules[spec_nodes::M_IMPORT][0], false))
>> +    /* import  */
>> +    header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
>> +  else if (__builtin_expect (node == n_modules[spec_nodes::M_MODULE][0], false))
>> +    ; /* module  */
>> +  else
>> +    goto not_module;
> 
> Are the __builtin_expects really useful here?  If not I'd prefer to 
> avoid them and just let the predictors do their job. Similarly  
> elsewhere in this patch.

I'll investigate.

>> +    not_module:
>> +      /* Drop out of directive mode.  */
>> +      pfile->state.save_comments
>> +	= !CPP_OPTION (pfile, discard_comments);
>> +      pfile->state.in_deferred_pragma = false;
>> +      pfile->state.angled_headers = false;
> 
> This doesn't seem to match the code to go into directives mode all that 
> well.  You don't reset pragma_allow_expansion or directive_line.

Hm, yeah.  I think neither of those fields have any bearing in 
non-directives mode, and always get set when entering it?  Again, 
comment at the very least.

nathan
Boris Kolpackov Nov. 9, 2020, 6:15 a.m. UTC | #5
Nathan Sidwell <nathan@acm.org> writes:

> > This doesn't seem to match the code to go into directives mode all that 
> > well.? You don't reset pragma_allow_expansion or directive_line.
> 
> Hm, yeah.  I think neither of those fields have any bearing in 
> non-directives mode, and always get set when entering it?  Again, 
> comment at the very least.

FWIW, I remember wrestling with that login in my branch, here are the
changes to directives-only.c (in particular, see the "Handle import 
as pseudo-directive (P1703R0)" commit):

https://github.com/boris-kolpackov/gcc-cxx-modules-ex/commits/c%2B%2B-modules-ex/libcpp/directives-only.c
Nathan Sidwell Nov. 9, 2020, 1:54 p.m. UTC | #6
Jeff,
here is an updated patch with changelog.  I've added checking_asserts 
and comments for the state changes you were concerned about.

While the __builtin_expects do make a change to generated code, you are 
probably right that they are not significant and I have removed them -- 
cpplib tends to sprinkle them liberally and I guess I got infected.

I trust this addresses your concerns.

nathan
diff mbox series

Patch

diff --git c/libcpp/include/cpplib.h w/libcpp/include/cpplib.h
index 8e398863cf6..81be6457951 100644
--- c/libcpp/include/cpplib.h
+++ w/libcpp/include/cpplib.h
@@ -487,6 +494,9 @@  struct cpp_options
   /* Nonzero for the '::' token.  */
   unsigned char scope;
 
+  /* Nonzero means tokenize C++20 module directives.  */
+  unsigned char module_directives;
+
   /* Holds the name of the target (execution) character set.  */
   const char *narrow_charset;
 
@@ -831,6 +857,7 @@  struct GTY(()) cpp_macro {
 #define NODE_USED	(1 << 5)	/* Dumped with -dU.  */
 #define NODE_CONDITIONAL (1 << 6)	/* Conditional macro */
 #define NODE_WARN_OPERATOR (1 << 7)	/* Warn about C++ named operator.  */
+#define NODE_MODULE (1 << 8)		/* C++-20 module-related name.  */
 
 /* Different flavors of hash node.  */
 enum node_type
@@ -888,9 +915,9 @@  struct GTY(()) cpp_hashnode {
   unsigned int directive_index : 7;	/* If is_directive,
 					   then index into directive table.
 					   Otherwise, a NODE_OPERATOR.  */
-  unsigned char rid_code;		/* Rid code - for front ends.  */
+  unsigned int rid_code : 8;		/* Rid code - for front ends.  */
+  unsigned int flags : 9;		/* CPP flags.  */
   ENUM_BITFIELD(node_type) type : 2;	/* CPP node type.  */
-  unsigned int flags : 8;		/* CPP flags.  */
 
   /* 6 bits spare (plus another 32 on 64-bit hosts).  */
 
diff --git c/libcpp/lex.c w/libcpp/lex.c
index fb222924c8c..b3498f195bf 100644
--- c/libcpp/lex.c
+++ w/libcpp/lex.c
@@ -2606,6 +2622,131 @@  _cpp_temp_token (cpp_reader *pfile)
   return result;
 }
 
+/* RESULT is a CPP_NAME with NODE_MODULE set.  See if we should enter
+   deferred_pragma mode to tokenize the rest of the line.  */
+
+static void
+cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
+{
+  unsigned backup = 0; /* Tokens we peeked.  */
+  cpp_hashnode *node = result->val.node.node;
+  cpp_token *peek = result;
+  cpp_token *keyword = peek;
+  cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
+  int header_count = 0;
+
+  /* Enter directives mode for the peeking.  */
+  pfile->state.in_deferred_pragma = true;
+  pfile->state.pragma_allow_expansion = true;
+  pfile->state.save_comments = 0;
+  pfile->directive_line = result->src_loc;
+
+  if (node == n_modules[spec_nodes::M_EXPORT][0])
+    {
+      peek = _cpp_lex_direct (pfile);
+      keyword = peek;
+      backup++;
+      if (keyword->type != CPP_NAME)
+	goto not_module;
+      node = keyword->val.node.node;
+      if (!(node->flags & NODE_MODULE))
+	goto not_module;
+    }
+
+  if (__builtin_expect (node == n_modules[spec_nodes::M__IMPORT][0], false))
+    /* __import  */
+    header_count = backup + 2 + 16;
+  else if (__builtin_expect (node == n_modules[spec_nodes::M_IMPORT][0], false))
+    /* import  */
+    header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
+  else if (__builtin_expect (node == n_modules[spec_nodes::M_MODULE][0], false))
+    ; /* module  */
+  else
+    goto not_module;
+
+  /* We've seen [export] {module|import|__import}.  Check the next token.  */
+  if (header_count)
+    /* After '{,__}import' a header name may appear.  */
+    pfile->state.angled_headers = true;
+  peek = _cpp_lex_direct (pfile);
+  backup++;
+
+  /* ... import followed by identifier, ':', '<' or
+     header-name preprocessing tokens, or module
+     followed by cpp-identifier, ':' or ';' preprocessing
+     tokens.  C++ keywords are not yet relevant.  */
+  if (peek->type == CPP_NAME
+      || peek->type == CPP_COLON
+      ||  (header_count
+	   ? (peek->type == CPP_LESS
+	      || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
+	      || peek->type == CPP_HEADER_NAME)
+	   : peek->type == CPP_SEMICOLON))
+    {
+      pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
+      if (!pfile->state.pragma_allow_expansion)
+	pfile->state.prevent_expansion++;
+
+      if (!header_count && linemap_included_from
+	  (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
+	cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
+			     "module control-line cannot be in included file");
+
+      /* The first one or two tokens cannot be macro names.  */
+      for (int ix = backup; ix--;)
+	{
+	  cpp_token *tok = ix ? keyword : result;
+	  cpp_hashnode *node = tok->val.node.node;
+
+	  /* Don't attempt to expand the token.  */
+	  tok->flags |= NO_EXPAND;
+	  if (_cpp_defined_macro_p (node)
+	      && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
+	      && !cpp_fun_like_macro_p (node))
+	    cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0, 
+				 "module control-line \"%s\" cannot be"
+				 " an object-like macro",
+				 NODE_NAME (node));
+	}
+
+      /* Map to underbar variants.  */
+      keyword->val.node.node = n_modules[header_count
+					 ? spec_nodes::M_IMPORT
+					 : spec_nodes::M_MODULE][1];
+      if (backup != 1)
+	result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
+
+      /* Maybe tell the tokenizer we expect a header-name down the
+	 road.  */
+      pfile->state.directive_file_token = header_count;
+    }
+  else
+    {
+    not_module:
+      /* Drop out of directive mode.  */
+      pfile->state.save_comments
+	= !CPP_OPTION (pfile, discard_comments);
+      pfile->state.in_deferred_pragma = false;
+      pfile->state.angled_headers = false;
+    }
+
+  /* In either case we want to backup the peeked tokens.  */
+  if (backup)
+    {
+      /* If we saw EOL, we should drop it, because this isn't a module
+	 control-line after all.  */
+      bool eol = peek->type == CPP_PRAGMA_EOL;
+      if (!eol || backup > 1)
+	{
+	  /* Put put the peeked tokens back  */
+	  _cpp_backup_tokens_direct (pfile, backup);
+	  /* But if the last one was an EOL, forget it.  */
+	  if (eol)
+	    pfile->lookaheads--;
+	}
+    }
+}
+
 /* Lex a token into RESULT (external interface).  Takes care of issues
    like directive handling, token lookahead, multiple include
    optimization and skipping.  */
@@ -2654,6 +2795,22 @@  _cpp_lex_token (cpp_reader *pfile)
 	    }
 	  else if (pfile->state.in_deferred_pragma)
 	    result = &pfile->directive_result;
+	  else if (result->type == CPP_NAME
+		   && __builtin_expect
+		   (result->val.node.node->flags & NODE_MODULE, 0)
+		   && !pfile->state.skipping
+		   /* Unlike regular directives, we do not deal with
+		      tokenizing module directives as macro arguments.
+		      That's not permitted.  */
+		   && !pfile->state.parsing_args)
+	    {
+	      /* P1857.  Before macro expansion, At start of logical
+		 line ... */
+	      /* We don't have to consider lookaheads at this point.  */
+	      gcc_checking_assert (!pfile->lookaheads);
+
+	      cpp_maybe_module_directive (pfile, result);
+	    }
 
 	  if (pfile->cb.line_change && !pfile->state.skipping)
 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
@@ -3446,7 +3609,11 @@  cpp_output_token (const cpp_token *token, FILE *fp)
       break;
 
     case SPELL_LITERAL:
+      if (token->type == CPP_HEADER_NAME)
+	fputc ('"', fp);
       fwrite (token->val.str.text, 1, token->val.str.len, fp);
+      if (token->type == CPP_HEADER_NAME)
+	fputc ('"', fp);
       break;
 
     case SPELL_NONE:
@@ -3932,6 +4099,188 @@  do_peek_prev (const unsigned char *peek, const unsigned char *bound)
     return peek;
 }
 
+/* If PEEK[-1] is identifier MATCH, scan past it and trailing white
+   space.  Otherwise return NULL.  */
+
+static const unsigned char *
+do_peek_ident (const char *match, const unsigned char *peek,
+	       const unsigned char *limit)
+{
+  for (; *++match; peek++)
+    if (*peek != *match)
+      {
+	peek = do_peek_next (peek, limit);
+	if (*peek != *match)
+	  return NULL;
+      }
+
+  /* Must now not be looking at an identifier char.  */
+  peek = do_peek_next (peek, limit);
+  if (ISIDNUM (*peek))
+    return NULL;
+
+  /* Skip control-line whitespace.  */
+ ws:
+  while (*peek == ' ' || *peek == '\t')
+    peek++;
+  if (__builtin_expect (*peek == '\\', false))
+    {
+      peek = do_peek_backslash (peek, limit);
+      if (*peek != '\\')
+	goto ws;
+    }
+
+  return peek;
+}
+
+/* Are we looking at a module control line starting as PEEK - 1?  */
+
+static bool
+do_peek_module (cpp_reader *pfile, unsigned char c,
+		const unsigned char *peek, const unsigned char *limit)
+{
+  bool import = false;
+
+  if (__builtin_expect (c == 'e', false))
+    {
+      if (!((peek[0] == 'x' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("export", peek, limit))))
+	return false;
+
+      /* export, peek for import or module.  No need to peek __import
+	 here.  */
+      if (peek[0] == 'i')
+	{
+	  if (!((peek[1] == 'm' || peek[1] == '\\')
+		&& (peek = do_peek_ident ("import", peek + 1, limit))))
+	    return false;
+	  import = true;
+	}
+      else if (peek[0] == 'm')
+	{
+	  if (!((peek[1] == 'o' || peek[1] == '\\')
+		&& (peek = do_peek_ident ("module", peek + 1, limit))))
+	    return false;
+	}
+      else
+	return false;
+    }
+  else if (__builtin_expect (c == 'i', false))
+    {
+      if (!((peek[0] == 'm' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("import", peek, limit))))
+	return false;
+      import = true;
+    }
+  else if (__builtin_expect (c == '_', false))
+    {
+      /* Needed for translated includes.   */
+      if (!((peek[0] == '_' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("__import", peek, limit))))
+	return false;
+      import = true;
+    }
+  else if (__builtin_expect (c == 'm', false))
+    {
+      if (!((peek[0] == 'o' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("module", peek, limit))))
+	return false;
+    }
+  else
+    return false;
+
+  /* Peek the next character to see if it's good enough.  We'll be at
+     the first non-whitespace char, including skipping an escaped
+     newline.  */
+  /* ... import followed by identifier, ':', '<' or header-name
+     preprocessing tokens, or module followed by identifier, ':' or
+     ';' preprocessing tokens.  */
+  unsigned char p = *peek++;
+      
+  /* A character literal is ... single quotes, ... optionally preceded
+     by u8, u, U, or L */
+  /* A string-literal is a ... double quotes, optionally prefixed by
+     R, u8, u8R, u, uR, U, UR, L, or LR */
+  if (p == 'u')
+    {
+      peek = do_peek_next (peek, limit);
+      if (*peek == '8')
+	{
+	  peek++;
+	  goto peek_u8;
+	}
+      goto peek_u;
+    }
+  else if (p == 'U' || p == 'L')
+    {
+    peek_u8:
+      peek = do_peek_next (peek, limit);
+    peek_u:
+      if (*peek == '\"' || *peek == '\'')
+	return false;
+
+      if (*peek == 'R')
+	goto peek_R;
+      /* Identifier. Ok.  */
+    }
+  else if (p == 'R')
+    {
+    peek_R:
+      if (CPP_OPTION (pfile, rliterals))
+	{
+	  peek = do_peek_next (peek, limit);
+	  if (*peek == '\"')
+	    return false;
+	}
+      /* Identifier. Ok.  */
+    }
+  else if ('Z' - 'A' == 25
+	   ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
+	   : ISIDST (p))
+    {
+      /* Identifier.  Ok. */
+    }
+  else if (p == '<')
+    {
+      /* Maybe angle header, ok for import.  Reject
+	 '<=', '<<' digraph:'<:'.  */
+      if (!import)
+	return false;
+      peek = do_peek_next (peek, limit);
+      if (*peek == '=' || *peek == '<'
+	  || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
+	return false;
+    }
+  else if (p == ';')
+    {
+      /* SEMICOLON, ok for module.  */
+      if (import)
+	return false;
+    }
+  else if (p == '"')
+    {
+      /* STRING, ok for import.  */
+      if (!import)
+	return false;
+    }
+  else if (p == ':')
+    {
+      /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
+      peek = do_peek_next (peek, limit);
+      if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
+	return false;
+    }
+  else
+    /* FIXME: Detect a unicode character, excluding those not
+       permitted as the initial character. [lex.name]/1.  I presume
+       we need to check the \[uU] spellings, and directly using
+       Unicode in say UTF8 form?  Or perhaps we do the phase-1
+       conversion of UTF8 to universal-character-names?  */
+    return false;
+
+  return true;
+}
+
 /* Directives-only scanning.  Somewhat more relaxed than correct
    parsing -- some ill-formed programs will not be rejected.  */
 
@@ -3940,6 +4289,8 @@  cpp_directive_only_process (cpp_reader *pfile,
 			    void *data,
 			    void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
 {
+  bool module_p = CPP_OPTION (pfile, module_directives);
+
   do
     {
     restart:
@@ -4332,6 +4683,51 @@  cpp_directive_only_process (cpp_reader *pfile,
 	      }
 	      goto dflt;
 
+	    case '_':
+	    case 'e':
+	    case 'i':
+	    case 'm':
+	      if (bol && module_p && !pfile->state.skipping
+		  && do_peek_module (pfile, c, pos, limit))
+		{
+		  /* We've seen the start of a module control line.
+		     Start up the tokenizer.  */
+		  pos--; /* Backup over the first character.  */
+
+		  /* Backup over whitespace to start of line.  */
+		  while (pos > line_start
+			 && (pos[-1] == ' ' || pos[-1] == '\t'))
+		    pos--;
+
+		  if (pos > base)
+		    cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
+
+		  /* Prep things for directive handling. */
+		  buffer->next_line = pos;
+		  buffer->need_line = true;
+
+		  /* Now get tokens until the PRAGMA_EOL.  */
+		  do
+		    {
+		      location_t spelling;
+		      const cpp_token *tok
+			= cpp_get_token_with_location (pfile, &spelling);
+
+		      gcc_assert (pfile->state.in_deferred_pragma
+				  || tok->type == CPP_PRAGMA_EOL);
+		      cb (pfile, CPP_DO_token, data, tok, spelling);
+		    }
+		  while (pfile->state.in_deferred_pragma);
+
+		  if (pfile->buffer->next_line < pfile->buffer->rlimit)
+		    cb (pfile, CPP_DO_location, data,
+			pfile->line_table->highest_line);
+
+		  pfile->mi_valid = false;
+		  goto restart;
+		}
+	      goto dflt;
+
 	    default:
 	    dflt:
 	      bol = false;
diff --git c/libcpp/macro.c w/libcpp/macro.c
index e304f67c2e0..f5f280dfdc7 100644
--- c/libcpp/macro.c
+++ w/libcpp/macro.c
@@ -2930,6 +2932,85 @@  cpp_get_token_1 (cpp_reader *pfile, location_t *location)
     }
 
   pfile->about_to_expand_macro_p = saved_about_to_expand_macro;
+
+  if (pfile->state.directive_file_token
+      && !pfile->state.parsing_args
+      && !(result->type == CPP_PADDING || result->type == CPP_COMMENT)
+      && !(15 & --pfile->state.directive_file_token))
+    {
+      /* Do header-name frobbery.  Concatenate < ... > as approprate.
+	 Do header search if needed, and finally drop the outer <> or
+	 "".  */
+      pfile->state.angled_headers = false;
+
+      /* Do angle-header reconstitution.  Then do include searching.
+	 We'll always end up with a ""-quoted header-name in that
+	 case.  If searching finds nothing, we emit a diagnostic and
+	 an empty string.  */
+      size_t len = 0;
+      char *fname = NULL;
+
+      cpp_token *tmp = _cpp_temp_token (pfile);
+      *tmp = *result;
+
+      tmp->type = CPP_HEADER_NAME;
+      bool need_search = !pfile->state.directive_file_token;
+      pfile->state.directive_file_token = 0;
+
+      bool angle = result->type != CPP_STRING;
+      if (result->type == CPP_HEADER_NAME
+	  || (result->type == CPP_STRING && result->val.str.text[0] != 'R'))
+	{
+	  len = result->val.str.len - 2;
+	  fname = XNEWVEC (char, len + 1);
+	  memcpy (fname, result->val.str.text + 1, len);
+	  fname[len] = 0;
+	}
+      else if (result->type == CPP_LESS)
+	fname = _cpp_bracket_include (pfile);
+
+      if (fname)
+	{
+	  /* We have a header-name.  Look it up.  This will emit an
+	     unfound diagnostic.  Canonicalize the found name.  */
+	  const char *found = fname;
+
+	  if (need_search)
+	    {
+	      found = cpp_find_header_unit (pfile, fname, angle, tmp->src_loc);
+	      if (!found)
+		found = "";
+	      len = strlen (found);
+	    }
+	  /* Force a leading './' if it's not absolute.  */
+	  bool dotme = (found[0] == '.' ? !IS_DIR_SEPARATOR (found[1])
+			: found[0] && !IS_ABSOLUTE_PATH (found));
+
+	  if (BUFF_ROOM (pfile->u_buff) < len + 1 + dotme * 2)
+	    _cpp_extend_buff (pfile, &pfile->u_buff, len + 1 + dotme * 2);
+	  unsigned char *buf = BUFF_FRONT (pfile->u_buff);
+	  size_t pos = 0;
+	      
+	  if (dotme)
+	    {
+	      buf[pos++] = '.';
+	      /* Apparently '/' is unconditional.  */
+	      buf[pos++] = '/';
+	    }
+	  memcpy (&buf[pos], found, len);
+	  pos += len;
+	  buf[pos] = 0;
+
+	  tmp->val.str.len = pos;
+	  tmp->val.str.text = buf;
+
+	  tmp->type = CPP_HEADER_NAME;
+	  XDELETEVEC (fname);
+	  
+	  result = tmp;
+	}
+    }
+
   return result;
 }
 
diff --git c/gcc/c-family/c-lex.c w/gcc/c-family/c-lex.c
index e81e16ddc26..44575473719 100644
--- c/gcc/c-family/c-lex.c
+++ w/gcc/c-family/c-lex.c
@@ -654,8 +656,11 @@  c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags,
       *value = build_int_cst (integer_type_node, tok->val.pragma);
       break;
 
-      /* These tokens should not be visible outside cpplib.  */
     case CPP_HEADER_NAME:
+      *value = build_string (tok->val.str.len, (const char *)tok->val.str.text);
+      break;
+
+      /* These tokens should not be visible outside cpplib.  */
     case CPP_MACRO_ARG:
       gcc_unreachable ();
 
diff --git c/libcpp/init.c w/libcpp/init.c
index 6c52f50de39..96ade569457 100644
--- c/libcpp/init.c
+++ w/libcpp/init.c
@@ -840,4 +855,27 @@  post_options (cpp_reader *pfile)
       CPP_OPTION (pfile, trigraphs) = 0;
       CPP_OPTION (pfile, warn_trigraphs) = 0;
     }
+
+  if (CPP_OPTION (pfile, module_directives))
+    {
+      /* These unspellable tokens have a leading space.  */
+      const char *const inits[spec_nodes::M_HWM]
+	= {"export ", "module ", "import ", "__import"};
+
+      for (int ix = 0; ix != spec_nodes::M_HWM; ix++)
+	{
+	  cpp_hashnode *node = cpp_lookup (pfile, UC (inits[ix]),
+					   strlen (inits[ix]));
+
+	  /* Token we pass to the compiler.  */
+	  pfile->spec_nodes.n_modules[ix][1] = node;
+
+	  if (ix != spec_nodes::M__IMPORT)
+	    /* Token we recognize when lexing, drop the trailing ' '.  */
+	    node = cpp_lookup (pfile, NODE_NAME (node), NODE_LEN (node) - 1);
+
+	  node->flags |= NODE_MODULE;
+	  pfile->spec_nodes.n_modules[ix][0] = node;
+	}
+    }
 }
diff --git c/libcpp/internal.h w/libcpp/internal.h
index 4759961a33a..17b65601b66 100644
--- c/libcpp/internal.h
+++ w/libcpp/internal.h
@@ -280,6 +280,9 @@  struct lexer_state
   /* Nonzero when tokenizing a deferred pragma.  */
   unsigned char in_deferred_pragma;
 
+  /* Count to token that is a header-name.  */
+  unsigned char directive_file_token;
+
   /* Nonzero if the deferred pragma being handled allows macro expansion.  */
   unsigned char pragma_allow_expansion;
 };
@@ -292,6 +295,12 @@  struct spec_nodes
   cpp_hashnode *n_false;		/* C++ keyword false */
   cpp_hashnode *n__VA_ARGS__;		/* C99 vararg macros */
   cpp_hashnode *n__VA_OPT__;		/* C++ vararg macros */
+
+  enum {M_EXPORT, M_MODULE, M_IMPORT, M__IMPORT, M_HWM};
+  
+  /* C++20 modules, only set when module_directives is in effect.
+     incoming variants [0], outgoing ones [1] */
+  cpp_hashnode *n_modules[M_HWM][2];
 };
 
 typedef struct _cpp_line_note _cpp_line_note;