preprocessor: C++ module-directives

C++20 modules introduces a new kind of preprocessor directive -- a module directive. These are directives but without the leading '#'. We have to detect them by sniffing the start of a logical line. When detected we replace the initial identifiers with unspellable tokens and pass them through to the language parser the same way deferred pragmas are. There's a PRAGMA_EOL at the logical end of line too. One additional complication is that we have to do header-name lexing after the initial tokens, and that requires changes in the macro-aware piece of the preprocessor. The above sniffer sets a counter in the lexer state, and that triggers at the appropriate point. We then do the same header-name lexing that occurs on a #include directive or has_include pseudo-macro. Except that the header name ends up in the token stream. A couple of token emitters need to deal with the new token possibility. gcc/c-family/ * c-lex.c (c_lex_with_flags): CPP_HEADER_NAMEs can now be seen. libcpp/ * include/cpplib.h (struct cpp_options): Add module_directives option. (NODE_MODULE): New node flag. (struct cpp_hashnode): Make rid-code a bitfield, increase bits in flags and swap with type field. * init.c (post_options): Create module-directive identifier nodes. * internal.h (struct lexer_state): Add directive_file_token & n_modules fields. Add module node enumerator. * lex.c (cpp_maybe_module_directive): New. (_cpp_lex_token): Call it. (cpp_output_token): Add '"' around CPP_HEADER_NAME token. (do_peek_ident, do_peek_module): New. (cpp_directives_only): Detect module-directive lines. * macro.c (cpp_get_token_1): Deal with directive_file_token triggering.
2020-11-18 10:24:12 -08:00 · 2020-11-18 10:24:12 -08:00 · c9c3d5f28a
parent 7ceb899e93
commit c9c3d5f28a
6 changed files with 514 additions and 4 deletions
--- a/gcc/c-family/c-lex.c
+++ b/gcc/c-family/c-lex.c
@ -667,8 +667,11 @@ c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags,
      *value = build_int_cst (integer_type_node, tok->val.pragma);
      break;

-      /* These tokens should not be visible outside cpplib.  */
    case CPP_HEADER_NAME:
+      *value = build_string (tok->val.str.len, (const char *)tok->val.str.text);
+      break;
+
+      /* This token should not be visible outside cpplib.  */
    case CPP_MACRO_ARG:
      gcc_unreachable ();

--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@ -487,6 +487,9 @@ struct cpp_options
  /* Nonzero for the '::' token.  */
  unsigned char scope;

+  /* Nonzero means tokenize C++20 module directives.  */
+  unsigned char module_directives;
+
  /* Holds the name of the target (execution) character set.  */
  const char *narrow_charset;

@ -842,6 +845,7 @@ struct GTY(()) cpp_macro {
 #define NODE_USED	(1 << 5)	/* Dumped with -dU.  */
 #define NODE_CONDITIONAL (1 << 6)	/* Conditional macro */
 #define NODE_WARN_OPERATOR (1 << 7)	/* Warn about C++ named operator.  */
+#define NODE_MODULE (1 << 8)		/* C++-20 module-related name.  */

 /* Different flavors of hash node.  */
 enum node_type
@ -900,11 +904,11 @@ struct GTY(()) cpp_hashnode {
  unsigned int directive_index : 7;	/* If is_directive,
 					   then index into directive table.
 					   Otherwise, a NODE_OPERATOR.  */
-  unsigned char rid_code;		/* Rid code - for front ends.  */
+  unsigned int rid_code : 8;		/* Rid code - for front ends.  */
+  unsigned int flags : 9;		/* CPP flags.  */
  ENUM_BITFIELD(node_type) type : 2;	/* CPP node type.  */
-  unsigned int flags : 8;		/* CPP flags.  */

-  /* 6 bits spare (plus another 32 on 64-bit hosts).  */
+  /* 5 bits spare (plus another 32 on 64-bit hosts).  */

  union _cpp_hashnode_value GTY ((desc ("%1.type"))) value;
 };
--- a/libcpp/init.c
+++ b/libcpp/init.c
@ -843,4 +843,27 @@ post_options (cpp_reader *pfile)
      CPP_OPTION (pfile, trigraphs) = 0;
      CPP_OPTION (pfile, warn_trigraphs) = 0;
    }
+
+  if (CPP_OPTION (pfile, module_directives))
+    {
+      /* These unspellable tokens have a leading space.  */
+      const char *const inits[spec_nodes::M_HWM]
+	= {"export ", "module ", "import ", "__import"};
+
+      for (int ix = 0; ix != spec_nodes::M_HWM; ix++)
+	{
+	  cpp_hashnode *node = cpp_lookup (pfile, UC (inits[ix]),
+					   strlen (inits[ix]));
+
+	  /* Token we pass to the compiler.  */
+	  pfile->spec_nodes.n_modules[ix][1] = node;
+
+	  if (ix != spec_nodes::M__IMPORT)
+	    /* Token we recognize when lexing, drop the trailing ' '.  */
+	    node = cpp_lookup (pfile, NODE_NAME (node), NODE_LEN (node) - 1);
+
+	  node->flags |= NODE_MODULE;
+	  pfile->spec_nodes.n_modules[ix][0] = node;
+	}
+    }
 }
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@ -280,6 +280,9 @@ struct lexer_state
  /* Nonzero when tokenizing a deferred pragma.  */
  unsigned char in_deferred_pragma;

+  /* Count to token that is a header-name.  */
+  unsigned char directive_file_token;
+
  /* Nonzero if the deferred pragma being handled allows macro expansion.  */
  unsigned char pragma_allow_expansion;
 };
@ -292,6 +295,12 @@ struct spec_nodes
  cpp_hashnode *n_false;		/* C++ keyword false */
  cpp_hashnode *n__VA_ARGS__;		/* C99 vararg macros */
  cpp_hashnode *n__VA_OPT__;		/* C++ vararg macros */
+
+  enum {M_EXPORT, M_MODULE, M_IMPORT, M__IMPORT, M_HWM};
+  
+  /* C++20 modules, only set when module_directives is in effect.
+     incoming variants [0], outgoing ones [1] */
+  cpp_hashnode *n_modules[M_HWM][2];
 };

 typedef struct _cpp_line_note _cpp_line_note;
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@ -2615,6 +2615,150 @@ _cpp_temp_token (cpp_reader *pfile)
  return result;
 }

+/* We're at the beginning of a logical line (so not in
+  directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
+  if we should enter deferred_pragma mode to tokenize the rest of the
+  line as a module control-line.  */
+
+static void
+cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
+{
+  unsigned backup = 0; /* Tokens we peeked.  */
+  cpp_hashnode *node = result->val.node.node;
+  cpp_token *peek = result;
+  cpp_token *keyword = peek;
+  cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
+  int header_count = 0;
+
+  /* Make sure the incoming state is as we expect it.  This way we
+     can restore it using constants.  */
+  gcc_checking_assert (!pfile->state.in_deferred_pragma
+		       && !pfile->state.skipping
+		       && !pfile->state.parsing_args
+		       && !pfile->state.angled_headers
+		       && (pfile->state.save_comments
+			   == !CPP_OPTION (pfile, discard_comments)));
+
+  /* Enter directives mode sufficiently for peeking.  We don't have
+     to actually set in_directive.  */
+  pfile->state.in_deferred_pragma = true;
+
+  /* These two fields are needed to process tokenization in deferred
+     pragma mode.  They are not used outside deferred pragma mode or
+     directives mode.  */
+  pfile->state.pragma_allow_expansion = true;
+  pfile->directive_line = result->src_loc;
+
+  /* Saving comments is incompatible with directives mode.   */
+  pfile->state.save_comments = 0;
+
+  if (node == n_modules[spec_nodes::M_EXPORT][0])
+    {
+      peek = _cpp_lex_direct (pfile);
+      keyword = peek;
+      backup++;
+      if (keyword->type != CPP_NAME)
+	goto not_module;
+      node = keyword->val.node.node;
+      if (!(node->flags & NODE_MODULE))
+	goto not_module;
+    }
+
+  if (node == n_modules[spec_nodes::M__IMPORT][0])
+    /* __import  */
+    header_count = backup + 2 + 16;
+  else if (node == n_modules[spec_nodes::M_IMPORT][0])
+    /* import  */
+    header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
+  else if (node == n_modules[spec_nodes::M_MODULE][0])
+    ; /* module  */
+  else
+    goto not_module;
+
+  /* We've seen [export] {module|import|__import}.  Check the next token.  */
+  if (header_count)
+    /* After '{,__}import' a header name may appear.  */
+    pfile->state.angled_headers = true;
+  peek = _cpp_lex_direct (pfile);
+  backup++;
+
+  /* ... import followed by identifier, ':', '<' or
+     header-name preprocessing tokens, or module
+     followed by cpp-identifier, ':' or ';' preprocessing
+     tokens.  C++ keywords are not yet relevant.  */
+  if (peek->type == CPP_NAME
+      || peek->type == CPP_COLON
+      ||  (header_count
+	   ? (peek->type == CPP_LESS
+	      || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
+	      || peek->type == CPP_HEADER_NAME)
+	   : peek->type == CPP_SEMICOLON))
+    {
+      pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
+      if (!pfile->state.pragma_allow_expansion)
+	pfile->state.prevent_expansion++;
+
+      if (!header_count && linemap_included_from
+	  (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
+	cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
+			     "module control-line cannot be in included file");
+
+      /* The first one or two tokens cannot be macro names.  */
+      for (int ix = backup; ix--;)
+	{
+	  cpp_token *tok = ix ? keyword : result;
+	  cpp_hashnode *node = tok->val.node.node;
+
+	  /* Don't attempt to expand the token.  */
+	  tok->flags |= NO_EXPAND;
+	  if (_cpp_defined_macro_p (node)
+	      && !cpp_fun_like_macro_p (node))
+	    cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0, 
+				 "module control-line \"%s\" cannot be"
+				 " an object-like macro",
+				 NODE_NAME (node));
+	}
+
+      /* Map to underbar variants.  */
+      keyword->val.node.node = n_modules[header_count
+					 ? spec_nodes::M_IMPORT
+					 : spec_nodes::M_MODULE][1];
+      if (backup != 1)
+	result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
+
+      /* Maybe tell the tokenizer we expect a header-name down the
+	 road.  */
+      pfile->state.directive_file_token = header_count;
+    }
+  else
+    {
+    not_module:
+      /* Drop out of directive mode.  */
+      /* We aaserted save_comments had this value upon entry.  */
+      pfile->state.save_comments
+	= !CPP_OPTION (pfile, discard_comments);
+      pfile->state.in_deferred_pragma = false;
+      /* Do not let this remain on.  */
+      pfile->state.angled_headers = false;
+    }
+
+  /* In either case we want to backup the peeked tokens.  */
+  if (backup)
+    {
+      /* If we saw EOL, we should drop it, because this isn't a module
+	 control-line after all.  */
+      bool eol = peek->type == CPP_PRAGMA_EOL;
+      if (!eol || backup > 1)
+	{
+	  /* Put put the peeked tokens back  */
+	  _cpp_backup_tokens_direct (pfile, backup);
+	  /* But if the last one was an EOL, forget it.  */
+	  if (eol)
+	    pfile->lookaheads--;
+	}
+    }
+}
+
 /* Lex a token into RESULT (external interface).  Takes care of issues
   like directive handling, token lookahead, multiple include
   optimization and skipping.  */
@ -2663,6 +2807,21 @@ _cpp_lex_token (cpp_reader *pfile)
 	    }
 	  else if (pfile->state.in_deferred_pragma)
 	    result = &pfile->directive_result;
+	  else if (result->type == CPP_NAME
+		   && (result->val.node.node->flags & NODE_MODULE)
+		   && !pfile->state.skipping
+		   /* Unlike regular directives, we do not deal with
+		      tokenizing module directives as macro arguments.
+		      That's not permitted.  */
+		   && !pfile->state.parsing_args)
+	    {
+	      /* P1857.  Before macro expansion, At start of logical
+		 line ... */
+	      /* We don't have to consider lookaheads at this point.  */
+	      gcc_checking_assert (!pfile->lookaheads);
+
+	      cpp_maybe_module_directive (pfile, result);
+	    }

 	  if (pfile->cb.line_change && !pfile->state.skipping)
 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
@ -3461,7 +3620,11 @@ cpp_output_token (const cpp_token *token, FILE *fp)
      break;

    case SPELL_LITERAL:
+      if (token->type == CPP_HEADER_NAME)
+	fputc ('"', fp);
      fwrite (token->val.str.text, 1, token->val.str.len, fp);
+      if (token->type == CPP_HEADER_NAME)
+	fputc ('"', fp);
      break;

    case SPELL_NONE:
@ -3947,6 +4110,188 @@ do_peek_prev (const unsigned char *peek, const unsigned char *bound)
    return peek;
 }

+/* If PEEK[-1] is identifier MATCH, scan past it and trailing white
+   space.  Otherwise return NULL.  */
+
+static const unsigned char *
+do_peek_ident (const char *match, const unsigned char *peek,
+	       const unsigned char *limit)
+{
+  for (; *++match; peek++)
+    if (*peek != *match)
+      {
+	peek = do_peek_next (peek, limit);
+	if (*peek != *match)
+	  return NULL;
+      }
+
+  /* Must now not be looking at an identifier char.  */
+  peek = do_peek_next (peek, limit);
+  if (ISIDNUM (*peek))
+    return NULL;
+
+  /* Skip control-line whitespace.  */
+ ws:
+  while (*peek == ' ' || *peek == '\t')
+    peek++;
+  if (__builtin_expect (*peek == '\\', false))
+    {
+      peek = do_peek_backslash (peek, limit);
+      if (*peek != '\\')
+	goto ws;
+    }
+
+  return peek;
+}
+
+/* Are we looking at a module control line starting as PEEK - 1?  */
+
+static bool
+do_peek_module (cpp_reader *pfile, unsigned char c,
+		const unsigned char *peek, const unsigned char *limit)
+{
+  bool import = false;
+
+  if (__builtin_expect (c == 'e', false))
+    {
+      if (!((peek[0] == 'x' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("export", peek, limit))))
+	return false;
+
+      /* export, peek for import or module.  No need to peek __import
+	 here.  */
+      if (peek[0] == 'i')
+	{
+	  if (!((peek[1] == 'm' || peek[1] == '\\')
+		&& (peek = do_peek_ident ("import", peek + 1, limit))))
+	    return false;
+	  import = true;
+	}
+      else if (peek[0] == 'm')
+	{
+	  if (!((peek[1] == 'o' || peek[1] == '\\')
+		&& (peek = do_peek_ident ("module", peek + 1, limit))))
+	    return false;
+	}
+      else
+	return false;
+    }
+  else if (__builtin_expect (c == 'i', false))
+    {
+      if (!((peek[0] == 'm' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("import", peek, limit))))
+	return false;
+      import = true;
+    }
+  else if (__builtin_expect (c == '_', false))
+    {
+      /* Needed for translated includes.   */
+      if (!((peek[0] == '_' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("__import", peek, limit))))
+	return false;
+      import = true;
+    }
+  else if (__builtin_expect (c == 'm', false))
+    {
+      if (!((peek[0] == 'o' || peek[0] == '\\')
+	    && (peek = do_peek_ident ("module", peek, limit))))
+	return false;
+    }
+  else
+    return false;
+
+  /* Peek the next character to see if it's good enough.  We'll be at
+     the first non-whitespace char, including skipping an escaped
+     newline.  */
+  /* ... import followed by identifier, ':', '<' or header-name
+     preprocessing tokens, or module followed by identifier, ':' or
+     ';' preprocessing tokens.  */
+  unsigned char p = *peek++;
+      
+  /* A character literal is ... single quotes, ... optionally preceded
+     by u8, u, U, or L */
+  /* A string-literal is a ... double quotes, optionally prefixed by
+     R, u8, u8R, u, uR, U, UR, L, or LR */
+  if (p == 'u')
+    {
+      peek = do_peek_next (peek, limit);
+      if (*peek == '8')
+	{
+	  peek++;
+	  goto peek_u8;
+	}
+      goto peek_u;
+    }
+  else if (p == 'U' || p == 'L')
+    {
+    peek_u8:
+      peek = do_peek_next (peek, limit);
+    peek_u:
+      if (*peek == '\"' || *peek == '\'')
+	return false;
+
+      if (*peek == 'R')
+	goto peek_R;
+      /* Identifier. Ok.  */
+    }
+  else if (p == 'R')
+    {
+    peek_R:
+      if (CPP_OPTION (pfile, rliterals))
+	{
+	  peek = do_peek_next (peek, limit);
+	  if (*peek == '\"')
+	    return false;
+	}
+      /* Identifier. Ok.  */
+    }
+  else if ('Z' - 'A' == 25
+	   ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
+	   : ISIDST (p))
+    {
+      /* Identifier.  Ok. */
+    }
+  else if (p == '<')
+    {
+      /* Maybe angle header, ok for import.  Reject
+	 '<=', '<<' digraph:'<:'.  */
+      if (!import)
+	return false;
+      peek = do_peek_next (peek, limit);
+      if (*peek == '=' || *peek == '<'
+	  || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
+	return false;
+    }
+  else if (p == ';')
+    {
+      /* SEMICOLON, ok for module.  */
+      if (import)
+	return false;
+    }
+  else if (p == '"')
+    {
+      /* STRING, ok for import.  */
+      if (!import)
+	return false;
+    }
+  else if (p == ':')
+    {
+      /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
+      peek = do_peek_next (peek, limit);
+      if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
+	return false;
+    }
+  else
+    /* FIXME: Detect a unicode character, excluding those not
+       permitted as the initial character. [lex.name]/1.  I presume
+       we need to check the \[uU] spellings, and directly using
+       Unicode in say UTF8 form?  Or perhaps we do the phase-1
+       conversion of UTF8 to universal-character-names?  */
+    return false;
+
+  return true;
+}
+
 /* Directives-only scanning.  Somewhat more relaxed than correct
   parsing -- some ill-formed programs will not be rejected.  */

@ -3955,6 +4300,8 @@ cpp_directive_only_process (cpp_reader *pfile,
 			    void *data,
 			    void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
 {
+  bool module_p = CPP_OPTION (pfile, module_directives);
+
  do
    {
    restart:
@ -4347,6 +4694,51 @@ cpp_directive_only_process (cpp_reader *pfile,
 	      }
 	      goto dflt;

+	    case '_':
+	    case 'e':
+	    case 'i':
+	    case 'm':
+	      if (bol && module_p && !pfile->state.skipping
+		  && do_peek_module (pfile, c, pos, limit))
+		{
+		  /* We've seen the start of a module control line.
+		     Start up the tokenizer.  */
+		  pos--; /* Backup over the first character.  */
+
+		  /* Backup over whitespace to start of line.  */
+		  while (pos > line_start
+			 && (pos[-1] == ' ' || pos[-1] == '\t'))
+		    pos--;
+
+		  if (pos > base)
+		    cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
+
+		  /* Prep things for directive handling. */
+		  buffer->next_line = pos;
+		  buffer->need_line = true;
+
+		  /* Now get tokens until the PRAGMA_EOL.  */
+		  do
+		    {
+		      location_t spelling;
+		      const cpp_token *tok
+			= cpp_get_token_with_location (pfile, &spelling);
+
+		      gcc_assert (pfile->state.in_deferred_pragma
+				  || tok->type == CPP_PRAGMA_EOL);
+		      cb (pfile, CPP_DO_token, data, tok, spelling);
+		    }
+		  while (pfile->state.in_deferred_pragma);
+
+		  if (pfile->buffer->next_line < pfile->buffer->rlimit)
+		    cb (pfile, CPP_DO_location, data,
+			pfile->line_table->highest_line);
+
+		  pfile->mi_valid = false;
+		  goto restart;
+		}
+	      goto dflt;
+
 	    default:
 	    dflt:
 	      bol = false;
--- a/libcpp/macro.c
+++ b/libcpp/macro.c
@ -2963,6 +2963,85 @@ cpp_get_token_1 (cpp_reader *pfile, location_t *location)
    }

  pfile->about_to_expand_macro_p = saved_about_to_expand_macro;
+
+  if (pfile->state.directive_file_token
+      && !pfile->state.parsing_args
+      && !(result->type == CPP_PADDING || result->type == CPP_COMMENT)
+      && !(15 & --pfile->state.directive_file_token))
+    {
+      /* Do header-name frobbery.  Concatenate < ... > as approprate.
+	 Do header search if needed, and finally drop the outer <> or
+	 "".  */
+      pfile->state.angled_headers = false;
+
+      /* Do angle-header reconstitution.  Then do include searching.
+	 We'll always end up with a ""-quoted header-name in that
+	 case.  If searching finds nothing, we emit a diagnostic and
+	 an empty string.  */
+      size_t len = 0;
+      char *fname = NULL;
+
+      cpp_token *tmp = _cpp_temp_token (pfile);
+      *tmp = *result;
+
+      tmp->type = CPP_HEADER_NAME;
+      bool need_search = !pfile->state.directive_file_token;
+      pfile->state.directive_file_token = 0;
+
+      bool angle = result->type != CPP_STRING;
+      if (result->type == CPP_HEADER_NAME
+	  || (result->type == CPP_STRING && result->val.str.text[0] != 'R'))
+	{
+	  len = result->val.str.len - 2;
+	  fname = XNEWVEC (char, len + 1);
+	  memcpy (fname, result->val.str.text + 1, len);
+	  fname[len] = 0;
+	}
+      else if (result->type == CPP_LESS)
+	fname = _cpp_bracket_include (pfile);
+
+      if (fname)
+	{
+	  /* We have a header-name.  Look it up.  This will emit an
+	     unfound diagnostic.  Canonicalize the found name.  */
+	  const char *found = fname;
+
+	  if (need_search)
+	    {
+	      found = cpp_find_header_unit (pfile, fname, angle, tmp->src_loc);
+	      if (!found)
+		found = "";
+	      len = strlen (found);
+	    }
+	  /* Force a leading './' if it's not absolute.  */
+	  bool dotme = (found[0] == '.' ? !IS_DIR_SEPARATOR (found[1])
+			: found[0] && !IS_ABSOLUTE_PATH (found));
+
+	  if (BUFF_ROOM (pfile->u_buff) < len + 1 + dotme * 2)
+	    _cpp_extend_buff (pfile, &pfile->u_buff, len + 1 + dotme * 2);
+	  unsigned char *buf = BUFF_FRONT (pfile->u_buff);
+	  size_t pos = 0;
+	      
+	  if (dotme)
+	    {
+	      buf[pos++] = '.';
+	      /* Apparently '/' is unconditional.  */
+	      buf[pos++] = '/';
+	    }
+	  memcpy (&buf[pos], found, len);
+	  pos += len;
+	  buf[pos] = 0;
+
+	  tmp->val.str.len = pos;
+	  tmp->val.str.text = buf;
+
+	  tmp->type = CPP_HEADER_NAME;
+	  XDELETEVEC (fname);
+	  
+	  result = tmp;
+	}
+    }
+
  return result;
 }