preprocessor: C++ module-directives

C++20 modules introduces a new kind of preprocessor directive -- a
module directive.  These are directives but without the leading '#'.
We have to detect them by sniffing the start of a logical line.  When
detected we replace the initial identifiers with unspellable tokens
and pass them through to the language parser the same way deferred
pragmas are.  There's a PRAGMA_EOL at the logical end of line too.

One additional complication is that we have to do header-name lexing
after the initial tokens, and that requires changes in the macro-aware
piece of the preprocessor.  The above sniffer sets a counter in the
lexer state, and that triggers at the appropriate point.  We then do
the same header-name lexing that occurs on a #include directive or
has_include pseudo-macro.  Except that the header name ends up in the
token stream.

A couple of token emitters need to deal with the new token possibility.

	gcc/c-family/
	* c-lex.c (c_lex_with_flags): CPP_HEADER_NAMEs can now be seen.
	libcpp/
	* include/cpplib.h (struct cpp_options): Add module_directives
	option.
	(NODE_MODULE): New node flag.
	(struct cpp_hashnode): Make rid-code a bitfield, increase bits in
	flags and swap with type field.
	* init.c (post_options): Create module-directive identifier nodes.
	* internal.h (struct lexer_state): Add directive_file_token &
	n_modules fields.  Add module node enumerator.
	* lex.c (cpp_maybe_module_directive): New.
	(_cpp_lex_token): Call it.
	(cpp_output_token): Add '"' around CPP_HEADER_NAME token.
	(do_peek_ident, do_peek_module): New.
	(cpp_directives_only): Detect module-directive lines.
	* macro.c (cpp_get_token_1): Deal with directive_file_token
	triggering.
This commit is contained in:
Nathan Sidwell 2020-11-18 10:24:12 -08:00
parent 7ceb899e93
commit c9c3d5f28a
6 changed files with 514 additions and 4 deletions

View File

@ -667,8 +667,11 @@ c_lex_with_flags (tree *value, location_t *loc, unsigned char *cpp_flags,
*value = build_int_cst (integer_type_node, tok->val.pragma);
break;
/* These tokens should not be visible outside cpplib. */
case CPP_HEADER_NAME:
*value = build_string (tok->val.str.len, (const char *)tok->val.str.text);
break;
/* This token should not be visible outside cpplib. */
case CPP_MACRO_ARG:
gcc_unreachable ();

View File

@ -487,6 +487,9 @@ struct cpp_options
/* Nonzero for the '::' token. */
unsigned char scope;
/* Nonzero means tokenize C++20 module directives. */
unsigned char module_directives;
/* Holds the name of the target (execution) character set. */
const char *narrow_charset;
@ -842,6 +845,7 @@ struct GTY(()) cpp_macro {
#define NODE_USED (1 << 5) /* Dumped with -dU. */
#define NODE_CONDITIONAL (1 << 6) /* Conditional macro */
#define NODE_WARN_OPERATOR (1 << 7) /* Warn about C++ named operator. */
#define NODE_MODULE (1 << 8) /* C++-20 module-related name. */
/* Different flavors of hash node. */
enum node_type
@ -900,11 +904,11 @@ struct GTY(()) cpp_hashnode {
unsigned int directive_index : 7; /* If is_directive,
then index into directive table.
Otherwise, a NODE_OPERATOR. */
unsigned char rid_code; /* Rid code - for front ends. */
unsigned int rid_code : 8; /* Rid code - for front ends. */
unsigned int flags : 9; /* CPP flags. */
ENUM_BITFIELD(node_type) type : 2; /* CPP node type. */
unsigned int flags : 8; /* CPP flags. */
/* 6 bits spare (plus another 32 on 64-bit hosts). */
/* 5 bits spare (plus another 32 on 64-bit hosts). */
union _cpp_hashnode_value GTY ((desc ("%1.type"))) value;
};

View File

@ -843,4 +843,27 @@ post_options (cpp_reader *pfile)
CPP_OPTION (pfile, trigraphs) = 0;
CPP_OPTION (pfile, warn_trigraphs) = 0;
}
if (CPP_OPTION (pfile, module_directives))
{
/* These unspellable tokens have a leading space. */
const char *const inits[spec_nodes::M_HWM]
= {"export ", "module ", "import ", "__import"};
for (int ix = 0; ix != spec_nodes::M_HWM; ix++)
{
cpp_hashnode *node = cpp_lookup (pfile, UC (inits[ix]),
strlen (inits[ix]));
/* Token we pass to the compiler. */
pfile->spec_nodes.n_modules[ix][1] = node;
if (ix != spec_nodes::M__IMPORT)
/* Token we recognize when lexing, drop the trailing ' '. */
node = cpp_lookup (pfile, NODE_NAME (node), NODE_LEN (node) - 1);
node->flags |= NODE_MODULE;
pfile->spec_nodes.n_modules[ix][0] = node;
}
}
}

View File

@ -280,6 +280,9 @@ struct lexer_state
/* Nonzero when tokenizing a deferred pragma. */
unsigned char in_deferred_pragma;
/* Count to token that is a header-name. */
unsigned char directive_file_token;
/* Nonzero if the deferred pragma being handled allows macro expansion. */
unsigned char pragma_allow_expansion;
};
@ -292,6 +295,12 @@ struct spec_nodes
cpp_hashnode *n_false; /* C++ keyword false */
cpp_hashnode *n__VA_ARGS__; /* C99 vararg macros */
cpp_hashnode *n__VA_OPT__; /* C++ vararg macros */
enum {M_EXPORT, M_MODULE, M_IMPORT, M__IMPORT, M_HWM};
/* C++20 modules, only set when module_directives is in effect.
incoming variants [0], outgoing ones [1] */
cpp_hashnode *n_modules[M_HWM][2];
};
typedef struct _cpp_line_note _cpp_line_note;

View File

@ -2615,6 +2615,150 @@ _cpp_temp_token (cpp_reader *pfile)
return result;
}
/* We're at the beginning of a logical line (so not in
directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
if we should enter deferred_pragma mode to tokenize the rest of the
line as a module control-line. */
static void
cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
{
unsigned backup = 0; /* Tokens we peeked. */
cpp_hashnode *node = result->val.node.node;
cpp_token *peek = result;
cpp_token *keyword = peek;
cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
int header_count = 0;
/* Make sure the incoming state is as we expect it. This way we
can restore it using constants. */
gcc_checking_assert (!pfile->state.in_deferred_pragma
&& !pfile->state.skipping
&& !pfile->state.parsing_args
&& !pfile->state.angled_headers
&& (pfile->state.save_comments
== !CPP_OPTION (pfile, discard_comments)));
/* Enter directives mode sufficiently for peeking. We don't have
to actually set in_directive. */
pfile->state.in_deferred_pragma = true;
/* These two fields are needed to process tokenization in deferred
pragma mode. They are not used outside deferred pragma mode or
directives mode. */
pfile->state.pragma_allow_expansion = true;
pfile->directive_line = result->src_loc;
/* Saving comments is incompatible with directives mode. */
pfile->state.save_comments = 0;
if (node == n_modules[spec_nodes::M_EXPORT][0])
{
peek = _cpp_lex_direct (pfile);
keyword = peek;
backup++;
if (keyword->type != CPP_NAME)
goto not_module;
node = keyword->val.node.node;
if (!(node->flags & NODE_MODULE))
goto not_module;
}
if (node == n_modules[spec_nodes::M__IMPORT][0])
/* __import */
header_count = backup + 2 + 16;
else if (node == n_modules[spec_nodes::M_IMPORT][0])
/* import */
header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
else if (node == n_modules[spec_nodes::M_MODULE][0])
; /* module */
else
goto not_module;
/* We've seen [export] {module|import|__import}. Check the next token. */
if (header_count)
/* After '{,__}import' a header name may appear. */
pfile->state.angled_headers = true;
peek = _cpp_lex_direct (pfile);
backup++;
/* ... import followed by identifier, ':', '<' or
header-name preprocessing tokens, or module
followed by cpp-identifier, ':' or ';' preprocessing
tokens. C++ keywords are not yet relevant. */
if (peek->type == CPP_NAME
|| peek->type == CPP_COLON
|| (header_count
? (peek->type == CPP_LESS
|| (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
|| peek->type == CPP_HEADER_NAME)
: peek->type == CPP_SEMICOLON))
{
pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
if (!pfile->state.pragma_allow_expansion)
pfile->state.prevent_expansion++;
if (!header_count && linemap_included_from
(LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
"module control-line cannot be in included file");
/* The first one or two tokens cannot be macro names. */
for (int ix = backup; ix--;)
{
cpp_token *tok = ix ? keyword : result;
cpp_hashnode *node = tok->val.node.node;
/* Don't attempt to expand the token. */
tok->flags |= NO_EXPAND;
if (_cpp_defined_macro_p (node)
&& !cpp_fun_like_macro_p (node))
cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
"module control-line \"%s\" cannot be"
" an object-like macro",
NODE_NAME (node));
}
/* Map to underbar variants. */
keyword->val.node.node = n_modules[header_count
? spec_nodes::M_IMPORT
: spec_nodes::M_MODULE][1];
if (backup != 1)
result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
/* Maybe tell the tokenizer we expect a header-name down the
road. */
pfile->state.directive_file_token = header_count;
}
else
{
not_module:
/* Drop out of directive mode. */
/* We aaserted save_comments had this value upon entry. */
pfile->state.save_comments
= !CPP_OPTION (pfile, discard_comments);
pfile->state.in_deferred_pragma = false;
/* Do not let this remain on. */
pfile->state.angled_headers = false;
}
/* In either case we want to backup the peeked tokens. */
if (backup)
{
/* If we saw EOL, we should drop it, because this isn't a module
control-line after all. */
bool eol = peek->type == CPP_PRAGMA_EOL;
if (!eol || backup > 1)
{
/* Put put the peeked tokens back */
_cpp_backup_tokens_direct (pfile, backup);
/* But if the last one was an EOL, forget it. */
if (eol)
pfile->lookaheads--;
}
}
}
/* Lex a token into RESULT (external interface). Takes care of issues
like directive handling, token lookahead, multiple include
optimization and skipping. */
@ -2663,6 +2807,21 @@ _cpp_lex_token (cpp_reader *pfile)
}
else if (pfile->state.in_deferred_pragma)
result = &pfile->directive_result;
else if (result->type == CPP_NAME
&& (result->val.node.node->flags & NODE_MODULE)
&& !pfile->state.skipping
/* Unlike regular directives, we do not deal with
tokenizing module directives as macro arguments.
That's not permitted. */
&& !pfile->state.parsing_args)
{
/* P1857. Before macro expansion, At start of logical
line ... */
/* We don't have to consider lookaheads at this point. */
gcc_checking_assert (!pfile->lookaheads);
cpp_maybe_module_directive (pfile, result);
}
if (pfile->cb.line_change && !pfile->state.skipping)
pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
@ -3461,7 +3620,11 @@ cpp_output_token (const cpp_token *token, FILE *fp)
break;
case SPELL_LITERAL:
if (token->type == CPP_HEADER_NAME)
fputc ('"', fp);
fwrite (token->val.str.text, 1, token->val.str.len, fp);
if (token->type == CPP_HEADER_NAME)
fputc ('"', fp);
break;
case SPELL_NONE:
@ -3947,6 +4110,188 @@ do_peek_prev (const unsigned char *peek, const unsigned char *bound)
return peek;
}
/* If PEEK[-1] is identifier MATCH, scan past it and trailing white
space. Otherwise return NULL. */
static const unsigned char *
do_peek_ident (const char *match, const unsigned char *peek,
const unsigned char *limit)
{
for (; *++match; peek++)
if (*peek != *match)
{
peek = do_peek_next (peek, limit);
if (*peek != *match)
return NULL;
}
/* Must now not be looking at an identifier char. */
peek = do_peek_next (peek, limit);
if (ISIDNUM (*peek))
return NULL;
/* Skip control-line whitespace. */
ws:
while (*peek == ' ' || *peek == '\t')
peek++;
if (__builtin_expect (*peek == '\\', false))
{
peek = do_peek_backslash (peek, limit);
if (*peek != '\\')
goto ws;
}
return peek;
}
/* Are we looking at a module control line starting as PEEK - 1? */
static bool
do_peek_module (cpp_reader *pfile, unsigned char c,
const unsigned char *peek, const unsigned char *limit)
{
bool import = false;
if (__builtin_expect (c == 'e', false))
{
if (!((peek[0] == 'x' || peek[0] == '\\')
&& (peek = do_peek_ident ("export", peek, limit))))
return false;
/* export, peek for import or module. No need to peek __import
here. */
if (peek[0] == 'i')
{
if (!((peek[1] == 'm' || peek[1] == '\\')
&& (peek = do_peek_ident ("import", peek + 1, limit))))
return false;
import = true;
}
else if (peek[0] == 'm')
{
if (!((peek[1] == 'o' || peek[1] == '\\')
&& (peek = do_peek_ident ("module", peek + 1, limit))))
return false;
}
else
return false;
}
else if (__builtin_expect (c == 'i', false))
{
if (!((peek[0] == 'm' || peek[0] == '\\')
&& (peek = do_peek_ident ("import", peek, limit))))
return false;
import = true;
}
else if (__builtin_expect (c == '_', false))
{
/* Needed for translated includes. */
if (!((peek[0] == '_' || peek[0] == '\\')
&& (peek = do_peek_ident ("__import", peek, limit))))
return false;
import = true;
}
else if (__builtin_expect (c == 'm', false))
{
if (!((peek[0] == 'o' || peek[0] == '\\')
&& (peek = do_peek_ident ("module", peek, limit))))
return false;
}
else
return false;
/* Peek the next character to see if it's good enough. We'll be at
the first non-whitespace char, including skipping an escaped
newline. */
/* ... import followed by identifier, ':', '<' or header-name
preprocessing tokens, or module followed by identifier, ':' or
';' preprocessing tokens. */
unsigned char p = *peek++;
/* A character literal is ... single quotes, ... optionally preceded
by u8, u, U, or L */
/* A string-literal is a ... double quotes, optionally prefixed by
R, u8, u8R, u, uR, U, UR, L, or LR */
if (p == 'u')
{
peek = do_peek_next (peek, limit);
if (*peek == '8')
{
peek++;
goto peek_u8;
}
goto peek_u;
}
else if (p == 'U' || p == 'L')
{
peek_u8:
peek = do_peek_next (peek, limit);
peek_u:
if (*peek == '\"' || *peek == '\'')
return false;
if (*peek == 'R')
goto peek_R;
/* Identifier. Ok. */
}
else if (p == 'R')
{
peek_R:
if (CPP_OPTION (pfile, rliterals))
{
peek = do_peek_next (peek, limit);
if (*peek == '\"')
return false;
}
/* Identifier. Ok. */
}
else if ('Z' - 'A' == 25
? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
: ISIDST (p))
{
/* Identifier. Ok. */
}
else if (p == '<')
{
/* Maybe angle header, ok for import. Reject
'<=', '<<' digraph:'<:'. */
if (!import)
return false;
peek = do_peek_next (peek, limit);
if (*peek == '=' || *peek == '<'
|| (*peek == ':' && CPP_OPTION (pfile, digraphs)))
return false;
}
else if (p == ';')
{
/* SEMICOLON, ok for module. */
if (import)
return false;
}
else if (p == '"')
{
/* STRING, ok for import. */
if (!import)
return false;
}
else if (p == ':')
{
/* Maybe COLON, ok. Reject '::', digraph:':>'. */
peek = do_peek_next (peek, limit);
if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
return false;
}
else
/* FIXME: Detect a unicode character, excluding those not
permitted as the initial character. [lex.name]/1. I presume
we need to check the \[uU] spellings, and directly using
Unicode in say UTF8 form? Or perhaps we do the phase-1
conversion of UTF8 to universal-character-names? */
return false;
return true;
}
/* Directives-only scanning. Somewhat more relaxed than correct
parsing -- some ill-formed programs will not be rejected. */
@ -3955,6 +4300,8 @@ cpp_directive_only_process (cpp_reader *pfile,
void *data,
void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
{
bool module_p = CPP_OPTION (pfile, module_directives);
do
{
restart:
@ -4347,6 +4694,51 @@ cpp_directive_only_process (cpp_reader *pfile,
}
goto dflt;
case '_':
case 'e':
case 'i':
case 'm':
if (bol && module_p && !pfile->state.skipping
&& do_peek_module (pfile, c, pos, limit))
{
/* We've seen the start of a module control line.
Start up the tokenizer. */
pos--; /* Backup over the first character. */
/* Backup over whitespace to start of line. */
while (pos > line_start
&& (pos[-1] == ' ' || pos[-1] == '\t'))
pos--;
if (pos > base)
cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
/* Prep things for directive handling. */
buffer->next_line = pos;
buffer->need_line = true;
/* Now get tokens until the PRAGMA_EOL. */
do
{
location_t spelling;
const cpp_token *tok
= cpp_get_token_with_location (pfile, &spelling);
gcc_assert (pfile->state.in_deferred_pragma
|| tok->type == CPP_PRAGMA_EOL);
cb (pfile, CPP_DO_token, data, tok, spelling);
}
while (pfile->state.in_deferred_pragma);
if (pfile->buffer->next_line < pfile->buffer->rlimit)
cb (pfile, CPP_DO_location, data,
pfile->line_table->highest_line);
pfile->mi_valid = false;
goto restart;
}
goto dflt;
default:
dflt:
bol = false;

View File

@ -2963,6 +2963,85 @@ cpp_get_token_1 (cpp_reader *pfile, location_t *location)
}
pfile->about_to_expand_macro_p = saved_about_to_expand_macro;
if (pfile->state.directive_file_token
&& !pfile->state.parsing_args
&& !(result->type == CPP_PADDING || result->type == CPP_COMMENT)
&& !(15 & --pfile->state.directive_file_token))
{
/* Do header-name frobbery. Concatenate < ... > as approprate.
Do header search if needed, and finally drop the outer <> or
"". */
pfile->state.angled_headers = false;
/* Do angle-header reconstitution. Then do include searching.
We'll always end up with a ""-quoted header-name in that
case. If searching finds nothing, we emit a diagnostic and
an empty string. */
size_t len = 0;
char *fname = NULL;
cpp_token *tmp = _cpp_temp_token (pfile);
*tmp = *result;
tmp->type = CPP_HEADER_NAME;
bool need_search = !pfile->state.directive_file_token;
pfile->state.directive_file_token = 0;
bool angle = result->type != CPP_STRING;
if (result->type == CPP_HEADER_NAME
|| (result->type == CPP_STRING && result->val.str.text[0] != 'R'))
{
len = result->val.str.len - 2;
fname = XNEWVEC (char, len + 1);
memcpy (fname, result->val.str.text + 1, len);
fname[len] = 0;
}
else if (result->type == CPP_LESS)
fname = _cpp_bracket_include (pfile);
if (fname)
{
/* We have a header-name. Look it up. This will emit an
unfound diagnostic. Canonicalize the found name. */
const char *found = fname;
if (need_search)
{
found = cpp_find_header_unit (pfile, fname, angle, tmp->src_loc);
if (!found)
found = "";
len = strlen (found);
}
/* Force a leading './' if it's not absolute. */
bool dotme = (found[0] == '.' ? !IS_DIR_SEPARATOR (found[1])
: found[0] && !IS_ABSOLUTE_PATH (found));
if (BUFF_ROOM (pfile->u_buff) < len + 1 + dotme * 2)
_cpp_extend_buff (pfile, &pfile->u_buff, len + 1 + dotme * 2);
unsigned char *buf = BUFF_FRONT (pfile->u_buff);
size_t pos = 0;
if (dotme)
{
buf[pos++] = '.';
/* Apparently '/' is unconditional. */
buf[pos++] = '/';
}
memcpy (&buf[pos], found, len);
pos += len;
buf[pos] = 0;
tmp->val.str.len = pos;
tmp->val.str.text = buf;
tmp->type = CPP_HEADER_NAME;
XDELETEVEC (fname);
result = tmp;
}
}
return result;
}