libcpp: Implement C++23 P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31

The following patch implements the
P1949R7 - C++ Identifier Syntax using Unicode Standard Annex 31
paper.  We already allow UTF-8 characters in the source, so that part
is already implemented, so IMHO all we need to do is pedwarn instead of
just warn for the (default) -Wnormalize=nfc (or for -Wnormalize={id,nkfc})
if the character is not in NFC and to use the unicode XID_Start and
XID_Continue derived code properties to find out what characters are allowed
(the standard actually adds U+005F to XID_Start, but we are handling the
ASCII compatible characters differently already and they aren't allowed
in UCNs in identifiers).  Instead of hardcoding the large tables
in ucnid.tab, this patch makes makeucnid.c read them from the Unicode
tables (13.0.0 version at this point).

For non-pedantic mode, we accept as 2nd+ char in identifiers a union
of valid characters in all supported modes, but for the 1st char it
was actually pedantically requiring that it is not any of the characters
that may not appear in the currently chosen standard as the first character.
This patch changes it such that also what is allowed at the start of an
identifier is a union of characters valid at the start of an identifier
in any of the pedantic modes.

2021-09-01  Jakub Jelinek  <jakub@redhat.com>

	PR c++/100977
libcpp/
	* include/cpplib.h (struct cpp_options): Add cxx23_identifiers.
	* charset.c (CXX23, NXX23): New enumerators.
	(CID, NFC, NKC, CTX): Renumber.
	(ucn_valid_in_identifier): Implement P1949R7 - use CXX23 and
	NXX23 flags for cxx23_identifiers.  For start character in
	non-pedantic mode, allow characters that are allowed as start
	characters in any of the supported language modes, rather than
	disallowing characters allowed only as non-start characters in
	current mode but for characters from other language modes allowing
	them even if they are never allowed at start.
	* init.c (struct lang_flags): Add cxx23_identifiers.
	(lang_defaults): Add cxx23_identifiers column.
	(cpp_set_lang): Initialize CPP_OPTION (pfile, cxx23_identifiers).
	* lex.c (warn_about_normalization): If cxx23_identifiers, use
	cpp_pedwarning_with_line instead of cpp_warning_with_line for
	"is not in NFC" diagnostics.
	* makeucnid.c: Adjust usage comment.
	(CXX23, NXX23): New enumerators.
	(all_languages): Add CXX23.
	(not_NFC, not_NFKC, maybe_not_NFC): Renumber.
	(read_derivedcore): New function.
	(write_table): Print also CXX23 and NXX23 columns.
	(main): Require 5 arguments instead of 4, call read_derivedcore.
	* ucnid.h: Regenerated using Unicode 13.0.0 files.
gcc/testsuite/
	* g++.dg/cpp23/normalize1.C: New test.
	* g++.dg/cpp23/normalize2.C: New test.
	* g++.dg/cpp23/normalize3.C: New test.
	* g++.dg/cpp23/normalize4.C: New test.
	* g++.dg/cpp23/normalize5.C: New test.
	* g++.dg/cpp23/normalize6.C: New test.
	* g++.dg/cpp23/normalize7.C: New test.
	* g++.dg/cpp23/ucnid-1-utf8.C: New test.
	* g++.dg/cpp23/ucnid-2-utf8.C: New test.
	* gcc.dg/cpp/ucnid-4.c: Don't expect
	"not valid at the start of an identifier" errors.
	* gcc.dg/cpp/ucnid-4-utf8.c: Likewise.
	* gcc.dg/cpp/ucnid-5-utf8.c: New test.
This commit is contained in:
Jakub Jelinek 2021-09-01 22:33:06 +02:00
parent 852fdc23a2
commit c4d6dcacfc
18 changed files with 3221 additions and 1800 deletions

View File

@ -0,0 +1,66 @@
// { dg-do preprocess { target { c++11 && { ! c++23 } } } }
// { dg-options "" }
\u00AA
\u00B7
\u0F43 // { dg-warning "not in NFC" }
a\u05B8\u05B9\u05B9\u05BBb
a\u05BB\u05B9\u05B8\u05B9b // { dg-warning "not in NFC" }
\u09CB
\u09C7\u09BE // { dg-warning "not in NFC" }
\u0B4B
\u0B47\u0B3E // { dg-warning "not in NFC" }
\u0BCA
\u0BC6\u0BBE // { dg-warning "not in NFC" }
\u0BCB
\u0BC7\u0BBE // { dg-warning "not in NFC" }
\u0CCA
\u0CC6\u0CC2 // { dg-warning "not in NFC" }
\u0D4A
\u0D46\u0D3E // { dg-warning "not in NFC" }
\u0D4B
\u0D47\u0D3E // { dg-warning "not in NFC" }
K
\u212A // { dg-warning "not in NFC" }
\u03AC
\u1F71 // { dg-warning "not in NFC" }
\uAC00
\u1100\u1161 // { dg-warning "not in NFC" }
\uAC01
\u1100\u1161\u11A8 // { dg-warning "not in NFC" }
\uAC00\u11A8 // { dg-warning "not in NFC" }
ª
·
// { dg-warning "not in NFC" }
aָֹֹֻb
aָֹֹֻb // { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
K
// { dg-warning "not in NFC" }
ά
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }

View File

@ -0,0 +1,66 @@
// { dg-do preprocess { target { c++23 } } }
// { dg-options "" }
\u00AA
\u00B7
\u0F43 // { dg-warning "not in NFC" }
a\u05B8\u05B9\u05B9\u05BBb
a\u05BB\u05B9\u05B8\u05B9b // { dg-warning "not in NFC" }
\u09CB
\u09C7\u09BE // { dg-warning "not in NFC" }
\u0B4B
\u0B47\u0B3E // { dg-warning "not in NFC" }
\u0BCA
\u0BC6\u0BBE // { dg-warning "not in NFC" }
\u0BCB
\u0BC7\u0BBE // { dg-warning "not in NFC" }
\u0CCA
\u0CC6\u0CC2 // { dg-warning "not in NFC" }
\u0D4A
\u0D46\u0D3E // { dg-warning "not in NFC" }
\u0D4B
\u0D47\u0D3E // { dg-warning "not in NFC" }
K
\u212A // { dg-warning "not in NFC" }
\u03AC
\u1F71 // { dg-warning "not in NFC" }
\uAC00
\u1100\u1161 // { dg-warning "not in NFC" }
\uAC01
\u1100\u1161\u11A8 // { dg-warning "not in NFC" }
\uAC00\u11A8 // { dg-warning "not in NFC" }
ª
·
// { dg-warning "not in NFC" }
aָֹֹֻb
aָֹֹֻb // { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
K
// { dg-warning "not in NFC" }
ά
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }

View File

@ -0,0 +1,80 @@
// { dg-do preprocess { target { c++23 } } }
// { dg-options "-pedantic-errors" }
\u00AA
\u00B7 // { dg-error "is not valid at the start of an identifier" }
\u0F43 // { dg-error "not in NFC" }
a\u05B8\u05B9\u05B9\u05BBb
a\u05BB\u05B9\u05B8\u05B9b // { dg-error "not in NFC" }
\u09CB // { dg-error "is not valid at the start of an identifier" }
\u09C7\u09BE // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
\u0B4B // { dg-error "is not valid at the start of an identifier" }
\u0B47\u0B3E // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
\u0BCA // { dg-error "is not valid at the start of an identifier" }
\u0BC6\u0BBE // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
\u0BCB // { dg-error "is not valid at the start of an identifier" }
\u0BC7\u0BBE // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
\u0CCA // { dg-error "is not valid at the start of an identifier" }
\u0CC6\u0CC2 // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
\u0D4A // { dg-error "is not valid at the start of an identifier" }
\u0D46\u0D3E // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
\u0D4B // { dg-error "is not valid at the start of an identifier" }
\u0D47\u0D3E // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
K
\u212A // { dg-error "not in NFC" }
\u03AC
\u1F71 // { dg-error "not in NFC" }
\uAC00
\u1100\u1161 // { dg-error "not in NFC" }
\uAC01
\u1100\u1161\u11A8 // { dg-error "not in NFC" }
\uAC00\u11A8 // { dg-error "not in NFC" }
ª
· // { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
aָֹֹֻb
aָֹֹֻb // { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
// { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
// { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
// { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
// { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
// { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
// { dg-error "is not valid at the start of an identifier" }
// { dg-error "not in NFC" }
// { dg-error "is not valid at the start of an identifier" "" { target *-*-* } .-1 }
K
// { dg-error "not in NFC" }
ά
// { dg-error "not in NFC" }
// { dg-error "not in NFC" }
// { dg-error "not in NFC" }
// { dg-error "not in NFC" }

View File

@ -0,0 +1,66 @@
// { dg-do preprocess { target { c++23 } } }
// { dg-options "" }
\u00AA
x\u00B7
\u0F43 // { dg-warning "not in NFC" }
a\u05B8\u05B9\u05B9\u05BBb
a\u05BB\u05B9\u05B8\u05B9b // { dg-warning "not in NFC" }
x\u09CB
x\u09C7\u09BE // { dg-warning "not in NFC" }
x\u0B4B
x\u0B47\u0B3E // { dg-warning "not in NFC" }
x\u0BCA
x\u0BC6\u0BBE // { dg-warning "not in NFC" }
x\u0BCB
x\u0BC7\u0BBE // { dg-warning "not in NFC" }
x\u0CCA
x\u0CC6\u0CC2 // { dg-warning "not in NFC" }
x\u0D4A
x\u0D46\u0D3E // { dg-warning "not in NFC" }
x\u0D4B
x\u0D47\u0D3E // { dg-warning "not in NFC" }
K
\u212A // { dg-warning "not in NFC" }
\u03AC
\u1F71 // { dg-warning "not in NFC" }
\uAC00
\u1100\u1161 // { dg-warning "not in NFC" }
\uAC01
\u1100\u1161\u11A8 // { dg-warning "not in NFC" }
\uAC00\u11A8 // { dg-warning "not in NFC" }
ª
x·
// { dg-warning "not in NFC" }
aָֹֹֻb
aָֹֹֻb // { dg-warning "not in NFC" }
x
x // { dg-warning "not in NFC" }
x
x // { dg-warning "not in NFC" }
x
x // { dg-warning "not in NFC" }
x
x // { dg-warning "not in NFC" }
x
xೆ // { dg-warning "not in NFC" }
x
x // { dg-warning "not in NFC" }
x
x // { dg-warning "not in NFC" }
K
// { dg-warning "not in NFC" }
ά
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }
// { dg-warning "not in NFC" }

View File

@ -0,0 +1,66 @@
// { dg-do preprocess { target { c++23 } } }
// { dg-options "-pedantic-errors" }
\u00AA
x\u00B7
\u0F43 // { dg-error "not in NFC" }
a\u05B8\u05B9\u05B9\u05BBb
a\u05BB\u05B9\u05B8\u05B9b // { dg-error "not in NFC" }
x\u09CB
x\u09C7\u09BE // { dg-error "not in NFC" }
x\u0B4B
x\u0B47\u0B3E // { dg-error "not in NFC" }
x\u0BCA
x\u0BC6\u0BBE // { dg-error "not in NFC" }
x\u0BCB
x\u0BC7\u0BBE // { dg-error "not in NFC" }
x\u0CCA
x\u0CC6\u0CC2 // { dg-error "not in NFC" }
x\u0D4A
x\u0D46\u0D3E // { dg-error "not in NFC" }
x\u0D4B
x\u0D47\u0D3E // { dg-error "not in NFC" }
K
\u212A // { dg-error "not in NFC" }
\u03AC
\u1F71 // { dg-error "not in NFC" }
\uAC00
\u1100\u1161 // { dg-error "not in NFC" }
\uAC01
\u1100\u1161\u11A8 // { dg-error "not in NFC" }
\uAC00\u11A8 // { dg-error "not in NFC" }
ª
x·
// { dg-error "not in NFC" }
aָֹֹֻb
aָֹֹֻb // { dg-error "not in NFC" }
x
x // { dg-error "not in NFC" }
x
x // { dg-error "not in NFC" }
x
x // { dg-error "not in NFC" }
x
x // { dg-error "not in NFC" }
x
xೆ // { dg-error "not in NFC" }
x
x // { dg-error "not in NFC" }
x
x // { dg-error "not in NFC" }
K
// { dg-error "not in NFC" }
ά
// { dg-error "not in NFC" }
// { dg-error "not in NFC" }
// { dg-error "not in NFC" }
// { dg-error "not in NFC" }

View File

@ -0,0 +1,10 @@
// P1949R7
// { dg-do compile { target c++11 } }
// { dg-options "" }
constexpr int À = 1; // U+00C0
constexpr int = 2; // U+0041 U+0300 { dg-warning "is not in NFC" }
constexpr int gv1 = \u00c0;
constexpr int gv2 = A\u0300; // { dg-warning "is not in NFC" }
static_assert(gv1 == 1, "");
static_assert(gv2 == 2, "");

View File

@ -0,0 +1,12 @@
// P1949R7
// { dg-do compile { target c++11 } }
// { dg-options "-pedantic-errors" }
constexpr int À = 1; // U+00C0
constexpr int = 2; // U+0041 U+0300 { dg-warning "is not in NFC" "" { target { ! c++23 } } }
// { dg-error "is not in NFC" "" { target c++23 } .-1 }
constexpr int gv1 = \u00c0;
constexpr int gv2 = A\u0300; // { dg-warning "is not in NFC" "" { target { ! c++23 } } }
// { dg-error "is not in NFC" "" { target c++23 } .-1 }
static_assert(gv1 == 1, "");
static_assert(gv2 == 2, "");

View File

@ -0,0 +1,18 @@
// P1949R7
// { dg-do compile }
// { dg-options "" }
bool 👷 = true;
bool 👷 = false; // { dg-error "is not valid in an identifier" }
int = 0; // { dg-error "is not valid in an identifier" }
int 🕐 = 0;
int = 0; // { dg-error "is not valid in an identifier" }
int 💀 = 0;
int = 0; // { dg-error "is not valid in an identifier" }
int 👊 = 0;
int = 0; // { dg-error "is not valid in an identifier" }
int 🚀 = 0;
int = 0; // { dg-error "is not valid in an identifier" }
int 😀 = 0;
struct E {};
class 💩 : public E {};

View File

@ -0,0 +1,18 @@
// P1949R7
// { dg-do compile }
// { dg-options "-pedantic-errors" }
bool 👷 = true; // { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
bool 👷 = false; // { dg-error "is not valid in an identifier" }
int = 0; // { dg-error "is not valid in an identifier" }
int 🕐 = 0; // { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
int = 0; // { dg-error "is not valid in an identifier" }
int 💀 = 0; // { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
int = 0; // { dg-error "is not valid in an identifier" }
int 👊 = 0; // { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
int = 0; // { dg-error "is not valid in an identifier" }
int 🚀 = 0; // { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
int = 0; // { dg-error "is not valid in an identifier" }
int 😀 = 0; // { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }
struct E {};
class 💩 : public E {}; // { dg-error "is not valid in an identifier" "" { target { c++98_only || c++23 } } }

View File

@ -9,9 +9,9 @@
Ö
΄
٩ /* { dg-error "not valid at the start of an identifier" } */
٩
0º
/* { dg-error "not valid at the start of an identifier" } */
A๙

View File

@ -9,9 +9,9 @@
\u00D6
\u0384
\u0669 /* { dg-error "not valid at the start of an identifier" } */
\u0669
A\u0669
0\u00BA
0\u0669
\u0E59 /* { dg-error "not valid at the start of an identifier" } */
\u0E59
A\u0E59

View File

@ -0,0 +1,17 @@
/* { dg-do preprocess } */
/* { dg-options "-std=c99 -pedantic" } */
ª
« /* not a preprocessing error because we lex it into its own token */
/* not a preprocessing error because we lex it into its own token */
º
À
Ö
΄ /* not a preprocessing error because we lex it into its own token */
٩ /* { dg-error "not valid at the start of an identifier" } */
0º
/* { dg-error "not valid at the start of an identifier" } */
A๙

View File

@ -894,14 +894,18 @@ enum {
C11 = 8,
/* Valid in a C11/C++11 identifier, but not as the first character? */
N11 = 16,
/* Valid in a C++23 identifier? */
CXX23 = 32,
/* Valid in a C++23 identifier, but not as the first character? */
NXX23 = 64,
/* NFC representation is not valid in an identifier? */
CID = 32,
CID = 128,
/* Might be valid NFC form? */
NFC = 64,
NFC = 256,
/* Might be valid NFKC form? */
NKC = 128,
NKC = 512,
/* Certain preceding characters might make it not valid NFC/NKFC form? */
CTX = 256
CTX = 1024
};
struct ucnrange {
@ -948,10 +952,12 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
/* When -pedantic, we require the character to have been listed by
the standard for the current language. Otherwise, we accept the
union of the acceptable sets for all supported language versions. */
valid_flags = C99 | CXX | C11;
valid_flags = C99 | CXX | C11 | CXX23;
if (CPP_PEDANTIC (pfile))
{
if (CPP_OPTION (pfile, c11_identifiers))
if (CPP_OPTION (pfile, cxx23_identifiers))
valid_flags = CXX23;
else if (CPP_OPTION (pfile, c11_identifiers))
valid_flags = C11;
else if (CPP_OPTION (pfile, c99))
valid_flags = C99;
@ -960,12 +966,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
}
if (! (ucnranges[mn].flags & valid_flags))
return 0;
if (CPP_OPTION (pfile, c11_identifiers))
invalid_start_flags = N11;
else if (CPP_OPTION (pfile, c99))
invalid_start_flags = N99;
else
invalid_start_flags = 0;
/* Update NST. */
if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
@ -1008,6 +1008,28 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
nst->previous = c;
nst->prev_class = ucnranges[mn].combine;
if (!CPP_PEDANTIC (pfile))
{
/* If not -pedantic, accept as character that may
begin an identifier a union of characters allowed
at that position in each of the character sets. */
if ((ucnranges[mn].flags & (C99 | N99)) == C99
|| (ucnranges[mn].flags & CXX) != 0
|| (ucnranges[mn].flags & (C11 | N11)) == C11
|| (ucnranges[mn].flags & (CXX23 | NXX23)) == CXX23)
return 1;
return 2;
}
if (CPP_OPTION (pfile, cxx23_identifiers))
invalid_start_flags = NXX23;
else if (CPP_OPTION (pfile, c11_identifiers))
invalid_start_flags = N11;
else if (CPP_OPTION (pfile, c99))
invalid_start_flags = N99;
else
invalid_start_flags = 0;
/* In C99, UCN digits may not begin identifiers. In C11 and C++11,
UCN combining characters may not begin identifiers. */
if (ucnranges[mn].flags & invalid_start_flags)

View File

@ -482,6 +482,10 @@ struct cpp_options
in C11 and C++11. */
unsigned char c11_identifiers;
/* Nonzero means extended identifiers allow the characters specified
in C++23. */
unsigned char cxx23_identifiers;
/* Nonzero for C++ 2014 Standard binary constants. */
unsigned char binary_constants;

View File

@ -82,6 +82,7 @@ struct lang_flags
char extended_numbers;
char extended_identifiers;
char c11_identifiers;
char cxx23_identifiers;
char std;
char digraphs;
char uliterals;
@ -99,31 +100,31 @@ struct lang_flags
};
static const struct lang_flags lang_defaults[] =
{ /* c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef */
/* GNUC89 */ { 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC99 */ { 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC11 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC17 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC2X */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1 },
/* STDC89 */ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC94 */ { 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC99 */ { 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC11 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC17 */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC2X */ { 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1 },
/* GNUCXX */ { 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* CXX98 */ { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0 },
/* GNUCXX11 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* CXX11 */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0 },
/* GNUCXX14 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0 },
/* CXX14 */ { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 },
/* GNUCXX17 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0 },
/* CXX17 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0 },
/* GNUCXX20 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0 },
/* CXX20 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0 },
/* GNUCXX23 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0 },
/* CXX23 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0 },
/* ASM */ { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
{ /* c99 c++ xnum xid c11 c++23 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef */
/* GNUC89 */ { 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC99 */ { 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC11 */ { 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC17 */ { 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* GNUC2X */ { 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1 },
/* STDC89 */ { 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC94 */ { 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC99 */ { 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC11 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC17 */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
/* STDC2X */ { 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1 },
/* GNUCXX */ { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* CXX98 */ { 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0 },
/* GNUCXX11 */ { 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0 },
/* CXX11 */ { 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0 },
/* GNUCXX14 */ { 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0 },
/* CXX14 */ { 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 },
/* GNUCXX17 */ { 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0 },
/* CXX17 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0 },
/* GNUCXX20 */ { 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0 },
/* CXX20 */ { 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0 },
/* GNUCXX23 */ { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0 },
/* CXX23 */ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0 },
/* ASM */ { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
};
/* Sets internal flags correctly for a given language. */
@ -139,6 +140,7 @@ cpp_set_lang (cpp_reader *pfile, enum c_lang lang)
CPP_OPTION (pfile, extended_numbers) = l->extended_numbers;
CPP_OPTION (pfile, extended_identifiers) = l->extended_identifiers;
CPP_OPTION (pfile, c11_identifiers) = l->c11_identifiers;
CPP_OPTION (pfile, cxx23_identifiers) = l->cxx23_identifiers;
CPP_OPTION (pfile, std) = l->std;
CPP_OPTION (pfile, digraphs) = l->digraphs;
CPP_OPTION (pfile, uliterals) = l->uliterals;

View File

@ -1306,6 +1306,9 @@ warn_about_normalization (cpp_reader *pfile,
if (NORMALIZE_STATE_RESULT (s) == normalized_C)
cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
"`%.*s' is not in NFKC", (int) sz, buf);
else if (CPP_OPTION (pfile, cxx23_identifiers))
cpp_pedwarning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
"`%.*s' is not in NFC", (int) sz, buf);
else
cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
"`%.*s' is not in NFC", (int) sz, buf);

View File

@ -17,7 +17,7 @@ along with this program; see the file COPYING3. If not see
/* Run this program as
./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
> ucnid.h
DerivedCoreProperties.txt > ucnid.h
*/
#include <stdio.h>
@ -32,10 +32,12 @@ enum {
N99 = 4,
C11 = 8,
N11 = 16,
all_languages = C99 | CXX | C11,
not_NFC = 32,
not_NFKC = 64,
maybe_not_NFC = 128
CXX23 = 32,
NXX23 = 64,
all_languages = C99 | CXX | C11 | CXX23 | NXX23,
not_NFC = 128,
not_NFKC = 256,
maybe_not_NFC = 512
};
#define NUM_CODE_POINTS 0x110000
@ -241,6 +243,74 @@ read_derived (const char *fname)
fclose (f);
}
/* Read DerivedCoreProperties.txt and fill in languages version in
flags from the XID_Start and XID_Continue properties. */
static void
read_derivedcore (char *fname)
{
FILE * f = fopen (fname, "r");
if (!f)
fail ("opening DerivedCoreProperties.txt");
for (;;)
{
char line[256];
unsigned long codepoint_start, codepoint_end;
char *l;
int i, j;
if (!fgets (line, sizeof (line), f))
break;
if (line[0] == '#' || line[0] == '\n' || line[0] == '\r')
continue;
codepoint_start = strtoul (line, &l, 16);
if (l == line)
fail ("parsing DerivedCoreProperties.txt, reading code point");
if (codepoint_start > MAX_CODE_POINT)
fail ("parsing DerivedCoreProperties.txt, code point too large");
if (*l == '.' && l[1] == '.')
{
char *l2 = l + 2;
codepoint_end = strtoul (l + 2, &l, 16);
if (l == l2 || codepoint_end < codepoint_start)
fail ("parsing DerivedCoreProperties.txt, reading code point");
if (codepoint_end > MAX_CODE_POINT)
fail ("parsing DerivedCoreProperties.txt, code point too large");
}
else
codepoint_end = codepoint_start;
while (*l == ' ')
l++;
if (*l++ != ';')
fail ("parsing DerivedCoreProperties.txt, reading code point");
while (*l == ' ')
l++;
if (codepoint_end < 0x80)
continue;
if (strncmp (l, "XID_Start ", 10) == 0)
{
for (; codepoint_start <= codepoint_end; codepoint_start++)
flags[codepoint_start]
= (flags[codepoint_start] | CXX23) & ~NXX23;
}
else if (strncmp (l, "XID_Continue ", 13) == 0)
{
for (; codepoint_start <= codepoint_end; codepoint_start++)
if ((flags[codepoint_start] & CXX23) == 0)
flags[codepoint_start] |= CXX23 | NXX23;
}
}
if (ferror (f))
fail ("reading DerivedCoreProperties.txt");
fclose (f);
}
/* Write out the table.
The table consists of two words per entry. The first word is the flags
for the unicode code points up to and including the second word. */
@ -261,12 +331,14 @@ write_table (void)
|| really_safe != (decomp[i][0] == 0)
|| combining_value[i] != last_combine)
{
printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
last_flag & C99 ? "C99" : " 0",
last_flag & N99 ? "N99" : " 0",
last_flag & CXX ? "CXX" : " 0",
last_flag & C11 ? "C11" : " 0",
last_flag & N11 ? "N11" : " 0",
last_flag & CXX23 ? "CXX23" : " 0",
last_flag & NXX23 ? "NXX23" : " 0",
really_safe ? "CID" : " 0",
last_flag & not_NFC ? " 0" : "NFC",
last_flag & not_NFKC ? " 0" : "NKC",
@ -439,11 +511,12 @@ write_copyright (void)
int
main(int argc, char ** argv)
{
if (argc != 4)
if (argc != 5)
fail ("too few arguments to makeucn");
read_ucnid (argv[1]);
read_table (argv[2]);
read_derived (argv[3]);
read_derivedcore (argv[4]);
write_copyright ();
write_table ();

File diff suppressed because it is too large Load Diff