ucnid-2011-1.c: New test.

gcc/testsuite:
	* c-c++-common/cpp/ucnid-2011-1.c: New test.

libcpp:
	* ucnid.tab: Add C11 and C11NOSTART data.
	* makeucnid.c (digit): Rename enum value to N99.
	(C11, N11, all_languages): New enum values.
	(NUM_CODE_POINTS, MAX_CODE_POINT): New macros.
	(flags, decomp, combining_value): Use NUM_CODE_POINTS as array
	size.
	(decomp): Use unsigned int as element type.
	(all_decomp): New array.
	(read_ucnid): Handle C11 and C11NOSTART.  Use MAX_CODE_POINT.
	(read_table): Use MAX_CODE_POINT.  Store all decompositions in
	all_decomp.
	(read_derived): Use MAX_CODE_POINT.
	(write_table): Use NUM_CODE_POINTS.  Print N99, C11 and N11
	flags.  Print whole array variable declaration rather than just
	array contents.
	(char_id_valid, write_context_switch): New functions.
	(main): Call write_context_switch.
	* ucnid.h: Regenerate.
	* include/cpplib.h (struct cpp_options): Add c11_identifiers.
	* init.c (struct lang_flags): Add c11_identifiers.
	(cpp_set_lang): Set c11_identifiers option from selected language.
	* internal.h (struct normalize_state): Document "previous" as
	previous starter character.
	(NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument.
	* charset.c (DIG): Rename enum value to N99.
	(C11, N11): New enum values.
	(struct ucnrange): Give name to struct.  Use short for flags and
	unsigned int for end of range.  Include ucnid.h for whole variable
	declaration.
	(ucn_valid_in_identifier): Allow for characters up to 0x10FFFF.
	Allow for C11 in determining valid characters and valid start
	characters.  Use check_nfc for non-Hangul context-dependent
	checks.  Only store starter characters in nst->previous.
	(_cpp_valid_ucn): Pass new argument to
	NORMALIZE_STATE_UPDATE_IDNUM.
	* lex.c (lex_identifier): Pass new argument to
	NORMALIZE_STATE_UPDATE_IDNUM.  Call NORMALIZE_STATE_UPDATE_IDNUM
	after initial non-UCN part of identifier.
	(lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM.

From-SVN: r204886
This commit is contained in:
Joseph Myers 2013-11-16 00:05:08 +00:00 committed by Joseph Myers
parent 3d053a5f72
commit d3f4ff8b51
11 changed files with 4783 additions and 840 deletions

View File

@ -1,3 +1,7 @@
2013-11-15 Joseph Myers <joseph@codesourcery.com>
* c-c++-common/cpp/ucnid-2011-1.c: New test.
2013-11-15 Paolo Carlini <paolo.carlini@oracle.com>
PR c++/58188

View File

@ -0,0 +1,15 @@
/* { dg-do preprocess } */
/* { dg-options "-std=c11 -pedantic -fextended-identifiers" { target c } } */
/* { dg-options "-std=c++11 -pedantic -fextended-identifiers" { target c++ } } */
\u00A8
B\u0300
\u0300 /* { dg-error "not valid at the start of an identifier" } */
A\u0300 /* { dg-warning "not in NFC" } */
\U00010000
\U0001FFFD
\U000E1234

View File

@ -1,3 +1,45 @@
2013-11-15 Joseph Myers <joseph@codesourcery.com>
* ucnid.tab: Add C11 and C11NOSTART data.
* makeucnid.c (digit): Rename enum value to N99.
(C11, N11, all_languages): New enum values.
(NUM_CODE_POINTS, MAX_CODE_POINT): New macros.
(flags, decomp, combining_value): Use NUM_CODE_POINTS as array
size.
(decomp): Use unsigned int as element type.
(all_decomp): New array.
(read_ucnid): Handle C11 and C11NOSTART. Use MAX_CODE_POINT.
(read_table): Use MAX_CODE_POINT. Store all decompositions in
all_decomp.
(read_derived): Use MAX_CODE_POINT.
(write_table): Use NUM_CODE_POINTS. Print N99, C11 and N11
flags. Print whole array variable declaration rather than just
array contents.
(char_id_valid, write_context_switch): New functions.
(main): Call write_context_switch.
* ucnid.h: Regenerate.
* include/cpplib.h (struct cpp_options): Add c11_identifiers.
* init.c (struct lang_flags): Add c11_identifiers.
(cpp_set_lang): Set c11_identifiers option from selected language.
* internal.h (struct normalize_state): Document "previous" as
previous starter character.
(NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument.
* charset.c (DIG): Rename enum value to N99.
(C11, N11): New enum values.
(struct ucnrange): Give name to struct. Use short for flags and
unsigned int for end of range. Include ucnid.h for whole variable
declaration.
(ucn_valid_in_identifier): Allow for characters up to 0x10FFFF.
Allow for C11 in determining valid characters and valid start
characters. Use check_nfc for non-Hangul context-dependent
checks. Only store starter characters in nst->previous.
(_cpp_valid_ucn): Pass new argument to
NORMALIZE_STATE_UPDATE_IDNUM.
* lex.c (lex_identifier): Pass new argument to
NORMALIZE_STATE_UPDATE_IDNUM. Call NORMALIZE_STATE_UPDATE_IDNUM
after initial non-UCN part of identifier.
(lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM.
2013-11-15 Joseph Myers <joseph@codesourcery.com>
* ucnid.tab: Mark C99 digits as [C99DIG].

View File

@ -828,29 +828,32 @@ enum {
/* Valid in a C99 identifier? */
C99 = 1,
/* Valid in a C99 identifier, but not as the first character? */
DIG = 2,
N99 = 2,
/* Valid in a C++ identifier? */
CXX = 4,
/* Valid in a C11/C++11 identifier? */
C11 = 8,
/* Valid in a C11/C++11 identifier, but not as the first character? */
N11 = 16,
/* NFC representation is not valid in an identifier? */
CID = 8,
CID = 32,
/* Might be valid NFC form? */
NFC = 16,
NFC = 64,
/* Might be valid NFKC form? */
NKC = 32,
NKC = 128,
/* Certain preceding characters might make it not valid NFC/NKFC form? */
CTX = 64
CTX = 256
};
static const struct {
struct ucnrange {
/* Bitmap of flags above. */
unsigned char flags;
unsigned short flags;
/* Combining class of the character. */
unsigned char combine;
/* Last character in the range described by this entry. */
unsigned short end;
} ucnranges[] = {
#include "ucnid.h"
unsigned int end;
};
#include "ucnid.h"
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
the start of an identifier, and 0 if C is not valid in an
@ -864,8 +867,9 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
struct normalize_state *nst)
{
int mn, mx, md;
unsigned short valid_flags, invalid_start_flags;
if (c > 0xFFFF)
if (c > 0x10FFFF)
return 0;
mn = 0;
@ -881,15 +885,25 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
/* When -pedantic, we require the character to have been listed by
the standard for the current language. Otherwise, we accept the
union of the acceptable sets for C++98 and C99. */
if (! (ucnranges[mn].flags & (C99 | CXX)))
union of the acceptable sets for all supported language versions. */
valid_flags = C99 | CXX | C11;
if (CPP_PEDANTIC (pfile))
{
if (CPP_OPTION (pfile, c11_identifiers))
valid_flags = C11;
else if (CPP_OPTION (pfile, c99))
valid_flags = C99;
else if (CPP_OPTION (pfile, cplusplus))
valid_flags = CXX;
}
if (! (ucnranges[mn].flags & valid_flags))
return 0;
if (CPP_PEDANTIC (pfile)
&& ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
|| (CPP_OPTION (pfile, cplusplus)
&& !(ucnranges[mn].flags & CXX))))
return 0;
if (CPP_OPTION (pfile, c11_identifiers))
invalid_start_flags = N11;
else if (CPP_OPTION (pfile, c99))
invalid_start_flags = N99;
else
invalid_start_flags = 0;
/* Update NST. */
if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
@ -899,17 +913,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
bool safe;
cppchar_t p = nst->previous;
/* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
if (c == 0x09BE)
safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
else if (c == 0x0B3E)
safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
else if (c == 0x0BBE)
safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
else if (c == 0x0CC2)
safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
else if (c == 0x0D3E)
safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
/* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
and are combined algorithmically from a sequence of the form
1100-1112 1161-1175 11A8-11C2
@ -917,20 +920,19 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
really a valid character).
Unfortunately, C99 allows (only) the NFC form, but C++ allows
only the combining characters. */
else if (c >= 0x1161 && c <= 0x1175)
if (c >= 0x1161 && c <= 0x1175)
safe = p < 0x1100 || p > 0x1112;
else if (c >= 0x11A8 && c <= 0x11C2)
safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
else
safe = check_nfc (pfile, c, p);
if (!safe)
{
/* Uh-oh, someone updated ucnid.h without updating this code. */
cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
safe = true;
if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
nst->level = MAX (nst->level, normalized_identifier_C);
else
nst->level = normalized_none;
}
if (!safe && c < 0x1161)
nst->level = normalized_none;
else if (!safe)
nst->level = MAX (nst->level, normalized_identifier_C);
}
else if (ucnranges[mn].flags & NKC)
;
@ -940,11 +942,13 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
nst->level = MAX (nst->level, normalized_identifier_C);
else
nst->level = normalized_none;
nst->previous = c;
if (ucnranges[mn].combine == 0)
nst->previous = c;
nst->prev_class = ucnranges[mn].combine;
/* In C99, UCN digits may not begin identifiers. */
if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
/* In C99, UCN digits may not begin identifiers. In C11 and C++11,
UCN combining characters may not begin identifiers. */
if (ucnranges[mn].flags & invalid_start_flags)
return 2;
return 1;
@ -1054,7 +1058,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
CPP_OPTION (pfile, warn_dollars) = 0;
cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
}
NORMALIZE_STATE_UPDATE_IDNUM (nst);
NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
}
else if (identifier_pos)
{

View File

@ -437,6 +437,10 @@ struct cpp_options
literal number suffixes as user-defined literal number suffixes. */
unsigned char ext_numeric_literals;
/* Nonzero means extended identifiers allow the characters specified
in C11 and C++11. */
unsigned char c11_identifiers;
/* Nonzero for C++ 2014 Standard binary constants. */
unsigned char binary_constants;

View File

@ -77,6 +77,7 @@ struct lang_flags
char cplusplus;
char extended_numbers;
char extended_identifiers;
char c11_identifiers;
char std;
char cplusplus_comments;
char digraphs;
@ -88,21 +89,21 @@ struct lang_flags
};
static const struct lang_flags lang_defaults[] =
{ /* c99 c++ xnum xid std // digr ulit rlit udlit bin_cst dig_sep */
/* GNUC89 */ { 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
/* GNUC99 */ { 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0 },
/* GNUC11 */ { 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0 },
/* STDC89 */ { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 },
/* STDC94 */ { 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0 },
/* STDC99 */ { 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
/* STDC11 */ { 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0 },
/* GNUCXX */ { 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
/* CXX98 */ { 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
/* GNUCXX11 */ { 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0 },
/* CXX11 */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0 },
/* GNUCXX1Y */ { 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
/* CXX1Y */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
/* ASM */ { 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }
{ /* c99 c++ xnum xid c11 std // digr ulit rlit udlit bin_cst dig_sep */
/* GNUC89 */ { 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
/* GNUC99 */ { 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0 },
/* GNUC11 */ { 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0 },
/* STDC89 */ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 },
/* STDC94 */ { 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0 },
/* STDC99 */ { 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
/* STDC11 */ { 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
/* GNUCXX */ { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
/* CXX98 */ { 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
/* GNUCXX11 */ { 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0 },
/* CXX11 */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0 },
/* GNUCXX1Y */ { 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 },
/* CXX1Y */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
/* ASM */ { 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }
/* xid should be 1 for GNUC99, STDC99, GNUCXX, CXX98, GNUCXX11, CXX11,
GNUCXX1Y, and CXX1Y when no longer experimental (when all uses of
identifiers in the compiler have been audited for correct handling
@ -121,6 +122,7 @@ cpp_set_lang (cpp_reader *pfile, enum c_lang lang)
CPP_OPTION (pfile, cplusplus) = l->cplusplus;
CPP_OPTION (pfile, extended_numbers) = l->extended_numbers;
CPP_OPTION (pfile, extended_identifiers) = l->extended_identifiers;
CPP_OPTION (pfile, c11_identifiers) = l->c11_identifiers;
CPP_OPTION (pfile, std) = l->std;
CPP_OPTION (pfile, trigraphs) = l->std;
CPP_OPTION (pfile, cplusplus_comments) = l->cplusplus_comments;

View File

@ -713,9 +713,10 @@ extern size_t _cpp_replacement_text_len (const cpp_macro *);
struct normalize_state
{
/* The previous character. */
/* The previous starter character. */
cppchar_t previous;
/* The combining class of the previous character. */
/* The combining class of the previous character (whether or not a
starter). */
unsigned char prev_class;
/* The lowest normalization level so far. */
enum cpp_normalize_level level;
@ -723,10 +724,10 @@ struct normalize_state
#define INITIAL_NORMALIZE_STATE { 0, 0, normalized_KC }
#define NORMALIZE_STATE_RESULT(st) ((st)->level)
/* We saw a character that matches ISIDNUM(), update a
/* We saw a character C that matches ISIDNUM(), update a
normalize_state appropriately. */
#define NORMALIZE_STATE_UPDATE_IDNUM(st) \
((st)->previous = 0, (st)->prev_class = 0)
#define NORMALIZE_STATE_UPDATE_IDNUM(st, c) \
((st)->previous = (c), (st)->prev_class = 0)
extern cppchar_t _cpp_valid_ucn (cpp_reader *, const unsigned char **,
const unsigned char *, int,

View File

@ -1204,11 +1204,14 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
cur = pfile->buffer->cur;
if (! starts_ucn)
while (ISIDNUM (*cur))
{
hash = HT_HASHSTEP (hash, *cur);
cur++;
}
{
while (ISIDNUM (*cur))
{
hash = HT_HASHSTEP (hash, *cur);
cur++;
}
NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
}
pfile->buffer->cur = cur;
if (starts_ucn || forms_identifier_p (pfile, false, nst))
{
@ -1216,8 +1219,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
do {
while (ISIDNUM (*pfile->buffer->cur))
{
NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
pfile->buffer->cur++;
NORMALIZE_STATE_UPDATE_IDNUM (nst);
}
} while (forms_identifier_p (pfile, false, nst));
result = _cpp_interpret_identifier (pfile, base,
@ -1277,8 +1280,8 @@ lex_number (cpp_reader *pfile, cpp_string *number,
while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
|| VALID_SIGN (*cur, cur[-1]))
{
NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
cur++;
NORMALIZE_STATE_UPDATE_IDNUM (nst);
}
pfile->buffer->cur = cur;

View File

@ -29,15 +29,22 @@ along with this program; see the file COPYING3. If not see
enum {
C99 = 1,
CXX = 2,
digit = 4,
not_NFC = 8,
not_NFKC = 16,
maybe_not_NFC = 32
N99 = 4,
C11 = 8,
N11 = 16,
all_languages = C99 | CXX | C11,
not_NFC = 32,
not_NFKC = 64,
maybe_not_NFC = 128
};
static unsigned flags[65536];
static unsigned short decomp[65536][2];
static unsigned char combining_value[65536];
#define NUM_CODE_POINTS 0x110000
#define MAX_CODE_POINT 0x10ffff
static unsigned flags[NUM_CODE_POINTS];
static unsigned int all_decomp[NUM_CODE_POINTS][2];
static unsigned int decomp[NUM_CODE_POINTS][2];
static unsigned char combining_value[NUM_CODE_POINTS];
/* Die! */
@ -48,7 +55,7 @@ fail (const char *s)
exit (1);
}
/* Read ucnid.tab and set the C99 and CXX flags in header[]. */
/* Read ucnid.tab and set the flags for language versions in header[]. */
static void
read_ucnid (const char *fname)
@ -66,10 +73,14 @@ read_ucnid (const char *fname)
break;
if (strcmp (line, "[C99]\n") == 0)
fl = C99;
if (strcmp (line, "[C99DIG]\n") == 0)
fl = C99|digit;
else if (strcmp (line, "[C99DIG]\n") == 0)
fl = C99|N99;
else if (strcmp (line, "[CXX]\n") == 0)
fl = CXX;
else if (strcmp (line, "[C11]\n") == 0)
fl = C11;
else if (strcmp (line, "[C11NOSTART]\n") == 0)
fl = C11|N11;
else if (isxdigit (line[0]))
{
char *l = line;
@ -94,7 +105,7 @@ read_ucnid (const char *fname)
}
while (isspace (*l))
l++;
if (end > 0xFFFF)
if (end > MAX_CODE_POINT)
fail ("parsing ucnid.tab, end too large");
while (start <= end)
flags[start++] |= fl;
@ -108,8 +119,10 @@ read_ucnid (const char *fname)
/* Read UnicodeData.txt and fill in the 'decomp' table to be the
decompositions of characters for which both the character
decomposed and all the code points in the decomposition are either
C99 or CXX. */
decomposed and all the code points in the decomposition are valid
for some supported language version, and the 'all_decomp' table to
be the decompositions of all characters without those
constraints. */
static void
read_table (char *fname)
@ -123,7 +136,7 @@ read_table (char *fname)
char line[256];
unsigned long codepoint, this_decomp[4];
char *l;
int i;
int i, j;
int decomp_useful;
if (!fgets (line, sizeof (line), f))
@ -131,8 +144,8 @@ read_table (char *fname)
codepoint = strtoul (line, &l, 16);
if (l == line || *l != ';')
fail ("parsing UnicodeData.txt, reading code point");
if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX)))
continue;
if (codepoint > MAX_CODE_POINT)
fail ("parsing UnicodeData.txt, code point too large");
do {
l++;
@ -171,7 +184,9 @@ read_table (char *fname)
}
if (i > 2) /* Decomposition too long. */
fail ("parsing UnicodeData.txt, decomposition too long");
if (decomp_useful)
for (j = 0; j < i; j++)
all_decomp[codepoint][j] = this_decomp[j];
if ((flags[codepoint] & all_languages) && decomp_useful)
while (--i >= 0)
decomp[codepoint][i] = this_decomp[i];
}
@ -208,8 +223,8 @@ read_derived (const char *fname)
start = strtoul (line, &l, 16);
if (l == line)
fail ("parsing DerivedNormalizationProps.txt, reading start");
if (start > 0xffff)
continue;
if (start > MAX_CODE_POINT)
fail ("parsing DerivedNormalizationProps.txt, code point too large");
if (*l == '.' && l[1] == '.')
end = strtoul (l + 2, &l, 16);
else
@ -237,17 +252,21 @@ write_table (void)
unsigned last_flag = flags[0];
bool really_safe = decomp[0][0] == 0;
unsigned char last_combine = combining_value[0];
printf ("static const struct ucnrange ucnranges[] = {\n");
for (i = 1; i <= 65536; i++)
if (i == 65536
|| (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX)))
for (i = 1; i <= NUM_CODE_POINTS; i++)
if (i == NUM_CODE_POINTS
|| (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
|| really_safe != (decomp[i][0] == 0)
|| combining_value[i] != last_combine)
{
printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
last_flag & C99 ? "C99" : " 0",
last_flag & digit ? "DIG" : " 0",
last_flag & N99 ? "N99" : " 0",
last_flag & CXX ? "CXX" : " 0",
last_flag & C11 ? "C11" : " 0",
last_flag & N11 ? "N11" : " 0",
really_safe ? "CID" : " 0",
last_flag & not_NFC ? " 0" : "NFC",
last_flag & not_NFKC ? " 0" : "NKC",
@ -258,6 +277,98 @@ write_table (void)
last_combine = combining_value[0];
really_safe = decomp[i][0] == 0;
}
printf ("};\n");
}
/* Return whether a given character is valid in an identifier for some
supported language, either as itself or as a UCN. */
static bool
char_id_valid (unsigned int c)
{
return ((flags[c] & all_languages)
|| (c == 0x24)
|| (c >= 0x30 && c <= 0x39)
|| (c >= 0x41 && c <= 0x5a)
|| (c >= 0x61 && c <= 0x7a));
}
/* Write out the switch statement over characters for which it is
context-dependent whether they are in NFC. */
static void
write_context_switch (void)
{
unsigned i;
printf ("static bool\n"
"check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
"{\n"
" switch (c)\n"
" {\n");
for (i = 0; i < NUM_CODE_POINTS; i++)
{
bool found_case = false;
unsigned j;
if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
continue;
if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
continue; /* Hangul handled algorithmically. */
printf (" case %#06x:\n"
" switch (p)\n"
"\t{\n", i);
/* If an NFC starter character decomposes with this character I
as the second character and an NFC starter character S as the
first character, that latter character as a previous
character means this character is not NFC. Furthermore, any
NFC starter character K made by a series of compositions of S
with combining characters whose combining class is greater
than that of I also means this character is not NFC. */
for (j = 0; j < NUM_CODE_POINTS; j++)
{
unsigned s, k;
if (all_decomp[j][1] != i)
continue;
s = all_decomp[j][0];
if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
continue;
if (char_id_valid (s))
{
found_case = true;
printf ("\tcase %#06x:\n", s);
}
for (k = 0; k < NUM_CODE_POINTS; k++)
{
unsigned t = k;
if (k == s || !char_id_valid (k))
continue;
while (all_decomp[t][1] != 0
&& combining_value[all_decomp[t][1]] > combining_value[i])
{
if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
break;
t = all_decomp[t][0];
}
if (t == s)
{
found_case = true;
printf ("\tcase %#06x:\n", k);
}
}
}
if (found_case)
printf ("\t return false;\n");
else
printf ("\t/* Non-NFC cases not applicable to C/C++. */\n");
printf ("\tdefault:\n"
"\t return true;\n"
"\t}\n\n");
}
printf (" default:\n"
" cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
" return true;\n"
" }\n"
"}\n");
}
/* Print out the huge copyright notice. */
@ -336,5 +447,6 @@ main(int argc, char ** argv)
write_copyright ();
write_table ();
write_context_switch ();
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@ -19,7 +19,8 @@
; D, which is itself a reproduction from ISO/IEC TR 10176:1998, and
; the similar table from ISO/IEC 14882:1988 (C++98) Annex E, which is
; a reproduction of ISO/IEC PDTR 10176. Unfortunately these tables
; are not identical.
; are not identical. It also reproduces the somewhat different tables
; in C11 and C++11, which are identical to each other.
[C99]
@ -209,3 +210,34 @@ fbd3-fd3f fd50-fd8f fd92-fdc7 fdf0-fdfb fe70-fe72 fe74 fe76-fefc
ff21-ff3a ff41-ff5a ff66-ffbe ffc2-ffc7 ffca-ffcf ffd2-ffd7
ffda-ffdc 4e00-9fa5
[C11]
; Group 1
00a8 00aa 00ad 00af 00b2-00b5 00b7-00ba 00bc-00be 00c0-00d6 00d8-00f6
00f8-00ff
; Group 2, minus characters under C11NOSTART
0100-02ff 0370-167f 1681-180d 180f-1dbf 1e00-1fff
; Group 3
200b-200d 202a-202e 203f-2040 2054 2060-206f
; Group 4, minus characters under C11NOSTART
2070-20cf 2100-218f 2460-24ff 2776-2793 2c00-2dff 2e80-2fff
; Group 5
3004-3007 3021-302f 3031-303f
; Group 6
3040-d7ff
; Group 7, minus characters under C11NOSTART
f900-fd3d fd40-fdcf fdf0-fe1f fe30-fe44 fe47-fffd
; Group 8
10000-1fffd 20000-2fffd 30000-3fffd 40000-4fffd 50000-5fffd
60000-6fffd 70000-7fffd 80000-8fffd 90000-9fffd a0000-afffd
b0000-bfffd c0000-cfffd d0000-dfffd e0000-efffd
[C11NOSTART]
; Group 1
0300-036f 1dc0-1dff 20d0-20ff fe20-fe2f