ucnid-2011-1.c: New test.
gcc/testsuite: * c-c++-common/cpp/ucnid-2011-1.c: New test. libcpp: * ucnid.tab: Add C11 and C11NOSTART data. * makeucnid.c (digit): Rename enum value to N99. (C11, N11, all_languages): New enum values. (NUM_CODE_POINTS, MAX_CODE_POINT): New macros. (flags, decomp, combining_value): Use NUM_CODE_POINTS as array size. (decomp): Use unsigned int as element type. (all_decomp): New array. (read_ucnid): Handle C11 and C11NOSTART. Use MAX_CODE_POINT. (read_table): Use MAX_CODE_POINT. Store all decompositions in all_decomp. (read_derived): Use MAX_CODE_POINT. (write_table): Use NUM_CODE_POINTS. Print N99, C11 and N11 flags. Print whole array variable declaration rather than just array contents. (char_id_valid, write_context_switch): New functions. (main): Call write_context_switch. * ucnid.h: Regenerate. * include/cpplib.h (struct cpp_options): Add c11_identifiers. * init.c (struct lang_flags): Add c11_identifiers. (cpp_set_lang): Set c11_identifiers option from selected language. * internal.h (struct normalize_state): Document "previous" as previous starter character. (NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument. * charset.c (DIG): Rename enum value to N99. (C11, N11): New enum values. (struct ucnrange): Give name to struct. Use short for flags and unsigned int for end of range. Include ucnid.h for whole variable declaration. (ucn_valid_in_identifier): Allow for characters up to 0x10FFFF. Allow for C11 in determining valid characters and valid start characters. Use check_nfc for non-Hangul context-dependent checks. Only store starter characters in nst->previous. (_cpp_valid_ucn): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. * lex.c (lex_identifier): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. Call NORMALIZE_STATE_UPDATE_IDNUM after initial non-UCN part of identifier. (lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. From-SVN: r204886
This commit is contained in:
parent
3d053a5f72
commit
d3f4ff8b51
@ -1,3 +1,7 @@
|
||||
2013-11-15 Joseph Myers <joseph@codesourcery.com>
|
||||
|
||||
* c-c++-common/cpp/ucnid-2011-1.c: New test.
|
||||
|
||||
2013-11-15 Paolo Carlini <paolo.carlini@oracle.com>
|
||||
|
||||
PR c++/58188
|
||||
|
15
gcc/testsuite/c-c++-common/cpp/ucnid-2011-1.c
Normal file
15
gcc/testsuite/c-c++-common/cpp/ucnid-2011-1.c
Normal file
@ -0,0 +1,15 @@
|
||||
/* { dg-do preprocess } */
|
||||
/* { dg-options "-std=c11 -pedantic -fextended-identifiers" { target c } } */
|
||||
/* { dg-options "-std=c++11 -pedantic -fextended-identifiers" { target c++ } } */
|
||||
|
||||
\u00A8
|
||||
|
||||
B\u0300
|
||||
|
||||
\u0300 /* { dg-error "not valid at the start of an identifier" } */
|
||||
|
||||
A\u0300 /* { dg-warning "not in NFC" } */
|
||||
|
||||
\U00010000
|
||||
\U0001FFFD
|
||||
\U000E1234
|
@ -1,3 +1,45 @@
|
||||
2013-11-15 Joseph Myers <joseph@codesourcery.com>
|
||||
|
||||
* ucnid.tab: Add C11 and C11NOSTART data.
|
||||
* makeucnid.c (digit): Rename enum value to N99.
|
||||
(C11, N11, all_languages): New enum values.
|
||||
(NUM_CODE_POINTS, MAX_CODE_POINT): New macros.
|
||||
(flags, decomp, combining_value): Use NUM_CODE_POINTS as array
|
||||
size.
|
||||
(decomp): Use unsigned int as element type.
|
||||
(all_decomp): New array.
|
||||
(read_ucnid): Handle C11 and C11NOSTART. Use MAX_CODE_POINT.
|
||||
(read_table): Use MAX_CODE_POINT. Store all decompositions in
|
||||
all_decomp.
|
||||
(read_derived): Use MAX_CODE_POINT.
|
||||
(write_table): Use NUM_CODE_POINTS. Print N99, C11 and N11
|
||||
flags. Print whole array variable declaration rather than just
|
||||
array contents.
|
||||
(char_id_valid, write_context_switch): New functions.
|
||||
(main): Call write_context_switch.
|
||||
* ucnid.h: Regenerate.
|
||||
* include/cpplib.h (struct cpp_options): Add c11_identifiers.
|
||||
* init.c (struct lang_flags): Add c11_identifiers.
|
||||
(cpp_set_lang): Set c11_identifiers option from selected language.
|
||||
* internal.h (struct normalize_state): Document "previous" as
|
||||
previous starter character.
|
||||
(NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument.
|
||||
* charset.c (DIG): Rename enum value to N99.
|
||||
(C11, N11): New enum values.
|
||||
(struct ucnrange): Give name to struct. Use short for flags and
|
||||
unsigned int for end of range. Include ucnid.h for whole variable
|
||||
declaration.
|
||||
(ucn_valid_in_identifier): Allow for characters up to 0x10FFFF.
|
||||
Allow for C11 in determining valid characters and valid start
|
||||
characters. Use check_nfc for non-Hangul context-dependent
|
||||
checks. Only store starter characters in nst->previous.
|
||||
(_cpp_valid_ucn): Pass new argument to
|
||||
NORMALIZE_STATE_UPDATE_IDNUM.
|
||||
* lex.c (lex_identifier): Pass new argument to
|
||||
NORMALIZE_STATE_UPDATE_IDNUM. Call NORMALIZE_STATE_UPDATE_IDNUM
|
||||
after initial non-UCN part of identifier.
|
||||
(lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM.
|
||||
|
||||
2013-11-15 Joseph Myers <joseph@codesourcery.com>
|
||||
|
||||
* ucnid.tab: Mark C99 digits as [C99DIG].
|
||||
|
@ -828,29 +828,32 @@ enum {
|
||||
/* Valid in a C99 identifier? */
|
||||
C99 = 1,
|
||||
/* Valid in a C99 identifier, but not as the first character? */
|
||||
DIG = 2,
|
||||
N99 = 2,
|
||||
/* Valid in a C++ identifier? */
|
||||
CXX = 4,
|
||||
/* Valid in a C11/C++11 identifier? */
|
||||
C11 = 8,
|
||||
/* Valid in a C11/C++11 identifier, but not as the first character? */
|
||||
N11 = 16,
|
||||
/* NFC representation is not valid in an identifier? */
|
||||
CID = 8,
|
||||
CID = 32,
|
||||
/* Might be valid NFC form? */
|
||||
NFC = 16,
|
||||
NFC = 64,
|
||||
/* Might be valid NFKC form? */
|
||||
NKC = 32,
|
||||
NKC = 128,
|
||||
/* Certain preceding characters might make it not valid NFC/NKFC form? */
|
||||
CTX = 64
|
||||
CTX = 256
|
||||
};
|
||||
|
||||
static const struct {
|
||||
struct ucnrange {
|
||||
/* Bitmap of flags above. */
|
||||
unsigned char flags;
|
||||
unsigned short flags;
|
||||
/* Combining class of the character. */
|
||||
unsigned char combine;
|
||||
/* Last character in the range described by this entry. */
|
||||
unsigned short end;
|
||||
} ucnranges[] = {
|
||||
#include "ucnid.h"
|
||||
unsigned int end;
|
||||
};
|
||||
#include "ucnid.h"
|
||||
|
||||
/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
|
||||
the start of an identifier, and 0 if C is not valid in an
|
||||
@ -864,8 +867,9 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
|
||||
struct normalize_state *nst)
|
||||
{
|
||||
int mn, mx, md;
|
||||
unsigned short valid_flags, invalid_start_flags;
|
||||
|
||||
if (c > 0xFFFF)
|
||||
if (c > 0x10FFFF)
|
||||
return 0;
|
||||
|
||||
mn = 0;
|
||||
@ -881,15 +885,25 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
|
||||
|
||||
/* When -pedantic, we require the character to have been listed by
|
||||
the standard for the current language. Otherwise, we accept the
|
||||
union of the acceptable sets for C++98 and C99. */
|
||||
if (! (ucnranges[mn].flags & (C99 | CXX)))
|
||||
union of the acceptable sets for all supported language versions. */
|
||||
valid_flags = C99 | CXX | C11;
|
||||
if (CPP_PEDANTIC (pfile))
|
||||
{
|
||||
if (CPP_OPTION (pfile, c11_identifiers))
|
||||
valid_flags = C11;
|
||||
else if (CPP_OPTION (pfile, c99))
|
||||
valid_flags = C99;
|
||||
else if (CPP_OPTION (pfile, cplusplus))
|
||||
valid_flags = CXX;
|
||||
}
|
||||
if (! (ucnranges[mn].flags & valid_flags))
|
||||
return 0;
|
||||
|
||||
if (CPP_PEDANTIC (pfile)
|
||||
&& ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
|
||||
|| (CPP_OPTION (pfile, cplusplus)
|
||||
&& !(ucnranges[mn].flags & CXX))))
|
||||
return 0;
|
||||
if (CPP_OPTION (pfile, c11_identifiers))
|
||||
invalid_start_flags = N11;
|
||||
else if (CPP_OPTION (pfile, c99))
|
||||
invalid_start_flags = N99;
|
||||
else
|
||||
invalid_start_flags = 0;
|
||||
|
||||
/* Update NST. */
|
||||
if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
|
||||
@ -899,17 +913,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
|
||||
bool safe;
|
||||
cppchar_t p = nst->previous;
|
||||
|
||||
/* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
|
||||
if (c == 0x09BE)
|
||||
safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
|
||||
else if (c == 0x0B3E)
|
||||
safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
|
||||
else if (c == 0x0BBE)
|
||||
safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
|
||||
else if (c == 0x0CC2)
|
||||
safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
|
||||
else if (c == 0x0D3E)
|
||||
safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
|
||||
/* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
|
||||
and are combined algorithmically from a sequence of the form
|
||||
1100-1112 1161-1175 11A8-11C2
|
||||
@ -917,20 +920,19 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
|
||||
really a valid character).
|
||||
Unfortunately, C99 allows (only) the NFC form, but C++ allows
|
||||
only the combining characters. */
|
||||
else if (c >= 0x1161 && c <= 0x1175)
|
||||
if (c >= 0x1161 && c <= 0x1175)
|
||||
safe = p < 0x1100 || p > 0x1112;
|
||||
else if (c >= 0x11A8 && c <= 0x11C2)
|
||||
safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
|
||||
else
|
||||
safe = check_nfc (pfile, c, p);
|
||||
if (!safe)
|
||||
{
|
||||
/* Uh-oh, someone updated ucnid.h without updating this code. */
|
||||
cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
|
||||
safe = true;
|
||||
if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
|
||||
nst->level = MAX (nst->level, normalized_identifier_C);
|
||||
else
|
||||
nst->level = normalized_none;
|
||||
}
|
||||
if (!safe && c < 0x1161)
|
||||
nst->level = normalized_none;
|
||||
else if (!safe)
|
||||
nst->level = MAX (nst->level, normalized_identifier_C);
|
||||
}
|
||||
else if (ucnranges[mn].flags & NKC)
|
||||
;
|
||||
@ -940,11 +942,13 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
|
||||
nst->level = MAX (nst->level, normalized_identifier_C);
|
||||
else
|
||||
nst->level = normalized_none;
|
||||
nst->previous = c;
|
||||
if (ucnranges[mn].combine == 0)
|
||||
nst->previous = c;
|
||||
nst->prev_class = ucnranges[mn].combine;
|
||||
|
||||
/* In C99, UCN digits may not begin identifiers. */
|
||||
if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
|
||||
/* In C99, UCN digits may not begin identifiers. In C11 and C++11,
|
||||
UCN combining characters may not begin identifiers. */
|
||||
if (ucnranges[mn].flags & invalid_start_flags)
|
||||
return 2;
|
||||
|
||||
return 1;
|
||||
@ -1054,7 +1058,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
|
||||
CPP_OPTION (pfile, warn_dollars) = 0;
|
||||
cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
|
||||
}
|
||||
NORMALIZE_STATE_UPDATE_IDNUM (nst);
|
||||
NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
|
||||
}
|
||||
else if (identifier_pos)
|
||||
{
|
||||
|
@ -437,6 +437,10 @@ struct cpp_options
|
||||
literal number suffixes as user-defined literal number suffixes. */
|
||||
unsigned char ext_numeric_literals;
|
||||
|
||||
/* Nonzero means extended identifiers allow the characters specified
|
||||
in C11 and C++11. */
|
||||
unsigned char c11_identifiers;
|
||||
|
||||
/* Nonzero for C++ 2014 Standard binary constants. */
|
||||
unsigned char binary_constants;
|
||||
|
||||
|
@ -77,6 +77,7 @@ struct lang_flags
|
||||
char cplusplus;
|
||||
char extended_numbers;
|
||||
char extended_identifiers;
|
||||
char c11_identifiers;
|
||||
char std;
|
||||
char cplusplus_comments;
|
||||
char digraphs;
|
||||
@ -88,21 +89,21 @@ struct lang_flags
|
||||
};
|
||||
|
||||
static const struct lang_flags lang_defaults[] =
|
||||
{ /* c99 c++ xnum xid std // digr ulit rlit udlit bin_cst dig_sep */
|
||||
/* GNUC89 */ { 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* GNUC99 */ { 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0 },
|
||||
/* GNUC11 */ { 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0 },
|
||||
/* STDC89 */ { 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 },
|
||||
/* STDC94 */ { 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0 },
|
||||
/* STDC99 */ { 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* STDC11 */ { 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0 },
|
||||
/* GNUCXX */ { 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* CXX98 */ { 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* GNUCXX11 */ { 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0 },
|
||||
/* CXX11 */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0 },
|
||||
/* GNUCXX1Y */ { 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
|
||||
/* CXX1Y */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1 },
|
||||
/* ASM */ { 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }
|
||||
{ /* c99 c++ xnum xid c11 std // digr ulit rlit udlit bin_cst dig_sep */
|
||||
/* GNUC89 */ { 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* GNUC99 */ { 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0 },
|
||||
/* GNUC11 */ { 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0 },
|
||||
/* STDC89 */ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 },
|
||||
/* STDC94 */ { 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0 },
|
||||
/* STDC99 */ { 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* STDC11 */ { 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
|
||||
/* GNUCXX */ { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* CXX98 */ { 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0 },
|
||||
/* GNUCXX11 */ { 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0 },
|
||||
/* CXX11 */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0 },
|
||||
/* GNUCXX1Y */ { 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 },
|
||||
/* CXX1Y */ { 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
|
||||
/* ASM */ { 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 }
|
||||
/* xid should be 1 for GNUC99, STDC99, GNUCXX, CXX98, GNUCXX11, CXX11,
|
||||
GNUCXX1Y, and CXX1Y when no longer experimental (when all uses of
|
||||
identifiers in the compiler have been audited for correct handling
|
||||
@ -121,6 +122,7 @@ cpp_set_lang (cpp_reader *pfile, enum c_lang lang)
|
||||
CPP_OPTION (pfile, cplusplus) = l->cplusplus;
|
||||
CPP_OPTION (pfile, extended_numbers) = l->extended_numbers;
|
||||
CPP_OPTION (pfile, extended_identifiers) = l->extended_identifiers;
|
||||
CPP_OPTION (pfile, c11_identifiers) = l->c11_identifiers;
|
||||
CPP_OPTION (pfile, std) = l->std;
|
||||
CPP_OPTION (pfile, trigraphs) = l->std;
|
||||
CPP_OPTION (pfile, cplusplus_comments) = l->cplusplus_comments;
|
||||
|
@ -713,9 +713,10 @@ extern size_t _cpp_replacement_text_len (const cpp_macro *);
|
||||
|
||||
struct normalize_state
|
||||
{
|
||||
/* The previous character. */
|
||||
/* The previous starter character. */
|
||||
cppchar_t previous;
|
||||
/* The combining class of the previous character. */
|
||||
/* The combining class of the previous character (whether or not a
|
||||
starter). */
|
||||
unsigned char prev_class;
|
||||
/* The lowest normalization level so far. */
|
||||
enum cpp_normalize_level level;
|
||||
@ -723,10 +724,10 @@ struct normalize_state
|
||||
#define INITIAL_NORMALIZE_STATE { 0, 0, normalized_KC }
|
||||
#define NORMALIZE_STATE_RESULT(st) ((st)->level)
|
||||
|
||||
/* We saw a character that matches ISIDNUM(), update a
|
||||
/* We saw a character C that matches ISIDNUM(), update a
|
||||
normalize_state appropriately. */
|
||||
#define NORMALIZE_STATE_UPDATE_IDNUM(st) \
|
||||
((st)->previous = 0, (st)->prev_class = 0)
|
||||
#define NORMALIZE_STATE_UPDATE_IDNUM(st, c) \
|
||||
((st)->previous = (c), (st)->prev_class = 0)
|
||||
|
||||
extern cppchar_t _cpp_valid_ucn (cpp_reader *, const unsigned char **,
|
||||
const unsigned char *, int,
|
||||
|
17
libcpp/lex.c
17
libcpp/lex.c
@ -1204,11 +1204,14 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
|
||||
|
||||
cur = pfile->buffer->cur;
|
||||
if (! starts_ucn)
|
||||
while (ISIDNUM (*cur))
|
||||
{
|
||||
hash = HT_HASHSTEP (hash, *cur);
|
||||
cur++;
|
||||
}
|
||||
{
|
||||
while (ISIDNUM (*cur))
|
||||
{
|
||||
hash = HT_HASHSTEP (hash, *cur);
|
||||
cur++;
|
||||
}
|
||||
NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
|
||||
}
|
||||
pfile->buffer->cur = cur;
|
||||
if (starts_ucn || forms_identifier_p (pfile, false, nst))
|
||||
{
|
||||
@ -1216,8 +1219,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
|
||||
do {
|
||||
while (ISIDNUM (*pfile->buffer->cur))
|
||||
{
|
||||
NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
|
||||
pfile->buffer->cur++;
|
||||
NORMALIZE_STATE_UPDATE_IDNUM (nst);
|
||||
}
|
||||
} while (forms_identifier_p (pfile, false, nst));
|
||||
result = _cpp_interpret_identifier (pfile, base,
|
||||
@ -1277,8 +1280,8 @@ lex_number (cpp_reader *pfile, cpp_string *number,
|
||||
while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
|
||||
|| VALID_SIGN (*cur, cur[-1]))
|
||||
{
|
||||
NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
|
||||
cur++;
|
||||
NORMALIZE_STATE_UPDATE_IDNUM (nst);
|
||||
}
|
||||
|
||||
pfile->buffer->cur = cur;
|
||||
|
@ -29,15 +29,22 @@ along with this program; see the file COPYING3. If not see
|
||||
enum {
|
||||
C99 = 1,
|
||||
CXX = 2,
|
||||
digit = 4,
|
||||
not_NFC = 8,
|
||||
not_NFKC = 16,
|
||||
maybe_not_NFC = 32
|
||||
N99 = 4,
|
||||
C11 = 8,
|
||||
N11 = 16,
|
||||
all_languages = C99 | CXX | C11,
|
||||
not_NFC = 32,
|
||||
not_NFKC = 64,
|
||||
maybe_not_NFC = 128
|
||||
};
|
||||
|
||||
static unsigned flags[65536];
|
||||
static unsigned short decomp[65536][2];
|
||||
static unsigned char combining_value[65536];
|
||||
#define NUM_CODE_POINTS 0x110000
|
||||
#define MAX_CODE_POINT 0x10ffff
|
||||
|
||||
static unsigned flags[NUM_CODE_POINTS];
|
||||
static unsigned int all_decomp[NUM_CODE_POINTS][2];
|
||||
static unsigned int decomp[NUM_CODE_POINTS][2];
|
||||
static unsigned char combining_value[NUM_CODE_POINTS];
|
||||
|
||||
/* Die! */
|
||||
|
||||
@ -48,7 +55,7 @@ fail (const char *s)
|
||||
exit (1);
|
||||
}
|
||||
|
||||
/* Read ucnid.tab and set the C99 and CXX flags in header[]. */
|
||||
/* Read ucnid.tab and set the flags for language versions in header[]. */
|
||||
|
||||
static void
|
||||
read_ucnid (const char *fname)
|
||||
@ -66,10 +73,14 @@ read_ucnid (const char *fname)
|
||||
break;
|
||||
if (strcmp (line, "[C99]\n") == 0)
|
||||
fl = C99;
|
||||
if (strcmp (line, "[C99DIG]\n") == 0)
|
||||
fl = C99|digit;
|
||||
else if (strcmp (line, "[C99DIG]\n") == 0)
|
||||
fl = C99|N99;
|
||||
else if (strcmp (line, "[CXX]\n") == 0)
|
||||
fl = CXX;
|
||||
else if (strcmp (line, "[C11]\n") == 0)
|
||||
fl = C11;
|
||||
else if (strcmp (line, "[C11NOSTART]\n") == 0)
|
||||
fl = C11|N11;
|
||||
else if (isxdigit (line[0]))
|
||||
{
|
||||
char *l = line;
|
||||
@ -94,7 +105,7 @@ read_ucnid (const char *fname)
|
||||
}
|
||||
while (isspace (*l))
|
||||
l++;
|
||||
if (end > 0xFFFF)
|
||||
if (end > MAX_CODE_POINT)
|
||||
fail ("parsing ucnid.tab, end too large");
|
||||
while (start <= end)
|
||||
flags[start++] |= fl;
|
||||
@ -108,8 +119,10 @@ read_ucnid (const char *fname)
|
||||
|
||||
/* Read UnicodeData.txt and fill in the 'decomp' table to be the
|
||||
decompositions of characters for which both the character
|
||||
decomposed and all the code points in the decomposition are either
|
||||
C99 or CXX. */
|
||||
decomposed and all the code points in the decomposition are valid
|
||||
for some supported language version, and the 'all_decomp' table to
|
||||
be the decompositions of all characters without those
|
||||
constraints. */
|
||||
|
||||
static void
|
||||
read_table (char *fname)
|
||||
@ -123,7 +136,7 @@ read_table (char *fname)
|
||||
char line[256];
|
||||
unsigned long codepoint, this_decomp[4];
|
||||
char *l;
|
||||
int i;
|
||||
int i, j;
|
||||
int decomp_useful;
|
||||
|
||||
if (!fgets (line, sizeof (line), f))
|
||||
@ -131,8 +144,8 @@ read_table (char *fname)
|
||||
codepoint = strtoul (line, &l, 16);
|
||||
if (l == line || *l != ';')
|
||||
fail ("parsing UnicodeData.txt, reading code point");
|
||||
if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX)))
|
||||
continue;
|
||||
if (codepoint > MAX_CODE_POINT)
|
||||
fail ("parsing UnicodeData.txt, code point too large");
|
||||
|
||||
do {
|
||||
l++;
|
||||
@ -171,7 +184,9 @@ read_table (char *fname)
|
||||
}
|
||||
if (i > 2) /* Decomposition too long. */
|
||||
fail ("parsing UnicodeData.txt, decomposition too long");
|
||||
if (decomp_useful)
|
||||
for (j = 0; j < i; j++)
|
||||
all_decomp[codepoint][j] = this_decomp[j];
|
||||
if ((flags[codepoint] & all_languages) && decomp_useful)
|
||||
while (--i >= 0)
|
||||
decomp[codepoint][i] = this_decomp[i];
|
||||
}
|
||||
@ -208,8 +223,8 @@ read_derived (const char *fname)
|
||||
start = strtoul (line, &l, 16);
|
||||
if (l == line)
|
||||
fail ("parsing DerivedNormalizationProps.txt, reading start");
|
||||
if (start > 0xffff)
|
||||
continue;
|
||||
if (start > MAX_CODE_POINT)
|
||||
fail ("parsing DerivedNormalizationProps.txt, code point too large");
|
||||
if (*l == '.' && l[1] == '.')
|
||||
end = strtoul (l + 2, &l, 16);
|
||||
else
|
||||
@ -237,17 +252,21 @@ write_table (void)
|
||||
unsigned last_flag = flags[0];
|
||||
bool really_safe = decomp[0][0] == 0;
|
||||
unsigned char last_combine = combining_value[0];
|
||||
|
||||
printf ("static const struct ucnrange ucnranges[] = {\n");
|
||||
|
||||
for (i = 1; i <= 65536; i++)
|
||||
if (i == 65536
|
||||
|| (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX)))
|
||||
for (i = 1; i <= NUM_CODE_POINTS; i++)
|
||||
if (i == NUM_CODE_POINTS
|
||||
|| (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
|
||||
|| really_safe != (decomp[i][0] == 0)
|
||||
|| combining_value[i] != last_combine)
|
||||
{
|
||||
printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
|
||||
printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
|
||||
last_flag & C99 ? "C99" : " 0",
|
||||
last_flag & digit ? "DIG" : " 0",
|
||||
last_flag & N99 ? "N99" : " 0",
|
||||
last_flag & CXX ? "CXX" : " 0",
|
||||
last_flag & C11 ? "C11" : " 0",
|
||||
last_flag & N11 ? "N11" : " 0",
|
||||
really_safe ? "CID" : " 0",
|
||||
last_flag & not_NFC ? " 0" : "NFC",
|
||||
last_flag & not_NFKC ? " 0" : "NKC",
|
||||
@ -258,6 +277,98 @@ write_table (void)
|
||||
last_combine = combining_value[0];
|
||||
really_safe = decomp[i][0] == 0;
|
||||
}
|
||||
|
||||
printf ("};\n");
|
||||
}
|
||||
|
||||
/* Return whether a given character is valid in an identifier for some
|
||||
supported language, either as itself or as a UCN. */
|
||||
|
||||
static bool
|
||||
char_id_valid (unsigned int c)
|
||||
{
|
||||
return ((flags[c] & all_languages)
|
||||
|| (c == 0x24)
|
||||
|| (c >= 0x30 && c <= 0x39)
|
||||
|| (c >= 0x41 && c <= 0x5a)
|
||||
|| (c >= 0x61 && c <= 0x7a));
|
||||
}
|
||||
|
||||
/* Write out the switch statement over characters for which it is
|
||||
context-dependent whether they are in NFC. */
|
||||
|
||||
static void
|
||||
write_context_switch (void)
|
||||
{
|
||||
unsigned i;
|
||||
printf ("static bool\n"
|
||||
"check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
|
||||
"{\n"
|
||||
" switch (c)\n"
|
||||
" {\n");
|
||||
for (i = 0; i < NUM_CODE_POINTS; i++)
|
||||
{
|
||||
bool found_case = false;
|
||||
unsigned j;
|
||||
if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
|
||||
continue;
|
||||
if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
|
||||
continue; /* Hangul handled algorithmically. */
|
||||
printf (" case %#06x:\n"
|
||||
" switch (p)\n"
|
||||
"\t{\n", i);
|
||||
/* If an NFC starter character decomposes with this character I
|
||||
as the second character and an NFC starter character S as the
|
||||
first character, that latter character as a previous
|
||||
character means this character is not NFC. Furthermore, any
|
||||
NFC starter character K made by a series of compositions of S
|
||||
with combining characters whose combining class is greater
|
||||
than that of I also means this character is not NFC. */
|
||||
for (j = 0; j < NUM_CODE_POINTS; j++)
|
||||
{
|
||||
unsigned s, k;
|
||||
if (all_decomp[j][1] != i)
|
||||
continue;
|
||||
s = all_decomp[j][0];
|
||||
if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
|
||||
continue;
|
||||
if (char_id_valid (s))
|
||||
{
|
||||
found_case = true;
|
||||
printf ("\tcase %#06x:\n", s);
|
||||
}
|
||||
for (k = 0; k < NUM_CODE_POINTS; k++)
|
||||
{
|
||||
unsigned t = k;
|
||||
if (k == s || !char_id_valid (k))
|
||||
continue;
|
||||
while (all_decomp[t][1] != 0
|
||||
&& combining_value[all_decomp[t][1]] > combining_value[i])
|
||||
{
|
||||
if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
|
||||
break;
|
||||
t = all_decomp[t][0];
|
||||
}
|
||||
if (t == s)
|
||||
{
|
||||
found_case = true;
|
||||
printf ("\tcase %#06x:\n", k);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (found_case)
|
||||
printf ("\t return false;\n");
|
||||
else
|
||||
printf ("\t/* Non-NFC cases not applicable to C/C++. */\n");
|
||||
printf ("\tdefault:\n"
|
||||
"\t return true;\n"
|
||||
"\t}\n\n");
|
||||
}
|
||||
printf (" default:\n"
|
||||
" cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
|
||||
" return true;\n"
|
||||
" }\n"
|
||||
"}\n");
|
||||
}
|
||||
|
||||
/* Print out the huge copyright notice. */
|
||||
@ -336,5 +447,6 @@ main(int argc, char ** argv)
|
||||
|
||||
write_copyright ();
|
||||
write_table ();
|
||||
write_context_switch ();
|
||||
return 0;
|
||||
}
|
||||
|
5216
libcpp/ucnid.h
5216
libcpp/ucnid.h
File diff suppressed because it is too large
Load Diff
@ -19,7 +19,8 @@
|
||||
; D, which is itself a reproduction from ISO/IEC TR 10176:1998, and
|
||||
; the similar table from ISO/IEC 14882:1988 (C++98) Annex E, which is
|
||||
; a reproduction of ISO/IEC PDTR 10176. Unfortunately these tables
|
||||
; are not identical.
|
||||
; are not identical. It also reproduces the somewhat different tables
|
||||
; in C11 and C++11, which are identical to each other.
|
||||
|
||||
[C99]
|
||||
|
||||
@ -209,3 +210,34 @@ fbd3-fd3f fd50-fd8f fd92-fdc7 fdf0-fdfb fe70-fe72 fe74 fe76-fefc
|
||||
ff21-ff3a ff41-ff5a ff66-ffbe ffc2-ffc7 ffca-ffcf ffd2-ffd7
|
||||
ffda-ffdc 4e00-9fa5
|
||||
|
||||
[C11]
|
||||
; Group 1
|
||||
00a8 00aa 00ad 00af 00b2-00b5 00b7-00ba 00bc-00be 00c0-00d6 00d8-00f6
|
||||
00f8-00ff
|
||||
|
||||
; Group 2, minus characters under C11NOSTART
|
||||
0100-02ff 0370-167f 1681-180d 180f-1dbf 1e00-1fff
|
||||
|
||||
; Group 3
|
||||
200b-200d 202a-202e 203f-2040 2054 2060-206f
|
||||
|
||||
; Group 4, minus characters under C11NOSTART
|
||||
2070-20cf 2100-218f 2460-24ff 2776-2793 2c00-2dff 2e80-2fff
|
||||
|
||||
; Group 5
|
||||
3004-3007 3021-302f 3031-303f
|
||||
|
||||
; Group 6
|
||||
3040-d7ff
|
||||
|
||||
; Group 7, minus characters under C11NOSTART
|
||||
f900-fd3d fd40-fdcf fdf0-fe1f fe30-fe44 fe47-fffd
|
||||
|
||||
; Group 8
|
||||
10000-1fffd 20000-2fffd 30000-3fffd 40000-4fffd 50000-5fffd
|
||||
60000-6fffd 70000-7fffd 80000-8fffd 90000-9fffd a0000-afffd
|
||||
b0000-bfffd c0000-cfffd d0000-dfffd e0000-efffd
|
||||
|
||||
[C11NOSTART]
|
||||
; Group 1
|
||||
0300-036f 1dc0-1dff 20d0-20ff fe20-fe2f
|
||||
|
Loading…
Reference in New Issue
Block a user