ucnid-2011-1.c: New test.

gcc/testsuite: * c-c++-common/cpp/ucnid-2011-1.c: New test. libcpp: * ucnid.tab: Add C11 and C11NOSTART data. * makeucnid.c (digit): Rename enum value to N99. (C11, N11, all_languages): New enum values. (NUM_CODE_POINTS, MAX_CODE_POINT): New macros. (flags, decomp, combining_value): Use NUM_CODE_POINTS as array size. (decomp): Use unsigned int as element type. (all_decomp): New array. (read_ucnid): Handle C11 and C11NOSTART. Use MAX_CODE_POINT. (read_table): Use MAX_CODE_POINT. Store all decompositions in all_decomp. (read_derived): Use MAX_CODE_POINT. (write_table): Use NUM_CODE_POINTS. Print N99, C11 and N11 flags. Print whole array variable declaration rather than just array contents. (char_id_valid, write_context_switch): New functions. (main): Call write_context_switch. * ucnid.h: Regenerate. * include/cpplib.h (struct cpp_options): Add c11_identifiers. * init.c (struct lang_flags): Add c11_identifiers. (cpp_set_lang): Set c11_identifiers option from selected language. * internal.h (struct normalize_state): Document "previous" as previous starter character. (NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument. * charset.c (DIG): Rename enum value to N99. (C11, N11): New enum values. (struct ucnrange): Give name to struct. Use short for flags and unsigned int for end of range. Include ucnid.h for whole variable declaration. (ucn_valid_in_identifier): Allow for characters up to 0x10FFFF. Allow for C11 in determining valid characters and valid start characters. Use check_nfc for non-Hangul context-dependent checks. Only store starter characters in nst->previous. (_cpp_valid_ucn): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. * lex.c (lex_identifier): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. Call NORMALIZE_STATE_UPDATE_IDNUM after initial non-UCN part of identifier. (lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM. From-SVN: r204886
2013-11-16 00:05:08 +00:00 · 2013-11-16 00:05:08 +00:00 · d3f4ff8b51
commit d3f4ff8b51
parent 3d053a5f72
11 changed files with 4783 additions and 840 deletions
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,7 @@
+2013-11-15  Joseph Myers  <joseph@codesourcery.com>
+
+	* c-c++-common/cpp/ucnid-2011-1.c: New test.
+
 2013-11-15  Paolo Carlini  <paolo.carlini@oracle.com>

 	PR c++/58188
--- a/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1.c
+++ b/gcc/testsuite/c-c++-common/cpp/ucnid-2011-1.c
@ -0,0 +1,15 @@
+/* { dg-do preprocess } */
+/* { dg-options "-std=c11 -pedantic -fextended-identifiers" { target c } } */
+/* { dg-options "-std=c++11 -pedantic -fextended-identifiers" { target c++ } } */
+
+\u00A8
+
+B\u0300
+
+\u0300 /* { dg-error "not valid at the start of an identifier" } */
+
+A\u0300 /* { dg-warning "not in NFC" } */
+
+\U00010000
+\U0001FFFD
+\U000E1234
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@ -1,3 +1,45 @@
+2013-11-15  Joseph Myers  <joseph@codesourcery.com>
+
+	* ucnid.tab: Add C11 and C11NOSTART data.
+	* makeucnid.c (digit): Rename enum value to N99.
+	(C11, N11, all_languages): New enum values.
+	(NUM_CODE_POINTS, MAX_CODE_POINT): New macros.
+	(flags, decomp, combining_value): Use NUM_CODE_POINTS as array
+	size.
+	(decomp): Use unsigned int as element type.
+	(all_decomp): New array.
+	(read_ucnid): Handle C11 and C11NOSTART.  Use MAX_CODE_POINT.
+	(read_table): Use MAX_CODE_POINT.  Store all decompositions in
+	all_decomp.
+	(read_derived): Use MAX_CODE_POINT.
+	(write_table): Use NUM_CODE_POINTS.  Print N99, C11 and N11
+	flags.  Print whole array variable declaration rather than just
+	array contents.
+	(char_id_valid, write_context_switch): New functions.
+	(main): Call write_context_switch.
+	* ucnid.h: Regenerate.
+	* include/cpplib.h (struct cpp_options): Add c11_identifiers.
+	* init.c (struct lang_flags): Add c11_identifiers.
+	(cpp_set_lang): Set c11_identifiers option from selected language.
+	* internal.h (struct normalize_state): Document "previous" as
+	previous starter character.
+	(NORMALIZE_STATE_UPDATE_IDNUM): Take character as argument.
+	* charset.c (DIG): Rename enum value to N99.
+	(C11, N11): New enum values.
+	(struct ucnrange): Give name to struct.  Use short for flags and
+	unsigned int for end of range.  Include ucnid.h for whole variable
+	declaration.
+	(ucn_valid_in_identifier): Allow for characters up to 0x10FFFF.
+	Allow for C11 in determining valid characters and valid start
+	characters.  Use check_nfc for non-Hangul context-dependent
+	checks.  Only store starter characters in nst->previous.
+	(_cpp_valid_ucn): Pass new argument to
+	NORMALIZE_STATE_UPDATE_IDNUM.
+	* lex.c (lex_identifier): Pass new argument to
+	NORMALIZE_STATE_UPDATE_IDNUM.  Call NORMALIZE_STATE_UPDATE_IDNUM
+	after initial non-UCN part of identifier.
+	(lex_number): Pass new argument to NORMALIZE_STATE_UPDATE_IDNUM.
+
 2013-11-15  Joseph Myers  <joseph@codesourcery.com>

 	* ucnid.tab: Mark C99 digits as [C99DIG].
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@ -828,29 +828,32 @@ enum {
  /* Valid in a C99 identifier?  */
  C99 = 1,
  /* Valid in a C99 identifier, but not as the first character?  */
-  DIG = 2,
+  N99 = 2,
  /* Valid in a C++ identifier?  */
  CXX = 4,
+  /* Valid in a C11/C++11 identifier?  */
+  C11 = 8,
+  /* Valid in a C11/C++11 identifier, but not as the first character?  */
+  N11 = 16,
  /* NFC representation is not valid in an identifier?  */
-  CID = 8,
+  CID = 32,
  /* Might be valid NFC form?  */
-  NFC = 16,
+  NFC = 64,
  /* Might be valid NFKC form?  */
-  NKC = 32,
+  NKC = 128,
  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
-  CTX = 64
+  CTX = 256
 };

-static const struct {
+struct ucnrange {
  /* Bitmap of flags above.  */
-  unsigned char flags;
+  unsigned short flags;
  /* Combining class of the character.  */
  unsigned char combine;
  /* Last character in the range described by this entry.  */
-  unsigned short end;
-} ucnranges[] = {
-#include "ucnid.h"
+  unsigned int end;
 };
+#include "ucnid.h"

 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
   the start of an identifier, and 0 if C is not valid in an
@ -864,8 +867,9 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
 			 struct normalize_state *nst)
 {
  int mn, mx, md;
+  unsigned short valid_flags, invalid_start_flags;

-  if (c > 0xFFFF)
+  if (c > 0x10FFFF)
    return 0;

  mn = 0;
@ -881,15 +885,25 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,

  /* When -pedantic, we require the character to have been listed by
     the standard for the current language.  Otherwise, we accept the
-     union of the acceptable sets for C++98 and C99.  */
-  if (! (ucnranges[mn].flags & (C99 | CXX)))
+     union of the acceptable sets for all supported language versions.  */
+  valid_flags = C99 | CXX | C11;
+  if (CPP_PEDANTIC (pfile))
+    {
+      if (CPP_OPTION (pfile, c11_identifiers))
+	valid_flags = C11;
+      else if (CPP_OPTION (pfile, c99))
+	valid_flags = C99;
+      else if (CPP_OPTION (pfile, cplusplus))
+	valid_flags = CXX;
+    }
+  if (! (ucnranges[mn].flags & valid_flags))
      return 0;
-
-  if (CPP_PEDANTIC (pfile)
-      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
-	  || (CPP_OPTION (pfile, cplusplus)
-	      && !(ucnranges[mn].flags & CXX))))
-    return 0;
+  if (CPP_OPTION (pfile, c11_identifiers))
+    invalid_start_flags = N11;
+  else if (CPP_OPTION (pfile, c99))
+    invalid_start_flags = N99;
+  else
+    invalid_start_flags = 0;

  /* Update NST.  */
  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
@ -899,17 +913,6 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
      bool safe;
      cppchar_t p = nst->previous;

-      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
-      if (c == 0x09BE)
-	safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
-      else if (c == 0x0B3E)
-	safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
-      else if (c == 0x0BBE)
-	safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
-      else if (c == 0x0CC2)
-	safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
-      else if (c == 0x0D3E)
-	safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
 	 and are combined algorithmically from a sequence of the form
 	 1100-1112 1161-1175 11A8-11C2
@ -917,20 +920,19 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
 	 really a valid character).
 	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
 	 only the combining characters.  */
-      else if (c >= 0x1161 && c <= 0x1175)
+      if (c >= 0x1161 && c <= 0x1175)
 	safe = p < 0x1100 || p > 0x1112;
      else if (c >= 0x11A8 && c <= 0x11C2)
 	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
      else
+	safe = check_nfc (pfile, c, p);
+      if (!safe)
 	{
-	  /* Uh-oh, someone updated ucnid.h without updating this code.  */
-	  cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
-	  safe = true;
+	  if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
+	    nst->level = MAX (nst->level, normalized_identifier_C);
+	  else
+	    nst->level = normalized_none;
 	}
-      if (!safe && c < 0x1161)
-	nst->level = normalized_none;
-      else if (!safe)
-	nst->level = MAX (nst->level, normalized_identifier_C);
    }
  else if (ucnranges[mn].flags & NKC)
    ;
@ -940,11 +942,13 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
    nst->level = MAX (nst->level, normalized_identifier_C);
  else
    nst->level = normalized_none;
-  nst->previous = c;
+  if (ucnranges[mn].combine == 0)
+    nst->previous = c;
  nst->prev_class = ucnranges[mn].combine;

-  /* In C99, UCN digits may not begin identifiers.  */
-  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
+  /* In C99, UCN digits may not begin identifiers.  In C11 and C++11,
+     UCN combining characters may not begin identifiers.  */
+  if (ucnranges[mn].flags & invalid_start_flags)
    return 2;

  return 1;
@ -1054,7 +1058,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
 	  CPP_OPTION (pfile, warn_dollars) = 0;
 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 	}
-      NORMALIZE_STATE_UPDATE_IDNUM (nst);
+      NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
    }
  else if (identifier_pos)
    {
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@ -437,6 +437,10 @@ struct cpp_options
     literal number suffixes as user-defined literal number suffixes.  */
  unsigned char ext_numeric_literals;

+  /* Nonzero means extended identifiers allow the characters specified
+     in C11 and C++11.  */
+  unsigned char c11_identifiers;
+
  /* Nonzero for C++ 2014 Standard binary constants.  */
  unsigned char binary_constants;

--- a/libcpp/init.c
+++ b/libcpp/init.c
@ -77,6 +77,7 @@ struct lang_flags
  char cplusplus;
  char extended_numbers;
  char extended_identifiers;
+  char c11_identifiers;
  char std;
  char cplusplus_comments;
  char digraphs;
@ -88,21 +89,21 @@ struct lang_flags
 };

 static const struct lang_flags lang_defaults[] =
-{ /*              c99 c++ xnum xid std  //   digr ulit rlit udlit bin_cst dig_sep */
-  /* GNUC89   */  { 0,  0,  1,   0,  0,   1,   1,   0,   0,   0,    0,      0 },
-  /* GNUC99   */  { 1,  0,  1,   0,  0,   1,   1,   1,   1,   0,    0,      0 },
-  /* GNUC11   */  { 1,  0,  1,   0,  0,   1,   1,   1,   1,   0,    0,      0 },
-  /* STDC89   */  { 0,  0,  0,   0,  1,   0,   0,   0,   0,   0,    0,      0 },
-  /* STDC94   */  { 0,  0,  0,   0,  1,   0,   1,   0,   0,   0,    0,      0 },
-  /* STDC99   */  { 1,  0,  1,   0,  1,   1,   1,   0,   0,   0,    0,      0 },
-  /* STDC11   */  { 1,  0,  1,   0,  1,   1,   1,   1,   0,   0,    0,      0 },
-  /* GNUCXX   */  { 0,  1,  1,   0,  0,   1,   1,   0,   0,   0,    0,      0 },
-  /* CXX98    */  { 0,  1,  1,   0,  1,   1,   1,   0,   0,   0,    0,      0 },
-  /* GNUCXX11 */  { 1,  1,  1,   0,  0,   1,   1,   1,   1,   1,    0,      0 },
-  /* CXX11    */  { 1,  1,  1,   0,  1,   1,   1,   1,   1,   1,    0,      0 },
-  /* GNUCXX1Y */  { 1,  1,  1,   0,  0,   1,   1,   1,   1,   1,    1,      1 },
-  /* CXX1Y    */  { 1,  1,  1,   0,  1,   1,   1,   1,   1,   1,    1,      1 },
-  /* ASM      */  { 0,  0,  1,   0,  0,   1,   0,   0,   0,   0,    0,      0 }
+{ /*              c99 c++ xnum xid c11 std  //   digr ulit rlit udlit bin_cst dig_sep */
+  /* GNUC89   */  { 0,  0,  1,   0,  0,  0,   1,   1,   0,   0,   0,    0,      0 },
+  /* GNUC99   */  { 1,  0,  1,   0,  0,  0,   1,   1,   1,   1,   0,    0,      0 },
+  /* GNUC11   */  { 1,  0,  1,   0,  1,  0,   1,   1,   1,   1,   0,    0,      0 },
+  /* STDC89   */  { 0,  0,  0,   0,  0,  1,   0,   0,   0,   0,   0,    0,      0 },
+  /* STDC94   */  { 0,  0,  0,   0,  0,  1,   0,   1,   0,   0,   0,    0,      0 },
+  /* STDC99   */  { 1,  0,  1,   0,  0,  1,   1,   1,   0,   0,   0,    0,      0 },
+  /* STDC11   */  { 1,  0,  1,   0,  1,  1,   1,   1,   1,   0,   0,    0,      0 },
+  /* GNUCXX   */  { 0,  1,  1,   0,  0,  0,   1,   1,   0,   0,   0,    0,      0 },
+  /* CXX98    */  { 0,  1,  1,   0,  0,  1,   1,   1,   0,   0,   0,    0,      0 },
+  /* GNUCXX11 */  { 1,  1,  1,   0,  1,  0,   1,   1,   1,   1,   1,    0,      0 },
+  /* CXX11    */  { 1,  1,  1,   0,  1,  1,   1,   1,   1,   1,   1,    0,      0 },
+  /* GNUCXX1Y */  { 1,  1,  1,   0,  1,  0,   1,   1,   1,   1,   1,    1,      1 },
+  /* CXX1Y    */  { 1,  1,  1,   0,  1,  1,   1,   1,   1,   1,   1,    1,      1 },
+  /* ASM      */  { 0,  0,  1,   0,  0,  0,   1,   0,   0,   0,   0,    0,      0 }
  /* xid should be 1 for GNUC99, STDC99, GNUCXX, CXX98, GNUCXX11, CXX11,
     GNUCXX1Y, and CXX1Y when no longer experimental (when all uses of
     identifiers in the compiler have been audited for correct handling
@ -121,6 +122,7 @@ cpp_set_lang (cpp_reader *pfile, enum c_lang lang)
  CPP_OPTION (pfile, cplusplus)			 = l->cplusplus;
  CPP_OPTION (pfile, extended_numbers)		 = l->extended_numbers;
  CPP_OPTION (pfile, extended_identifiers)	 = l->extended_identifiers;
+  CPP_OPTION (pfile, c11_identifiers)		 = l->c11_identifiers;
  CPP_OPTION (pfile, std)			 = l->std;
  CPP_OPTION (pfile, trigraphs)			 = l->std;
  CPP_OPTION (pfile, cplusplus_comments)	 = l->cplusplus_comments;
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@ -713,9 +713,10 @@ extern size_t _cpp_replacement_text_len (const cpp_macro *);

 struct normalize_state 
 {
-  /* The previous character.  */
+  /* The previous starter character.  */
  cppchar_t previous;
-  /* The combining class of the previous character.  */
+  /* The combining class of the previous character (whether or not a
+     starter).  */
  unsigned char prev_class;
  /* The lowest normalization level so far.  */
  enum cpp_normalize_level level;
@ -723,10 +724,10 @@ struct normalize_state
 #define INITIAL_NORMALIZE_STATE { 0, 0, normalized_KC }
 #define NORMALIZE_STATE_RESULT(st) ((st)->level)

-/* We saw a character that matches ISIDNUM(), update a
+/* We saw a character C that matches ISIDNUM(), update a
   normalize_state appropriately.  */
-#define NORMALIZE_STATE_UPDATE_IDNUM(st) \
-  ((st)->previous = 0, (st)->prev_class = 0)
+#define NORMALIZE_STATE_UPDATE_IDNUM(st, c)	\
+  ((st)->previous = (c), (st)->prev_class = 0)

 extern cppchar_t _cpp_valid_ucn (cpp_reader *, const unsigned char **,
 				 const unsigned char *, int,
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@ -1204,11 +1204,14 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,

  cur = pfile->buffer->cur;
  if (! starts_ucn)
-    while (ISIDNUM (*cur))
-      {
-	hash = HT_HASHSTEP (hash, *cur);
-	cur++;
-      }
+    {
+      while (ISIDNUM (*cur))
+	{
+	  hash = HT_HASHSTEP (hash, *cur);
+	  cur++;
+	}
+      NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
+    }
  pfile->buffer->cur = cur;
  if (starts_ucn || forms_identifier_p (pfile, false, nst))
    {
@ -1216,8 +1219,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
      do {
 	while (ISIDNUM (*pfile->buffer->cur))
 	  {
+	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
 	    pfile->buffer->cur++;
-	    NORMALIZE_STATE_UPDATE_IDNUM (nst);
 	  }
      } while (forms_identifier_p (pfile, false, nst));
      result = _cpp_interpret_identifier (pfile, base,
@ -1277,8 +1280,8 @@ lex_number (cpp_reader *pfile, cpp_string *number,
      while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
 	     || VALID_SIGN (*cur, cur[-1]))
 	{
+	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
 	  cur++;
-	  NORMALIZE_STATE_UPDATE_IDNUM (nst);
 	}

      pfile->buffer->cur = cur;
--- a/libcpp/makeucnid.c
+++ b/libcpp/makeucnid.c
@ -29,15 +29,22 @@ along with this program; see the file COPYING3.  If not see
 enum {
  C99 = 1,
  CXX = 2,
-  digit = 4,
-  not_NFC = 8,
-  not_NFKC = 16,
-  maybe_not_NFC = 32
+  N99 = 4,
+  C11 = 8,
+  N11 = 16,
+  all_languages = C99 | CXX | C11,
+  not_NFC = 32,
+  not_NFKC = 64,
+  maybe_not_NFC = 128
 };

-static unsigned flags[65536];
-static unsigned short decomp[65536][2];
-static unsigned char combining_value[65536];
+#define NUM_CODE_POINTS 0x110000
+#define MAX_CODE_POINT 0x10ffff
+
+static unsigned flags[NUM_CODE_POINTS];
+static unsigned int all_decomp[NUM_CODE_POINTS][2];
+static unsigned int decomp[NUM_CODE_POINTS][2];
+static unsigned char combining_value[NUM_CODE_POINTS];

 /* Die!  */

@ -48,7 +55,7 @@ fail (const char *s)
  exit (1);
 }

-/* Read ucnid.tab and set the C99 and CXX flags in header[].  */
+/* Read ucnid.tab and set the flags for language versions in header[].  */

 static void
 read_ucnid (const char *fname)
@ -66,10 +73,14 @@ read_ucnid (const char *fname)
 	break;
      if (strcmp (line, "[C99]\n") == 0)
 	fl = C99;
-      if (strcmp (line, "[C99DIG]\n") == 0)
-	fl = C99|digit;
+      else if (strcmp (line, "[C99DIG]\n") == 0)
+	fl = C99|N99;
      else if (strcmp (line, "[CXX]\n") == 0)
 	fl = CXX;
+      else if (strcmp (line, "[C11]\n") == 0)
+	fl = C11;
+      else if (strcmp (line, "[C11NOSTART]\n") == 0)
+	fl = C11|N11;
      else if (isxdigit (line[0]))
 	{
 	  char *l = line;
@ -94,7 +105,7 @@ read_ucnid (const char *fname)
 		}
 	      while (isspace (*l))
 		l++;
-	      if (end > 0xFFFF)
+	      if (end > MAX_CODE_POINT)
 		fail ("parsing ucnid.tab, end too large");
 	      while (start <= end)
 		flags[start++] |= fl;
@ -108,8 +119,10 @@ read_ucnid (const char *fname)

 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
   decompositions of characters for which both the character
-   decomposed and all the code points in the decomposition are either
-   C99 or CXX.  */
+   decomposed and all the code points in the decomposition are valid
+   for some supported language version, and the 'all_decomp' table to
+   be the decompositions of all characters without those
+   constraints.  */

 static void
 read_table (char *fname)
@ -123,7 +136,7 @@ read_table (char *fname)
      char line[256];
      unsigned long codepoint, this_decomp[4];
      char *l;
-      int i;
+      int i, j;
      int decomp_useful;

      if (!fgets (line, sizeof (line), f))
@ -131,8 +144,8 @@ read_table (char *fname)
      codepoint = strtoul (line, &l, 16);
      if (l == line || *l != ';')
 	fail ("parsing UnicodeData.txt, reading code point");
-      if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX)))
-	continue;
+      if (codepoint > MAX_CODE_POINT)
+	fail ("parsing UnicodeData.txt, code point too large");

      do {
 	l++;
@ -171,7 +184,9 @@ read_table (char *fname)
 	}
      if (i > 2)  /* Decomposition too long.  */
 	fail ("parsing UnicodeData.txt, decomposition too long");
-      if (decomp_useful)
+      for (j = 0; j < i; j++)
+	all_decomp[codepoint][j] = this_decomp[j];
+      if ((flags[codepoint] & all_languages) && decomp_useful)
 	while (--i >= 0)
 	  decomp[codepoint][i] = this_decomp[i];
    }
@ -208,8 +223,8 @@ read_derived (const char *fname)
      start = strtoul (line, &l, 16);
      if (l == line)
 	fail ("parsing DerivedNormalizationProps.txt, reading start");
-      if (start > 0xffff)
-	continue;
+      if (start > MAX_CODE_POINT)
+	fail ("parsing DerivedNormalizationProps.txt, code point too large");
      if (*l == '.' && l[1] == '.')
 	end = strtoul (l + 2, &l, 16);
      else
@ -237,17 +252,21 @@ write_table (void)
  unsigned last_flag = flags[0];
  bool really_safe = decomp[0][0] == 0;
  unsigned char last_combine = combining_value[0];
+
+  printf ("static const struct ucnrange ucnranges[] = {\n");
  
-  for (i = 1; i <= 65536; i++)
-    if (i == 65536
-	|| (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX)))
+  for (i = 1; i <= NUM_CODE_POINTS; i++)
+    if (i == NUM_CODE_POINTS
+	|| (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
 	|| really_safe != (decomp[i][0] == 0)
 	|| combining_value[i] != last_combine)
      {
-	printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
+	printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
 		last_flag & C99 ? "C99" : "  0",
-		last_flag & digit ? "DIG" : "  0",
+		last_flag & N99 ? "N99" : "  0",
 		last_flag & CXX ? "CXX" : "  0",
+		last_flag & C11 ? "C11" : "  0",
+		last_flag & N11 ? "N11" : "  0",
 		really_safe ? "CID" : "  0",
 		last_flag & not_NFC ? "  0" : "NFC",
 		last_flag & not_NFKC ? "  0" : "NKC",
@ -258,6 +277,98 @@ write_table (void)
 	last_combine = combining_value[0];
 	really_safe = decomp[i][0] == 0;
      }
+
+  printf ("};\n");
+}
+
+/* Return whether a given character is valid in an identifier for some
+   supported language, either as itself or as a UCN.  */
+
+static bool
+char_id_valid (unsigned int c)
+{
+  return ((flags[c] & all_languages)
+	  || (c == 0x24)
+	  || (c >= 0x30 && c <= 0x39)
+	  || (c >= 0x41 && c <= 0x5a)
+	  || (c >= 0x61 && c <= 0x7a));
+}
+
+/* Write out the switch statement over characters for which it is
+   context-dependent whether they are in NFC.  */
+
+static void
+write_context_switch (void)
+{
+  unsigned i;
+  printf ("static bool\n"
+	  "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
+	  "{\n"
+	  "  switch (c)\n"
+	  "    {\n");
+  for (i = 0; i < NUM_CODE_POINTS; i++)
+    {
+      bool found_case = false;
+      unsigned j;
+      if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
+	continue;
+      if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
+	continue; /* Hangul handled algorithmically.  */
+      printf ("    case %#06x:\n"
+	      "      switch (p)\n"
+	      "\t{\n", i);
+      /* If an NFC starter character decomposes with this character I
+	 as the second character and an NFC starter character S as the
+	 first character, that latter character as a previous
+	 character means this character is not NFC.  Furthermore, any
+	 NFC starter character K made by a series of compositions of S
+	 with combining characters whose combining class is greater
+	 than that of I also means this character is not NFC.  */
+      for (j = 0; j < NUM_CODE_POINTS; j++)
+	{
+	  unsigned s, k;
+	  if (all_decomp[j][1] != i)
+	    continue;
+	  s = all_decomp[j][0];
+	  if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
+	    continue;
+	  if (char_id_valid (s))
+	    {
+	      found_case = true;
+	      printf ("\tcase %#06x:\n", s);
+	    }
+	  for (k = 0; k < NUM_CODE_POINTS; k++)
+	    {
+	      unsigned t = k;
+	      if (k == s || !char_id_valid (k))
+		continue;
+	      while (all_decomp[t][1] != 0
+		     && combining_value[all_decomp[t][1]] > combining_value[i])
+		{
+		  if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
+		    break;
+		  t = all_decomp[t][0];
+		}
+	      if (t == s)
+		{
+		  found_case = true;
+		  printf ("\tcase %#06x:\n", k);
+		}
+	    }
+	}
+      if (found_case)
+	printf ("\t  return false;\n");
+      else
+	printf ("\t/* Non-NFC cases not applicable to C/C++.  */\n");
+      printf ("\tdefault:\n"
+	      "\t  return true;\n"
+	      "\t}\n\n");
+    }
+  printf ("    default:\n"
+	  "      cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
+	  "      return true;\n"
+	  "  }\n"
+	  "}\n");
 }

 /* Print out the huge copyright notice.  */
@ -336,5 +447,6 @@ main(int argc, char ** argv)

  write_copyright ();
  write_table ();
+  write_context_switch ();
  return 0;
 }
--- a/libcpp/ucnid.h
+++ b/libcpp/ucnid.h
--- a/libcpp/ucnid.tab
+++ b/libcpp/ucnid.tab
@ -19,7 +19,8 @@
 ; D, which is itself a reproduction from ISO/IEC TR 10176:1998, and
 ; the similar table from ISO/IEC 14882:1988 (C++98) Annex E, which is
 ; a reproduction of ISO/IEC PDTR 10176.  Unfortunately these tables
-; are not identical.
+; are not identical.  It also reproduces the somewhat different tables
+; in C11 and C++11, which are identical to each other.

 [C99]

@ -209,3 +210,34 @@ fbd3-fd3f fd50-fd8f fd92-fdc7 fdf0-fdfb fe70-fe72 fe74 fe76-fefc
 ff21-ff3a ff41-ff5a ff66-ffbe ffc2-ffc7 ffca-ffcf ffd2-ffd7
 ffda-ffdc 4e00-9fa5

+[C11]
+; Group 1
+00a8 00aa 00ad 00af 00b2-00b5 00b7-00ba 00bc-00be 00c0-00d6 00d8-00f6
+00f8-00ff
+
+; Group 2, minus characters under C11NOSTART
+0100-02ff 0370-167f 1681-180d 180f-1dbf 1e00-1fff
+
+; Group 3
+200b-200d 202a-202e 203f-2040 2054 2060-206f
+
+; Group 4, minus characters under C11NOSTART
+2070-20cf 2100-218f 2460-24ff 2776-2793 2c00-2dff 2e80-2fff
+
+; Group 5
+3004-3007 3021-302f 3031-303f
+
+; Group 6
+3040-d7ff
+
+; Group 7, minus characters under C11NOSTART
+f900-fd3d fd40-fdcf fdf0-fe1f fe30-fe44 fe47-fffd
+
+; Group 8
+10000-1fffd 20000-2fffd 30000-3fffd 40000-4fffd 50000-5fffd
+60000-6fffd 70000-7fffd 80000-8fffd 90000-9fffd a0000-afffd
+b0000-bfffd c0000-cfffd d0000-dfffd e0000-efffd
+
+[C11NOSTART]
+; Group 1
+0300-036f 1dc0-1dff 20d0-20ff fe20-fe2f