Support extended characters in C/C++ identifiers (PR c/67224)
libcpp/ChangeLog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens. * internal.h (_cpp_valid_utf8): Declare. * lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers. (_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens. Do all work in "default" case to avoid slowing down typical code paths. Also handle $ and UCN in the default case for consistency. gcc/Changelog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * doc/cpp.texi: Document support for extended characters in identifiers. * doc/cppopts.texi: Likewise. gcc/testsuite/ChangeLog 2019-09-19 Lewis Hyatt <lhyatt@gmail.com> PR c/67224 * c-c++-common/cpp/ucnid-2011-1-utf8.c: New test. * g++.dg/cpp/ucnid-1-utf8.C: New test. * g++.dg/cpp/ucnid-2-utf8.C: New test. * g++.dg/cpp/ucnid-3-utf8.C: New test. * g++.dg/cpp/ucnid-4-utf8.C: New test. * g++.dg/other/ucnid-1-utf8.C: New test. * gcc.dg/cpp/ucnid-1-utf8.c: New test. * gcc.dg/cpp/ucnid-10-utf8.c: New test. * gcc.dg/cpp/ucnid-11-utf8.c: New test. * gcc.dg/cpp/ucnid-12-utf8.c: New test. * gcc.dg/cpp/ucnid-13-utf8.c: New test. * gcc.dg/cpp/ucnid-14-utf8.c: New test. * gcc.dg/cpp/ucnid-15-utf8.c: New test. * gcc.dg/cpp/ucnid-2-utf8.c: New test. * gcc.dg/cpp/ucnid-3-utf8.c: New test. * gcc.dg/cpp/ucnid-4-utf8.c: New test. * gcc.dg/cpp/ucnid-6-utf8.c: New test. * gcc.dg/cpp/ucnid-7-utf8.c: New test. * gcc.dg/cpp/ucnid-9-utf8.c: New test. * gcc.dg/ucnid-1-utf8.c: New test. * gcc.dg/ucnid-10-utf8.c: New test. * gcc.dg/ucnid-11-utf8.c: New test. * gcc.dg/ucnid-12-utf8.c: New test. * gcc.dg/ucnid-13-utf8.c: New test. * gcc.dg/ucnid-14-utf8.c: New test. * gcc.dg/ucnid-15-utf8.c: New test. * gcc.dg/ucnid-16-utf8.c: New test. * gcc.dg/ucnid-2-utf8.c: New test. * gcc.dg/ucnid-3-utf8.c: New test. * gcc.dg/ucnid-4-utf8.c: New test. * gcc.dg/ucnid-5-utf8.c: New test. * gcc.dg/ucnid-6-utf8.c: New test. * gcc.dg/ucnid-7-utf8.c: New test. * gcc.dg/ucnid-8-utf8.c: New test. * gcc.dg/ucnid-9-utf8.c: New test. From-SVN: r275979
This commit is contained in:
parent
e0710fcf7d
commit
7d112d6670
@ -1,3 +1,10 @@
|
|||||||
|
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
|
||||||
|
|
||||||
|
PR c/67224
|
||||||
|
* doc/cpp.texi: Document support for extended characters in
|
||||||
|
identifiers.
|
||||||
|
* doc/cppopts.texi: Likewise.
|
||||||
|
|
||||||
2019-09-19 Richard Biener <rguenther@suse.de>
|
2019-09-19 Richard Biener <rguenther@suse.de>
|
||||||
|
|
||||||
* tree-vect-loop.c (vect_is_slp_reduction): Remove.
|
* tree-vect-loop.c (vect_is_slp_reduction): Remove.
|
||||||
|
@ -274,11 +274,11 @@ the character in the source character set that they represent, then
|
|||||||
converted to the execution character set, just like unescaped
|
converted to the execution character set, just like unescaped
|
||||||
characters.
|
characters.
|
||||||
|
|
||||||
In identifiers, characters outside the ASCII range can only be
|
In identifiers, characters outside the ASCII range can be specified
|
||||||
specified with the @samp{\u} and @samp{\U} escapes, not used
|
with the @samp{\u} and @samp{\U} escapes or used directly in the input
|
||||||
directly. If strict ISO C90 conformance is specified with an option
|
encoding. If strict ISO C90 conformance is specified with an option
|
||||||
such as @option{-std=c90}, or @option{-fno-extended-identifiers} is
|
such as @option{-std=c90}, or @option{-fno-extended-identifiers} is
|
||||||
used, then those escapes are not permitted in identifiers.
|
used, then those constructs are not permitted in identifiers.
|
||||||
|
|
||||||
@node Initial processing
|
@node Initial processing
|
||||||
@section Initial processing
|
@section Initial processing
|
||||||
@ -503,8 +503,7 @@ In the 1999 C standard, identifiers may contain letters which are not
|
|||||||
part of the ``basic source character set'', at the implementation's
|
part of the ``basic source character set'', at the implementation's
|
||||||
discretion (such as accented Latin letters, Greek letters, or Chinese
|
discretion (such as accented Latin letters, Greek letters, or Chinese
|
||||||
ideograms). This may be done with an extended character set, or the
|
ideograms). This may be done with an extended character set, or the
|
||||||
@samp{\u} and @samp{\U} escape sequences. GCC only accepts such
|
@samp{\u} and @samp{\U} escape sequences.
|
||||||
characters in the @samp{\u} and @samp{\U} forms.
|
|
||||||
|
|
||||||
As an extension, GCC treats @samp{$} as a letter. This is for
|
As an extension, GCC treats @samp{$} as a letter. This is for
|
||||||
compatibility with some systems, such as VMS, where @samp{$} is commonly
|
compatibility with some systems, such as VMS, where @samp{$} is commonly
|
||||||
@ -584,15 +583,15 @@ Punctuator: @{ @} [ ] # ##
|
|||||||
@end smallexample
|
@end smallexample
|
||||||
|
|
||||||
@cindex other tokens
|
@cindex other tokens
|
||||||
Any other single character is considered ``other''. It is passed on to
|
Any other single byte is considered ``other'' and passed on to the
|
||||||
the preprocessor's output unmolested. The C compiler will almost
|
preprocessor's output unchanged. The C compiler will almost certainly
|
||||||
certainly reject source code containing ``other'' tokens. In ASCII, the
|
reject source code containing ``other'' tokens. In ASCII, the only
|
||||||
only other characters are @samp{@@}, @samp{$}, @samp{`}, and control
|
``other'' characters are @samp{@@}, @samp{$}, @samp{`}, and control
|
||||||
characters other than NUL (all bits zero). (Note that @samp{$} is
|
characters other than NUL (all bits zero). (Note that @samp{$} is
|
||||||
normally considered a letter.) All characters with the high bit set
|
normally considered a letter.) All bytes with the high bit set
|
||||||
(numeric range 0x7F--0xFF) are also ``other'' in the present
|
(numeric range 0x7F--0xFF) that were not succesfully interpreted as
|
||||||
implementation. This will change when proper support for international
|
part of an extended character in the input encoding are also ``other''
|
||||||
character sets is added to GCC@.
|
in the present implementation.
|
||||||
|
|
||||||
NUL is a special case because of the high probability that its
|
NUL is a special case because of the high probability that its
|
||||||
appearance is accidental, and because it may be invisible to the user
|
appearance is accidental, and because it may be invisible to the user
|
||||||
@ -4179,7 +4178,10 @@ be controlled using the @option{-fexec-charset} and
|
|||||||
The C and C++ standards allow identifiers to be composed of @samp{_}
|
The C and C++ standards allow identifiers to be composed of @samp{_}
|
||||||
and the alphanumeric characters. C++ also allows universal character
|
and the alphanumeric characters. C++ also allows universal character
|
||||||
names. C99 and later C standards permit both universal character
|
names. C99 and later C standards permit both universal character
|
||||||
names and implementation-defined characters.
|
names and implementation-defined characters. In both C and C++ modes,
|
||||||
|
GCC accepts in identifiers exactly those extended characters that
|
||||||
|
correspond to universal character names permitted by the chosen
|
||||||
|
standard.
|
||||||
|
|
||||||
GCC allows the @samp{$} character in identifiers as an extension for
|
GCC allows the @samp{$} character in identifiers as an extension for
|
||||||
most targets. This is true regardless of the @option{std=} switch,
|
most targets. This is true regardless of the @option{std=} switch,
|
||||||
|
@ -254,8 +254,9 @@ Accept @samp{$} in identifiers.
|
|||||||
|
|
||||||
@item -fextended-identifiers
|
@item -fextended-identifiers
|
||||||
@opindex fextended-identifiers
|
@opindex fextended-identifiers
|
||||||
Accept universal character names in identifiers. This option is
|
Accept universal character names and extended characters in
|
||||||
enabled by default for C99 (and later C standard versions) and C++.
|
identifiers. This option is enabled by default for C99 (and later C
|
||||||
|
standard versions) and C++.
|
||||||
|
|
||||||
@item -fno-canonical-system-headers
|
@item -fno-canonical-system-headers
|
||||||
@opindex fno-canonical-system-headers
|
@opindex fno-canonical-system-headers
|
||||||
|
@ -1,3 +1,42 @@
|
|||||||
|
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
|
||||||
|
|
||||||
|
PR c/67224
|
||||||
|
* c-c++-common/cpp/ucnid-2011-1-utf8.c: New test.
|
||||||
|
* g++.dg/cpp/ucnid-1-utf8.C: New test.
|
||||||
|
* g++.dg/cpp/ucnid-2-utf8.C: New test.
|
||||||
|
* g++.dg/cpp/ucnid-3-utf8.C: New test.
|
||||||
|
* g++.dg/cpp/ucnid-4-utf8.C: New test.
|
||||||
|
* g++.dg/other/ucnid-1-utf8.C: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-1-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-10-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-11-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-12-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-13-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-14-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-15-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-2-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-3-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-4-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-6-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-7-utf8.c: New test.
|
||||||
|
* gcc.dg/cpp/ucnid-9-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-1-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-10-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-11-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-12-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-13-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-14-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-15-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-16-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-2-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-3-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-4-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-5-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-6-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-7-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-8-utf8.c: New test.
|
||||||
|
* gcc.dg/ucnid-9-utf8.c: New test.
|
||||||
|
|
||||||
2019-09-19 Iain Sandoe <iain@sandoe.co.uk>
|
2019-09-19 Iain Sandoe <iain@sandoe.co.uk>
|
||||||
|
|
||||||
* gcc.dg/pr89313.c: Test for __POWERPC__ in addition to
|
* gcc.dg/pr89313.c: Test for __POWERPC__ in addition to
|
||||||
|
15
gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c
Normal file
15
gcc/testsuite/c-c++-common/cpp/ucnid-2011-1-utf8.c
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=c11 -pedantic" { target c } } */
|
||||||
|
/* { dg-options "-std=c++11 -pedantic" { target c++ } } */
|
||||||
|
|
||||||
|
¨
|
||||||
|
|
||||||
|
B̀
|
||||||
|
|
||||||
|
̀ /* { dg-error "not valid at the start of an identifier" } */
|
||||||
|
|
||||||
|
À /* { dg-warning "not in NFC" } */
|
||||||
|
|
||||||
|
𐀀
|
||||||
|
|
||||||
|
|
17
gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C
Normal file
17
gcc/testsuite/g++.dg/cpp/ucnid-1-utf8.C
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=gnu++98 -pedantic" } */
|
||||||
|
|
||||||
|
ª /* { dg-error "not valid in an identifier" } */
|
||||||
|
« /* { dg-error "not valid in an identifier" } */
|
||||||
|
¶ /* { dg-error "not valid in an identifier" } */
|
||||||
|
º /* { dg-error "not valid in an identifier" } */
|
||||||
|
À
|
||||||
|
Ö
|
||||||
|
΄
|
||||||
|
|
||||||
|
٩ /* { dg-error "not valid in an identifier" } */
|
||||||
|
A٩ /* { dg-error "not valid in an identifier" } */
|
||||||
|
0º /* { dg-error "not valid in an identifier" } */
|
||||||
|
0٩ /* { dg-error "not valid in an identifier" } */
|
||||||
|
๙
|
||||||
|
A๙
|
24
gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C
Normal file
24
gcc/testsuite/g++.dg/cpp/ucnid-2-utf8.C
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
/* Test stringization of identifiers with extended characters works. */
|
||||||
|
|
||||||
|
/* Note: The results expected in these tests are what GCC currently
|
||||||
|
outputs, but they are not technically standard-conforming. If GCC is
|
||||||
|
changed in the future to produce the standard-conforming output, then
|
||||||
|
this test will fail and should be adjusted to check for UCNs in the
|
||||||
|
output rather than UTF-8. See PR 91755 for more details. */
|
||||||
|
|
||||||
|
/* { dg-do run } */
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define h(s) #s
|
||||||
|
#define str(s) h(s)
|
||||||
|
|
||||||
|
int
|
||||||
|
main ()
|
||||||
|
{
|
||||||
|
if (strcmp (str (str (Á)), "\"Á\""))
|
||||||
|
abort ();
|
||||||
|
if (strcmp (str (str (Á)), "\"Á\""))
|
||||||
|
abort ();
|
||||||
|
}
|
23
gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C
Normal file
23
gcc/testsuite/g++.dg/cpp/ucnid-3-utf8.C
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
/* Test pasting of identifiers with extended characters works. */
|
||||||
|
|
||||||
|
/* Note: The results expected in these tests are what GCC currently
|
||||||
|
outputs, but they are not technically standard-conforming. If GCC is
|
||||||
|
changed in the future to produce the standard-conforming output, then
|
||||||
|
this test will fail and should be adjusted to check for UCNs in the
|
||||||
|
output rather than UTF-8. See PR 91755 for more details. */
|
||||||
|
|
||||||
|
/* { dg-do run } */
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define c(s1, s2) s1 ## s2
|
||||||
|
#define h(s) #s
|
||||||
|
#define str(s) h(s)
|
||||||
|
|
||||||
|
int
|
||||||
|
main ()
|
||||||
|
{
|
||||||
|
if (strcmp (str (str (c (Á, Á))), "\"ÁÁ\""))
|
||||||
|
abort ();
|
||||||
|
}
|
17
gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C
Normal file
17
gcc/testsuite/g++.dg/cpp/ucnid-4-utf8.C
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=gnu++98"} */
|
||||||
|
|
||||||
|
ª
|
||||||
|
« /* { dg-error "not valid in an identifier" } */
|
||||||
|
¶ /* { dg-error "not valid in an identifier" } */
|
||||||
|
º
|
||||||
|
À
|
||||||
|
Ö
|
||||||
|
΄
|
||||||
|
|
||||||
|
٩ /* OK in C++ */
|
||||||
|
A٩
|
||||||
|
0º
|
||||||
|
0٩
|
||||||
|
๙ /* OK in C++ */
|
||||||
|
A๙
|
28
gcc/testsuite/g++.dg/other/ucnid-1-utf8.C
Normal file
28
gcc/testsuite/g++.dg/other/ucnid-1-utf8.C
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-options "" } */
|
||||||
|
/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
#include <cstdlib>
|
||||||
|
|
||||||
|
int À(void) { return 1; }
|
||||||
|
int Á(void) { return 2; }
|
||||||
|
int Â(void) { return 3; }
|
||||||
|
int whÿ(void) { return 4; }
|
||||||
|
int aÄbсδe(void) { return 5; }
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (À() != 1)
|
||||||
|
abort ();
|
||||||
|
if (Á() != 2)
|
||||||
|
abort ();
|
||||||
|
if (Â() != 3)
|
||||||
|
abort ();
|
||||||
|
if (whÿ() != 4)
|
||||||
|
abort ();
|
||||||
|
if (aÄbсδe() != 5)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
26
gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c
Normal file
26
gcc/testsuite/gcc.dg/cpp/ucnid-1-utf8.c
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-options "-std=c99 -g3" } */
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
#define À 1
|
||||||
|
#define Á 2
|
||||||
|
#define  3
|
||||||
|
#define whÿ 4
|
||||||
|
#define aÄbсδe 5
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (À != 1)
|
||||||
|
abort ();
|
||||||
|
if (Á != 2)
|
||||||
|
abort ();
|
||||||
|
if (Â != 3)
|
||||||
|
abort ();
|
||||||
|
if (whÿ != 4)
|
||||||
|
abort ();
|
||||||
|
if (aÄbсδe != 5)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
8
gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c
Normal file
8
gcc/testsuite/gcc.dg/cpp/ucnid-10-utf8.c
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
/* Test UTF-8 is allowed in preprocessing numbers. */
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=c99" } */
|
||||||
|
|
||||||
|
#define a(x) b(x)
|
||||||
|
#define b(x) 0
|
||||||
|
#define p )
|
||||||
|
int c = a(0À.p);
|
30
gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c
Normal file
30
gcc/testsuite/gcc.dg/cpp/ucnid-11-utf8.c
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
/* Test spelling differences in UCNs are properly diagnosed for macro
|
||||||
|
redefinitions. */
|
||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=c99 -pedantic-errors" } */
|
||||||
|
|
||||||
|
/* Different spelling of UCN in expansion. */
|
||||||
|
#define m1 \u00c1 /* { dg-message "-:previous definition" } */
|
||||||
|
#define m1 Á /* { dg-error "-:redefined" } */
|
||||||
|
|
||||||
|
#define m1ok Á
|
||||||
|
#define m1ok Á
|
||||||
|
|
||||||
|
/* Different spelling of UCN in argument name. */
|
||||||
|
#define m2(\u00c1) /* { dg-message "-:previous definition" } */
|
||||||
|
#define m2(Á) /* { dg-error "-:redefined" } */
|
||||||
|
|
||||||
|
#define m2ok(Á)
|
||||||
|
#define m2ok(Á)
|
||||||
|
|
||||||
|
/* Same spelling in argument name but different spelling when used in
|
||||||
|
expansion. */
|
||||||
|
#define m3(\u00c1) \u00c1 /* { dg-message "-:previous definition" } */
|
||||||
|
#define m3(\u00c1) Á /* { dg-error "-:redefined" } */
|
||||||
|
|
||||||
|
#define m3ok(\u00c1) Á
|
||||||
|
#define m3ok(\u00c1) Á
|
||||||
|
|
||||||
|
/* Different spelling of the macro name itself is OK. */
|
||||||
|
#define m4ok\u00c1
|
||||||
|
#define m4okÁ
|
13
gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c
Normal file
13
gcc/testsuite/gcc.dg/cpp/ucnid-12-utf8.c
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
/* Test spelling differences in UCNs in macro definitions still count
|
||||||
|
as the same identifier for macro expansion. */
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=c99 -pedantic-errors" } */
|
||||||
|
|
||||||
|
#define m1\u00c1
|
||||||
|
#ifndef m1Á
|
||||||
|
#error not defined
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define m2(\u00c1) Á
|
||||||
|
|
||||||
|
int i = m2 (0);
|
5
gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c
Normal file
5
gcc/testsuite/gcc.dg/cpp/ucnid-13-utf8.c
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
/* Verify macros named with UTF-8 are output in -dD output with UCNs. */
|
||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=c99 -dD" } */
|
||||||
|
/* { dg-final { scan-file ucnid-13-utf8.i "\\\\U000000c1" } } */
|
||||||
|
#define Á 1
|
6
gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c
Normal file
6
gcc/testsuite/gcc.dg/cpp/ucnid-14-utf8.c
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
/* Verify macro definitions with UTF-8 are output in -dD output with
|
||||||
|
the original spelling. */
|
||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=c99 -dD" } */
|
||||||
|
/* { dg-final { scan-file ucnid-14-utf8.i "Á" } } */
|
||||||
|
#define a Á
|
6
gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c
Normal file
6
gcc/testsuite/gcc.dg/cpp/ucnid-15-utf8.c
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
/* Verify macro definitions with UTF-8 in argument names are output in
|
||||||
|
-dD output with the original spelling. */
|
||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=c99 -dD" } */
|
||||||
|
/* { dg-final { scan-file ucnid-15-utf8.i "#define a\\(Á\\) x:Á:y:Á:z" } } */
|
||||||
|
#define a(Á) x:Á:y:Á:z
|
16
gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c
Normal file
16
gcc/testsuite/gcc.dg/cpp/ucnid-2-utf8.c
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-options "-std=c99" } */
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define str(t) #t
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
const char s[] = str (ゲ);
|
||||||
|
|
||||||
|
if (strcmp (s, "ゲ") != 0)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
7
gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c
Normal file
7
gcc/testsuite/gcc.dg/cpp/ucnid-3-utf8.c
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=c99" } */
|
||||||
|
|
||||||
|
#define paste(x, y) x ## y
|
||||||
|
|
||||||
|
int paste(ª, Ա) = 3;
|
||||||
|
|
17
gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c
Normal file
17
gcc/testsuite/gcc.dg/cpp/ucnid-4-utf8.c
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=c99" } */
|
||||||
|
|
||||||
|
ª
|
||||||
|
« /* not a preprocessing error because we lex it into its own token */
|
||||||
|
¶ /* not a preprocessing error because we lex it into its own token */
|
||||||
|
º
|
||||||
|
À
|
||||||
|
Ö
|
||||||
|
΄
|
||||||
|
|
||||||
|
٩ /* { dg-error "not valid at the start of an identifier" } */
|
||||||
|
A٩
|
||||||
|
0º
|
||||||
|
0٩
|
||||||
|
๙ /* { dg-error "not valid at the start of an identifier" } */
|
||||||
|
A๙
|
5
gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c
Normal file
5
gcc/testsuite/gcc.dg/cpp/ucnid-6-utf8.c
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=c89" } */
|
||||||
|
#define a b(
|
||||||
|
#define b(x) q
|
||||||
|
int aª);
|
21
gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c
Normal file
21
gcc/testsuite/gcc.dg/cpp/ucnid-7-utf8.c
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=c99" } */
|
||||||
|
|
||||||
|
/* When GCC reads UTF-8-encoded input into its internal UTF-8
|
||||||
|
representation, it does not apply any transformation to the data, and
|
||||||
|
in particular it makes no attempt to verify that the encoding is valid
|
||||||
|
UTF-8. Historically, if any non-ASCII characters were found outside a
|
||||||
|
string or comment, they were treated as stray tokens and did not
|
||||||
|
necessarily produce an error, e.g. if, as in this test, they disappear
|
||||||
|
in the preprocessor. Now that UTF-8 is also supported in identifiers,
|
||||||
|
the basic structure of this process has not changed; GCC just treats
|
||||||
|
invalid UTF-8 as a stray token. This test verifies that the historical
|
||||||
|
behavior is unchanged. In the future, if GCC were changed, say, to
|
||||||
|
validate the UTF-8 on input, then this test would no longer be
|
||||||
|
appropriate. */
|
||||||
|
|
||||||
|
|
||||||
|
#define a b(
|
||||||
|
#define b(x) q
|
||||||
|
/* The line below contains invalid UTF-8. */
|
||||||
|
int aÏ);
|
8
gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c
Normal file
8
gcc/testsuite/gcc.dg/cpp/ucnid-9-utf8.c
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
/* { dg-do preprocess } */
|
||||||
|
/* { dg-options "-std=c99 -pedantic" } */
|
||||||
|
|
||||||
|
Ⅰ
|
||||||
|
ↂ
|
||||||
|
〇
|
||||||
|
〡
|
||||||
|
〩
|
25
gcc/testsuite/gcc.dg/ucnid-1-utf8.c
Normal file
25
gcc/testsuite/gcc.dg/ucnid-1-utf8.c
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-options "-std=c99 -g" } */
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
int À = 1;
|
||||||
|
int Á = 2;
|
||||||
|
int  = 3;
|
||||||
|
int whÿ = 4;
|
||||||
|
int aÄbсδe = 5;
|
||||||
|
|
||||||
|
if (À != 1)
|
||||||
|
abort ();
|
||||||
|
if (Á != 2)
|
||||||
|
abort ();
|
||||||
|
if (Â != 3)
|
||||||
|
abort ();
|
||||||
|
if (whÿ != 4)
|
||||||
|
abort ();
|
||||||
|
if (aÄbсδe != 5)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
11
gcc/testsuite/gcc.dg/ucnid-10-utf8.c
Normal file
11
gcc/testsuite/gcc.dg/ucnid-10-utf8.c
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
/* Verify diagnostics for extended identifiers refer to UCNs (in the C
|
||||||
|
locale). Test #pragma pack diagnostics. */
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=gnu99" } */
|
||||||
|
/* { dg-require-ascii-locale "" } */
|
||||||
|
/* { dg-skip-if "" { powerpc-ibm-aix* } } */
|
||||||
|
|
||||||
|
#pragma pack(push)
|
||||||
|
#pragma pack(pop, ó) /* { dg-warning "pop, \\\\U000000f3.*push, \\\\U000000f3" } */
|
||||||
|
#pragma pack(ç) /* { dg-warning "unknown action '\\\\U000000e7'" } */
|
||||||
|
|
7
gcc/testsuite/gcc.dg/ucnid-11-utf8.c
Normal file
7
gcc/testsuite/gcc.dg/ucnid-11-utf8.c
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-skip-if "-fdata-sections not supported" { { hppa*-*-hpux* } && { ! lp64 } } } */
|
||||||
|
/* { dg-options "-std=c99 -fdata-sections -g" } */
|
||||||
|
|
||||||
|
#include "ucnid-3-utf8.c"
|
7
gcc/testsuite/gcc.dg/ucnid-12-utf8.c
Normal file
7
gcc/testsuite/gcc.dg/ucnid-12-utf8.c
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-skip-if "-ffunction-sections not supported" { { hppa*-*-hpux* } && { ! lp64 } } } */
|
||||||
|
/* { dg-options "-std=c99 -ffunction-sections -g" } */
|
||||||
|
|
||||||
|
#include "ucnid-4-utf8.c"
|
15
gcc/testsuite/gcc.dg/ucnid-13-utf8.c
Normal file
15
gcc/testsuite/gcc.dg/ucnid-13-utf8.c
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
/* Verify diagnostics for extended identifiers refer to UCNs (in the C
|
||||||
|
locale). Miscellaneous diagnostics. */
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=gnu99 -Wpacked" } */
|
||||||
|
/* { dg-require-ascii-locale "" } */
|
||||||
|
/* { dg-skip-if "" { powerpc-ibm-aix* } } */
|
||||||
|
|
||||||
|
int a __attribute__((À)); /* { dg-warning "'\\\\U000000c0' attribute directive ignored" } */
|
||||||
|
|
||||||
|
extern void Á (void) __attribute__((deprecated));
|
||||||
|
void g (void) { Á (); } /* { dg-warning "'\\\\U000000c1' is deprecated" } */
|
||||||
|
|
||||||
|
struct  { char c; } __attribute__((packed)); /* { dg-warning "'\\\\U000000c2'" } */
|
||||||
|
|
||||||
|
void h (void) { asm ("%[Ã]" : : ); } /* { dg-error "undefined named operand '\\\\U000000c3'" } */
|
23
gcc/testsuite/gcc.dg/ucnid-14-utf8.c
Normal file
23
gcc/testsuite/gcc.dg/ucnid-14-utf8.c
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
/* Test miscellaneous uses of UTF-8 in identifiers compile and run OK,
|
||||||
|
with debug info enabled. */
|
||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-options "-std=c99 -g" } */
|
||||||
|
|
||||||
|
extern void abort (void);
|
||||||
|
extern void exit (int);
|
||||||
|
|
||||||
|
int
|
||||||
|
main (void)
|
||||||
|
{
|
||||||
|
struct À { int Á; } x;
|
||||||
|
struct À *y = &x;
|
||||||
|
y->Á = 1;
|
||||||
|
if (x.Á != 1)
|
||||||
|
abort ();
|
||||||
|
goto ÿ;
|
||||||
|
ÿ: ;
|
||||||
|
enum e { Â = 4 };
|
||||||
|
if (Â != 4)
|
||||||
|
abort ();
|
||||||
|
exit (0);
|
||||||
|
}
|
38
gcc/testsuite/gcc.dg/ucnid-15-utf8.c
Normal file
38
gcc/testsuite/gcc.dg/ucnid-15-utf8.c
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/* Test combinations of UTF-8 in various parts of identifiers. */
|
||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-options "-std=c99" } */
|
||||||
|
|
||||||
|
extern void abort (void);
|
||||||
|
|
||||||
|
int π = 3;
|
||||||
|
int π² = 9;
|
||||||
|
int πp1 = 4;
|
||||||
|
int twoπ = 6;
|
||||||
|
int four_plus_π_ = 7;
|
||||||
|
int 😀ÀÁÂÃÄÅßàáâãäaåbæçèéêcëìígîïð7ñ9__òóô4õöÆ3ÇÈÉÊËabcÌÍÎÏÐÑÒÓÔÕÖ😄😅🤣😂_ÿ = 2;
|
||||||
|
int π\u03C0 = 9;
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
if (π != 3)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
if (π² != 9)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
if (πp1 != 4)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
if (twoπ != 6)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
if (four_plus_π_ != 7)
|
||||||
|
abort () ;
|
||||||
|
|
||||||
|
if (😀ÀÁÂÃÄÅßàáâãäaåbæçèéêcëìígîïð7ñ9__òóô4õöÆ3ÇÈÉÊËabcÌÍÎÏÐÑÒÓÔÕÖ😄😅🤣😂_ÿ != 2)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
if(ππ != π²)
|
||||||
|
abort ();
|
||||||
|
}
|
6
gcc/testsuite/gcc.dg/ucnid-16-utf8.c
Normal file
6
gcc/testsuite/gcc.dg/ucnid-16-utf8.c
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=c99 -g -finput-charset=latin1" } */
|
||||||
|
/* { dg-final { scan-file ucnid-16-utf8.s "²" } } */
|
||||||
|
|
||||||
|
/* This superscript is encoded in latin1; verify that we still get UTF-8 in the output. */
|
||||||
|
int x² = 9;
|
28
gcc/testsuite/gcc.dg/ucnid-2-utf8.c
Normal file
28
gcc/testsuite/gcc.dg/ucnid-2-utf8.c
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-options "-std=c99 -g" } */
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
static int À = 1;
|
||||||
|
static int Á = 2;
|
||||||
|
static int  = 3;
|
||||||
|
static int whÿ = 4;
|
||||||
|
static int aÄbсδe = 5;
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (À != 1)
|
||||||
|
abort ();
|
||||||
|
if (Á != 2)
|
||||||
|
abort ();
|
||||||
|
if (Â != 3)
|
||||||
|
abort ();
|
||||||
|
if (whÿ != 4)
|
||||||
|
abort ();
|
||||||
|
if (aÄbсδe != 5)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
28
gcc/testsuite/gcc.dg/ucnid-3-utf8.c
Normal file
28
gcc/testsuite/gcc.dg/ucnid-3-utf8.c
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-options "-std=c99 -g" } */
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
int À = 1;
|
||||||
|
int Á = 2;
|
||||||
|
int  = 3;
|
||||||
|
int whÿ = 4;
|
||||||
|
int aÄbсδe = 5;
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (À != 1)
|
||||||
|
abort ();
|
||||||
|
if (Á != 2)
|
||||||
|
abort ();
|
||||||
|
if (Â != 3)
|
||||||
|
abort ();
|
||||||
|
if (whÿ != 4)
|
||||||
|
abort ();
|
||||||
|
if (aÄbсδe != 5)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
28
gcc/testsuite/gcc.dg/ucnid-4-utf8.c
Normal file
28
gcc/testsuite/gcc.dg/ucnid-4-utf8.c
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { powerpc-ibm-aix* } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-options "-std=c99 -g" } */
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
int À(void) { return 1; }
|
||||||
|
int Á(void) { return 2; }
|
||||||
|
int Â(void) { return 3; }
|
||||||
|
int whÿ(void) { return 4; }
|
||||||
|
int aÄbсδe(void) { return 5; }
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (À() != 1)
|
||||||
|
abort ();
|
||||||
|
if (Á() != 2)
|
||||||
|
abort ();
|
||||||
|
if (Â() != 3)
|
||||||
|
abort ();
|
||||||
|
if (whÿ() != 4)
|
||||||
|
abort ();
|
||||||
|
if (aÄbсδe() != 5)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
19
gcc/testsuite/gcc.dg/ucnid-5-utf8.c
Normal file
19
gcc/testsuite/gcc.dg/ucnid-5-utf8.c
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-skip-if "No dollar in identfiers" { avr-*-* powerpc-ibm-aix* } } */
|
||||||
|
/* { dg-options "-std=c99 -fdollars-in-identifiers -g" } */
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
int a$b(void) { return 1; }
|
||||||
|
int a$b😀(void) { return 2; }
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (a$b() != 1)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
if (a$b😀() != 2)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
28
gcc/testsuite/gcc.dg/ucnid-6-utf8.c
Normal file
28
gcc/testsuite/gcc.dg/ucnid-6-utf8.c
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-options "-std=c99 -save-temps -g" } */
|
||||||
|
void abort (void);
|
||||||
|
|
||||||
|
int À(void) { return 1; }
|
||||||
|
int Á(void) { return 2; }
|
||||||
|
int Â(void) { return 3; }
|
||||||
|
int whÿ(void) { return 4; }
|
||||||
|
int aÄbсδe(void) { return 5; }
|
||||||
|
|
||||||
|
int main (void)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (À() != 1)
|
||||||
|
abort ();
|
||||||
|
if (Á() != 2)
|
||||||
|
abort ();
|
||||||
|
if (Â() != 3)
|
||||||
|
abort ();
|
||||||
|
if (whÿ() != 4)
|
||||||
|
abort ();
|
||||||
|
if (aÄbсδe() != 5)
|
||||||
|
abort ();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
9
gcc/testsuite/gcc.dg/ucnid-7-utf8.c
Normal file
9
gcc/testsuite/gcc.dg/ucnid-7-utf8.c
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
/* Verify diagnostics for extended identifiers refer to UCNs (in the C
|
||||||
|
locale). */
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=c99" } */
|
||||||
|
/* { dg-require-ascii-locale "" } */
|
||||||
|
/* { dg-skip-if "" { "powerpc-ibm-aix*" } } */
|
||||||
|
|
||||||
|
void *p = &é; /* { dg-error "'\\\\U000000e9' undeclared" } */
|
||||||
|
void *q = &Ḁ; /* { dg-error "'\\\\U00001e00' undeclared" } */
|
16
gcc/testsuite/gcc.dg/ucnid-8-utf8.c
Normal file
16
gcc/testsuite/gcc.dg/ucnid-8-utf8.c
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
/* Verify diagnostics for extended identifiers refer to UCNs (in the C
|
||||||
|
locale). Further tests of C front-end diagnostics. */
|
||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-options "-std=gnu99 -Wvla" } */
|
||||||
|
/* { dg-require-ascii-locale "" } */
|
||||||
|
/* { dg-skip-if "" { powerpc-ibm-aix* } } */
|
||||||
|
|
||||||
|
int a __attribute__((__mode__(é))); /* { dg-error "unknown machine mode '\\\\U000000e9'" } */
|
||||||
|
struct s1 { int é : 0; }; /* { dg-error "zero width for bit-field '\\\\U000000e9'" } */
|
||||||
|
|
||||||
|
void f (int b) { int é[b]; } /* { dg-warning "variable length array '\\\\U000000e9'" } */
|
||||||
|
|
||||||
|
void g (static int é); /* { dg-error "storage class specified for parameter '\\\\U000000e9'" } */
|
||||||
|
|
||||||
|
struct s2 { int á; } é = { { 0 } }; /* { dg-warning "braces around scalar initializer" } */
|
||||||
|
/* { dg-message "near initialization for '\\\\U000000e9\\.\\\\U000000e1'" "UCN diag" { target *-*-* } .-1 } */
|
25
gcc/testsuite/gcc.dg/ucnid-9-utf8.c
Normal file
25
gcc/testsuite/gcc.dg/ucnid-9-utf8.c
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
/* Test __func__ with extended identifiers and character set
|
||||||
|
conversions. */
|
||||||
|
/* { dg-do run } */
|
||||||
|
/* { dg-xfail-if "" { "powerpc-ibm-aix*" } } */
|
||||||
|
/* { dg-skip-if "" { ! ucn } } */
|
||||||
|
/* { dg-options "-std=c99 -fexec-charset=ISO-8859-1 -g" } */
|
||||||
|
/* { dg-require-iconv "ISO-8859-1" } */
|
||||||
|
|
||||||
|
extern int strcmp (const char *, const char *);
|
||||||
|
extern void abort (void);
|
||||||
|
extern void exit (int);
|
||||||
|
|
||||||
|
void
|
||||||
|
é (void)
|
||||||
|
{
|
||||||
|
if (strcmp (__func__, "é") != 0)
|
||||||
|
abort ();
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
main (void)
|
||||||
|
{
|
||||||
|
é ();
|
||||||
|
exit (0);
|
||||||
|
}
|
@ -1,3 +1,13 @@
|
|||||||
|
2019-09-19 Lewis Hyatt <lhyatt@gmail.com>
|
||||||
|
|
||||||
|
PR c/67224
|
||||||
|
* charset.c (_cpp_valid_utf8): New function to help lex UTF-8 tokens.
|
||||||
|
* internal.h (_cpp_valid_utf8): Declare.
|
||||||
|
* lex.c (forms_identifier_p): Use it to recognize UTF-8 identifiers.
|
||||||
|
(_cpp_lex_direct): Handle UTF-8 in identifiers and CPP_OTHER tokens.
|
||||||
|
Do all work in "default" case to avoid slowing down typical code paths.
|
||||||
|
Also handle $ and UCN in the default case for consistency.
|
||||||
|
|
||||||
2019-08-30 Nathan Sidwell <nathan@acm.org>
|
2019-08-30 Nathan Sidwell <nathan@acm.org>
|
||||||
|
|
||||||
New # semantics for popping to "" name.
|
New # semantics for popping to "" name.
|
||||||
|
@ -1198,6 +1198,84 @@ convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
|
|||||||
return from;
|
return from;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
|
||||||
|
extended characters rather than UCNs. If the return value is TRUE, then a
|
||||||
|
character was successfully decoded and stored in *CP; *PSTR has been
|
||||||
|
updated to point one past the valid UTF-8 sequence. Diagnostics may have
|
||||||
|
been emitted if the character parsed is not allowed in the current context.
|
||||||
|
If the return value is FALSE, then *PSTR has not been modified and *CP may
|
||||||
|
equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
|
||||||
|
may, when processing an identifier in C mode, equal a codepoint that was
|
||||||
|
validly encoded but is not allowed to appear in an identifier. In either
|
||||||
|
case, no diagnostic is emitted, and the return value of FALSE should cause
|
||||||
|
a new token to be formed.
|
||||||
|
|
||||||
|
Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
|
||||||
|
a potential identifier, or a CPP_OTHER token. NST is unused in the latter
|
||||||
|
case.
|
||||||
|
|
||||||
|
As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
|
||||||
|
the start of an identifier, or 2 otherwise. */
|
||||||
|
|
||||||
|
extern bool
|
||||||
|
_cpp_valid_utf8 (cpp_reader *pfile,
|
||||||
|
const uchar **pstr,
|
||||||
|
const uchar *limit,
|
||||||
|
int identifier_pos,
|
||||||
|
struct normalize_state *nst,
|
||||||
|
cppchar_t *cp)
|
||||||
|
{
|
||||||
|
const uchar *base = *pstr;
|
||||||
|
size_t inbytesleft = limit - base;
|
||||||
|
if (one_utf8_to_cppchar (pstr, &inbytesleft, cp))
|
||||||
|
{
|
||||||
|
/* No diagnostic here as this byte will rather become a
|
||||||
|
new token. */
|
||||||
|
*cp = 0;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (identifier_pos)
|
||||||
|
{
|
||||||
|
switch (ucn_valid_in_identifier (pfile, *cp, nst))
|
||||||
|
{
|
||||||
|
|
||||||
|
case 0:
|
||||||
|
/* In C++, this is an error for invalid character in an identifier
|
||||||
|
because logically, the UTF-8 was converted to a UCN during
|
||||||
|
translation phase 1 (even though we don't physically do it that
|
||||||
|
way). In C, this byte rather becomes grammatically a separate
|
||||||
|
token. */
|
||||||
|
|
||||||
|
if (CPP_OPTION (pfile, cplusplus))
|
||||||
|
cpp_error (pfile, CPP_DL_ERROR,
|
||||||
|
"extended character %.*s is not valid in an identifier",
|
||||||
|
(int) (*pstr - base), base);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
*pstr = base;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
if (identifier_pos == 1)
|
||||||
|
{
|
||||||
|
/* This is treated the same way in C++ or C99 -- lexed as an
|
||||||
|
identifier which is then invalid because an identifier is
|
||||||
|
not allowed to start with this character. */
|
||||||
|
cpp_error (pfile, CPP_DL_ERROR,
|
||||||
|
"extended character %.*s is not valid at the start of an identifier",
|
||||||
|
(int) (*pstr - base), base);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/* Subroutine of convert_hex and convert_oct. N is the representation
|
/* Subroutine of convert_hex and convert_oct. N is the representation
|
||||||
in the execution character set of a numeric escape; write it into the
|
in the execution character set of a numeric escape; write it into the
|
||||||
string buffer TBUF and update the end-of-string pointer therein. WIDE
|
string buffer TBUF and update the end-of-string pointer therein. WIDE
|
||||||
@ -1956,8 +2034,9 @@ cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Convert an identifier denoted by ID and LEN, which might contain
|
/* Convert an identifier denoted by ID and LEN, which might contain
|
||||||
UCN escapes, to the source character set, either UTF-8 or
|
UCN escapes or UTF-8 multibyte chars, to the source character set,
|
||||||
UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
|
either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually
|
||||||
|
a valid identifier. */
|
||||||
cpp_hashnode *
|
cpp_hashnode *
|
||||||
_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
|
_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
|
||||||
{
|
{
|
||||||
|
@ -791,6 +791,14 @@ extern bool _cpp_valid_ucn (cpp_reader *, const unsigned char **,
|
|||||||
cppchar_t *,
|
cppchar_t *,
|
||||||
source_range *char_range,
|
source_range *char_range,
|
||||||
cpp_string_location_reader *loc_reader);
|
cpp_string_location_reader *loc_reader);
|
||||||
|
|
||||||
|
extern bool _cpp_valid_utf8 (cpp_reader *pfile,
|
||||||
|
const uchar **pstr,
|
||||||
|
const uchar *limit,
|
||||||
|
int identifier_pos,
|
||||||
|
struct normalize_state *nst,
|
||||||
|
cppchar_t *cp);
|
||||||
|
|
||||||
extern void _cpp_destroy_iconv (cpp_reader *);
|
extern void _cpp_destroy_iconv (cpp_reader *);
|
||||||
extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
|
extern unsigned char *_cpp_convert_input (cpp_reader *, const char *,
|
||||||
unsigned char *, size_t, size_t,
|
unsigned char *, size_t, size_t,
|
||||||
|
59
libcpp/lex.c
59
libcpp/lex.c
@ -1313,7 +1313,9 @@ warn_about_normalization (cpp_reader *pfile,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns TRUE if the sequence starting at buffer->cur is invalid in
|
static const cppchar_t utf8_signifier = 0xC0;
|
||||||
|
|
||||||
|
/* Returns TRUE if the sequence starting at buffer->cur is valid in
|
||||||
an identifier. FIRST is TRUE if this starts an identifier. */
|
an identifier. FIRST is TRUE if this starts an identifier. */
|
||||||
static bool
|
static bool
|
||||||
forms_identifier_p (cpp_reader *pfile, int first,
|
forms_identifier_p (cpp_reader *pfile, int first,
|
||||||
@ -1336,17 +1338,25 @@ forms_identifier_p (cpp_reader *pfile, int first,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Is this a syntactically valid UCN? */
|
/* Is this a syntactically valid UCN or a valid UTF-8 char? */
|
||||||
if (CPP_OPTION (pfile, extended_identifiers)
|
if (CPP_OPTION (pfile, extended_identifiers))
|
||||||
&& *buffer->cur == '\\'
|
|
||||||
&& (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
|
|
||||||
{
|
{
|
||||||
cppchar_t s;
|
cppchar_t s;
|
||||||
buffer->cur += 2;
|
if (*buffer->cur >= utf8_signifier)
|
||||||
if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
|
{
|
||||||
state, &s, NULL, NULL))
|
if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
|
||||||
return true;
|
state, &s))
|
||||||
buffer->cur -= 2;
|
return true;
|
||||||
|
}
|
||||||
|
else if (*buffer->cur == '\\'
|
||||||
|
&& (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
|
||||||
|
{
|
||||||
|
buffer->cur += 2;
|
||||||
|
if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
|
||||||
|
state, &s, NULL, NULL))
|
||||||
|
return true;
|
||||||
|
buffer->cur -= 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
@ -1464,7 +1474,8 @@ lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
|
|||||||
pfile->buffer->cur = cur;
|
pfile->buffer->cur = cur;
|
||||||
if (starts_ucn || forms_identifier_p (pfile, false, nst))
|
if (starts_ucn || forms_identifier_p (pfile, false, nst))
|
||||||
{
|
{
|
||||||
/* Slower version for identifiers containing UCNs (or $). */
|
/* Slower version for identifiers containing UCNs
|
||||||
|
or extended chars (including $). */
|
||||||
do {
|
do {
|
||||||
while (ISIDNUM (*pfile->buffer->cur))
|
while (ISIDNUM (*pfile->buffer->cur))
|
||||||
{
|
{
|
||||||
@ -3123,12 +3134,12 @@ _cpp_lex_direct (cpp_reader *pfile)
|
|||||||
/* @ is a punctuator in Objective-C. */
|
/* @ is a punctuator in Objective-C. */
|
||||||
case '@': result->type = CPP_ATSIGN; break;
|
case '@': result->type = CPP_ATSIGN; break;
|
||||||
|
|
||||||
case '$':
|
default:
|
||||||
case '\\':
|
|
||||||
{
|
{
|
||||||
const uchar *base = --buffer->cur;
|
const uchar *base = --buffer->cur;
|
||||||
struct normalize_state nst = INITIAL_NORMALIZE_STATE;
|
|
||||||
|
|
||||||
|
/* Check for an extended identifier ($ or UCN or UTF-8). */
|
||||||
|
struct normalize_state nst = INITIAL_NORMALIZE_STATE;
|
||||||
if (forms_identifier_p (pfile, true, &nst))
|
if (forms_identifier_p (pfile, true, &nst))
|
||||||
{
|
{
|
||||||
result->type = CPP_NAME;
|
result->type = CPP_NAME;
|
||||||
@ -3137,13 +3148,21 @@ _cpp_lex_direct (cpp_reader *pfile)
|
|||||||
warn_about_normalization (pfile, result, &nst);
|
warn_about_normalization (pfile, result, &nst);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
buffer->cur++;
|
|
||||||
}
|
|
||||||
/* FALLTHRU */
|
|
||||||
|
|
||||||
default:
|
/* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
|
||||||
create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
|
single token. */
|
||||||
break;
|
buffer->cur++;
|
||||||
|
if (c >= utf8_signifier)
|
||||||
|
{
|
||||||
|
const uchar *pstr = base;
|
||||||
|
cppchar_t s;
|
||||||
|
if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
|
||||||
|
buffer->cur = pstr;
|
||||||
|
}
|
||||||
|
create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Potentially convert the location of the token to a range. */
|
/* Potentially convert the location of the token to a range. */
|
||||||
|
Loading…
Reference in New Issue
Block a user