Make-lang.in: Add g-utf_32 unit for gnat and gnatbind

2005-03-08  Robert Dewar  <dewar@adacore.com>

	* Make-lang.in: Add g-utf_32 unit for gnat and gnatbind

	* impunit.adb: Add GNAT.UTF_32

	* scng.adb: Use gnat.utf_32 instead of widechar for utf_32 stuff

	* widechar.ads, widechar.adb: Remove redundant UTF-32 tables (scng
	now uses GNAT.UTF_32).

	* g-utf_32.ads, g-utf_32.adb: This is a new unit with full
	capabilities for categorizing characters using Unicode categories

From-SVN: r96501
This commit is contained in:
Robert Dewar 2005-03-15 17:10:45 +01:00 committed by Arnaud Charlet
parent 73f0204748
commit c8427bff14
7 changed files with 3798 additions and 1712 deletions

View File

@ -123,7 +123,8 @@ GNAT_ADA_OBJS = ada/ada.o ada/a-charac.o ada/a-chlat1.o ada/a-except.o \
ada/exp_prag.o ada/exp_smem.o ada/exp_strm.o ada/exp_tss.o ada/exp_util.o \
ada/exp_vfpt.o ada/expander.o ada/fname.o ada/fname-uf.o ada/fmap.o \
ada/freeze.o ada/frontend.o ada/gnat.o ada/g-hesora.o ada/g-htable.o \
ada/g-os_lib.o ada/g-speche.o ada/g-string.o ada/s-crc32.o ada/get_targ.o \
ada/g-os_lib.o ada/g-speche.o ada/g-string.o ada/g-utf_32.o \
ada/s-crc32.o ada/get_targ.o \
ada/gnatvsn.o ada/hlo.o ada/hostparm.o ada/impunit.o ada/interfac.o \
ada/itypes.o ada/inline.o ada/krunch.o ada/lib.o ada/layout.o ada/lib-load.o \
ada/lib-util.o ada/lib-xref.o ada/lib-writ.o ada/live.o ada/namet.o \
@ -192,6 +193,7 @@ GNATBIND_OBJS = \
ada/g-htable.o \
ada/g-os_lib.o \
ada/g-string.o \
ada/g-utf_32.o \
ada/gnat.o \
ada/gnatbind.o \
ada/gnatvsn.o \
@ -2132,6 +2134,8 @@ ada/g-speche.o : ada/gnat.ads ada/g-speche.ads ada/g-speche.adb \
ada/g-string.o : ada/gnat.ads ada/g-string.ads ada/g-string.adb \
ada/system.ads ada/unchdeal.ads
ada/g-utf_32.o : ada/gnat.ads ada/g-utf_32.ads ada/g-utf_32.adb
ada/get_targ.o : ada/get_targ.ads ada/get_targ.adb ada/system.ads \
ada/s-exctab.ads ada/s-stalib.ads ada/s-unstyp.ads ada/types.ads \
ada/unchconv.ads ada/unchdeal.ads

File diff suppressed because it is too large Load Diff

View File

@ -44,13 +44,59 @@
package GNAT.UTF_32 is
type UTF_32 is mod 2 ** 32;
-- The actual allowed range is 16#00_0000# .. 16#01_FFFF#
type UTF_32 is range 0 .. 16#7FFF_FFFF#;
-- So far, the only defined character codes are in 0 .. 16#01_FFFF#
function Is_UTF_32_Letter (U : UTF_32) return Boolean;
type Category is (
Cc, -- Other, Control
Cf, -- Other, Format
Cn, -- Other, Not Assigned
Co, -- Other, Private Use
Cs, -- Other, Surrogate
Ll, -- Letter, Lowercase
Lm, -- Letter, Modifier
Lo, -- Letter, Other
Lt, -- Letter, Titlecase
Lu, -- Letter, Uppercase
Mc, -- Mark, Spacing Combining
Me, -- Mark, Enclosing
Mn, -- Mark, Nonspacing
Nd, -- Number, Decimal Digit
Nl, -- Number, Letter
No, -- Number, Other
Pc, -- Punctuation, Connector
Pd, -- Punctuation, Dash
Pe, -- Punctuation, Close
Pf, -- Punctuation, Final quote
Pi, -- Punctuation, Initial quote
Po, -- Punctuation, Other
Ps, -- Punctuation, Open
Sc, -- Symbol, Currency
Sk, -- Symbol, Modifier
Sm, -- Symbol, Math
So, -- Symbol, Other
Zl, -- Separator, Line
Zp, -- Separator, Paragraph
Zs); -- Separator, Space
function Get_Category (U : UTF_32) return Category;
-- Given a UTF32 code, returns corresponding Category, or Cn if
-- the code does not have an assigned unicode category.
-- The following functions perform category tests corresponding to lexical
-- classes defined in the Ada standard. There are two interfaces for each
-- function. The first takes a Category (e.g. returned by Get_Category).
-- The second takes a UTF_32 code. The form taking the UTF_32 code is
-- typically more efficient than calling Get_Category, but if several
-- different tests are to be performed on the same code, it is more
-- efficient to use Get_Category to get the category, then test the
-- resulting category.
function Is_UTF_32_Letter (U : UTF_32) return Boolean;
function Is_UTF_32_Letter (C : Category) return Boolean;
pragma Inline (Is_UTF_32_Letter);
-- Returns true iff U is a letter that can be used to start an identifier.
-- This means that it is in one of the following categories:
-- Returns true iff U is a letter that can be used to start an identifier,
-- or if C is one of the corresponding categories, which are the following:
-- Letter, Uppercase (Lu)
-- Letter, Lowercase (Ll)
-- Letter, Titlecase (Lt)
@ -58,52 +104,59 @@ package GNAT.UTF_32 is
-- Letter, Other (Lo)
-- Number, Letter (Nl)
function Is_UTF_32_Digit (U : UTF_32) return Boolean;
function Is_UTF_32_Digit (U : UTF_32) return Boolean;
function Is_UTF_32_Digit (C : Category) return Boolean;
pragma Inline (Is_UTF_32_Digit);
-- Returns true iff U is a digit that can be used to extend an identifer,
-- which means it is in one of the following categories:
-- or if C is one of the corresponding categories, which are the following:
-- Number, Decimal_Digit (Nd)
function Is_UTF_32_Line_Terminator (U : UTF_32) return Boolean;
function Is_UTF_32_Line_Terminator (U : UTF_32) return Boolean;
pragma Inline (Is_UTF_32_Line_Terminator);
-- Returns true iff U is an allowed line terminator for source programs,
-- which means it is in one of the following categories:
-- Separator, Line (Zl)
-- Separator, Paragraph (Zp)
-- or that it is a conventional line terminator (CR, LF, VT, FF)
-- if U is in the category Zp (Separator, Paragaph), or Zs (Separator,
-- Line), or if U is a conventional line terminator (CR, LF, VT, FF).
-- There is no category version for this function, since the set of
-- characters does not correspond to a set of Unicode categories.
function Is_UTF_32_Mark (U : UTF_32) return Boolean;
function Is_UTF_32_Mark (U : UTF_32) return Boolean;
function Is_UTF_32_Mark (C : Category) return Boolean;
pragma Inline (Is_UTF_32_Mark);
-- Returns true iff U is a mark character which can be used to extend
-- an identifier. This means it is in one of the following categories:
-- Returns true iff U is a mark character which can be used to extend an
-- identifier, or if C is one of the corresponding categories, which are
-- the following:
-- Mark, Non-Spacing (Mn)
-- Mark, Spacing Combining (Mc)
function Is_UTF_32_Other (U : UTF_32) return Boolean;
function Is_UTF_32_Other (U : UTF_32) return Boolean;
function Is_UTF_32_Other (C : Category) return Boolean;
pragma Inline (Is_UTF_32_Other);
-- Returns true iff U is an other format character, which means that it
-- can be used to extend an identifier, but is ignored for the purposes of
-- matching of identiers. This means that it is in one of the following
-- categories:
-- matching of identiers, or if C is one of the corresponding categories,
-- which are the following:
-- Other, Format (Cf)
function Is_UTF_32_Punctuation (U : UTF_32) return Boolean;
function Is_UTF_32_Punctuation (U : UTF_32) return Boolean;
function Is_UTF_32_Punctuation (C : Category) return Boolean;
pragma Inline (Is_UTF_32_Punctuation);
-- Returns true iff U is a punctuation character that can be used to
-- separate pices of an identifier. This means that it is in one of the
-- following categories:
-- separate pices of an identifier, or if C is one of the corresponding
-- categories, which are the following:
-- Punctuation, Connector (Pc)
function Is_UTF_32_Space (U : UTF_32) return Boolean;
function Is_UTF_32_Space (U : UTF_32) return Boolean;
function Is_UTF_32_Space (C : Category) return Boolean;
pragma Inline (Is_UTF_32_Space);
-- Returns true iff U is considered a space to be ignored, which means
-- that it is in one of the following categories:
-- Returns true iff U is considered a space to be ignored, or if C is one
-- of the corresponding categories, which are the following:
-- Separator, Space (Zs)
function Is_UTF_32_Non_Graphic (U : UTF_32) return Boolean;
function Is_UTF_32_Non_Graphic (U : UTF_32) return Boolean;
function Is_UTF_32_Non_Graphic (C : Category) return Boolean;
pragma Inline (Is_UTF_32_Non_Graphic);
-- Returns true iff U is considered to be a non-graphic character,
-- which means that it is in one of the following categories:
-- Returns true iff U is considered to be a non-graphic character, or if C
-- is one of the corresponding categories, which are the following:
-- Other, Control (Cc)
-- Other, Private Use (Co)
-- Other, Surrogate (Cs)
@ -113,6 +166,16 @@ package GNAT.UTF_32 is
--
-- Note that the Ada category format effector is subsumed by the above
-- list of Unicode categories.
--
-- Note that Other, Unassiged (Cn) is quite deliberately not included
-- in the list of categories above. This means that should any of these
-- code positions be defined in future with graphic characters they will
-- be allowed without a need to change implementations or the standard.
-- The following function is used to fold to upper case, as required by
-- the Ada 2005 standard rules for identifier case folding. Two
-- identifiers are equivalent if they are identical after folding all
-- letters to upper case using this routine.
function UTF_32_To_Upper_Case (U : UTF_32) return UTF_32;
pragma Inline (UTF_32_To_Upper_Case);

View File

@ -250,6 +250,7 @@ package body Impunit is
"g-thread", -- GNAT.Threads
"g-traceb", -- GNAT.Traceback
"g-trasym", -- GNAT.Traceback.Symbolic
"g-utf_32", -- GNAT.UTF_32
"g-wistsp", -- GNAT.Wide_String_Split
-----------------------------------------------------

View File

@ -40,6 +40,8 @@ with Widechar; use Widechar;
with System.CRC32;
with System.WCh_Con; use System.WCh_Con;
with GNAT.UTF_32; use GNAT.UTF_32;
package body Scng is
use ASCII;
@ -1103,7 +1105,7 @@ package body Scng is
Accumulate_Checksum (Code);
if Ada_Version >= Ada_05
and then Is_UTF_32_Non_Graphic (Code)
and then Is_UTF_32_Non_Graphic (UTF_32 (Code))
then
Error_Msg
("(Ada 2005) non-graphic character not permitted " &
@ -1515,7 +1517,7 @@ package body Scng is
-- If UTF_32 terminator, terminate comment scan
elsif Is_UTF_32_Line_Terminator (Code) then
elsif Is_UTF_32_Line_Terminator (UTF_32 (Code)) then
Scan_Ptr := Wptr;
exit;
end if;
@ -1639,7 +1641,7 @@ package body Scng is
Code := Character'Pos (' ');
elsif Ada_Version >= Ada_05
and then Is_UTF_32_Non_Graphic (Code)
and then Is_UTF_32_Non_Graphic (UTF_32 (Code))
then
Error_Msg
("(Ada 2005) non-graphic character not permitted " &
@ -1899,7 +1901,7 @@ package body Scng is
-- Invalid control characters
when NUL | SOH | STX | ETX | EOT | ENQ | ACK | BEL | BS | SO |
when NUL | SOH | STX | ETX | EOT | ENQ | ACK | BEL | BS | ASCII.SO |
SI | DLE | DC1 | DC2 | DC3 | DC4 | NAK | SYN | ETB | CAN |
EM | FS | GS | RS | US | DEL
=>
@ -1942,6 +1944,7 @@ package body Scng is
declare
Code : Char_Code;
Cat : Category;
Err : Boolean;
begin
@ -1953,10 +1956,13 @@ package body Scng is
if Err then
Error_Illegal_Wide_Character;
goto Scan_Next_Character;
end if;
Cat := Get_Category (UTF_32 (Code));
-- If OK letter, reset scan ptr and go scan identifier
elsif Is_UTF_32_Letter (Code) then
if Is_UTF_32_Letter (Cat) then
Scan_Ptr := Wptr;
Name_Len := 0;
Underline_Found := False;
@ -1965,18 +1971,18 @@ package body Scng is
-- If OK wide space, ignore and keep scanning (we do not include
-- any ignored spaces in checksum)
elsif Is_UTF_32_Space (Code) then
elsif Is_UTF_32_Space (Cat) then
goto Scan_Next_Character;
-- If OK wide line terminator, terminate current line
elsif Is_UTF_32_Line_Terminator (Code) then
elsif Is_UTF_32_Line_Terminator (UTF_32 (Code)) then
Scan_Ptr := Wptr;
goto Scan_Line_Terminator;
-- Punctuation is an error (at start of identifier)
elsif Is_UTF_32_Punctuation (Code) then
elsif Is_UTF_32_Punctuation (Cat) then
Error_Msg
("identifier cannot start with punctuation", Wptr);
Scan_Ptr := Wptr;
@ -1986,7 +1992,7 @@ package body Scng is
-- Mark character is an error (at start of identifer)
elsif Is_UTF_32_Mark (Code) then
elsif Is_UTF_32_Mark (Cat) then
Error_Msg
("identifier cannot start with mark character", Wptr);
Scan_Ptr := Wptr;
@ -1996,7 +2002,7 @@ package body Scng is
-- Other format character is an error (at start of identifer)
elsif Is_UTF_32_Other (Code) then
elsif Is_UTF_32_Other (Cat) then
Error_Msg
("identifier cannot start with other format character", Wptr);
Scan_Ptr := Wptr;
@ -2008,7 +2014,7 @@ package body Scng is
-- identifier or bad literal. Not worth doing too much to try to
-- distinguish these cases, but we will do a little bit.
elsif Is_UTF_32_Digit (Code) then
elsif Is_UTF_32_Digit (Cat) then
Error_Msg
("identifier cannot start with digit character", Wptr);
Scan_Ptr := Wptr;
@ -2155,9 +2161,10 @@ package body Scng is
-- encoding into the name table entry for the identifier.
declare
Code : Char_Code;
Err : Boolean;
Chr : Character;
Code : Char_Code;
Err : Boolean;
Chr : Character;
Cat : Category;
begin
Wptr := Scan_Ptr;
@ -2198,19 +2205,22 @@ package body Scng is
("wide character not allowed in identifier", Wptr);
end if;
Cat := Get_Category (UTF_32 (Code));
-- If OK letter, store it folding to upper case. Note
-- that we include the folded letter in the checksum.
if Is_UTF_32_Letter (Code) then
Code := UTF_32_To_Upper_Case (Code);
if Is_UTF_32_Letter (Cat) then
Code :=
Char_Code (UTF_32_To_Upper_Case (UTF_32 (Code)));
Accumulate_Checksum (Code);
Store_Encoded_Character (Code);
Underline_Found := False;
-- If OK extended digit or mark, then store it
elsif Is_UTF_32_Digit (Code)
or else Is_UTF_32_Mark (Code)
elsif Is_UTF_32_Digit (Cat)
or else Is_UTF_32_Mark (Cat)
then
Accumulate_Checksum (Code);
Store_Encoded_Character (Code);
@ -2219,7 +2229,7 @@ package body Scng is
-- Wide punctuation is also stored, but counts as an
-- underline character for error checking purposes.
elsif Is_UTF_32_Punctuation (Code) then
elsif Is_UTF_32_Punctuation (Cat) then
Accumulate_Checksum (Code);
if Underline_Found then
@ -2241,12 +2251,12 @@ package body Scng is
-- stored. It seems reasonable to exclude it from the
-- checksum.
elsif Is_UTF_32_Other (Code) then
elsif Is_UTF_32_Other (Cat) then
null;
-- Wide character in category Separator,Space terminates
elsif Is_UTF_32_Space (Code) then
elsif Is_UTF_32_Space (Cat) then
goto Scan_Identifier_Complete;
-- Any other wide character is not acceptable

File diff suppressed because it is too large Load Diff

View File

@ -90,78 +90,4 @@ package Widechar is
P : Source_Ptr) return Boolean;
-- Determines if S (P) is the start of a wide character sequence
function Is_UTF_32_Letter (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Letter);
-- Returns true iff U is a letter that can be used to start an identifier.
-- This means that it is in one of the following categories:
-- Letter, Uppercase (Lu)
-- Letter, Lowercase (Ll)
-- Letter, Titlecase (Lt)
-- Letter, Modifier (Lm)
-- Letter, Other (Lo)
-- Number, Letter (Nl)
function Is_UTF_32_Digit (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Digit);
-- Returns true iff U is a digit that can be used to extend an identifer,
-- which means it is in one of the following categories:
-- Number, Decimal_Digit (Nd)
function Is_UTF_32_Line_Terminator (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Line_Terminator);
-- Returns true iff U is an allowed line terminator for source programs,
-- which means it is in one of the following categories:
-- Separator, Line (Zl)
-- Separator, Paragraph (Zp)
-- or that it is a conventional line terminator (CR, LF, VT, FF)
function Is_UTF_32_Mark (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Mark);
-- Returns true iff U is a mark character which can be used to extend
-- an identifier. This means it is in one of the following categories:
-- Mark, Non-Spacing (Mn)
-- Mark, Spacing Combining (Mc)
function Is_UTF_32_Other (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Other);
-- Returns true iff U is an other format character, which means that it
-- can be used to extend an identifier, but is ignored for the purposes of
-- matching of identiers. This means that it is in one of the following
-- categories:
-- Other, Format (Cf)
function Is_UTF_32_Punctuation (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Punctuation);
-- Returns true iff U is a punctuation character that can be used to
-- separate pices of an identifier. This means that it is in one of the
-- following categories:
-- Punctuation, Connector (Pc)
function Is_UTF_32_Space (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Space);
-- Returns true iff U is considered a space to be ignored, which means
-- that it is in one of the following categories:
-- Separator, Space (Zs)
function Is_UTF_32_Non_Graphic (U : Char_Code) return Boolean;
pragma Inline (Is_UTF_32_Non_Graphic);
-- Returns true iff U is considered to be a non-graphic character,
-- which means that it is in one of the following categories:
-- Other, Control (Cc)
-- Other, Private Use (Co)
-- Other, Surrogate (Cs)
-- Other, Format (Cf)
-- Separator, Line (Zl)
-- Separator, Paragraph (Zp)
--
-- Note that the Ada category format effector is subsumed by the above
-- list of Unicode categories.
function UTF_32_To_Upper_Case (U : Char_Code) return Char_Code;
pragma Inline (UTF_32_To_Upper_Case);
-- If U represents a lower case letter, returns the corresponding upper
-- case letter, otherwise U is returned unchanged. The folding is locale
-- independent as defined by documents referenced in the note in section
-- 1 of ISO/IEC 10646:2003
end Widechar;