unicode-decomp.pl: Move from chartables.pl...

2002-03-06  Eric Blake  <ebb9@email.byu.edu>

	* scripts/unicode-decomp.pl: Move from chartables.pl, and remove
	the code for generating include/java-chartables.h.
	* scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and
	merge with Classpath.
	* scripts/unicode-muncher.pl: Copy from Classpath.
	* scritps/MakeCharTables.java: New file.
	* gnu/gcj/convert/Blocks-3.txt: New file.
	* gnu/gcj/convert/UnicodeData-3.0.0.txt: New file.
	* gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file.
	* gnu/java/lang/CharData.java: Copy from Classpath.
	* Makefile.am (ordinary_java_source_files): Add
	gnu/java/lang/CharData.java.
	* configure.in: Remove --enable-fast-character option.
	* java/lang/Character.java: Merge algorithms and Javadoc with
	Classpath.
	* java/lang/natCharacter.cc: Implement Unicode lookup table more
	efficiently.
	* include/java-chardecomp.h: Regenerate.
	* include/java-chartables.h: Regenerate.

From-SVN: r50370
This commit is contained in:
Eric Blake 2002-03-06 19:13:01 +00:00 committed by Eric Blake
parent 90681dec69
commit 74b1875a09
9 changed files with 3401 additions and 88067 deletions

View File

@ -1,3 +1,25 @@
2002-03-06 Eric Blake <ebb9@email.byu.edu>
* scripts/unicode-decomp.pl: Move from chartables.pl, and remove
the code for generating include/java-chartables.h.
* scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and
merge with Classpath.
* scripts/unicode-muncher.pl: Copy from Classpath.
* scritps/MakeCharTables.java: New file.
* gnu/gcj/convert/Blocks-3.txt: New file.
* gnu/gcj/convert/UnicodeData-3.0.0.txt: New file.
* gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file.
* gnu/java/lang/CharData.java: Copy from Classpath.
* Makefile.am (ordinary_java_source_files): Add
gnu/java/lang/CharData.java.
* configure.in: Remove --enable-fast-character option.
* java/lang/Character.java: Merge algorithms and Javadoc with
Classpath.
* java/lang/natCharacter.cc: Implement Unicode lookup table more
efficiently.
* include/java-chardecomp.h: Regenerate.
* include/java-chartables.h: Regenerate.
2002-03-06 Bryce McKinlay <bryce@waitaki.otago.ac.nz>
* java/awt/MediaTracker.java: Implemented.

View File

@ -1288,6 +1288,7 @@ gnu/java/io/NullOutputStream.java \
gnu/java/io/ObjectIdentityWrapper.java \
gnu/java/lang/ArrayHelper.java \
gnu/java/lang/ClassHelper.java \
gnu/java/lang/CharData.java \
gnu/java/lang/reflect/TypeSignature.java \
gnu/java/locale/Calendar.java \
gnu/java/locale/Calendar_de.java \

View File

@ -1,965 +0,0 @@
# chartables.pl - A perl program to generate tables for use by the
# Character class.
# Copyright (C) 1998, 1999 Red Hat, Inc.
#
# This file is part of libjava.
#
# This software is copyrighted work licensed under the terms of the
# Libjava License. Please consult the file "LIBJAVA_LICENSE" for
# details.
# This program requires a `unidata.txt' file of the form distributed
# on the Unicode 2.0 CD ROM. Or, get it more conveniently here:
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
# Version `2.1.8' of this file was last used to update the Character class.
# Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
# "The Java Language Specification", ISBN 0-201-63451-1
# plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
# Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt
# If this exits with nonzero status, then you must investigate the
# cause of the problem.
# Diagnostics and other information to stderr.
# This creates the new include/java-chartables.h and
# include/java-chardecomp.h files directly.
# With -n, the files are not created, but all processing
# still occurs.
# Fields in the table.
$CODE = 0;
$NAME = 1;
$CATEGORY = 2;
$DECOMPOSITION = 5;
$DECIMAL = 6;
$DIGIT = 7;
$NUMERIC = 8;
$UPPERCASE = 12;
$LOWERCASE = 13;
$TITLECASE = 14;
# A special case.
$TAMIL_DIGIT_ONE = 0x0be7;
$TAMIL_DIGIT_NINE = 0x0bef;
# These are endpoints of legitimate gaps in the tables.
$CJK_IDEOGRAPH_END = 0x9fa5;
$HANGUL_END = 0xd7a3;
$HIGH_SURROGATE_END = 0xdb7f;
$PRIVATE_HIGH_SURROGATE_END = 0xdbff;
$LOW_SURROGATE_END = 0xdfff;
$PRIVATE_END = 0xf8ff;
%title_to_upper = ();
%title_to_lower = ();
%numerics = ();
%name = ();
@digit_start = ();
@digit_end = ();
@space_start = ();
@space_end = ();
# @letter_start = ();
# @letter_end = ();
@all_start = ();
@all_end = ();
@all_cats = ();
@upper_start = ();
@upper_end = ();
@upper_map = ();
%upper_anom = ();
@lower_start = ();
@lower_end = ();
@lower_map = ();
%lower_anom = ();
@attributes = ();
# There are a few characters which actually need two attributes.
# These are special-cased.
$ROMAN_START = 0x2160;
$ROMAN_END = 0x217f;
%second_attributes = ();
$prevcode = -1;
$status = 0;
%category_map =
(
'Mn' => 'NON_SPACING_MARK',
'Mc' => 'COMBINING_SPACING_MARK',
'Me' => 'ENCLOSING_MARK',
'Nd' => 'DECIMAL_DIGIT_NUMBER',
'Nl' => 'LETTER_NUMBER',
'No' => 'OTHER_NUMBER',
'Zs' => 'SPACE_SEPARATOR',
'Zl' => 'LINE_SEPARATOR',
'Zp' => 'PARAGRAPH_SEPARATOR',
'Cc' => 'CONTROL',
'Cf' => 'FORMAT',
'Cs' => 'SURROGATE',
'Co' => 'PRIVATE_USE',
'Cn' => 'UNASSIGNED',
'Lu' => 'UPPERCASE_LETTER',
'Ll' => 'LOWERCASE_LETTER',
'Lt' => 'TITLECASE_LETTER',
'Lm' => 'MODIFIER_LETTER',
'Lo' => 'OTHER_LETTER',
'Pc' => 'CONNECTOR_PUNCTUATION',
'Pd' => 'DASH_PUNCTUATION',
'Ps' => 'START_PUNCTUATION',
'Pe' => 'END_PUNCTUATION',
'Pi' => 'START_PUNCTUATION',
'Pf' => 'END_PUNCTUATION',
'Po' => 'OTHER_PUNCTUATION',
'Sm' => 'MATH_SYMBOL',
'Sc' => 'CURRENCY_SYMBOL',
'Sk' => 'MODIFIER_SYMBOL',
'So' => 'OTHER_SYMBOL'
);
# These maps characters to their decompositions.
%canonical_decomposition = ();
%full_decomposition = ();
# Handle `-n' and open output files.
local ($f1, $f2) = ('include/java-chartables.h',
'include/java-chardecomp.h');
if ($ARGV[0] eq '-n')
{
shift @ARGV;
$f1 = '/dev/null';
$f2 = '/dev/null';
}
open (CHARTABLE, "> $f1");
open (DECOMP, "> $f2");
# Process the Unicode file.
while (<>)
{
chop;
# Specify a limit for split so that we pick up trailing fields.
# We make the limit larger than we need, to catch the case where
# there are extra fields.
@fields = split (';', $_, 30);
# Convert code to number.
$ncode = hex ($fields[$CODE]);
if ($#fields != 14)
{
print STDERR ("Entry for \\u", $fields[$CODE],
" has wrong number of fields: ", $#fields, "\n");
}
$name{$fields[$CODE]} = $fields[$NAME];
# If we've found a gap in the table, fill it in.
if ($ncode != $prevcode + 1)
{
&process_gap (*fields, $prevcode, $ncode);
}
&process_char (*fields, $ncode);
$prevcode = $ncode;
}
if ($prevcode != 0xffff)
{
# Setting of `fields' parameter doesn't matter here.
&process_gap (*fields, $prevcode, 0x10000);
}
print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n";
print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n";
print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n";
print CHARTABLE "// These tables are automatically generated by the chartables.pl\n";
print CHARTABLE "// script. DO NOT EDIT the tables. Instead, fix the script\n";
print CHARTABLE "// and run it again.\n\n";
print CHARTABLE "// This file should only be included by natCharacter.cc\n\n";
$bytes = 0;
# Titlecase mapping tables.
if ($#title_to_lower != $#title_to_upper)
{
# If this fails we need to reimplement toTitleCase.
print STDERR "titlecase mappings have different sizes\n";
$status = 1;
}
# Also ensure that the tables are entirely parallel.
foreach $key (sort keys %title_to_lower)
{
if (! defined $title_to_upper{$key})
{
print STDERR "titlecase mappings have different entries\n";
$status = 1;
}
}
&print_single_map ("title_to_lower_table", %title_to_lower);
&print_single_map ("title_to_upper_table", %title_to_upper);
print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n";
printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE;
# All numeric values.
&print_numerics;
# Digits only.
&print_block ("digit_table", *digit_start, *digit_end);
# Space characters.
&print_block ("space_table", *space_start, *space_end);
# Letters. We used to generate a separate letter table. But this
# doesn't really seem worthwhile. Simply using `all_table' saves us
# about 800 bytes, and only adds 3 table probes to isLetter.
# &print_block ("letter_table", *letter_start, *letter_end);
# Case tables.
&print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom);
&print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom);
# Everything else.
&print_all_block (*all_start, *all_end, *all_cats);
print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n";
printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START;
printf CHARTABLE "#define ROMAN_END 0x%04x\n\n", $ROMAN_END;
&print_fast_tables (*all_start, *all_end, *all_cats,
*attributes, *second_attributes);
print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n";
print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n";
printf STDERR "Approximately %d bytes of data generated (compact case)\n",
$bytes;
# Now generate decomposition tables.
printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n";
printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n";
printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n";
print DECOMP "// These tables are automatically generated by the chartables.pl\n";
print DECOMP "// script. DO NOT EDIT the tables. Instead, fix the script\n";
print DECOMP "// and run it again.\n\n";
print DECOMP "// This file should only be included by natCollator.cc\n\n";
print DECOMP "struct decomp_entry\n{\n";
print DECOMP " jchar key;\n";
print DECOMP " const char *value;\n";
print DECOMP "};\n\n";
&write_decompositions;
printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
close (CHARTABLE);
close (DECOMP);
exit $status;
# Process a gap in the space.
sub process_gap
{
local (*fields, $prevcode, $ncode) = @_;
local (@gap_fields, $i);
if ($ncode == $CJK_IDEOGRAPH_END
|| $ncode == $HANGUL_END
|| $ncode == $HIGH_SURROGATE_END
|| $ncode == $PRIVATE_HIGH_SURROGATE_END
|| $ncode == $LOW_SURROGATE_END
|| $ncode == $PRIVATE_END)
{
# The characters in the gap we just found are known to
# have the same properties as the character at the end of
# the gap.
@gap_fields = @fields;
}
else
{
# This prints too much to be enabled.
# print STDERR "Gap found at \\u", $fields[$CODE], "\n";
@gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '',
'', '', '', '');
}
for ($i = $prevcode + 1; $i < $ncode; ++$i)
{
$gap_fields[$CODE] = sprintf ("%04x", $i);
$gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE];
&process_char (*gap_fields, $i);
}
}
# Process a single character.
sub process_char
{
local (*fields, $ncode) = @_;
if ($fields[$DECOMPOSITION] ne '')
{
&add_decomposition ($ncode, $fields[$DECOMPOSITION]);
}
# If this is a titlecase character, mark it.
if ($fields[$CATEGORY] eq 'Lt')
{
$title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE];
$title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE];
}
else
{
# For upper and lower case mappings, we try to build compact
# tables that map range onto range. We specifically want to
# avoid titlecase characters. Java specifies a range check to
# make sure the character is not between 0x2000 and 0x2fff.
# We avoid that here because we need to generate table entries
# -- toLower and toUpper still work in that range.
if ($fields[$UPPERCASE] eq ''
&& ($fields[$LOWERCASE] ne ''
|| $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/))
{
if ($fields[$LOWERCASE] ne '')
{
&update_case_block (*upper_start, *upper_end, *upper_map,
$fields[$CODE], $fields[$LOWERCASE]);
&set_attribute ($ncode, hex ($fields[$LOWERCASE]));
}
else
{
$upper_anom{$fields[$CODE]} = 1;
}
}
elsif ($fields[$LOWERCASE] ne '')
{
print STDERR ("Java missed upper case char \\u",
$fields[$CODE], "\n");
}
elsif ($fields[$CATEGORY] eq 'Lu')
{
# This case is for letters which are marked as upper case
# but for which there is no lower case equivalent. For
# instance, LATIN LETTER YR.
}
if ($fields[$LOWERCASE] eq ''
&& ($fields[$UPPERCASE] ne ''
|| $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/))
{
if ($fields[$UPPERCASE] ne '')
{
&update_case_block (*lower_start, *lower_end, *lower_map,
$fields[$CODE], $fields[$UPPERCASE]);
&set_attribute ($ncode, hex ($fields[$UPPERCASE]));
}
else
{
$lower_anom{$fields[$CODE]} = 1;
}
}
elsif ($fields[$UPPERCASE] ne '')
{
print STDERR ("Java missed lower case char \\u",
$fields[$CODE], "\n");
}
elsif ($fields[$CATEGORY] eq 'Ll')
{
# This case is for letters which are marked as lower case
# but for which there is no upper case equivalent. For
# instance, FEMININE ORDINAL INDICATOR.
}
}
# If we have a non-decimal numeric value, add it to the list.
if ($fields[$CATEGORY] eq 'Nd'
&& ($ncode < 0x2000 || $ncode > 0x2fff)
&& $fields[$NAME] =~ /DIGIT/)
{
# This is a digit character that is handled elsewhere.
}
elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '')
{
# Do a simple check.
if ($fields[$DECIMAL] ne '')
{
# This catches bugs in an earlier implementation of
# chartables.pl. Now it is here for historical interest
# only.
# print STDERR ("Character \u", $fields[$CODE],
# " would have been missed as digit\n");
}
local ($val) = $fields[$DIGIT];
$val = $fields[$NUMERIC] if $val eq '';
local ($ok) = 1;
# If we have a value which is not a positive integer, then we
# set the value to -2 to make life easier for
# Character.getNumericValue.
if ($val !~ m/^[0-9]+$/)
{
if ($fields[$CATEGORY] ne 'Nl'
&& $fields[$CATEGORY] ne 'No')
{
# This shows a few errors in the Unicode table. These
# characters have a missing Numeric field, and the `N'
# for the mirrored field shows up there instead. I
# reported these characters to errata@unicode.org on
# Thu Sep 10 1998. They said it will be fixed in the
# 2.1.6 release of the tables.
print STDERR ("Character \u", $fields[$CODE],
" has value but is not numeric; val = '",
$val, "'\n");
# We skip these.
$ok = 0;
}
$val = "-2";
}
if ($ok)
{
$numerics{$fields[$CODE]} = $val;
&set_attribute ($ncode, $val);
}
}
# We build a table that lists ranges of ordinary decimal values.
# At each step we make sure that the digits are in the correct
# order, with no holes, as this is assumed by Character. If this
# fails, reimplementation is required. This implementation
# dovetails nicely with the Java Spec, which has strange rules for
# what constitutes a decimal value. In particular the Unicode
# name must contain the word `DIGIT'. The spec doesn't directly
# say that digits must have type `Nd' (or that their value must an
# integer), but that can be inferred from the list of digits in
# the book(s). Currently the only Unicode characters whose name
# includes `DIGIT' which would not fit are the Tibetan "half"
# digits.
if ($fields[$CATEGORY] eq 'Nd')
{
if (($ncode < 0x2000 || $ncode > 0x2fff)
&& $fields[$NAME] =~ /DIGIT/)
{
&update_digit_block (*digit_start, *digit_end, $fields[$CODE],
$fields[$DECIMAL]);
&set_attribute ($ncode, $fields[$DECIMAL]);
}
else
{
# If this fails then Character.getType will fail. We
# assume that things in `digit_table' are the only
# category `Nd' characters.
print STDERR ("Character \u", $fields[$CODE],
" is class Nd but not in digit table\n");
$status = 1;
}
}
# Keep track of space characters.
if ($fields[$CATEGORY] =~ /Z[slp]/)
{
&update_block (*space_start, *space_end, $fields[$CODE]);
}
# Keep track of letters.
# if ($fields[$CATEGORY] =~ /L[ultmo]/)
# {
# &update_letter_block (*letter_start, *letter_end, $fields[$CODE],
# $fields[$CATEGORY]);
# }
# Keep track of all characters. You might think we wouldn't have
# to do this for uppercase letters, or other characters we already
# "classify". The problem is that this classification is
# different. E.g., \u216f is uppercase by Java rules, but is a
# LETTER_NUMBER here.
&update_all_block (*all_start, *all_end, *all_cats,
$fields[$CODE], $fields[$CATEGORY]);
}
# Called to add a new decomposition.
sub add_decomposition
{
local ($ncode, $value) = @_;
local ($is_full) = 0;
local ($first) = 1;
local (@decomp) = ();
foreach (split (' ', $value))
{
if ($first && /^\<.*\>$/)
{
$is_full = 1;
}
else
{
push (@decomp, hex ($_));
}
$first = 0;
}
# We pack the value into a string because this means we can stick
# with Perl 4 features.
local ($s) = pack "I*", @decomp;
if ($is_full)
{
$full_decomposition{$ncode} = $s;
}
else
{
$canonical_decomposition{$ncode} = $s;
}
}
# Write a single decomposition table.
sub write_single_decomposition
{
local ($name, $is_canon, %table) = @_;
printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
local ($key, @expansion, $char);
local ($first_line) = 1;
for ($key = 0; $key <= 65535; ++$key)
{
next if ! defined $table{$key};
printf DECOMP ",\n"
unless $first_line;
$first_line = 0;
printf DECOMP " { 0x%04x, \"", $key;
# We represent the expansion as a series of bytes, terminated
# with a double nul. This is ugly, but relatively
# space-efficient. Most expansions are short, but there are a
# few that are very long (e.g. \uFDFA). This means that if we
# chose a fixed-space representation we would waste a lot of
# space.
@expansion = unpack "I*", $table{$key};
foreach $char (@expansion)
{
printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
}
printf DECOMP "\" }";
}
printf DECOMP "\n};\n\n";
}
sub write_decompositions
{
&write_single_decomposition ('canonical', 1, %canonical_decomposition);
&write_single_decomposition ('full', 0, %full_decomposition);
}
# We represent a block of characters with a pair of lists. This
# function updates the pair to account for the new character. Returns
# 1 if we added to the old block, 0 otherwise.
sub update_block
{
local (*start, *end, $char) = @_;
local ($nchar) = hex ($char);
local ($count) = $#end;
if ($count >= 0 && $end[$count] == $nchar - 1)
{
++$end[$count];
return 1;
}
else
{
++$count;
$start[$count] = $nchar;
$end[$count] = $nchar;
}
return 0;
}
# Return true if we will be appending this character to the end of the
# existing block.
sub block_append_p
{
local (*end, $char) = @_;
return $#end >= 0 && $end[$#end] == $char - 1;
}
# This updates the digit block. This table is much like an ordinary
# block, but it has an extra constraint.
sub update_digit_block
{
local (*start, *end, $char, $value) = @_;
&update_block ($start, $end, $char);
local ($nchar) = hex ($char);
# We want to make sure that the new digit's value is correct for
# its place in the block. However, we special-case Tamil digits,
# since Tamil does not have a digit `0'.
local ($count) = $#start;
if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE)
&& $nchar - $start[$count] != $value)
{
# If this fails then Character.digit_value will be wrong.
print STDERR "Character \\u", $char, " violates digit constraint\n";
$status = 1;
}
}
# Update letter table. We could be smart about avoiding upper or
# lower case letters, but it is much simpler to just track them all.
sub update_letter_block
{
local (*start, *end, $char, $category) = @_;
&update_block (*start, *end, $char);
}
# Update `all' table. This table holds all the characters we don't
# already categorize for other reasons. FIXME: if a given type has
# very few characters, we should just inline the code. E.g., there is
# only one paragraph separator.
sub update_all_block
{
local (*start, *end, *cats, $char, $category) = @_;
local ($nchar) = hex ($char);
local ($count) = $#end;
if ($count >= 0
&& $end[$count] == $nchar - 1
&& $cats[$count] eq $category)
{
++$end[$count];
}
else
{
++$count;
$start[$count] = $nchar;
$end[$count] = $nchar;
$cats[$count] = $category;
}
}
# Update a case table. We handle case tables specially because we
# want to map (e.g.) a block of uppercase characters directly onto the
# corresponding block of lowercase characters. Therefore we generate
# a new entry when the block would no longer map directly.
sub update_case_block
{
local (*start, *end, *map, $char, $mapchar) = @_;
local ($nchar) = hex ($char);
local ($nmap) = hex ($mapchar);
local ($count) = $#end;
if ($count >= 0
&& $end[$count] == $nchar - 1
&& $nchar - $start[$count] == $nmap - $map[$count])
{
++$end[$count];
}
else
{
++$count;
$start[$count] = $nchar;
$end[$count] = $nchar;
$map[$count] = $nmap;
}
}
# Set the attribute value for the character. Each character can have
# only one attribute.
sub set_attribute
{
local ($ncode, $attr) = @_;
if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr)
{
if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END)
{
$second_attributes{$ncode} = $attr;
}
else
{
printf STDERR "character \\u%04x already has attribute\n", $ncode;
}
}
# Attributes can be interpreted as unsigned in some situations,
# so we check against 65535. This could cause errors -- we need
# to check the interpretation here.
elsif ($attr < -32768 || $attr > 65535)
{
printf STDERR "attribute out of range for character \\u%04x\n", $ncode;
}
else
{
$attributes{$ncode} = $attr;
}
}
# Print a block table.
sub print_block
{
local ($title, *start, *end) = @_;
print CHARTABLE "static const jchar ", $title, "[][2] =\n";
print CHARTABLE " {\n";
local ($i) = 0;
while ($i <= $#start)
{
print CHARTABLE " { ";
&print_char ($start[$i]);
print CHARTABLE ", ";
&print_char ($end[$i]);
print CHARTABLE " }";
print CHARTABLE "," if ($i != $#start);
print CHARTABLE "\n";
++$i;
$bytes += 4; # Two bytes per char.
}
print CHARTABLE " };\n\n";
}
# Print the numerics table.
sub print_numerics
{
local ($i, $key, $count, @keys);
$i = 0;
@keys = sort keys %numerics;
$count = @keys;
print CHARTABLE "static const jchar numeric_table[] =\n";
print CHARTABLE " { ";
foreach $key (@keys)
{
&print_char (hex ($key));
++$i;
print CHARTABLE ", " if $i < $count;
# Print 5 per line.
print CHARTABLE "\n " if ($i % 5 == 0);
$bytes += 2; # One character.
}
print CHARTABLE " };\n\n";
print CHARTABLE "static const jshort numeric_value[] =\n";
print CHARTABLE " { ";
$i = 0;
foreach $key (@keys)
{
print CHARTABLE $numerics{$key};
if ($numerics{$key} > 32767 || $numerics{$key} < -32768)
{
# This means our generated type info is incorrect. We
# could just detect and work around this here, but I'm
# lazy.
print STDERR "numeric value won't fit in a short\n";
$status = 1;
}
++$i;
print CHARTABLE ", " if $i < $count;
# Print 10 per line.
print CHARTABLE "\n " if ($i % 10 == 0);
$bytes += 2; # One short.
}
print CHARTABLE " };\n\n";
}
# Print a table that maps one single letter onto another. It assumes
# the map is index by char code.
sub print_single_map
{
local ($title, %map) = @_;
local (@keys) = sort keys %map;
$num = @keys;
print CHARTABLE "static const jchar ", $title, "[][2] =\n";
print CHARTABLE " {\n";
$i = 0;
for $key (@keys)
{
print CHARTABLE " { ";
&print_char (hex ($key));
print CHARTABLE ", ";
&print_char (hex ($map{$key}));
print CHARTABLE " }";
++$i;
if ($i < $num)
{
print CHARTABLE ",";
}
else
{
print CHARTABLE " ";
}
print CHARTABLE " // ", $name{$key}, "\n";
$bytes += 4; # Two bytes per char.
}
print CHARTABLE " };\n\n";
}
# Print the `all' block.
sub print_all_block
{
local (*start, *end, *cats) = @_;
&print_block ("all_table", *start, *end);
local ($i) = 0;
local ($sum) = 0;
while ($i <= $#start)
{
$sum += $end[$i] - $start[$i] + 1;
++$i;
}
# We do this computation just to make sure it isn't cheaper to
# simply list all the characters individually.
printf STDERR ("all_table encodes %d characters in %d entries\n",
$sum, $#start + 1);
print CHARTABLE "static const jbyte category_table[] =\n";
print CHARTABLE " { ";
$i = 0;
while ($i <= $#cats)
{
if ($i > 0 && $cats[$i] eq $cats[$i - 1])
{
# This isn't an error. We can have a duplicate because
# two ranges are not adjacent while the intervening
# characters are left out of the table for other reasons.
# We could exploit this to make the table a little smaller.
# printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i];
}
print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
print CHARTABLE ", " if ($i < $#cats);
++$i;
print CHARTABLE "\n ";
++$bytes;
}
print CHARTABLE " };\n\n";
}
# Print case table.
sub print_case_table
{
local ($title, *start, *end, *map, *anomalous) = @_;
&print_block ($title . '_case_table', *start, *end);
print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n";
print CHARTABLE " { ";
local ($i) = 0;
while ($i <= $#map)
{
&print_char ($map[$i]);
print CHARTABLE ", " if $i < $#map;
++$i;
print CHARTABLE "\n " if $i % 5 == 0;
$bytes += 2;
}
print CHARTABLE " };\n";
local ($key, @keys);
@keys = sort keys %anomalous;
if ($title eq 'upper')
{
if ($#keys >= 0)
{
# If these are found we need to change Character.isUpperCase.
print STDERR "Found anomalous upper case characters\n";
$status = 1;
}
}
else
{
print CHARTABLE "\n";
print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n";
print CHARTABLE " { ";
$i = 0;
foreach $key (@keys)
{
&print_char (hex ($key));
print CHARTABLE ", " if $i < $#keys;
++$i;
print CHARTABLE "\n " if $i % 5 == 0;
$bytes += 2;
}
print CHARTABLE " };\n";
}
print CHARTABLE "\n";
}
# Print the type table and attributes table for the fast version.
sub print_fast_tables
{
local (*start, *end, *cats, *atts, *second_atts) = @_;
print CHARTABLE "static const jbyte type_table[] =\n{ ";
local ($i, $j);
for ($i = 0; $i <= $#cats; ++$i)
{
for ($j = $start[$i]; $j <= $end[$i]; ++$j)
{
print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]};
print CHARTABLE "," if ($i < $#cats || $j < $end[$i]);
print CHARTABLE "\n ";
}
}
print CHARTABLE "\n };\n\n";
print CHARTABLE "static const jshort attribute_table[] =\n{ ";
for ($i = 0; $i <= 0xffff; ++$i)
{
$atts{$i} = 0 if ! defined $atts{$i};
print CHARTABLE $atts{$i};
print CHARTABLE ", " if $i < 0xffff;
print CHARTABLE "\n " if $i % 5 == 1;
}
print CHARTABLE "\n };\n\n";
print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ ";
for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i)
{
print CHARTABLE $second_atts{$i};
print CHARTABLE ", " if $i < $ROMAN_END;
print CHARTABLE "\n " if $i % 5 == 1;
}
print CHARTABLE "\n };\n\n";
}
# Print a character constant.
sub print_char
{
local ($ncode) = @_;
printf CHARTABLE "0x%04x", $ncode;
}

View File

@ -42,13 +42,6 @@ AC_SUBST(COMPPATH)
dnl The -no-testsuite modules omit the test subdir.
AM_CONDITIONAL(TESTSUBDIR, test -d $srcdir/testsuite)
dnl See whether the user prefers size or speed for Character.
dnl The default is size.
AC_ARG_ENABLE(fast-character,
[ --enable-fast-character prefer speed over size for Character],
# Nothing
, AC_DEFINE(COMPACT_CHARACTER))
dnl Should the runtime set system properties by examining the
dnl environment variable GCJ_PROPERTIES?
AC_ARG_ENABLE(getenv-properties,

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,39 @@
// natCharacter.cc - Native part of Character class.
/* java.lang.Character -- Wrapper class for char, and Unicode subsets
Copyright (C) 1998, 1999, 2001, 2002 Free Software Foundation, Inc.
/* Copyright (C) 1998, 1999 Free Software Foundation
This file is part of GNU Classpath.
This file is part of libgcj.
GNU Classpath is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
GNU Classpath is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with GNU Classpath; see the file COPYING. If not, write to the
Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA.
Linking this library statically or dynamically with other modules is
making a combined work based on this library. Thus, the terms and
conditions of the GNU General Public License cover the whole
combination.
As a special exception, the copyright holders of this library give you
permission to link this library with independent modules to produce an
executable, regardless of the license terms of these independent
modules, and to copy and distribute the resulting executable under
terms of your choice, provided that you also meet, for each linked
independent module, the terms and conditions of the license of that
module. An independent module is a module which is not derived from
or based on this library. If you modify this library, you may extend
this exception to your version of the library, but you are not
obligated to do so. If you do not wish to do so, delete this
exception statement from your version. */
#include <config.h>
@ -18,267 +45,69 @@ details. */
#define asize(x) ((sizeof (x)) / sizeof (x[0]))
static jchar
to_lower_title (jchar ch)
jchar
java::lang::Character::readChar(jchar ch)
{
for (unsigned int i = 0; i < asize (title_to_upper_table); ++i)
{
// We can assume that the entries in the two tables are
// parallel. This is checked in the script.
if (title_to_upper_table[i][1] == ch
|| title_to_upper_table[i][0] == ch)
return title_to_lower_table[i][1];
}
return ch;
// Perform 16-bit addition to find the correct entry in data.
return data[(jchar) (blocks[ch >> SHIFT] + ch)];
}
static jchar
to_upper_title (jchar ch)
jint
java::lang::Character::getType(jchar ch)
{
for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
{
// We can assume that the entries in the two tables are
// parallel. This is checked in the script.
if (title_to_lower_table[i][1] == ch
|| title_to_lower_table[i][0] == ch)
return title_to_upper_table[i][1];
}
return ch;
}
jboolean
java::lang::Character::isTitleCase (jchar ch)
{
for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
{
if (title_to_lower_table[i][0] == ch)
return true;
}
return false;
// Perform 16-bit addition to find the correct entry in data.
return (jint) (data[(jchar) (blocks[ch >> SHIFT] + ch)] & TYPE_MASK);
}
jchar
java::lang::Character::toTitleCase (jchar ch)
java::lang::Character::toLowerCase(jchar ch)
{
// Both titlecase mapping tables have the same length. This is
// checked in the chartables script.
for (unsigned int i = 0; i < asize (title_to_lower_table); ++i)
{
if (title_to_lower_table[i][0] == ch)
return ch;
if (title_to_lower_table[i][1] == ch)
return title_to_lower_table[i][0];
if (title_to_upper_table[i][1] == ch)
return title_to_upper_table[i][0];
}
return toUpperCase (ch);
}
#ifdef COMPACT_CHARACTER
static int
table_search (const jchar table[][2], int table_len, jchar ch)
{
int low, high, i, old;
low = 0;
high = table_len;
i = high / 2;
while (true)
{
if (ch < table[i][0])
high = i;
else if (ch > table[i][1])
low = i;
else
return i;
old = i;
i = (high + low) / 2;
if (i == old)
break;
}
return -1;
}
jint
java::lang::Character::digit_value (jchar ch)
{
int index = table_search (digit_table, asize (digit_table), ch);
if (index == -1)
return -1;
jchar base = digit_table[index][0];
// Tamil doesn't have a digit `0'. So we special-case it here.
if (base == TAMIL_DIGIT_ONE)
return ch - base + 1;
return ch - base;
}
jint
java::lang::Character::getNumericValue (jchar ch)
{
jint d = digit (ch, 36);
if (d != -1)
return d;
for (unsigned int i = 0; i < asize (numeric_table); ++i)
{
if (numeric_table[i] == ch)
return numeric_value[i];
}
return -1;
}
jint
java::lang::Character::getType (jchar ch)
{
int index = table_search (all_table, asize (all_table), ch);
if (index != -1)
return category_table[index];
return UNASSIGNED;
}
jboolean
java::lang::Character::isLowerCase (jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
if (table_search (lower_case_table, asize (lower_case_table), ch) != -1)
return true;
int low, high, i, old;
low = 0;
high = asize (lower_anomalous_table);
i = high / 2;
while (true)
{
if (ch < lower_anomalous_table[i])
high = i;
else if (ch > lower_anomalous_table[i])
low = i;
else
return true;
old = i;
i = (high + low) / 2;
if (i == old)
break;
}
return false;
}
jboolean
java::lang::Character::isSpaceChar (jchar ch)
{
return table_search (space_table, asize (space_table), ch) != -1;
}
jboolean
java::lang::Character::isUpperCase (jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
return table_search (upper_case_table, asize (upper_case_table), ch) != -1;
return (jchar) (ch + lower[readChar(ch) >> 7]);
}
jchar
java::lang::Character::toLowerCase (jchar ch)
java::lang::Character::toUpperCase(jchar ch)
{
int index = table_search (upper_case_table, asize (upper_case_table), ch);
if (index == -1)
return to_lower_title (ch);
return (jchar) (ch - upper_case_table[index][0]
+ upper_case_map_table[index]);
return (jchar) (ch + upper[readChar(ch) >> 7]);
}
jchar
java::lang::Character::toUpperCase (jchar ch)
java::lang::Character::toTitleCase(jchar ch)
{
int index = table_search (lower_case_table, asize (lower_case_table), ch);
if (index == -1)
return to_upper_title (ch);
return (jchar) (ch - lower_case_table[index][0]
+ lower_case_map_table[index]);
}
#else /* COMPACT_CHARACTER */
jint
java::lang::Character::digit_value (jchar ch)
{
if (type_table[ch] == DECIMAL_DIGIT_NUMBER)
return attribute_table[ch];
return -1;
// As title is short, it doesn't hurt to exhaustively iterate over it.
for (int i = title_length - 2; i >= 0; i -= 2)
if (title[i] == ch)
return title[i + 1];
return toUpperCase(ch);
}
jint
java::lang::Character::getNumericValue (jchar ch)
java::lang::Character::digit(jchar ch, jint radix)
{
jint d = digit (ch, 36);
if (d != -1)
return d;
// Some characters require two attributes. We special-case them here.
if (ch >= ROMAN_START && ch <= ROMAN_END)
return secondary_attribute_table[ch - ROMAN_START];
if (type_table[ch] == LETTER_NUMBER || type_table[ch] == OTHER_NUMBER)
return attribute_table[ch];
return -1;
if (radix < MIN_RADIX || radix > MAX_RADIX)
return (jint) -1;
jchar attr = readChar(ch);
if (((1 << (attr & TYPE_MASK))
& ((1 << UPPERCASE_LETTER)
| (1 << LOWERCASE_LETTER)
| (1 << DECIMAL_DIGIT_NUMBER))))
{
// Signedness doesn't matter; 0xffff vs. -1 are both rejected.
jint digit = (jint) numValue[attr >> 7];
return (digit >= 0 && digit < radix) ? digit : (jint) -1;
}
return (jint) -1;
}
jint
java::lang::Character::getType (jchar ch)
java::lang::Character::getNumericValue(jchar ch)
{
return type_table[ch];
// numValue is stored as an array of jshort, since 10000 is the maximum.
return (jint) numValue[readChar(ch) >> 7];
}
jboolean
java::lang::Character::isLowerCase (jchar ch)
jbyte
java::lang::Character::getDirectionality(jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
return type_table[ch] == LOWERCASE_LETTER;
return direction[readChar(ch) >> 7];
}
jboolean
java::lang::Character::isSpaceChar (jchar ch)
{
return (type_table[ch] == SPACE_SEPARATOR
|| type_table[ch] == LINE_SEPARATOR
|| type_table[ch] == PARAGRAPH_SEPARATOR);
}
jboolean
java::lang::Character::isUpperCase (jchar ch)
{
if (ch >= 0x2000 && ch <= 0x2fff)
return false;
return type_table[ch] == UPPERCASE_LETTER;
}
jchar
java::lang::Character::toLowerCase (jchar ch)
{
if (type_table[ch] == UPPERCASE_LETTER)
return attribute_table[ch];
return to_lower_title (ch);
}
jchar
java::lang::Character::toUpperCase (jchar ch)
{
if (type_table[ch] == LOWERCASE_LETTER)
return attribute_table[ch];
return to_upper_title (ch);
}
#endif /* COMPACT_CHARACTER */

View File

@ -1,65 +0,0 @@
#! /usr/bin/perl
if ($ARGV[0] eq '')
{
$file = 'Blocks.txt';
if (! -f $file)
{
# Too painful to figure out how to get Perl to do it.
system 'wget -o .wget-log http://www.unicode.org/Public/UNIDATA/Blocks.txt';
}
}
else
{
$file = $ARGV[0];
}
open (INPUT, "< $file") || die "couldn't open $file: $!";
@array = ();
while (<INPUT>)
{
next if /^#/;
chop;
next if /^$/;
($start, $to, $text) = split (/; /);
($symbol = $text) =~ tr/a-z/A-Z/;
$symbol =~ s/[- ]/_/g;
# Special case for one of the SPECIALS.
next if $start eq 'FEFF';
# Special case some areas that our heuristic mishandles.
if ($symbol eq 'HIGH_SURROGATES')
{
$symbol = 'SURROGATES_AREA';
$text = 'Surrogates Area';
$to = 'DFFF';
}
elsif ($symbol =~ /SURROGATES/)
{
next;
}
elsif ($symbol eq 'PRIVATE_USE')
{
$symbol .= '_AREA';
$text = 'Private Use Area';
}
printf " public static final UnicodeBlock %s = new UnicodeBlock (\"%s\", '\\u%s', '\\u%s');\n",
$symbol, $text, $start, $to;
push (@array, $symbol);
}
printf " private static final UnicodeBlock[] blocks = {\n";
foreach (@array)
{
printf " %s", $_;
printf "," unless $_ eq 'SPECIALS';
printf "\n";
}
printf " };\n";
close (INPUT);