gcc/libjava/scripts/unicode-decomp.pl

#!/usr/bin/perl -w
# unicode-decomp.pl - script to generate database for java.text.Collator
# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
#
# This file is part of libjava.
# 
# This software is copyrighted work licensed under the terms of the
# Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
# details.

# Code for reading UnicodeData.txt and generating the code for
# gnu.java.lang.CharData.  For now, the relevant Unicode definition files
# are found in libjava/gnu/gcj/convert/.
#
# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
#   where <UnicodeData.txt> is obtained from www.unicode.org (named
#   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
#   is the final location of include/java-chardecomp.h.
#   As of JDK 1.4, use Unicode version 3.0.0 for best results.
#
# If this exits with nonzero status, then you must investigate the
# cause of the problem.
# Diagnostics and other information to stderr.
# With -n, the files are not created, but all processing still occurs.

# These maps characters to their decompositions.
my %canonical_decomposition = ();
my %full_decomposition = ();

# Handle `-n' and open output files.
if ($ARGV[0] && $ARGV[0] eq '-n')
{
    shift @ARGV;
    $ARGV[1] = '/dev/null';
}
die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";

# Process the Unicode file.
$| = 1;
my $count = 0;
print STDERR "Parsing attributes file";
while (<UNICODE>)
{
    print STDERR "." unless $count++ % 1000;
    chomp;
    s/\r//g;
    my ($ch, undef, undef, undef, undef, $decomp) = split ';';
    $ch = hex($ch);

    if ($decomp ne '')
    {
        my $is_full = 0;
        my @decomp = ();
        foreach (split (' ', $decomp))
        {
            if (/^\<.*\>$/)
            {
                $is_full = 1;
                next;
            }
	    push (@decomp, hex ($_));
	}
        my $s = pack "n*", @decomp;
        if ($is_full)
        {
            $full_decomposition{$ch} = $s;
        }
        else
        {
            $canonical_decomposition{$ch} = $s;
        }
    }
}

# Now generate decomposition tables.
open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
print STDERR "\nGenerating tables\n";
print DECOMP <<EOF;
// java-chardecomp.h - Decomposition character tables -*- c++ -*-

#ifndef __JAVA_CHARDECOMP_H__
#define __JAVA_CHARDECOMP_H__


// These tables are automatically generated by the $0
// script.  DO NOT EDIT the tables.  Instead, fix the script
// and run it again.

// This file should only be included by natCollator.cc

struct decomp_entry
{
  jchar key;
  const char *value;
};

EOF

&write_decompositions;

print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";

close(DECOMP);
print STDERR "Done\n";
exit;


# Write a single decomposition table.
sub write_single_decomposition($$%)
{
    my ($name, $is_canon, %table) = @_;
    my $first_line = 1;
    print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";

    for my $key (0 .. 0xffff)
    {
	next if ! defined $table{$key};
        print DECOMP ",\n" unless $first_line;
	$first_line = 0;

	printf DECOMP "  { 0x%04x, \"", $key;

	# We represent the expansion as a series of bytes, terminated
	# with a double nul.  This is ugly, but relatively
	# space-efficient.  Most expansions are short, but there are a
	# few that are very long (e.g. \uFDFA).  This means that if we
	# chose a fixed-space representation we would waste a lot of
	# space.
	my @expansion = unpack "n*", $table{$key};
	foreach my $char (@expansion)
	{
	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
	}

	print DECOMP "\" }";
    }

    print DECOMP "\n};\n\n";
}

sub write_decompositions()
{
    &write_single_decomposition ('canonical', 1, %canonical_decomposition);
    &write_single_decomposition ('full', 0, %full_decomposition);
}
unicode-decomp.pl: Move from chartables.pl... 2002-03-04 Eric Blake <ebb9@email.byu.edu> * scripts/unicode-decomp.pl: Move from chartables.pl, and remove the code for generating include/java-chartables.h. * scripts/unicode-blocks.pl: Move from scripts/blocks.pl, and merge with Classpath. * scripts/unicode-muncher.pl: Copy from Classpath. * scritps/MakeCharTables.java: New file. * gnu/gcj/convert/Blocks-3.txt: New file. * gnu/gcj/convert/UnicodeData-3.0.0.txt: New file. * gnu/gcj/convert/UnicodeCharacterDatabase-3.0.0.html: New file. * gnu/java/lang/CharData.java: Copy from Classpath. * Makefile.am (ordinary_java_source_files): Add gnu/java/lang/CharData.java. * configure.in: Remove --enable-fast-character option. * java/lang/Character.java: Merge algorithms and Javadoc with Classpath. * java/lang/natCharacter.cc: Implement Unicode lookup table more efficiently. * include/java-chardecomp.h: Regenerate. * include/java-chartables.h: Regenerate. From-SVN: r50368 2002-03-06 19:54:45 +01:00			`#!/usr/bin/perl -w`
			`# unicode-decomp.pl - script to generate database for java.text.Collator`
			`# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.`
			`#`
			`# This file is part of libjava.`
			`#`
			`# This software is copyrighted work licensed under the terms of the`
			`# Libjava License. Please consult the file "LIBJAVA_LICENSE" for`
			`# details.`

			`# Code for reading UnicodeData.txt and generating the code for`
			`# gnu.java.lang.CharData. For now, the relevant Unicode definition files`
			`# are found in libjava/gnu/gcj/convert/.`
			`#`
			`# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>`
			`# where <UnicodeData.txt> is obtained from www.unicode.org (named`
			`# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>`
			`# is the final location of include/java-chardecomp.h.`
			`# As of JDK 1.4, use Unicode version 3.0.0 for best results.`
			`#`
			`# If this exits with nonzero status, then you must investigate the`
			`# cause of the problem.`
			`# Diagnostics and other information to stderr.`
			`# With -n, the files are not created, but all processing still occurs.`

			`# These maps characters to their decompositions.`
			`my %canonical_decomposition = ();`
			`my %full_decomposition = ();`

			# Handle `-n' and open output files.
			`if ($ARGV[0] && $ARGV[0] eq '-n')`
			`{`
			`shift @ARGV;`
			`$ARGV[1] = '/dev/null';`
			`}`
			`die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;`
			`open (UNICODE, "< $ARGV[0]") \|\| die "Can't open Unicode attribute file: $!\n";`

			`# Process the Unicode file.`
			`$\| = 1;`
			`my $count = 0;`
			`print STDERR "Parsing attributes file";`
			`while (<UNICODE>)`
			`{`
			`print STDERR "." unless $count++ % 1000;`
			`chomp;`
			`s/\r//g;`
			`my ($ch, undef, undef, undef, undef, $decomp) = split ';';`
			`$ch = hex($ch);`

			`if ($decomp ne '')`
			`{`
			`my $is_full = 0;`
			`my @decomp = ();`
			`foreach (split (' ', $decomp))`
			`{`
			`if (/^\<.*\>$/)`
			`{`
			`$is_full = 1;`
			`next;`
			`}`
			`push (@decomp, hex ($_));`
			`}`
			`my $s = pack "n*", @decomp;`
			`if ($is_full)`
			`{`
			`$full_decomposition{$ch} = $s;`
			`}`
			`else`
			`{`
			`$canonical_decomposition{$ch} = $s;`
			`}`
			`}`
			`}`

			`# Now generate decomposition tables.`
			`open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";`
			`print STDERR "\nGenerating tables\n";`
			`print DECOMP <<EOF;`
			`// java-chardecomp.h - Decomposition character tables -- c++ --`

			`#ifndef __JAVA_CHARDECOMP_H__`
			`#define __JAVA_CHARDECOMP_H__`


			`// These tables are automatically generated by the $0`
			`// script. DO NOT EDIT the tables. Instead, fix the script`
			`// and run it again.`

			`// This file should only be included by natCollator.cc`

			`struct decomp_entry`
			`{`
			`jchar key;`
			`const char *value;`
			`};`

			`EOF`

			`&write_decompositions;`

			`print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";`

			`close(DECOMP);`
			`print STDERR "Done\n";`
			`exit;`


			`# Write a single decomposition table.`
			`sub write_single_decomposition($$%)`
			`{`
			`my ($name, $is_canon, %table) = @_;`
			`my $first_line = 1;`
			`print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";`

			`for my $key (0 .. 0xffff)`
			`{`
			`next if ! defined $table{$key};`
			`print DECOMP ",\n" unless $first_line;`
			`$first_line = 0;`

			`printf DECOMP " { 0x%04x, \"", $key;`

			`# We represent the expansion as a series of bytes, terminated`
			`# with a double nul. This is ugly, but relatively`
			`# space-efficient. Most expansions are short, but there are a`
			`# few that are very long (e.g. \uFDFA). This means that if we`
			`# chose a fixed-space representation we would waste a lot of`
			`# space.`
			`my @expansion = unpack "n*", $table{$key};`
			`foreach my $char (@expansion)`
			`{`
			`printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);`
			`}`

			`print DECOMP "\" }";`
			`}`

			`print DECOMP "\n};\n\n";`
			`}`

			`sub write_decompositions()`
			`{`
			`&write_single_decomposition ('canonical', 1, %canonical_decomposition);`
			`&write_single_decomposition ('full', 0, %full_decomposition);`
			`}`