147 lines
3.9 KiB
Perl
147 lines
3.9 KiB
Perl
|
#!/usr/bin/perl -w
|
||
|
# unicode-decomp.pl - script to generate database for java.text.Collator
|
||
|
# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
|
||
|
#
|
||
|
# This file is part of libjava.
|
||
|
#
|
||
|
# This software is copyrighted work licensed under the terms of the
|
||
|
# Libjava License. Please consult the file "LIBJAVA_LICENSE" for
|
||
|
# details.
|
||
|
|
||
|
# Code for reading UnicodeData.txt and generating the code for
|
||
|
# gnu.java.lang.CharData. For now, the relevant Unicode definition files
|
||
|
# are found in libjava/gnu/gcj/convert/.
|
||
|
#
|
||
|
# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
|
||
|
# where <UnicodeData.txt> is obtained from www.unicode.org (named
|
||
|
# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
|
||
|
# is the final location of include/java-chardecomp.h.
|
||
|
# As of JDK 1.4, use Unicode version 3.0.0 for best results.
|
||
|
#
|
||
|
# If this exits with nonzero status, then you must investigate the
|
||
|
# cause of the problem.
|
||
|
# Diagnostics and other information to stderr.
|
||
|
# With -n, the files are not created, but all processing still occurs.
|
||
|
|
||
|
# These maps characters to their decompositions.
|
||
|
my %canonical_decomposition = ();
|
||
|
my %full_decomposition = ();
|
||
|
|
||
|
# Handle `-n' and open output files.
|
||
|
if ($ARGV[0] && $ARGV[0] eq '-n')
|
||
|
{
|
||
|
shift @ARGV;
|
||
|
$ARGV[1] = '/dev/null';
|
||
|
}
|
||
|
die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
|
||
|
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
|
||
|
|
||
|
# Process the Unicode file.
|
||
|
$| = 1;
|
||
|
my $count = 0;
|
||
|
print STDERR "Parsing attributes file";
|
||
|
while (<UNICODE>)
|
||
|
{
|
||
|
print STDERR "." unless $count++ % 1000;
|
||
|
chomp;
|
||
|
s/\r//g;
|
||
|
my ($ch, undef, undef, undef, undef, $decomp) = split ';';
|
||
|
$ch = hex($ch);
|
||
|
|
||
|
if ($decomp ne '')
|
||
|
{
|
||
|
my $is_full = 0;
|
||
|
my @decomp = ();
|
||
|
foreach (split (' ', $decomp))
|
||
|
{
|
||
|
if (/^\<.*\>$/)
|
||
|
{
|
||
|
$is_full = 1;
|
||
|
next;
|
||
|
}
|
||
|
push (@decomp, hex ($_));
|
||
|
}
|
||
|
my $s = pack "n*", @decomp;
|
||
|
if ($is_full)
|
||
|
{
|
||
|
$full_decomposition{$ch} = $s;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
$canonical_decomposition{$ch} = $s;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# Now generate decomposition tables.
|
||
|
open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
|
||
|
print STDERR "\nGenerating tables\n";
|
||
|
print DECOMP <<EOF;
|
||
|
// java-chardecomp.h - Decomposition character tables -*- c++ -*-
|
||
|
|
||
|
#ifndef __JAVA_CHARDECOMP_H__
|
||
|
#define __JAVA_CHARDECOMP_H__
|
||
|
|
||
|
|
||
|
// These tables are automatically generated by the $0
|
||
|
// script. DO NOT EDIT the tables. Instead, fix the script
|
||
|
// and run it again.
|
||
|
|
||
|
// This file should only be included by natCollator.cc
|
||
|
|
||
|
struct decomp_entry
|
||
|
{
|
||
|
jchar key;
|
||
|
const char *value;
|
||
|
};
|
||
|
|
||
|
EOF
|
||
|
|
||
|
&write_decompositions;
|
||
|
|
||
|
print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
|
||
|
|
||
|
close(DECOMP);
|
||
|
print STDERR "Done\n";
|
||
|
exit;
|
||
|
|
||
|
|
||
|
# Write a single decomposition table.
|
||
|
sub write_single_decomposition($$%)
|
||
|
{
|
||
|
my ($name, $is_canon, %table) = @_;
|
||
|
my $first_line = 1;
|
||
|
print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
|
||
|
|
||
|
for my $key (0 .. 0xffff)
|
||
|
{
|
||
|
next if ! defined $table{$key};
|
||
|
print DECOMP ",\n" unless $first_line;
|
||
|
$first_line = 0;
|
||
|
|
||
|
printf DECOMP " { 0x%04x, \"", $key;
|
||
|
|
||
|
# We represent the expansion as a series of bytes, terminated
|
||
|
# with a double nul. This is ugly, but relatively
|
||
|
# space-efficient. Most expansions are short, but there are a
|
||
|
# few that are very long (e.g. \uFDFA). This means that if we
|
||
|
# chose a fixed-space representation we would waste a lot of
|
||
|
# space.
|
||
|
my @expansion = unpack "n*", $table{$key};
|
||
|
foreach my $char (@expansion)
|
||
|
{
|
||
|
printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
|
||
|
}
|
||
|
|
||
|
print DECOMP "\" }";
|
||
|
}
|
||
|
|
||
|
print DECOMP "\n};\n\n";
|
||
|
}
|
||
|
|
||
|
sub write_decompositions()
|
||
|
{
|
||
|
&write_single_decomposition ('canonical', 1, %canonical_decomposition);
|
||
|
&write_single_decomposition ('full', 0, %full_decomposition);
|
||
|
}
|