JIS0201.h: New file, generated from Unicode table.

�
	* gnu/gcj/convert/JIS0201.h:  New file, generated from Unicode table.
	* gnu/gcj/convert/Input_JavaSrc.java: New BytesToUnicode class.
	* gnu/gcj/convert/Input_SJIS.java: New BytesToUnicode class.
	* gnu/gcj/convert/Output_EUCJIS.java:  New UnicodeToBytes class.
	* gnu/gcj/convert/Output_SJIS.java:  New UnicodeToBytes class.
	* gnu/gcj/convert/natInput_EUCJIS.cc:  New file.
	* gnu/gcj/convert/natInput_SJIS.cc:  New file.
	* gnu/gcj/convert/natOutput_EUCJIS.cc:  New file.
	* gnu/gcj/convert/natOutput_SJIS.cc:  New file.
	* gnu/gcj/convert/make-trie.c:  New file: functions to make a trie.
	* gnu/gcj/convert/gen-from-JIS.c:  Invoke make-trie for output.
	* gnu/gcj/convert/Unicode_to_JIS.cc:  New generated trie table.

From-SVN: r26501
This commit is contained in:
Per Bothner 1999-04-16 10:25:08 -07:00
parent 063ee226ed
commit 7b824de381
6 changed files with 319 additions and 11 deletions

View File

@ -8,6 +8,12 @@ details. */
package gnu.gcj.convert;
/**
* Convert ISO-Latin-1 (8851-1) text to Unicode.
* @author Per Bothner <bothner@cygnus.com>
* @date Match 1999.
*/
public class Input_8859_1 extends BytesToUnicode
{
public String getName() { return "8859_1"; }

View File

@ -0,0 +1,24 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
package gnu.gcj.convert;
/**
* Convert SJIS (Shift JIS, used on Japanese MS-Windows) to Unicode.
* @author Per Bothner <bothner@cygnus.com>
* @date April 1999.
*/
public class Input_SJIS extends BytesToUnicode
{
public String getName() { return "SJIS"; }
public native int read (char[] outbuffer, int outpos, int outlength);
int first_byte;
}

View File

@ -0,0 +1,159 @@
/* This file is automatically generated from Unicode tables */
MAP(0x00, 0x20, 0x0020) /* SPACE */
MAP(0x00, 0x21, 0x0021) /* EXCLAMATION MARK */
MAP(0x00, 0x22, 0x0022) /* QUOTATION MARK */
MAP(0x00, 0x23, 0x0023) /* NUMBER SIGN */
MAP(0x00, 0x24, 0x0024) /* DOLLAR SIGN */
MAP(0x00, 0x25, 0x0025) /* PERCENT SIGN */
MAP(0x00, 0x26, 0x0026) /* AMPERSAND */
MAP(0x00, 0x27, 0x0027) /* APOSTROPHE */
MAP(0x00, 0x28, 0x0028) /* LEFT PARENTHESIS */
MAP(0x00, 0x29, 0x0029) /* RIGHT PARENTHESIS */
MAP(0x00, 0x2A, 0x002A) /* ASTERISK */
MAP(0x00, 0x2B, 0x002B) /* PLUS SIGN */
MAP(0x00, 0x2C, 0x002C) /* COMMA */
MAP(0x00, 0x2D, 0x002D) /* HYPHEN-MINUS */
MAP(0x00, 0x2E, 0x002E) /* FULL STOP */
MAP(0x00, 0x2F, 0x002F) /* SOLIDUS */
MAP(0x00, 0x30, 0x0030) /* DIGIT ZERO */
MAP(0x00, 0x31, 0x0031) /* DIGIT ONE */
MAP(0x00, 0x32, 0x0032) /* DIGIT TWO */
MAP(0x00, 0x33, 0x0033) /* DIGIT THREE */
MAP(0x00, 0x34, 0x0034) /* DIGIT FOUR */
MAP(0x00, 0x35, 0x0035) /* DIGIT FIVE */
MAP(0x00, 0x36, 0x0036) /* DIGIT SIX */
MAP(0x00, 0x37, 0x0037) /* DIGIT SEVEN */
MAP(0x00, 0x38, 0x0038) /* DIGIT EIGHT */
MAP(0x00, 0x39, 0x0039) /* DIGIT NINE */
MAP(0x00, 0x3A, 0x003A) /* COLON */
MAP(0x00, 0x3B, 0x003B) /* SEMICOLON */
MAP(0x00, 0x3C, 0x003C) /* LESS-THAN SIGN */
MAP(0x00, 0x3D, 0x003D) /* EQUALS SIGN */
MAP(0x00, 0x3E, 0x003E) /* GREATER-THAN SIGN */
MAP(0x00, 0x3F, 0x003F) /* QUESTION MARK */
MAP(0x00, 0x40, 0x0040) /* COMMERCIAL AT */
MAP(0x00, 0x41, 0x0041) /* LATIN CAPITAL LETTER A */
MAP(0x00, 0x42, 0x0042) /* LATIN CAPITAL LETTER B */
MAP(0x00, 0x43, 0x0043) /* LATIN CAPITAL LETTER C */
MAP(0x00, 0x44, 0x0044) /* LATIN CAPITAL LETTER D */
MAP(0x00, 0x45, 0x0045) /* LATIN CAPITAL LETTER E */
MAP(0x00, 0x46, 0x0046) /* LATIN CAPITAL LETTER F */
MAP(0x00, 0x47, 0x0047) /* LATIN CAPITAL LETTER G */
MAP(0x00, 0x48, 0x0048) /* LATIN CAPITAL LETTER H */
MAP(0x00, 0x49, 0x0049) /* LATIN CAPITAL LETTER I */
MAP(0x00, 0x4A, 0x004A) /* LATIN CAPITAL LETTER J */
MAP(0x00, 0x4B, 0x004B) /* LATIN CAPITAL LETTER K */
MAP(0x00, 0x4C, 0x004C) /* LATIN CAPITAL LETTER L */
MAP(0x00, 0x4D, 0x004D) /* LATIN CAPITAL LETTER M */
MAP(0x00, 0x4E, 0x004E) /* LATIN CAPITAL LETTER N */
MAP(0x00, 0x4F, 0x004F) /* LATIN CAPITAL LETTER O */
MAP(0x00, 0x50, 0x0050) /* LATIN CAPITAL LETTER P */
MAP(0x00, 0x51, 0x0051) /* LATIN CAPITAL LETTER Q */
MAP(0x00, 0x52, 0x0052) /* LATIN CAPITAL LETTER R */
MAP(0x00, 0x53, 0x0053) /* LATIN CAPITAL LETTER S */
MAP(0x00, 0x54, 0x0054) /* LATIN CAPITAL LETTER T */
MAP(0x00, 0x55, 0x0055) /* LATIN CAPITAL LETTER U */
MAP(0x00, 0x56, 0x0056) /* LATIN CAPITAL LETTER V */
MAP(0x00, 0x57, 0x0057) /* LATIN CAPITAL LETTER W */
MAP(0x00, 0x58, 0x0058) /* LATIN CAPITAL LETTER X */
MAP(0x00, 0x59, 0x0059) /* LATIN CAPITAL LETTER Y */
MAP(0x00, 0x5A, 0x005A) /* LATIN CAPITAL LETTER Z */
MAP(0x00, 0x5B, 0x005B) /* LEFT SQUARE BRACKET */
MAP(0x00, 0x5C, 0x00A5) /* YEN SIGN */
MAP(0x00, 0x5D, 0x005D) /* RIGHT SQUARE BRACKET */
MAP(0x00, 0x5E, 0x005E) /* CIRCUMFLEX ACCENT */
MAP(0x00, 0x5F, 0x005F) /* LOW LINE */
MAP(0x00, 0x60, 0x0060) /* GRAVE ACCENT */
MAP(0x00, 0x61, 0x0061) /* LATIN SMALL LETTER A */
MAP(0x00, 0x62, 0x0062) /* LATIN SMALL LETTER B */
MAP(0x00, 0x63, 0x0063) /* LATIN SMALL LETTER C */
MAP(0x00, 0x64, 0x0064) /* LATIN SMALL LETTER D */
MAP(0x00, 0x65, 0x0065) /* LATIN SMALL LETTER E */
MAP(0x00, 0x66, 0x0066) /* LATIN SMALL LETTER F */
MAP(0x00, 0x67, 0x0067) /* LATIN SMALL LETTER G */
MAP(0x00, 0x68, 0x0068) /* LATIN SMALL LETTER H */
MAP(0x00, 0x69, 0x0069) /* LATIN SMALL LETTER I */
MAP(0x00, 0x6A, 0x006A) /* LATIN SMALL LETTER J */
MAP(0x00, 0x6B, 0x006B) /* LATIN SMALL LETTER K */
MAP(0x00, 0x6C, 0x006C) /* LATIN SMALL LETTER L */
MAP(0x00, 0x6D, 0x006D) /* LATIN SMALL LETTER M */
MAP(0x00, 0x6E, 0x006E) /* LATIN SMALL LETTER N */
MAP(0x00, 0x6F, 0x006F) /* LATIN SMALL LETTER O */
MAP(0x00, 0x70, 0x0070) /* LATIN SMALL LETTER P */
MAP(0x00, 0x71, 0x0071) /* LATIN SMALL LETTER Q */
MAP(0x00, 0x72, 0x0072) /* LATIN SMALL LETTER R */
MAP(0x00, 0x73, 0x0073) /* LATIN SMALL LETTER S */
MAP(0x00, 0x74, 0x0074) /* LATIN SMALL LETTER T */
MAP(0x00, 0x75, 0x0075) /* LATIN SMALL LETTER U */
MAP(0x00, 0x76, 0x0076) /* LATIN SMALL LETTER V */
MAP(0x00, 0x77, 0x0077) /* LATIN SMALL LETTER W */
MAP(0x00, 0x78, 0x0078) /* LATIN SMALL LETTER X */
MAP(0x00, 0x79, 0x0079) /* LATIN SMALL LETTER Y */
MAP(0x00, 0x7A, 0x007A) /* LATIN SMALL LETTER Z */
MAP(0x00, 0x7B, 0x007B) /* LEFT CURLY BRACKET */
MAP(0x00, 0x7C, 0x007C) /* VERTICAL LINE */
MAP(0x00, 0x7D, 0x007D) /* RIGHT CURLY BRACKET */
MAP(0x00, 0x7E, 0x203E) /* OVERLINE */
MAP(0x00, 0xA1, 0xFF61) /* HALFWIDTH IDEOGRAPHIC FULL STOP */
MAP(0x00, 0xA2, 0xFF62) /* HALFWIDTH LEFT CORNER BRACKET */
MAP(0x00, 0xA3, 0xFF63) /* HALFWIDTH RIGHT CORNER BRACKET */
MAP(0x00, 0xA4, 0xFF64) /* HALFWIDTH IDEOGRAPHIC COMMA */
MAP(0x00, 0xA5, 0xFF65) /* HALFWIDTH KATAKANA MIDDLE DOT */
MAP(0x00, 0xA6, 0xFF66) /* HALFWIDTH KATAKANA LETTER WO */
MAP(0x00, 0xA7, 0xFF67) /* HALFWIDTH KATAKANA LETTER SMALL A */
MAP(0x00, 0xA8, 0xFF68) /* HALFWIDTH KATAKANA LETTER SMALL I */
MAP(0x00, 0xA9, 0xFF69) /* HALFWIDTH KATAKANA LETTER SMALL U */
MAP(0x00, 0xAA, 0xFF6A) /* HALFWIDTH KATAKANA LETTER SMALL E */
MAP(0x00, 0xAB, 0xFF6B) /* HALFWIDTH KATAKANA LETTER SMALL O */
MAP(0x00, 0xAC, 0xFF6C) /* HALFWIDTH KATAKANA LETTER SMALL YA */
MAP(0x00, 0xAD, 0xFF6D) /* HALFWIDTH KATAKANA LETTER SMALL YU */
MAP(0x00, 0xAE, 0xFF6E) /* HALFWIDTH KATAKANA LETTER SMALL YO */
MAP(0x00, 0xAF, 0xFF6F) /* HALFWIDTH KATAKANA LETTER SMALL TU */
MAP(0x00, 0xB0, 0xFF70) /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
MAP(0x00, 0xB1, 0xFF71) /* HALFWIDTH KATAKANA LETTER A */
MAP(0x00, 0xB2, 0xFF72) /* HALFWIDTH KATAKANA LETTER I */
MAP(0x00, 0xB3, 0xFF73) /* HALFWIDTH KATAKANA LETTER U */
MAP(0x00, 0xB4, 0xFF74) /* HALFWIDTH KATAKANA LETTER E */
MAP(0x00, 0xB5, 0xFF75) /* HALFWIDTH KATAKANA LETTER O */
MAP(0x00, 0xB6, 0xFF76) /* HALFWIDTH KATAKANA LETTER KA */
MAP(0x00, 0xB7, 0xFF77) /* HALFWIDTH KATAKANA LETTER KI */
MAP(0x00, 0xB8, 0xFF78) /* HALFWIDTH KATAKANA LETTER KU */
MAP(0x00, 0xB9, 0xFF79) /* HALFWIDTH KATAKANA LETTER KE */
MAP(0x00, 0xBA, 0xFF7A) /* HALFWIDTH KATAKANA LETTER KO */
MAP(0x00, 0xBB, 0xFF7B) /* HALFWIDTH KATAKANA LETTER SA */
MAP(0x00, 0xBC, 0xFF7C) /* HALFWIDTH KATAKANA LETTER SI */
MAP(0x00, 0xBD, 0xFF7D) /* HALFWIDTH KATAKANA LETTER SU */
MAP(0x00, 0xBE, 0xFF7E) /* HALFWIDTH KATAKANA LETTER SE */
MAP(0x00, 0xBF, 0xFF7F) /* HALFWIDTH KATAKANA LETTER SO */
MAP(0x00, 0xC0, 0xFF80) /* HALFWIDTH KATAKANA LETTER TA */
MAP(0x00, 0xC1, 0xFF81) /* HALFWIDTH KATAKANA LETTER TI */
MAP(0x00, 0xC2, 0xFF82) /* HALFWIDTH KATAKANA LETTER TU */
MAP(0x00, 0xC3, 0xFF83) /* HALFWIDTH KATAKANA LETTER TE */
MAP(0x00, 0xC4, 0xFF84) /* HALFWIDTH KATAKANA LETTER TO */
MAP(0x00, 0xC5, 0xFF85) /* HALFWIDTH KATAKANA LETTER NA */
MAP(0x00, 0xC6, 0xFF86) /* HALFWIDTH KATAKANA LETTER NI */
MAP(0x00, 0xC7, 0xFF87) /* HALFWIDTH KATAKANA LETTER NU */
MAP(0x00, 0xC8, 0xFF88) /* HALFWIDTH KATAKANA LETTER NE */
MAP(0x00, 0xC9, 0xFF89) /* HALFWIDTH KATAKANA LETTER NO */
MAP(0x00, 0xCA, 0xFF8A) /* HALFWIDTH KATAKANA LETTER HA */
MAP(0x00, 0xCB, 0xFF8B) /* HALFWIDTH KATAKANA LETTER HI */
MAP(0x00, 0xCC, 0xFF8C) /* HALFWIDTH KATAKANA LETTER HU */
MAP(0x00, 0xCD, 0xFF8D) /* HALFWIDTH KATAKANA LETTER HE */
MAP(0x00, 0xCE, 0xFF8E) /* HALFWIDTH KATAKANA LETTER HO */
MAP(0x00, 0xCF, 0xFF8F) /* HALFWIDTH KATAKANA LETTER MA */
MAP(0x00, 0xD0, 0xFF90) /* HALFWIDTH KATAKANA LETTER MI */
MAP(0x00, 0xD1, 0xFF91) /* HALFWIDTH KATAKANA LETTER MU */
MAP(0x00, 0xD2, 0xFF92) /* HALFWIDTH KATAKANA LETTER ME */
MAP(0x00, 0xD3, 0xFF93) /* HALFWIDTH KATAKANA LETTER MO */
MAP(0x00, 0xD4, 0xFF94) /* HALFWIDTH KATAKANA LETTER YA */
MAP(0x00, 0xD5, 0xFF95) /* HALFWIDTH KATAKANA LETTER YU */
MAP(0x00, 0xD6, 0xFF96) /* HALFWIDTH KATAKANA LETTER YO */
MAP(0x00, 0xD7, 0xFF97) /* HALFWIDTH KATAKANA LETTER RA */
MAP(0x00, 0xD8, 0xFF98) /* HALFWIDTH KATAKANA LETTER RI */
MAP(0x00, 0xD9, 0xFF99) /* HALFWIDTH KATAKANA LETTER RU */
MAP(0x00, 0xDA, 0xFF9A) /* HALFWIDTH KATAKANA LETTER RE */
MAP(0x00, 0xDB, 0xFF9B) /* HALFWIDTH KATAKANA LETTER RO */
MAP(0x00, 0xDC, 0xFF9C) /* HALFWIDTH KATAKANA LETTER WA */
MAP(0x00, 0xDD, 0xFF9D) /* HALFWIDTH KATAKANA LETTER N */
MAP(0x00, 0xDE, 0xFF9E) /* HALFWIDTH KATAKANA VOICED SOUND MARK */
MAP(0x00, 0xDF, 0xFF9F) /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */

View File

@ -16,6 +16,11 @@ struct chval
#define MAP(B1, B2, C) { B1, B2, C },
struct chval chtab_0201[] = {
#include "JIS0201.h"
{ 255, 255, 0}
};
struct chval chtab_0208[] = {
#include "JIS0208.h"
{ 255, 255, 0}
@ -50,9 +55,9 @@ int
main(int argc, char** argv)
{
FILE *out = stdout;
unsigned min1 = 256, max1 = 0, min2 = 256, max2 = 0, count = 0;
unsigned short low1_uc = 0xFFFF, high1_uc = 0;
unsigned short low2_uc = 0xFFFF, high2_uc = 0;
int min1 = 256, max1 = 0, min2 = 256, max2 = 0, count = 0;
int low1_uc = 0xFFFF, high1_uc = 0;
int low2_uc = 0xFFFF, high2_uc = 0;
int i; int row, col;
if (strcmp (argv[1], "JIS0208") == 0)
chtab = chtab_0208;
@ -61,14 +66,26 @@ main(int argc, char** argv)
else if (strcmp (argv[1], "toJIS") == 0)
{
int i;
int count = sizeof(sorted)/sizeof(struct chval);
qsort (sorted, count, sizeof(struct chval),
compare);
for (i = 0; i < count; i++)
for (i = 0; chtab_0201[i].b1 != 255; i++)
{
fprintf (out, " 0x%04x -> 0x%02x, 0x%02x\n",
sorted[i].uc, sorted[i].b1, sorted[i].b2);
enter(chtab_0201[i].uc, chtab_0201[i].b2);
}
for (i = 0; i < 0x20; i++)
{
enter (i, i);
}
enter (127, 127);
for (i = 0; chtab_0208[i].b1 != 255; i++)
{
enter(chtab_0208[i].uc,
(chtab_0208[i].b1 << 8) | chtab_0208[i].b2);
}
for (i = 0; chtab_0212[i].b1 != 255; i++)
{
enter(chtab_0212[i].uc,
0x8000 | (chtab_0212[i].b1 << 8) | chtab_0212[i].b2);
}
print_table ("Unicode_to_JIS", stdout);
exit(0);
}
else
@ -111,7 +128,7 @@ main(int argc, char** argv)
{
if (row == chtab[i].b1 && col == chtab[i].b2)
{
unsigned uc = chtab[i].uc;
int uc = chtab[i].uc;
if (uc < 0x2000)
{
if (uc > high1_uc)

View File

@ -31,7 +31,7 @@ gnu::gcj::convert::Input_EUCJIS::read(jcharArray outbuffer, jint outpos,
{
if (b < 128)
{
#if 0
#if 1
// Technically, we should translate 0x5c to Yen symbol;
// in practice, it is not clear.
if (b == 0x5c)

View File

@ -0,0 +1,102 @@
/* Copyright (C) 1999 Cygnus Solutions
This file is part of libgcj.
This software is copyrighted work licensed under the terms of the
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
details. */
#include <config.h>
#include <cni.h>
#include <gnu/gcj/convert/Output_EUCJIS.h>
/* A trie structure to map unicode values to JIS codes.
* code == -1: the character is undefined.
* code >= 0 && code < 128: JIS-Roman - mostly Ascii.
* code >= 128 && code < 256: Half-width Katakana.
* code >= 256 && code < 0x8000: JIS X 0208:1997.
* code >= 0x8000 && code < 0xFFFF: JIX X 0212-1990.
*/
extern unsigned short Unicode_to_JIS[];
int
trie_lookup (unsigned short *trie, unsigned short key)
{
unsigned short branch = trie[(key >> 12) & 0xf];
if (branch == 0)
return -1;
branch = trie[branch + ((key >> 8) & 0xf)];
if (branch == 0)
return -1;
branch = trie[branch + ((key >> 4) & 0xf)];
if (branch == 0)
return -1;
return trie[branch + (key & 0xf)];
}
static jint
convert_TO_EUCJIS (gnu::gcj::convert::Output_EUCJIS *encoder,
jchar *ptr, jint inlength)
{
int orig_inlength = inlength;
jint outbuf_length = encoder->buf->length;
for (;;)
{
if (encoder->count >= outbuf_length)
break;
if (encoder->pending1 >= 0)
{
elements(encoder->buf)[encoder->count++] = encoder->pending1;
encoder->pending1 = encoder->pending2;
encoder->pending2 = -1;
continue;
}
if (inlength == 0)
break;
jchar ch = *ptr++;
inlength--;
unsigned short val = trie_lookup(Unicode_to_JIS, ch);
if (val < 0x80)
{
if (val == 0xffff)
val = '?';
}
else if (val <= 0xFF)
{
encoder->pending1 = val;
encoder->pending2 = -1;
val = 0x8e;
}
else if (val < 0x8000)
{
val |= 0x8080;
encoder->pending1 = val & 0xff;
val = val >> 8;
encoder->pending2 = -1;
}
else
{
val |= 0x8080;
encoder->pending1 = val >> 8;
encoder->pending2 = val & 0xff;
val = 0x8f;
}
elements(encoder->buf)[encoder->count++] = val;
}
return orig_inlength - inlength;
}
jint
gnu::gcj::convert::Output_EUCJIS::write (jcharArray inbuffer,
jint inpos, jint inlength)
{
return convert_TO_EUCJIS(this, &elements(inbuffer)[inpos], inlength);
}
jint
gnu::gcj::convert::Output_EUCJIS::write (jstring str, jint inpos,
jint inlength, jcharArray)
{
return convert_TO_EUCJIS(this, _Jv_GetStringChars(str)+inpos, inlength);
}