Make-lang.in (JAVA_LEX_C): Added chartables.h.

* Make-lang.in (JAVA_LEX_C): Added chartables.h. * lex.c (java_ignorable_control_p): Removed. (java_letter_or_digit_p): Removed. (java_start_char_p): New function. (java_read_char): Return `int', not `unicode_t'. Changed callers. (java_read_unicode): Likewise. (java_read_unicode_collapsing_terminators): Likewise. (java_get_unicode): Likewise. (java_new_lexer): Initialize hit_eof. (java_parse_end_comment): Take `int' argument. (java_parse_doc_section): Likewise. (java_parse_escape_sequence): Don't allow backlash-newline. Return `int'. * lex.h (JAVA_DIGIT_P): Removed. (_JAVA_LETTER_OR_DIGIT_P): Removed. (_JAVA_IDENTIFIER_IGNORABLE): Removed. (JAVA_START_CHAR_P): Renamed from JAVA_ID_CHAR_P. (JAVA_PART_CHAR_P): New macro. (UEOF): Now -1. (JAVA_CHAR_ERROR): Now -2. (java_lexer): New field `hit_eof'. * chartables.h: New file. * gen-table.pl: new file. From-SVN: r38237
2000-12-13 22:47:13 +00:00 · 2000-12-13 22:47:13 +00:00 · 3f27e3f86a
parent 568aac9cf7
commit 3f27e3f86a
6 changed files with 3616 additions and 364 deletions
--- a/gcc/java/ChangeLog
+++ b/gcc/java/ChangeLog
@ -1,3 +1,30 @@
+2000-11-07  Tom Tromey  <tromey@cygnus.com>
+
+	* Make-lang.in (JAVA_LEX_C): Added chartables.h.
+	* lex.c (java_ignorable_control_p): Removed.
+	(java_letter_or_digit_p): Removed.
+	(java_start_char_p): New function.
+	(java_read_char): Return `int', not `unicode_t'.  Changed
+	callers.
+	(java_read_unicode): Likewise.
+	(java_read_unicode_collapsing_terminators): Likewise.
+	(java_get_unicode): Likewise.
+	(java_new_lexer): Initialize hit_eof.
+	(java_parse_end_comment): Take `int' argument.
+	(java_parse_doc_section): Likewise.
+	(java_parse_escape_sequence): Don't allow backlash-newline.
+	Return `int'.
+	* lex.h (JAVA_DIGIT_P): Removed.
+	(_JAVA_LETTER_OR_DIGIT_P): Removed.
+	(_JAVA_IDENTIFIER_IGNORABLE): Removed.
+	(JAVA_START_CHAR_P): Renamed from JAVA_ID_CHAR_P.
+	(JAVA_PART_CHAR_P): New macro.
+	(UEOF): Now -1.
+	(JAVA_CHAR_ERROR): Now -2.
+	(java_lexer): New field `hit_eof'.
+	* chartables.h: New file.
+	* gen-table.pl: new file.
+
 2000-11-20  Tom Tromey  <tromey@cygnus.com>
            Alexandre Petit-Bianco  <apbianco@cygnus.com>

--- a/gcc/java/Make-lang.in
+++ b/gcc/java/Make-lang.in
@ -214,7 +214,7 @@ java.stage4: stage4-start
 #
 # .o:.h dependencies.
 JAVA_TREE_H = $(TREE_H) java/java-tree.h java/java-tree.def
-JAVA_LEX_C = java/lex.c java/keyword.h
+JAVA_LEX_C = java/lex.c java/keyword.h java/chartables.h

 java/parse.o: java/parse.c java/jcf-reader.c $(CONFIG_H) system.h \
  function.h $(JAVA_TREE_H) $(JAVA_LEX_C) java/parse.h java/lex.h $(GGC_H)
--- a/gcc/java/chartables.h
+++ b/gcc/java/chartables.h
--- a/gcc/java/gen-table.pl
+++ b/gcc/java/gen-table.pl
@ -0,0 +1,256 @@
+#! /usr/bin/perl
+
+#    Copyright (C) 2000 Free Software Foundation
+
+#    This program is free software; you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation; either version 2, or (at your option)
+#    any later version.
+
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+
+#    You should have received a copy of the GNU General Public License
+#    along with this program; if not, write to the Free Software
+#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+#    02111-1307, USA.
+
+# gen-table.pl - Generate tables for gcj from Unicode data.
+# Usage: perl gen-table.pl DATA-FILE
+
+# Names of fields in Unicode data table.
+$CODE = 0;
+$NAME = 1;
+$CATEGORY = 2;
+$COMBINING_CLASSES = 3;
+$BIDI_CATEGORY = 4;
+$DECOMPOSITION = 5;
+$DECIMAL_VALUE = 6;
+$DIGIT_VALUE = 7;
+$NUMERIC_VALUE = 8;
+$MIRRORED = 9;
+$OLD_NAME = 10;
+$COMMENT = 11;
+$UPPER = 12;
+$LOWER = 13;
+$TITLE = 14;
+
+# Start of special-cased gaps in Unicode data table.
+%gaps = (
+	 0x4e00 => "CJK",
+	 0xac00 => "Hangul",
+	 0xd800 => "Unassigned High Surrogate",
+	 0xdb80 => "Private Use High Surrogate",
+	 0xdc00 => "Low Surrogate",
+	 0xe000 => "Private Use"
+	 );
+
+# This lists control characters which are also considered whitespace.
+# This is a somewhat odd list, taken from the JCL definition of
+# Character.isIdentifierIgnorable.
+%whitespace_controls =
+    (
+     0x0009 => 1,
+     0x000a => 1,
+     0x000b => 1,
+     0x000c => 1,
+     0x000d => 1,
+     0x001c => 1,
+     0x001d => 1,
+     0x001e => 1,
+     0x001f => 1
+     );
+
+open (INPUT, "< $ARGV[0]") || exit 1;
+
+$last_code = -1;
+while (<INPUT>)
+{
+    chop;
+    @fields = split (';', $_, 30);
+    if ($#fields != 14)
+    {
+	print STDERR "Entry for $fields[$CODE] has wrong number of fields\n";
+    }
+
+    $code = hex ($fields[$CODE]);
+    if ($code > $last_code + 1)
+    {
+	# Found a gap.
+	if (defined $gaps{$code})
+	{
+	    # Fill the gap with the last character read.
+	    @gfields = @fields;
+	}
+	else
+	{
+	    # The gap represents undefined characters.  Only the type
+	    # matters.
+	    @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
+			'', '', '', '');
+	}
+	for (++$last_code; $last_code < $code; ++$last_code)
+	{
+	    $gfields{$CODE} = sprintf ("%04x", $last_code);
+	    &process_one ($last_code, @gfields);
+	}
+    }
+    &process_one ($code, @fields);
+    $last_code = $code;
+}
+
+close (INPUT);
+
+@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
+	    '', '', '', '');
+for (++$last_code; $last_code < 0x10000; ++$last_code)
+{
+    $gfields{$CODE} = sprintf ("%04x", $last_code);
+    &process_one ($last_code, @gfields);
+}
+--$last_code;			# Want last to be 0xFFFF.
+
+&print_tables ($last_code);
+
+exit 0;
+
+# Process a single character.
+sub process_one
+{
+    my ($code, @fields) = @_;
+
+    my $value = '';
+    my $type = $fields[$CATEGORY];
+
+    # See if the character is a valid identifier start.
+    if ($type =~ /L./		# Letter
+	|| $type eq 'Pc'	# Connecting punctuation
+	|| $type eq 'Sc')	# Currency symbol
+    {
+	$value = 'LETTER_START';
+    }
+
+    # See if the character is a valid identifier member.
+    if ($type =~ /L./		# Letter
+	|| $type eq 'Pc'	# Connecting punctuation
+	|| $type eq 'Sc'	# Currency symbol
+	|| $type =~ /N[dl]/	# Number: decimal or letter
+	|| $type =~ /M[nc]/	# Mark: non-spacing or combining
+	|| ($type eq 'Cc'	# Certain controls
+	    && ! defined $whitespace_controls{$code})
+	|| ($code >= 0x200c	# Join controls
+	    && $code <= 0x200f)
+	|| ($code >= 0x202a	# Bidi controls -- note that there
+				# is a typo in the JCL where these are
+				# concerned.
+	    && $code <= 0x202e)
+	|| ($code >= 0x206a	# Format controls
+	    && $code <= 0x206f)
+	|| $code == 0xfeff)	# ZWNBSP
+    {
+	if ($value eq '')
+	{
+	    $value = 'LETTER_PART';
+	}
+	else
+	{
+	    $value = 'LETTER_PART | ' . $value;
+	}
+    }
+
+    if ($value eq '')
+    {
+	$value = '0';
+    }
+    else
+    {
+	$value = '(' . $value . ')';
+    }
+
+    $map[$code] = $value;
+}
+
+sub print_tables
+{
+    my ($last) = @_;
+
+    local ($bytes_out) = 0;
+
+    open (OUT, "> chartables.h");
+
+    print OUT "/* This file is automatically generated.  DO NOT EDIT!\n";
+    print OUT "   Instead, edit gen-table.pl and re-run.  */\n\n";
+
+    print OUT "#ifndef CHARTABLES_H\n";
+    print OUT "#define CHARTABLES_H\n\n";
+
+    print OUT "#define LETTER_START 1\n";
+    print OUT "#define LETTER_PART  2\n\n";
+
+    for ($count = 0; $count <= $last; $count += 256)
+    {
+	$row[$count / 256] = &print_row ($count, '(char *) ', 'char', 1,
+					 'page');
+    }
+
+    print OUT "static char *type_table[256] = {\n";
+    for ($count = 0; $count <= $last; $count += 256)
+    {
+	print OUT ",\n" if $count > 0;
+	print OUT "  ", $row[$count / 256];
+	$bytes_out += 4;
+    }
+    print OUT "\n};\n\n";
+
+    print OUT "#endif /* CHARTABLES_H */\n";
+
+    close (OUT);
+
+    printf "Generated %d bytes\n", $bytes_out;
+}
+
+# Print a single "row" of a two-level table.
+sub print_row
+{
+    my ($start, $def_pfx, $typname, $typsize, $name) = @_;
+
+    my ($i);
+    my (@values);
+    my ($flag) = 1;
+    my ($off);
+    for ($off = 0; $off < 256; ++$off)
+    {
+	$values[$off] = $map[$off + $start];
+	if ($values[$off] ne $values[0])
+	{
+	    $flag = 0;
+	}
+    }
+    if ($flag)
+    {
+	return $def_pfx . $values[0];
+    }
+
+    printf OUT "static %s %s%d[256] = {\n  ", $typname, $name, $start / 256;
+    my ($column) = 2;
+    for ($i = $start; $i < $start + 256; ++$i)
+    {
+	print OUT ", "
+	    if $i > $start;
+	my ($text) = $values[$i - $start];
+	if (length ($text) + $column + 2 > 78)
+	{
+	    print OUT "\n  ";
+	    $column = 2;
+	}
+	print OUT $text;
+	$column += length ($text) + 2;
+    }
+    print OUT "\n};\n\n";
+
+    $bytes_out += 256 * $typsize;
+
+    return sprintf "%s%d", $name, $start / 256;
+}
--- a/gcc/java/lex.c
+++ b/gcc/java/lex.c
@ -36,6 +36,7 @@ The Free Software Foundation is independent of Sun Microsystems, Inc.  */

 #include "keyword.h"
 #include "flags.h"
+#include "chartables.h"

 /* Function declaration  */
 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
@ -46,17 +47,17 @@ static int java_is_eol PARAMS ((FILE *, int));
 static tree build_wfl_node PARAMS ((tree));
 #endif
 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
-static unicode_t java_parse_escape_sequence PARAMS ((void));
-static int java_letter_or_digit_p PARAMS ((unicode_t));
-static int java_ignorable_control_p PARAMS ((unicode_t));
-static int java_parse_doc_section PARAMS ((unicode_t));
-static void java_parse_end_comment PARAMS ((unicode_t));
-static unicode_t java_get_unicode PARAMS ((void));
-static unicode_t java_read_unicode PARAMS ((java_lexer *, int *));
-static unicode_t java_read_unicode_collapsing_terminators
-    PARAMS ((java_lexer *, int *));
+static int java_parse_escape_sequence PARAMS ((void));
+static int java_start_char_p PARAMS ((unicode_t));
+static int java_part_char_p PARAMS ((unicode_t));
+static int java_parse_doc_section PARAMS ((int));
+static void java_parse_end_comment PARAMS ((int));
+static int java_get_unicode PARAMS ((void));
+static int java_read_unicode PARAMS ((java_lexer *, int *));
+static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
+							     int *));
 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
-static unicode_t java_read_char PARAMS ((java_lexer *));
+static int java_read_char PARAMS ((java_lexer *));
 static void java_allocate_new_line PARAMS ((void));
 static void java_unget_unicode PARAMS ((void));
 static unicode_t java_sneak_unicode PARAMS ((void));
@ -217,6 +218,7 @@ java_new_lexer (finput, encoding)
  lex->finput = finput;
  lex->bs_count = 0;
  lex->unget_value = 0;
+  lex->hit_eof = 0;

 #ifdef HAVE_ICONV
  lex->handle = iconv_open ("UCS-2", encoding);
@ -298,7 +300,7 @@ java_destroy_lexer (lex)
  free (lex);
 }

-static unicode_t
+static int
 java_read_char (lex)
     java_lexer *lex;
 {
@ -496,12 +498,12 @@ java_store_unicode (l, c, unicode_escape_p)
  l->unicode_escape_p [l->size++] = unicode_escape_p;
 }

-static unicode_t
+static int
 java_read_unicode (lex, unicode_escape_p)
     java_lexer *lex;
     int *unicode_escape_p;
 {
-  unicode_t c;
+  int c;

  c = java_read_char (lex);
  *unicode_escape_p = 0;
@ -549,12 +551,12 @@ java_read_unicode (lex, unicode_escape_p)
  return (unicode_t) '\\';
 }

-static unicode_t
+static int
 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
     java_lexer *lex;
     int *unicode_escape_p;
 {
-  unicode_t c = java_read_unicode (lex, unicode_escape_p);
+  int c = java_read_unicode (lex, unicode_escape_p);

  if (c == '\r')
    {
@ -571,13 +573,18 @@ java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
  return c;
 }

-static unicode_t
+static int
 java_get_unicode ()
 {
  /* It's time to read a line when... */
  if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
    {
-      unicode_t c;
+      int c;
+      int found_chars = 0;
+
+      if (ctxp->lexer->hit_eof)
+	return UEOF;
+
      java_allocate_new_line ();
      if (ctxp->c_line->line[0] != '\n')
 	{
@ -586,15 +593,24 @@ java_get_unicode ()
 	      int unicode_escape_p;
 	      c = java_read_unicode_collapsing_terminators (ctxp->lexer,
 							    &unicode_escape_p);
-	      java_store_unicode (ctxp->c_line, c, unicode_escape_p);
-	      if (ctxp->c_line->white_space_only 
-		  && !JAVA_WHITE_SPACE_P (c)
-		  && c != '\n'
-		  && c != UEOF)
-		ctxp->c_line->white_space_only = 0;
+	      if (c != UEOF)
+		{
+		  found_chars = 1;
+		  java_store_unicode (ctxp->c_line, c, unicode_escape_p);
+		  if (ctxp->c_line->white_space_only 
+		      && !JAVA_WHITE_SPACE_P (c)
+		      && c != '\n')
+		    ctxp->c_line->white_space_only = 0;
+		}
 	      if ((c == '\n') || (c == UEOF))
 		break;
 	    }
+
+	  if (c == UEOF && ! found_chars)
+	    {
+	      ctxp->lexer->hit_eof = 1;
+	      return UEOF;
+	    }
 	}
    }
  ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
@ -606,9 +622,8 @@ java_get_unicode ()
 * C is the first character following the '/' and '*'. */
 static void
 java_parse_end_comment (c)
-     unicode_t c;
+     int c;
 {
-
  for ( ;; c = java_get_unicode ())
    {
      switch (c)
@ -637,7 +652,7 @@ java_parse_end_comment (c)

 static int
 java_parse_doc_section (c)
-     unicode_t c;
+     int c;
 {
  int valid_tag = 0, seen_star = 0;

@ -655,10 +670,10 @@ java_parse_doc_section (c)
 	}
      c = java_get_unicode();
    }
-  
+
  if (c == UEOF)
    java_lex_error ("Comment not terminated at end of input", 0);
-  
+
  if (seen_star && (c == '/'))
    return 1;			/* Goto step1 in caller */

@ -673,7 +688,7 @@ java_parse_doc_section (c)
 	  c = java_get_unicode ();
 	  tag [tag_index++] = c;
 	}
-      
+
      if (c == UEOF)
 	java_lex_error ("Comment not terminated at end of input", 0);
      tag [tag_index] = '\0';
@ -685,28 +700,51 @@ java_parse_doc_section (c)
  return 0;
 }

-/* This function to be used only by JAVA_ID_CHAR_P (), otherwise it
-   will return a wrong result.  */
+/* Return true if C is a valid start character for a Java identifier.
+   This is only called if C >= 128 -- smaller values are handled
+   inline.  However, this function handles all values anyway.  */
 static int
-java_letter_or_digit_p (c)
+java_start_char_p (c)
     unicode_t c;
 {
-  return _JAVA_LETTER_OR_DIGIT_P (c);
+  unsigned int hi = c / 256;
+  char *page = type_table[hi];
+  unsigned long val = (unsigned long) page;
+  int flags;
+
+  if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
+    flags = page[c & 255];
+  else
+    flags = val;
+
+  return flags & LETTER_START;
 }

-/* This function to be used only by JAVA_ID_CHAR_P ().  */
+/* Return true if C is a valid part character for a Java identifier.
+   This is only called if C >= 128 -- smaller values are handled
+   inline.  However, this function handles all values anyway.  */
 static int
-java_ignorable_control_p (c)
+java_part_char_p (c)
     unicode_t c;
 {
-  return _JAVA_IDENTIFIER_IGNORABLE (c);
+  unsigned int hi = c / 256;
+  char *page = type_table[hi];
+  unsigned long val = (unsigned long) page;
+  int flags;
+
+  if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
+    flags = page[c & 255];
+  else
+    flags = val;
+
+  return flags & LETTER_PART;
 }

-static unicode_t
+static int
 java_parse_escape_sequence ()
 {
  unicode_t char_lit;
-  unicode_t c;
+  int c;

  switch (c = java_get_unicode ())
    {
@ -754,8 +792,6 @@ java_parse_escape_sequence ()

 	return char_lit;
      }
-    case '\n':
-      return '\n';		/* ULT, caught latter as a specific error */
    default:
      java_lex_error ("Invalid character in escape sequence", 0);
      return JAVA_CHAR_ERROR;
@ -840,7 +876,8 @@ java_lex (java_lval)
 #endif
     YYSTYPE *java_lval;
 {
-  unicode_t c, first_unicode;
+  int c;
+  unicode_t first_unicode;
  int ascii_index, all_ascii;
  char *string;

@ -863,7 +900,7 @@ java_lex (java_lval)
      if ((c = java_get_unicode ()) == UEOF)
 	return 0;		/* Ok here */
      else
-	java_unget_unicode ();	/* Caught latter at the end the function */
+	java_unget_unicode ();	/* Caught later, at the end of the function */
    }
  /* Handle EOF here */
  if (c == UEOF)	/* Should probably do something here... */
@ -1189,7 +1226,7 @@ java_lex (java_lval)
  /* Character literals */
  if (c == '\'')
    {
-      unicode_t char_lit;
+      int char_lit;
      if ((c = java_get_unicode ()) == '\\')
 	char_lit = java_parse_escape_sequence ();
      else
@ -1206,7 +1243,7 @@ java_lex (java_lval)
      if (c != '\'')
 	java_lex_error ("Syntax error in character literal", 0);

-      if (c == JAVA_CHAR_ERROR)
+      if (char_lit == JAVA_CHAR_ERROR)
        char_lit = 0;		/* We silently convert it to zero */

      JAVA_LEX_CHAR_LIT (char_lit);
@ -1225,7 +1262,11 @@ java_lex (java_lval)
 	{
 	  if (c == '\\')
 	    c = java_parse_escape_sequence ();
-	  no_error &= (c != JAVA_CHAR_ERROR ? 1 : 0);
+	  if (c == JAVA_CHAR_ERROR)
+	    {
+	      no_error = 0;
+	      c = 0;		/* We silently convert it to zero.  */
+	    }
 	  java_unicode_2_utf8 (c);
 	}
      if (c == '\n' || c == UEOF) /* ULT */
@ -1469,7 +1510,7 @@ java_lex (java_lval)
  
  /* Keyword, boolean literal or null literal */
  for (first_unicode = c, all_ascii = 1, ascii_index = 0; 
-       JAVA_ID_CHAR_P (c); c = java_get_unicode ())
+       JAVA_PART_CHAR_P (c); c = java_get_unicode ())
    {
      java_unicode_2_utf8 (c);
      if (all_ascii && c >= 128)
@ -1554,8 +1595,8 @@ java_lex (java_lval)
 	}
    }
  
-  /* We may have and ID here */
-  if (JAVA_ID_CHAR_P(first_unicode) && !JAVA_DIGIT_P (first_unicode))
+  /* We may have an ID here */
+  if (JAVA_START_CHAR_P (first_unicode))
    {
      JAVA_LEX_ID (string);
      java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
--- a/gcc/java/lex.h
+++ b/gcc/java/lex.h
@ -78,7 +78,7 @@ typedef unsigned short unicode_t;
 /* Line information containers  */
 struct java_line {
  unicode_t *line;		/* The line's unicode */
-  char      *unicode_escape_p;	/* The maching char was a unicode escape */
+  char      *unicode_escape_p;	/* The matching char was a unicode escape */
  unicode_t ahead[1];		/* Character ahead */
  char unicode_escape_ahead_p;	/* Character ahead is a unicode escape */
  int max;			/* buffer's max size */
@ -114,6 +114,9 @@ typedef struct java_lexer
  /* If nonzero, a value that was pushed back.  */
  unicode_t unget_value;

+  /* If nonzero, we've hit EOF.  Used only by java_get_unicode().  */
+  int hit_eof : 1;
+
 #ifdef HAVE_ICONV
  /* Nonzero if we've read any bytes.  We only recognize the
     byte-order-marker (BOM) as the first word.  */
@ -141,7 +144,7 @@ typedef struct java_lexer

  /* This is a buffer of characters already converted by iconv.  We
     use `char' here because we're assuming that iconv() converts to
-     big-endian UCS-2, and then we convert it ourselves.  */
+     UCS-2, and then we convert it ourselves.  */
  unsigned char out_buffer[1024];

  /* Index of first valid output character.  -1 if no valid
@ -251,325 +254,38 @@ extern void set_float_handler PARAMS ((jmp_buf));
 /* Macros to decode character ranges */
 #define RANGE(c, l, h)           (((c) >= l && (c) <= h))
 #define JAVA_WHITE_SPACE_P(c) (c == ' ' || c == '\t' || c == '\f')
-#define JAVA_ID_CHAR_P(c)     ((c < 128 && (RANGE (c, 'A', 'Z') ||	\
-					    RANGE (c, 'a', 'z') ||	\
-					    RANGE (c, '0', '9') ||	\
-					    c == '_'         ||		\
-					    c == '$'))       ||		\
-			       java_ignorable_control_p (c)  ||		\
-			       (c > 127 && java_letter_or_digit_p (c)))
-#define JAVA_ASCII_DIGIT(c)    RANGE(c,'0', '9')
-#define JAVA_ASCII_OCTDIGIT(c) RANGE(c,'0', '7')
-#define JAVA_ASCII_HEXDIGIT(c) (RANGE(c,'0', '9') || 	\
-				RANGE(c,'a', 'f') ||	\
-				RANGE(c,'A', 'F'))
-#define JAVA_ASCII_FPCHAR(c)   (RANGE(c,'d', 'f') || RANGE(c,'D', 'F') || \
+#define JAVA_START_CHAR_P(c) ((c < 128					      \
+			       && (RANGE (c, 'A', 'Z')			      \
+				   || RANGE (c, 'a', 'z')		      \
+				   || c == '_'				      \
+				   || c == '$'))			      \
+                              || (c >= 128 && java_start_char_p (c)))
+#define JAVA_PART_CHAR_P(c) ((c < 128					      \
+			       && (RANGE (c, 'A', 'Z')			      \
+				   || RANGE (c, 'a', 'z')		      \
+				   || RANGE (c, '0', '9')		      \
+				   || c == '_'				      \
+				   || c == '$'				      \
+				   || c == 0x0000			      \
+				   || RANGE (c, 0x01, 0x08)		      \
+				   || RANGE (c, 0x0e, 0x1b)		      \
+				   || c == 0x7f))			      \
+                              || (c >= 128 && java_part_char_p (c)))
+#define JAVA_ASCII_DIGIT(c)    RANGE (c, '0', '9')
+#define JAVA_ASCII_OCTDIGIT(c) RANGE (c, '0', '7')
+#define JAVA_ASCII_HEXDIGIT(c) (RANGE (c, '0', '9') || 	\
+				RANGE (c, 'a', 'f') ||	\
+				RANGE (c, 'A', 'F'))
+#define JAVA_ASCII_FPCHAR(c)   (RANGE (c, 'd', 'f') || RANGE (c, 'D', 'F') || \
 				c == '.' || JAVA_ASCII_DIGIT (c))
 #define JAVA_FP_SUFFIX(c)      (c == 'D' || c == 'd' || c == 'f' || c == 'F')
 #define JAVA_FP_EXP(c)         (c == 'E' || c == 'F')
 #define JAVA_FP_PM(c)          (c == '-' || c == '+')
-#define JAVA_ASCII_LETTER(c)   (RANGE(c,'a', 'z') || RANGE(c,'A', 'Z'))
-#define JAVA_DIGIT_P(c)							      \
-   (RANGE (c, 0x030, 0x039) || /* ISO-Latin-1 (and ASCII) digits ('0'-'9') */ \
-    RANGE (c, 0x660, 0x669) || /* Arabic-Indic digits */		      \
-    RANGE (c, 0x6F0, 0x6F9) || /* Eastern Arabic-Indic digits */	      \
-    RANGE (c, 0x966, 0x96F) || /* Devanagari digits */			      \
-    RANGE (c, 0x9E6, 0x9EF) || /* Bengali digits */			      \
-    RANGE (c, 0xA66, 0xA6F) || /* Gurmukhi digits */			      \
-    RANGE (c, 0xAE6, 0xAEF) || /* Gujarati digits */			      \
-    RANGE (c, 0xB66, 0xB6F) || /* Oriya digits */			      \
-    RANGE (c, 0xBE7, 0xBEF) || /* Tamil digits */			      \
-    RANGE (c, 0xC66, 0xC6F) || /* Telugu digits */			      \
-    RANGE (c, 0xCE6, 0xCEF) || /* Kannada digits */			      \
-    RANGE (c, 0xD66, 0xD6F) || /* Malayalam digits */			      \
-    RANGE (c, 0xE50, 0xE59) || /* Thai digits */			      \
-    RANGE (c, 0xED0, 0xED9))   /* Lao digits */
-
-/* This is not to be used as a stand alone macro. Use JAVA_ID_CHAR_P()
-   or the forcoming JAVA_LETTER_OR_DIGIT_P() instead.
-   It need to be split by region. FIXME.  */
-#define _JAVA_LETTER_OR_DIGIT_P(c)		\
-   (RANGE (c, 0x00C0, 0x00D6) ||		\
-    RANGE (c, 0x00D8, 0x00F6) ||		\
-    RANGE (c, 0x00F8, 0x01F5) ||		\
-    RANGE (c, 0x01FA, 0x0217) ||		\
-    RANGE (c, 0x0250, 0x02A8) ||		\
-    RANGE (c, 0x02B0, 0x02DE) ||		\
-    RANGE (c, 0x02E0, 0x02E9) ||		\
-    RANGE (c, 0x0300, 0x0345) ||		\
-    RANGE (c, 0x0360, 0x0361) ||		\
-    RANGE (c, 0x0374, 0x0375) ||		\
-    c == 0x037A            ||			\
-    c == 0x037E            ||			\
-    RANGE (c, 0x0384, 0x038A) ||		\
-    c == 0x038C            ||			\
-    c == 0x038E            ||			\
-    RANGE (c, 0x038F, 0x03A1) ||		\
-    RANGE (c, 0x03A3, 0x03CE) ||		\
-    RANGE (c, 0x03D0, 0x03D6) ||		\
-    RANGE (c, 0x03DA, 0x03E2) ||		\
-    c == 0x03DA            ||			\
-    c == 0x03DC            ||			\
-    c == 0x03DE            ||			\
-    c == 0x03E0            ||			\
-    RANGE (c, 0x03E2, 0x03F3) ||		\
-    RANGE (c, 0x0401, 0x040C) ||		\
-    RANGE (c, 0x040E, 0x044F) ||		\
-    RANGE (c, 0x0451, 0x045C) ||		\
-    RANGE (c, 0x045E, 0x0486) ||		\
-    RANGE (c, 0x0490, 0x04C4) ||		\
-    RANGE (c, 0x04C7, 0x04C8) ||		\
-    RANGE (c, 0x04CB, 0x04CC) ||		\
-    RANGE (c, 0x04D0, 0x04EB) ||		\
-    RANGE (c, 0x04EE, 0x04F5) ||		\
-    RANGE (c, 0x04F8, 0x04F9) ||		\
-    RANGE (c, 0x0531, 0x0556) ||		\
-    RANGE (c, 0x0559, 0x055F) ||		\
-    RANGE (c, 0x0561, 0x0587) ||		\
-    c == 0x0589            ||			\
-    RANGE (c, 0x05B0, 0x05B9) ||		\
-    RANGE (c, 0x05BB, 0x05C3) ||		\
-    RANGE (c, 0x05D0, 0x05EA) ||		\
-    RANGE (c, 0x05F0, 0x05F4) ||		\
-    c == 0x060C            ||			\
-    c == 0x061B            ||			\
-    c == 0x061F            ||			\
-    c == 0x0621            ||			\
-    RANGE (c, 0x0622, 0x063A) ||		\
-    RANGE (c, 0x0640, 0x0652) ||		\
-    RANGE (c, 0x0660, 0x066D) ||		\
-    RANGE (c, 0x0670, 0x06B7) ||		\
-    RANGE (c, 0x06BA, 0x06BE) ||		\
-    RANGE (c, 0x06C0, 0x06CE) ||		\
-    RANGE (c, 0x06D0, 0x06ED) ||		\
-    RANGE (c, 0x06F0, 0x06F9) ||		\
-    RANGE (c, 0x0901, 0x0903) ||		\
-    RANGE (c, 0x0905, 0x0939) ||		\
-    RANGE (c, 0x093C, 0x094D) ||		\
-    RANGE (c, 0x0950, 0x0954) ||		\
-    RANGE (c, 0x0958, 0x0970) ||		\
-    RANGE (c, 0x0981, 0x0983) ||		\
-    RANGE (c, 0x0985, 0x098C) ||		\
-    RANGE (c, 0x098F, 0x0990) ||		\
-    RANGE (c, 0x0993, 0x09A8) ||		\
-    RANGE (c, 0x09AA, 0x09B0) ||		\
-    c == 0x09B2            ||			\
-    RANGE (c, 0x09B6, 0x09B9) ||		\
-    c == 0x09BC            ||			\
-    c == 0x09BE            ||			\
-    RANGE (c, 0x09BF, 0x09C4) ||		\
-    RANGE (c, 0x09C7, 0x09C8) ||		\
-    RANGE (c, 0x09CB, 0x09CD) ||		\
-    c == 0x09D7            ||			\
-    RANGE (c, 0x09DC, 0x09DD) ||		\
-    RANGE (c, 0x09DF, 0x09E3) ||		\
-    RANGE (c, 0x09E6, 0x09FA) ||		\
-    c == 0x0A02            ||			\
-    RANGE (c, 0x0A05, 0x0A0A) ||		\
-    RANGE (c, 0x0A0F, 0x0A10) ||		\
-    RANGE (c, 0x0A13, 0x0A28) ||		\
-    RANGE (c, 0x0A2A, 0x0A30) ||		\
-    RANGE (c, 0x0A32, 0x0A33) ||		\
-    RANGE (c, 0x0A35, 0x0A36) ||		\
-    RANGE (c, 0x0A38, 0x0A39) ||		\
-    c == 0x0A3C            ||			\
-    c == 0x0A3E            ||			\
-    RANGE (c, 0x0A3F, 0x0A42) ||		\
-    RANGE (c, 0x0A47, 0x0A48) ||		\
-    RANGE (c, 0x0A4B, 0x0A4D) ||		\
-    RANGE (c, 0x0A59, 0x0A5C) ||		\
-    c == 0x0A5E            ||			\
-    RANGE (c, 0x0A66, 0x0A74) ||		\
-    RANGE (c, 0x0A81, 0x0A83) ||		\
-    RANGE (c, 0x0A85, 0x0A8B) ||		\
-    c == 0x0A8D            ||			\
-    c == 0x0A8F            ||			\
-    RANGE (c, 0x0A90, 0x0A91) ||		\
-    RANGE (c, 0x0A93, 0x0AA8) ||		\
-    RANGE (c, 0x0AAA, 0x0AB0) ||		\
-    RANGE (c, 0x0AB2, 0x0AB3) ||		\
-    RANGE (c, 0x0AB5, 0x0AB9) ||		\
-    RANGE (c, 0x0ABC, 0x0AC5) ||		\
-    RANGE (c, 0x0AC7, 0x0AC9) ||		\
-    RANGE (c, 0x0ACB, 0x0ACD) ||		\
-    c == 0x0AD0            ||			\
-    c == 0x0AE0            ||			\
-    RANGE (c, 0x0AE6, 0x0AEF) ||		\
-    RANGE (c, 0x0B01, 0x0B03) ||		\
-    RANGE (c, 0x0B05, 0x0B0C) ||		\
-    RANGE (c, 0x0B0F, 0x0B10) ||		\
-    RANGE (c, 0x0B13, 0x0B28) ||		\
-    RANGE (c, 0x0B2A, 0x0B30) ||		\
-    RANGE (c, 0x0B32, 0x0B33) ||		\
-    RANGE (c, 0x0B36, 0x0B39) ||		\
-    RANGE (c, 0x0B3C, 0x0B43) ||		\
-    RANGE (c, 0x0B47, 0x0B48) ||		\
-    RANGE (c, 0x0B4B, 0x0B4D) ||		\
-    RANGE (c, 0x0B56, 0x0B57) ||		\
-    RANGE (c, 0x0B5C, 0x0B5D) ||		\
-    RANGE (c, 0x0B5F, 0x0B61) ||		\
-    RANGE (c, 0x0B66, 0x0B70) ||		\
-    RANGE (c, 0x0B82, 0x0B83) ||		\
-    RANGE (c, 0x0B85, 0x0B8A) ||		\
-    RANGE (c, 0x0B8E, 0x0B90) ||		\
-    RANGE (c, 0x0B92, 0x0B95) ||		\
-    RANGE (c, 0x0B99, 0x0B9A) ||		\
-    c == 0x0B9C            ||			\
-    c == 0x0B9E            ||			\
-    c == 0x0B9F            ||			\
-    RANGE (c, 0x0BA3, 0x0BA4) ||		\
-    RANGE (c, 0x0BA8, 0x0BAA) ||		\
-    RANGE (c, 0x0BAE, 0x0BB5) ||		\
-    RANGE (c, 0x0BB7, 0x0BB9) ||		\
-    RANGE (c, 0x0BBE, 0x0BC2) ||		\
-    RANGE (c, 0x0BC6, 0x0BC8) ||		\
-    RANGE (c, 0x0BCA, 0x0BCD) ||		\
-    c == 0x0BD7            ||			\
-    RANGE (c, 0x0BE7, 0x0BF2) ||		\
-    RANGE (c, 0x0C01, 0x0C03) ||		\
-    RANGE (c, 0x0C05, 0x0C0C) ||		\
-    RANGE (c, 0x0C0E, 0x0C10) ||		\
-    RANGE (c, 0x0C12, 0x0C28) ||		\
-    RANGE (c, 0x0C2A, 0x0C33) ||		\
-    RANGE (c, 0x0C35, 0x0C39) ||		\
-    RANGE (c, 0x0C3E, 0x0C44) ||		\
-    RANGE (c, 0x0C46, 0x0C48) ||		\
-    RANGE (c, 0x0C4A, 0x0C4D) ||		\
-    RANGE (c, 0x0C55, 0x0C56) ||		\
-    RANGE (c, 0x0C60, 0x0C61) ||		\
-    RANGE (c, 0x0C66, 0x0C6F) ||		\
-    RANGE (c, 0x0C82, 0x0C83) ||		\
-    RANGE (c, 0x0C85, 0x0C8C) ||		\
-    RANGE (c, 0x0C8E, 0x0C90) ||		\
-    RANGE (c, 0x0C92, 0x0CA8) ||		\
-    RANGE (c, 0x0CAA, 0x0CB3) ||		\
-    RANGE (c, 0x0CB5, 0x0CB9) ||		\
-    RANGE (c, 0x0CBE, 0x0CC4) ||		\
-    RANGE (c, 0x0CC6, 0x0CC8) ||		\
-    RANGE (c, 0x0CCA, 0x0CCD) ||		\
-    RANGE (c, 0x0CD5, 0x0CD6) ||		\
-    c == 0x0CDE            ||			\
-    c == 0x0CE0            ||			\
-    c == 0x0CE1            ||			\
-    RANGE (c, 0x0CE6, 0x0CEF) ||		\
-    RANGE (c, 0x0D02, 0x0D03) ||		\
-    RANGE (c, 0x0D05, 0x0D0C) ||		\
-    RANGE (c, 0x0D0E, 0x0D10) ||		\
-    RANGE (c, 0x0D12, 0x0D28) ||		\
-    RANGE (c, 0x0D2A, 0x0D39) ||		\
-    RANGE (c, 0x0D3E, 0x0D43) ||		\
-    RANGE (c, 0x0D46, 0x0D48) ||		\
-    RANGE (c, 0x0D4A, 0x0D4D) ||		\
-    c == 0x0D57            ||			\
-    RANGE (c, 0x0D60, 0x0D61) ||		\
-    RANGE (c, 0x0D66, 0x0D6F) ||		\
-    RANGE (c, 0x0E01, 0x0E3A) ||		\
-    RANGE (c, 0x0E3F, 0x0E5B) ||		\
-    RANGE (c, 0x0E81, 0x0E82) ||		\
-    c == 0x0E84            ||			\
-    RANGE (c, 0x0E87, 0x0E88) ||		\
-    c == 0x0E8A            ||			\
-    c == 0x0E8D            ||			\
-    RANGE (c, 0x0E94, 0x0E97) ||		\
-    RANGE (c, 0x0E99, 0x0E9F) ||		\
-    RANGE (c, 0x0EA1, 0x0EA3) ||		\
-    c == 0x0EA5            ||			\
-    c == 0x0EA7            ||			\
-    RANGE (c, 0x0EAA, 0x0EAB) ||		\
-    RANGE (c, 0x0EAD, 0x0EB9) ||		\
-    RANGE (c, 0x0EBB, 0x0EBD) ||		\
-    RANGE (c, 0x0EC0, 0x0EC4) ||		\
-    c == 0x0EC6            ||			\
-    c == 0x0EC8            ||			\
-    RANGE (c, 0x0EC9, 0x0ECD) ||		\
-    RANGE (c, 0x0ED0, 0x0ED9) ||		\
-    RANGE (c, 0x0EDC, 0x0EDD) ||		\
-    RANGE (c, 0x10A0, 0x10C5) ||		\
-    RANGE (c, 0x10D0, 0x10F6) ||		\
-    c == 0x10FB            ||			\
-    RANGE (c, 0x1100, 0x1159) ||		\
-    RANGE (c, 0x115F, 0x11A2) ||		\
-    RANGE (c, 0x11A8, 0x11F9) ||		\
-    RANGE (c, 0x1E00, 0x1E9A) ||		\
-    RANGE (c, 0x1EA0, 0x1EF9) ||		\
-    RANGE (c, 0x1F00, 0x1F15) ||		\
-    RANGE (c, 0x1F18, 0x1F1D) ||		\
-    RANGE (c, 0x1F20, 0x1F45) ||		\
-    RANGE (c, 0x1F48, 0x1F4D) ||		\
-    RANGE (c, 0x1F50, 0x1F57) ||		\
-    c == 0x1F59            ||			\
-    c == 0x1F5B            ||			\
-    c == 0x1F5D            ||			\
-    RANGE (c, 0x1F5F, 0x1F7D) ||		\
-    RANGE (c, 0x1F80, 0x1FB4) ||		\
-    RANGE (c, 0x1FB6, 0x1FC4) ||		\
-    RANGE (c, 0x1FC6, 0x1FD3) ||		\
-    RANGE (c, 0x1FD6, 0x1FDB) ||		\
-    RANGE (c, 0x1FDD, 0x1FEF) ||		\
-    RANGE (c, 0x1FF2, 0x1FF4) ||		\
-    RANGE (c, 0x1FF6, 0x1FFE) ||		\
-    RANGE (c, 0x3041, 0x3094) ||		\
-    RANGE (c, 0x3099, 0x309E) ||		\
-    RANGE (c, 0x30A1, 0x30FE) ||		\
-    RANGE (c, 0x3105, 0x312C) ||		\
-    RANGE (c, 0x3131, 0x318E) ||		\
-    RANGE (c, 0x3190, 0x319F) ||		\
-    RANGE (c, 0x3200, 0x321C) ||		\
-    RANGE (c, 0x3220, 0x3243) ||		\
-    RANGE (c, 0x3260, 0x327B) ||		\
-    RANGE (c, 0x327F, 0x32B0) ||		\
-    RANGE (c, 0x32C0, 0x32CB) ||		\
-    RANGE (c, 0x32D0, 0x32FE) ||		\
-    RANGE (c, 0x3300, 0x3376) ||		\
-    RANGE (c, 0x337B, 0x33DD) ||		\
-    RANGE (c, 0x33E0, 0x33FE) ||		\
-    RANGE (c, 0x3400, 0x9FA5) ||		\
-    RANGE (c, 0xF900, 0xFA2D) ||		\
-    RANGE (c, 0xFB00, 0xFB06) ||		\
-    RANGE (c, 0xFB13, 0xFB17) ||		\
-    RANGE (c, 0xFB1E, 0xFB36) ||		\
-    RANGE (c, 0xFB38, 0xFB3C) ||		\
-    c == 0xFB3E            ||			\
-    c == 0xFB40            ||			\
-    c == 0xFB41            ||			\
-    c == 0xFB43            ||			\
-    c == 0xFB44            ||			\
-    c == 0xFB46            ||			\
-    RANGE (c, 0xFB47, 0xFBB1) ||		\
-    RANGE (c, 0xFBD3, 0xFD3F) ||		\
-    RANGE (c, 0xFD50, 0xFD8F) ||		\
-    RANGE (c, 0xFD92, 0xFDC7) ||		\
-    RANGE (c, 0xFDF0, 0xFDFB) ||		\
-    RANGE (c, 0xFE70, 0xFE72) ||		\
-    c == 0xFE74            ||			\
-    c == 0xFE76            ||			\
-    RANGE (c, 0xFE77, 0xFEFC) ||		\
-    RANGE (c, 0xFF10, 0xFF19) ||		\
-    RANGE (c, 0xFF21, 0xFF3A) ||		\
-    RANGE (c, 0xFF41, 0xFF5A) ||		\
-    RANGE (c, 0xFF66, 0xFFBE) ||		\
-    RANGE (c, 0xFFC2, 0xFFC7) ||		\
-    RANGE (c, 0xFFCA, 0xFFCF) ||		\
-    RANGE (c, 0xFFD2, 0xFFD7) ||		\
-    RANGE (c, 0xFFDA, 0xFFDC))
-
-/* Identifier-ignorable characters.  This should not be used
-   standalone.  Note that the JCL says 200a->200e.  That is a typo.
-   The correct values are 202a->202e.  Note also that we test against
-   0x0000 separately to avoid a warning.  */
-#define _JAVA_IDENTIFIER_IGNORABLE(c)					      \
-  (c == 0x0000								      \
-   || RANGE (c, 0x0001, 0x0008)						      \
-   || RANGE (c, 0x000e, 0x001b)						      \
-   || RANGE (c, 0x007f, 0x009f)						      \
-   || RANGE (c, 0x200c, 0x200f)						      \
-   || RANGE (c, 0x202a, 0x202e)						      \
-   || RANGE (c, 0x206a, 0x206f)						      \
-   || c == 0xfeff)
+#define JAVA_ASCII_LETTER(c)   (RANGE (c, 'a', 'z') || RANGE (c, 'A', 'Z'))

 /* Constants  */
-#define JAVA_CHAR_ERROR 0xFFC1	/* This is an illegal unicode!?! FIXME */
 #define JAVA_READ_BUFFER 256
-#define UEOF (unicode_t)0xffff
+#define JAVA_CHAR_ERROR -2
+#define UEOF -1

 #endif