From d19cbcb5e3dd83e2628d25d2cd23892a4cac83b0 Mon Sep 17 00:00:00 2001 From: Tom Tromey Date: Tue, 12 Sep 2000 22:23:59 +0000 Subject: [PATCH] re GNATS gcj/33 (gcj mangles composed characters) Fix for PR gcj/33: * jv-scan.c (help): Document --encoding. (options): Added `encoding' entry. (OPT_ENCODING): New define. (main): Handle --encoding. Include if nl_langinfo exists. * lang-options.h: Document --classpath, --CLASSPATH, --main, and --encoding. * jcf-parse.c Include if we have nl_langinfo. (parse_source_file): Correctly call java_init_lex. Added `finput' argument. Use nl_langinfo to determine default encoding. * java-tree.h (current_encoding): Declare. * parse.y (java_parser_context_restore_global): Don't restore `finput'. (java_parser_context_save_global): Don't set `finput' field. (java_pop_parser_context): Don't restore `finput'. Free old lexer if required. * lang.c (current_encoding): New global. (lang_decode_option): Recognize `-fencoding='. (finish_parse): Don't close finput. * parse.h (struct parser_ctxt): Removed `finput' and `unget_utf8_value' fields. Added `lexer' field. (java_init_lex): Fixed declaration. * lex.c (java_new_lexer): New function. (java_destroy_lexer): Likewise. (java_read_char): Added `lex' argument. Handle iconv case. (java_read_unicode): Added `lex' argument. Count backslashes in lexer structure. (java_init_lex): Added `finput' and `encoding' arguments. Set `lexer' field in ctxp. (BAD_UTF8_VALUE): Removed. (java_lex): Handle seeing UEOF in the middle of a string literal. * lex.h: Include if HAVE_ICONV defined. (java_lexer): New structure. (UNGETC): Removed. (GETC): Removed. (DEFAULT_ENCODING): New define. (java_destroy_lexer): Declare. From-SVN: r36377 --- gcc/java/ChangeLog | 41 ++++++ gcc/java/java-tree.h | 3 + gcc/java/jcf-parse.c | 31 ++++- gcc/java/jv-scan.c | 27 +++- gcc/java/lang-options.h | 6 +- gcc/java/lang.c | 14 +- gcc/java/lex.c | 294 ++++++++++++++++++++++++++++------------ gcc/java/lex.h | 41 +++++- gcc/java/parse.h | 5 +- gcc/java/parse.y | 7 +- 10 files changed, 364 insertions(+), 105 deletions(-) diff --git a/gcc/java/ChangeLog b/gcc/java/ChangeLog index 642f4a79164..7b13f9a0485 100644 --- a/gcc/java/ChangeLog +++ b/gcc/java/ChangeLog @@ -1,3 +1,44 @@ +2000-09-12 Tom Tromey + + Fix for PR gcj/33: + * jv-scan.c (help): Document --encoding. + (options): Added `encoding' entry. + (OPT_ENCODING): New define. + (main): Handle --encoding. + Include if nl_langinfo exists. + * lang-options.h: Document --classpath, --CLASSPATH, --main, and + --encoding. + * jcf-parse.c Include if we have nl_langinfo. + (parse_source_file): Correctly call java_init_lex. Added `finput' + argument. Use nl_langinfo to determine default encoding. + * java-tree.h (current_encoding): Declare. + * parse.y (java_parser_context_restore_global): Don't restore + `finput'. + (java_parser_context_save_global): Don't set `finput' field. + (java_pop_parser_context): Don't restore `finput'. Free old lexer + if required. + * lang.c (current_encoding): New global. + (lang_decode_option): Recognize `-fencoding='. + (finish_parse): Don't close finput. + * parse.h (struct parser_ctxt): Removed `finput' and + `unget_utf8_value' fields. Added `lexer' field. + (java_init_lex): Fixed declaration. + * lex.c (java_new_lexer): New function. + (java_destroy_lexer): Likewise. + (java_read_char): Added `lex' argument. Handle iconv case. + (java_read_unicode): Added `lex' argument. Count backslashes in + lexer structure. + (java_init_lex): Added `finput' and `encoding' arguments. Set + `lexer' field in ctxp. + (BAD_UTF8_VALUE): Removed. + (java_lex): Handle seeing UEOF in the middle of a string literal. + * lex.h: Include if HAVE_ICONV defined. + (java_lexer): New structure. + (UNGETC): Removed. + (GETC): Removed. + (DEFAULT_ENCODING): New define. + (java_destroy_lexer): Declare. + 2000-09-12 Tom Tromey Fix for PR gcj/343: diff --git a/gcc/java/java-tree.h b/gcc/java/java-tree.h index 94fdcaed827..18cdf7a785d 100644 --- a/gcc/java/java-tree.h +++ b/gcc/java/java-tree.h @@ -169,6 +169,9 @@ extern int flag_use_boehm_gc; object to its synchronization structure. */ extern int flag_hash_synchronization; +/* Encoding used for source files. */ +extern char *current_encoding; + /* The Java .class file that provides main_class; the main input file. */ extern struct JCF *current_jcf; diff --git a/gcc/java/jcf-parse.c b/gcc/java/jcf-parse.c index 02becc07483..4b76f598ca0 100644 --- a/gcc/java/jcf-parse.c +++ b/gcc/java/jcf-parse.c @@ -35,6 +35,10 @@ The Free Software Foundation is independent of Sun Microsystems, Inc. */ #include "toplev.h" #include "parse.h" +#ifdef HAVE_NL_LANGINFO +#include +#endif + /* A CONSTANT_Utf8 element is converted to an IDENTIFIER_NODE at parse time. */ #define JPOOL_UTF(JCF, INDEX) CPOOL_UTF(&(JCF)->cpool, INDEX) #define JPOOL_UTF_LENGTH(JCF, INDEX) IDENTIFIER_LENGTH (JPOOL_UTF (JCF, INDEX)) @@ -83,7 +87,7 @@ static struct JCF main_jcf[1]; static tree give_name_to_class PARAMS ((JCF *jcf, int index)); static void parse_zip_file_entries PARAMS ((void)); static void process_zip_dir PARAMS ((void)); -static void parse_source_file PARAMS ((tree)); +static void parse_source_file PARAMS ((tree, FILE *)); static void jcf_parse_source PARAMS ((void)); static int jcf_figure_file_type PARAMS ((JCF *)); static int find_in_current_zip PARAMS ((const char *, struct JCF **)); @@ -564,6 +568,7 @@ static void jcf_parse_source () { tree file; + FILE *finput; java_parser_context_save_global (); java_push_parser_context (); @@ -576,7 +581,7 @@ jcf_parse_source () if (!(finput = fopen (input_filename, "r"))) fatal ("input file `%s' just disappeared - jcf_parse_source", input_filename); - parse_source_file (file); + parse_source_file (file, finput); if (fclose (finput)) fatal ("can't close input file `%s' stream - jcf_parse_source", input_filename); @@ -754,8 +759,9 @@ parse_class_file () /* Parse a source file, as pointed by the current value of INPUT_FILENAME. */ static void -parse_source_file (file) +parse_source_file (file, finput) tree file; + FILE *finput; { int save_error_count = java_error_count; /* Mark the file as parsed */ @@ -765,7 +771,21 @@ parse_source_file (file) lang_init_source (1); /* Error msgs have no method prototypes */ - java_init_lex (); /* Initialize the parser */ + /* There's no point in trying to find the current encoding unless we + are going to do something intelligent with it -- hence the test + for iconv. */ +#ifdef HAVE_ICONV +#ifdef HAVE_NL_LANGINFO + setlocale (LC_CTYPE, ""); + if (current_encoding == NULL) + current_encoding = nl_langinfo (CODESET); +#endif /* HAVE_NL_LANGINFO */ +#endif /* HAVE_ICONV */ + if (current_encoding == NULL || *current_encoding == '\0') + current_encoding = DEFAULT_ENCODING; + + /* Initialize the parser */ + java_init_lex (finput, current_encoding); java_parse_abort_on_error (); java_parse (); /* Parse and build partial tree nodes. */ @@ -796,6 +816,7 @@ yyparse () int several_files = 0; char *list = xstrdup (input_filename), *next; tree node, current_file_list = NULL_TREE; + FILE *finput; do { @@ -901,7 +922,7 @@ yyparse () case JCF_SOURCE: java_push_parser_context (); java_parser_context_save_global (); - parse_source_file (name); + parse_source_file (name, finput); java_parser_context_restore_global (); java_pop_parser_context (1); break; diff --git a/gcc/java/jv-scan.c b/gcc/java/jv-scan.c index adb7ba38345..ae9c91d108e 100644 --- a/gcc/java/jv-scan.c +++ b/gcc/java/jv-scan.c @@ -26,6 +26,10 @@ Boston, MA 02111-1307, USA. */ #include "version.h" +#ifdef HAVE_NL_LANGINFO +#include +#endif + #include void fatal PARAMS ((const char *s, ...)) ATTRIBUTE_PRINTF_1 ATTRIBUTE_NORETURN; @@ -61,6 +65,7 @@ int flag_list_filename = 0; #define OPT_HELP LONG_OPT (0) #define OPT_VERSION LONG_OPT (1) +#define OPT_ENCODING LONG_OPT (2) static struct option options[] = { @@ -69,6 +74,7 @@ static struct option options[] = { "print-main", no_argument, &flag_find_main, 1 }, { "list-filename", no_argument, &flag_list_filename, 1 }, { "list-class", no_argument, &flag_dump_class, 1 }, + { "encoding", required_argument, NULL, OPT_ENCODING }, { NULL, no_argument, NULL, 0 } }; @@ -84,6 +90,7 @@ help () { printf ("Usage: jv-scan [OPTION]... FILE...\n\n"); printf ("Print useful information read from Java source files.\n\n"); + printf (" --encoding NAME Specify encoding of input file\n"); printf (" --print-main Print name of class containing `main'\n"); printf (" --list-class List all classes defined in file\n"); printf (" --list-filename Print input filename when listing class names\n"); @@ -114,6 +121,7 @@ DEFUN (main, (argc, argv), { int i = 1; const char *output_file = NULL; + const char *encoding = NULL; long ft; int opt; @@ -144,6 +152,10 @@ DEFUN (main, (argc, argv), version (); break; + case OPT_ENCODING: + encoding = optarg; + break; + default: usage (); break; @@ -172,7 +184,20 @@ DEFUN (main, (argc, argv), input_filename = argv [i]; if ( (finput = fopen (argv [i], "r")) ) { - java_init_lex (); + /* There's no point in trying to find the current encoding + unless we are going to do something intelligent with it + -- hence the test for iconv. */ +#ifdef HAVE_ICONV +#ifdef HAVE_NL_LANGINFO + setlocale (LC_CTYPE, ""); + if (encoding == NULL) + encoding = nl_langinfo (CODESET); +#endif /* HAVE_NL_LANGINFO */ +#endif /* HAVE_ICONV */ + if (encoding == NULL || *encoding == '\0') + encoding = DEFAULT_ENCODING; + + java_init_lex (finput, encoding); yyparse (); if (ftell (out) != ft) fputc ('\n', out); diff --git a/gcc/java/lang-options.h b/gcc/java/lang-options.h index 630e6d8a1bd..2b207dcdeb2 100644 --- a/gcc/java/lang-options.h +++ b/gcc/java/lang-options.h @@ -42,8 +42,10 @@ DEFINE_LANG_NAME ("Java") { "-M", "Print dependencies to stdout" }, { "-MM", "Print dependencies to stdout" }, #endif /* ! USE_CPPLIB */ - { "-fclasspath", "Set class path and suppress system path" }, - { "-fCLASSPATH", "Set class path" }, + { "--classpath", "Set class path and suppress system path" }, + { "--CLASSPATH", "Set class path" }, + { "--main", "Choose class whose main method should be used" }, + { "--encoding", "Choose input encoding (default is UTF-8)" }, { "-I", "Add directory to class path" }, { "-foutput-class-dir", "Directory where class files should be written" }, { "-fuse-divide-subroutine", "" }, diff --git a/gcc/java/lang.c b/gcc/java/lang.c index 5f95f2d9c57..2dc33f22b09 100644 --- a/gcc/java/lang.c +++ b/gcc/java/lang.c @@ -121,6 +121,9 @@ int flag_hash_synchronization; JNI, not CNI. */ int flag_jni = 0; +/* The encoding of the source file. */ +char *current_encoding = NULL; + /* When non zero, report the now deprecated empty statements. */ int flag_extraneous_semicolon; @@ -222,6 +225,13 @@ lang_decode_option (argc, argv) return 1; } #undef ARG +#define ARG "-fencoding=" + if (strncmp (p, ARG, sizeof (ARG) - 1) == 0) + { + current_encoding = p + sizeof (ARG) - 1; + return 1; + } +#undef ARG if (p[0] == '-' && p[1] == 'f') { @@ -309,7 +319,9 @@ lang_decode_option (argc, argv) return 0; } +/* Global open file. */ FILE *finput; + const char * init_parse (filename) const char *filename; @@ -362,6 +374,7 @@ init_parse (filename) } } } + init_lex (); return filename; @@ -370,7 +383,6 @@ init_parse (filename) void finish_parse () { - fclose (finput); jcf_dependency_write (); } diff --git a/gcc/java/lex.c b/gcc/java/lex.c index 535733fe2e2..4179b1dbca5 100644 --- a/gcc/java/lex.c +++ b/gcc/java/lex.c @@ -24,15 +24,15 @@ of Sun Microsystems, Inc. in the United States and other countries. The Free Software Foundation is independent of Sun Microsystems, Inc. */ /* It defines java_lex (yylex) that reads a Java ASCII source file -possibly containing Unicode escape sequence or utf8 encoded characters -and returns a token for everything found but comments, white spaces -and line terminators. When necessary, it also fills the java_lval -(yylval) union. It's implemented to be called by a re-entrant parser -generated by Bison. + possibly containing Unicode escape sequence or utf8 encoded + characters and returns a token for everything found but comments, + white spaces and line terminators. When necessary, it also fills + the java_lval (yylval) union. It's implemented to be called by a + re-entrant parser generated by Bison. -The lexical analysis conforms to the Java grammar described in "The -Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele. -Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */ + The lexical analysis conforms to the Java grammar described in "The + Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele. + Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */ #include "keyword.h" @@ -55,15 +55,18 @@ static int java_letter_or_digit_p PARAMS ((unicode_t)); static int java_parse_doc_section PARAMS ((unicode_t)); static void java_parse_end_comment PARAMS ((unicode_t)); static unicode_t java_get_unicode PARAMS ((void)); -static unicode_t java_read_unicode PARAMS ((int, int *)); +static unicode_t java_read_unicode PARAMS ((java_lexer *, int, int *)); static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int)); -static unicode_t java_read_char PARAMS ((void)); +static unicode_t java_read_char PARAMS ((java_lexer *)); static void java_allocate_new_line PARAMS ((void)); static void java_unget_unicode PARAMS ((void)); static unicode_t java_sneak_unicode PARAMS ((void)); +java_lexer *java_new_lexer PARAMS ((FILE *, const char *)); void -java_init_lex () +java_init_lex (finput, encoding) + FILE *finput; + const char *encoding; { #ifndef JC1_LITE int java_lang_imported = 0; @@ -114,9 +117,9 @@ java_init_lex () ctxp->lineno = lineno = 0; ctxp->p_line = NULL; ctxp->c_line = NULL; - ctxp->unget_utf8_value = 0; ctxp->minus_seen = 0; ctxp->java_error_flag = 0; + ctxp->lexer = java_new_lexer (finput, encoding); } static char * @@ -194,59 +197,180 @@ java_allocate_new_line () ctxp->c_line->white_space_only = 1; } -#define BAD_UTF8_VALUE 0xFFFE +/* Create a new lexer object. */ +java_lexer * +java_new_lexer (finput, encoding) + FILE *finput; + const char *encoding; +{ + java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer)); + int enc_error = 0; + + lex->finput = finput; + lex->bs_count = 0; + lex->unget_value = 0; + +#ifdef HAVE_ICONV + lex->handle = iconv_open ("UCS-2", encoding); + if (lex->handle == (iconv_t) -1) + { + /* FIXME: we should give a nice error based on errno here. */ + enc_error = 1; + } + lex->first = -1; + lex->last = -1; +#else /* HAVE_ICONV */ + if (strcmp (encoding, DEFAULT_ENCODING)) + enc_error = 1; +#endif /* HAVE_ICONV */ + + if (enc_error) + fatal ("unknown encoding: `%s'", encoding); + + return lex; +} + +void +java_destroy_lexer (lex) + java_lexer *lex; +{ +#ifdef HAVE_ICONV + iconv_close (lex->handle); +#endif + free (lex); +} static unicode_t -java_read_char () +java_read_char (lex) + java_lexer *lex; { - int c; - int c1, c2; - - if (ctxp->unget_utf8_value) + if (lex->unget_value) { - int to_return = ctxp->unget_utf8_value; - ctxp->unget_utf8_value = 0; - return (to_return); + unicode_t r = lex->unget_value; + lex->unget_value = 0; + return r; } - c = GETC (); +#ifdef HAVE_ICONV + { + char out[2]; + size_t ir, inbytesleft, in_save, out_count; + char *inp, *outp; - if (c < 128) - return (unicode_t)c; - if (c == EOF) - return UEOF; - else - { - if ((c & 0xe0) == 0xc0) - { - c1 = GETC (); - if ((c1 & 0xc0) == 0x80) - return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f)); - c = c1; - } - else if ((c & 0xf0) == 0xe0) - { - c1 = GETC (); - if ((c1 & 0xc0) == 0x80) - { - c2 = GETC (); - if ((c2 & 0xc0) == 0x80) - return (unicode_t)(((c & 0xf) << 12) + - (( c1 & 0x3f) << 6) + (c2 & 0x3f)); - else - c = c2; - } - else + while (1) + { + /* See if we need to read more data. If FIRST == 0 then the + previous conversion attempt ended in the middle of a + character at the end of the buffer. Otherwise we only have + to read if the buffer is empty. */ + if (lex->first == 0 || lex->first >= lex->last) + { + int r; + + if (lex->first >= lex->last) + { + lex->first = 0; + lex->last = 0; + } + if (feof (lex->finput)) + return UEOF; + r = fread (&lex->buffer[lex->last], 1, + sizeof (lex->buffer) - lex->last, + lex->finput); + lex->last += r; + } + + inbytesleft = lex->last - lex->first; + + if (inbytesleft == 0) + { + /* We've tried to read and there is nothing left. */ + return UEOF; + } + + in_save = inbytesleft; + out_count = 2; + inp = &lex->buffer[lex->first]; + outp = out; + ir = iconv (lex->handle, (const char **) &inp, &inbytesleft, + &outp, &out_count); + lex->first += in_save - inbytesleft; + + if (out_count == 0) + { + /* Success. We assume that UCS-2 is big-endian. This + appears to be an ok assumption. */ + unicode_t result; + result = (((unsigned char) out[0]) << 8) | (unsigned char) out[1]; + return result; + } + + if (ir == (size_t) -1) + { + if (errno == EINVAL) + { + /* This is ok. This means that the end of our buffer + is in the middle of a character sequence. We just + move the valid part of the buffer to the beginning + to force a read. */ + /* We use bcopy() because it should work for + overlapping strings. Use memmove() instead... */ + bcopy (&lex->buffer[lex->first], &lex->buffer[0], + lex->last - lex->first); + lex->last -= lex->first; + lex->first = 0; + } + else + { + /* A more serious error. */ + java_lex_error ("unrecognized character in input stream", 0); + return UEOF; + } + } + } + } +#else /* HAVE_ICONV */ + { + int c, c1, c2; + c = getc (lex->finput); + + if (c < 128) + return (unicode_t)c; + if (c == EOF) + return UEOF; + else + { + if ((c & 0xe0) == 0xc0) + { + c1 = getc (lex->finput); + if ((c1 & 0xc0) == 0x80) + return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f)); c = c1; - } - /* We looked for a UTF8 multi-byte sequence (since we saw an initial - byte with the high bit set), but found invalid bytes instead. - If the most recent byte was Ascii (and not EOF), we should - unget it, in case it was a comment terminator or other delimitor. */ - if ((c & 0x80) == 0) - UNGETC (c); - return BAD_UTF8_VALUE; - } + } + else if ((c & 0xf0) == 0xe0) + { + c1 = getc (lex->finput); + if ((c1 & 0xc0) == 0x80) + { + c2 = getc (lex->finput); + if ((c2 & 0xc0) == 0x80) + return (unicode_t)(((c & 0xf) << 12) + + (( c1 & 0x3f) << 6) + (c2 & 0x3f)); + else + c = c2; + } + else + c = c1; + } + + /* We simply don't support invalid characters. */ + java_lex_error ("malformed UTF-8 character", 0); + } + } +#endif /* HAVE_ICONV */ + + /* We only get here on error. */ + return UEOF; } static void @@ -267,56 +391,54 @@ java_store_unicode (l, c, unicode_escape_p) } static unicode_t -java_read_unicode (term_context, unicode_escape_p) - int term_context; - int *unicode_escape_p; +java_read_unicode (lex, term_context, unicode_escape_p) + java_lexer *lex; + int term_context; + int *unicode_escape_p; { unicode_t c; - long i, base; - c = java_read_char (); + c = java_read_char (lex); *unicode_escape_p = 0; if (c != '\\') - return ((term_context ? c : - java_lineterminator (c) ? '\n' : (unicode_t)c)); - - /* Count the number of preceeding '\' */ - for (base = ftell (finput), i = base-2; c == '\\';) - { - fseek (finput, i--, SEEK_SET); - c = java_read_char (); /* Will fail if reading utf8 stream. FIXME */ - } - fseek (finput, base, SEEK_SET); - if ((base-i-3)%2 == 0) /* If odd number of \ seen */ { - c = java_read_char (); + lex->bs_count = 0; + return (term_context ? c : (java_lineterminator (c) + ? '\n' + : (unicode_t) c)); + } + + ++lex->bs_count; + if ((lex->bs_count) % 2 == 1) + { + /* Odd number of \ seen. */ + c = java_read_char (lex); if (c == 'u') { - unsigned short unicode = 0; + unicode_t unicode = 0; int shift = 12; /* Next should be 4 hex digits, otherwise it's an error. The hex value is converted into the unicode, pushed into the Unicode stream. */ for (shift = 12; shift >= 0; shift -= 4) { - if ((c = java_read_char ()) == UEOF) + if ((c = java_read_char (lex)) == UEOF) return UEOF; if (c >= '0' && c <= '9') unicode |= (unicode_t)((c-'0') << shift); else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift); else - java_lex_error - ("Non hex digit in Unicode escape sequence", 0); + java_lex_error ("Non hex digit in Unicode escape sequence", 0); } *unicode_escape_p = 1; - return (term_context ? unicode : - (java_lineterminator (c) ? '\n' : unicode)); + return (term_context + ? unicode : (java_lineterminator (c) ? '\n' : unicode)); } - ctxp->unget_utf8_value = c; + lex->unget_value = c; } - return (unicode_t)'\\'; + return (unicode_t) '\\'; } static unicode_t @@ -331,7 +453,7 @@ java_get_unicode () for (;;) { int unicode_escape_p; - c = java_read_unicode (0, &unicode_escape_p); + c = java_read_unicode (ctxp->lexer, 0, &unicode_escape_p); java_store_unicode (ctxp->c_line, c, unicode_escape_p); if (ctxp->c_line->white_space_only && !JAVA_WHITE_SPACE_P (c) && c!='\n') @@ -354,7 +476,7 @@ java_lineterminator (c) else if (c == '\r') /* CR */ { int unicode_escape_p; - c = java_read_unicode (1, &unicode_escape_p); + c = java_read_unicode (ctxp->lexer, 1, &unicode_escape_p); if (c == '\r') { /* In this case we will have another terminator. For some @@ -363,7 +485,7 @@ java_lineterminator (c) up in the actual text of the line, causing an error. So instead we choose a very low-level method. FIXME: this is incredibly ugly. */ - UNGETC (c); + ctxp->lexer->unget_value = c; } else if (c != '\n') { @@ -939,7 +1061,7 @@ java_lex (java_lval) char *string; for (no_error = 1, c = java_get_unicode (); - c != '"' && c != '\n'; c = java_get_unicode ()) + c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ()) { if (c == '\\') c = java_parse_escape_sequence (); diff --git a/gcc/java/lex.h b/gcc/java/lex.h index d4754aba67f..cf29aa16a70 100644 --- a/gcc/java/lex.h +++ b/gcc/java/lex.h @@ -35,6 +35,13 @@ extern int lineno; /* A Unicode character, as read from the input file */ typedef unsigned short unicode_t; +#ifdef HAVE_ICONV +#include +#endif /* HAVE_ICONV */ + +/* Default encoding to use if no encoding is specified. */ +#define DEFAULT_ENCODING "UTF-8" + /* Debug macro to print-out what we match */ #ifdef JAVA_LEX_DEBUG #ifdef JAVA_LEX_DEBUG_CHAR @@ -96,13 +103,39 @@ typedef struct _java_lc { int col; } java_lc; +typedef struct java_lexer +{ + /* The file from which we're reading. */ + FILE *finput; + + /* Number of consecutive backslashes we've read. */ + int bs_count; + + /* If nonzero, a value that was pushed back. */ + unicode_t unget_value; + +#ifdef HAVE_ICONV + /* The handle for the iconv converter we're using. */ + iconv_t handle; + + /* Bytes we've read from the file but have not sent to iconv. */ + char buffer[1024]; + + /* Index of first valid character in buffer, -1 if no valid + characters. */ + int first; + + /* Index of last valid character in buffer, plus one. -1 if no + valid characters in buffer. */ + int last; +#endif /* HAVE_ICONV */ +} java_lexer; + +/* Destroy a lexer object. */ +extern void java_destroy_lexer PARAMS ((java_lexer *)); #define JAVA_LINE_MAX 80 -/* Macro to read and unread bytes */ -#define UNGETC(c) ungetc(c, finput) -#define GETC() getc(finput) - /* Build a location compound integer */ #define BUILD_LOCATION() ((ctxp->elc.line << 12) | (ctxp->elc.col & 0xfff)) diff --git a/gcc/java/parse.h b/gcc/java/parse.h index 80712370d8d..b1b0e8e9831 100644 --- a/gcc/java/parse.h +++ b/gcc/java/parse.h @@ -728,13 +728,12 @@ typedef struct _jdeplist { struct parser_ctxt { const char *filename; /* Current filename */ - FILE *finput; /* Current file input stream */ struct parser_ctxt *next; + java_lexer *lexer; /* Current lexer state */ char marker_begining; /* Marker. Should be a sub-struct */ struct java_line *p_line, *c_line; /* Previous and current line */ java_lc elc; /* Error's line column info */ - unicode_t unget_utf8_value; /* An unget utf8 value */ int ccb_indent; /* Keep track of {} indent, lexer */ int first_ccb_indent1; /* First { at ident level 1 */ int last_ccb_indent1; /* Last } at ident level 1 */ @@ -928,7 +927,7 @@ extern void reset_report PARAMS ((void)); /* Always in use, no matter what you compile */ void java_push_parser_context PARAMS ((void)); void java_pop_parser_context PARAMS ((int)); -void java_init_lex PARAMS ((void)); +void java_init_lex PARAMS ((FILE *, const char *)); extern void java_parser_context_save_global PARAMS ((void)); extern void java_parser_context_restore_global PARAMS ((void)); int yyparse PARAMS ((void)); diff --git a/gcc/java/parse.y b/gcc/java/parse.y index 9c92e58242a..42f4206948a 100644 --- a/gcc/java/parse.y +++ b/gcc/java/parse.y @@ -2618,10 +2618,13 @@ java_pop_parser_context (generate) next->incomplete_class = ctxp->incomplete_class; next->gclass_list = ctxp->gclass_list; lineno = ctxp->lineno; - finput = ctxp->finput; current_class = ctxp->current_class; } + /* If the old and new lexers differ, then free the old one. */ + if (ctxp->lexer && next && ctxp->lexer != next->lexer) + java_destroy_lexer (ctxp->lexer); + /* Set the single import class file flag to 0 for the current list of imported things */ for (current = ctxp->import_list; current; current = TREE_CHAIN (current)) @@ -2661,7 +2664,6 @@ java_parser_context_save_global () else if (ctxp->saved_data) create_new_parser_context (1); - ctxp->finput = finput; ctxp->lineno = lineno; ctxp->current_class = current_class; ctxp->filename = input_filename; @@ -2675,7 +2677,6 @@ java_parser_context_save_global () void java_parser_context_restore_global () { - finput = ctxp->finput; lineno = ctxp->lineno; current_class = ctxp->current_class; input_filename = ctxp->filename;