configure.in (enable_c_mbchar): New configure option.

Mon Jul 20 16:16:38 1998  Dave Brolley  <brolley@cygnus.com>
	* configure.in (enable_c_mbchar): New configure option.
	(extra_cpp_objs): Always available now.
	* cexp.y (mbchar.h): #include it.
	(yylex): Handle Multibyte characters in character literals.
	* cccp.c (mbchar.h): #include it.
	(main): Set character set based on LANG environment variable.
	(rescan): Handle multibyte characters in comments.
	(skip_if_group): See above.
	(validate_else): See above.
	(skip_to_end_of_comment): See above.
	(macarg1): See above.
	(discard_comments): See above.
	(rescan): Handle multibyte characters in string and character literals.
	(collect_expansion): See above.
	(skip_quoted_string): See above.
	(macroexpand): See above.
	(macarg1): See above.
	(discard_comments): See above.
	(change_newlines): See above.
	* c-lex.c (mbchar.h): #include it.
	(GET_ENVIRONMENT): New macro.
	(init_lex): Set character set based on LANG environment variable.
	(yylex): Handle multibyte characters in character literals.
	(yylex): Handle multibyte characters in string literals.
	* Makefile.in (mbchar.o): New target.
	(cccp$(exeext)): @extra_cpp_objs@ is always available.
	(cppmain$(exeext)): @extra_cpp_objs@ is always available.
	* mbchar.[ch]: New files for multibyte character handling.

From-SVN: r21303
This commit is contained in:
Dave Brolley 1998-07-20 13:35:38 +00:00 committed by Dave Brolley
parent 689fcba861
commit 56f48ce976
9 changed files with 987 additions and 156 deletions

View File

@ -1,3 +1,39 @@
Mon Jul 20 16:16:38 1998 Dave Brolley <brolley@cygnus.com>
* configure.in (enable_c_mbchar): New configure option.
(extra_cpp_objs): Always available now.
* cexp.y (mbchar.h): #include it.
(yylex): Handle Multibyte characters in character literals.
* cccp.c (mbchar.h): #include it.
(main): Set character set based on LANG environment variable.
(rescan): Handle multibyte characters in comments.
(skip_if_group): See above.
(validate_else): See above.
(skip_to_end_of_comment): See above.
(macarg1): See above.
(discard_comments): See above.
(rescan): Handle multibyte characters in string and character literals.
(collect_expansion): See above.
(skip_quoted_string): See above.
(macroexpand): See above.
(macarg1): See above.
(discard_comments): See above.
(change_newlines): See above.
* c-lex.c (mbchar.h): #include it.
(GET_ENVIRONMENT): New macro.
(init_lex): Set character set based on LANG environment variable.
(yylex): Handle multibyte characters in character literals.
(yylex): Handle multibyte characters in string literals.
* Makefile.in (mbchar.o): New target.
(cccp$(exeext)): @extra_cpp_objs@ is always available.
(cppmain$(exeext)): @extra_cpp_objs@ is always available.
* mbchar.[ch]: New files for multibyte character handling.
Mon Jul 20 01:11:11 1998 David S. Miller <davem@pierdol.cobaltmicro.com>
* jump.c (jump_optimize): When simplifying noop moves and

View File

@ -641,7 +641,8 @@ OBJS = toplev.o version.o tree.o print-tree.o stor-layout.o fold-const.o \
regclass.o local-alloc.o global.o reload.o reload1.o caller-save.o gcse.o \
insn-peep.o reorg.o $(SCHED_PREFIX)sched.o final.o recog.o reg-stack.o \
insn-opinit.o insn-recog.o insn-extract.o insn-output.o insn-emit.o \
profile.o insn-attrtab.o $(out_object_file) getpwd.o $(EXTRA_OBJS) convert.o
profile.o insn-attrtab.o $(out_object_file) getpwd.o $(EXTRA_OBJS) convert.o \
mbchar.o
# GEN files are listed separately, so they can be built before doing parallel
# makes for cc1 or cc1plus. Otherwise sequent parallel make attempts to load
@ -1275,13 +1276,14 @@ c-lang.o : c-lang.c $(CONFIG_H) system.h $(TREE_H) c-tree.h c-lex.h toplev.h \
output.h
c-lex.o : c-lex.c $(CONFIG_H) system.h $(TREE_H) $(RTL_H) c-lex.h c-tree.h \
$(srcdir)/c-parse.h input.h flags.h $(srcdir)/c-gperf.h c-pragma.h \
toplev.h output.h
toplev.h output.h mbchar.h
c-aux-info.o : c-aux-info.c $(CONFIG_H) system.h $(TREE_H) c-tree.h flags.h
c-convert.o : c-convert.c $(CONFIG_H) system.h $(TREE_H) flags.h toplev.h
c-pragma.o: c-pragma.c $(CONFIG_H) system.h $(RTL_H) $(TREE_H) except.h \
function.h defaults.h c-pragma.h toplev.h
c-iterate.o: c-iterate.c $(CONFIG_H) system.h $(TREE_H) $(RTL_H) c-tree.h \
flags.h toplev.h $(EXPR_H)
mbchar.o: $(CONFIG_H) system.h gansidecl.h mbchar.h
collect2$(exeext): collect2.o tlink.o hash.o cplus-dem.o underscore.o \
version.o choose-temp.o mkstemp.o $(LIBDEPS)
@ -1816,15 +1818,16 @@ $(HOST_PREFIX_1):
cpp$(exeext): $(CCCP)$(exeext)
-rm -f cpp$(exeext)
$(LN) $(CCCP)$(exeext) cpp$(exeext)
cccp$(exeext): cccp.o cexp.o version.o prefix.o $(LIBDEPS)
$(CC) $(ALL_CFLAGS) $(LDFLAGS) -o $@ cccp.o cexp.o prefix.o \
version.o $(LIBS)
cccp$(exeext): cccp.o cexp.o version.o prefix.o mbchar.o @extra_cpp_objs@ $(LIBDEPS)
$(CC) $(ALL_CFLAGS) $(LDFLAGS) -o $@ cccp.o cexp.o prefix.o mbchar.o \
version.o @extra_cpp_objs@ $(LIBS)
cexp.o: $(srcdir)/cexp.c $(CONFIG_H) system.h gansidecl.h
$(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) -c $(srcdir)/cexp.c
$(srcdir)/cexp.c: $(srcdir)/cexp.y
cd $(srcdir); $(BISON) -o cexp.c cexp.y
cccp.o: cccp.c $(CONFIG_H) pcp.h version.c config.status system.h gansidecl.h
cccp.o: cccp.c $(CONFIG_H) pcp.h version.c config.status system.h gansidecl.h \
mbchar.h
$(CC) $(ALL_CFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
-DGCC_INCLUDE_DIR=\"$(libsubdir)/include\" \
-DGPLUSPLUS_INCLUDE_DIR=\"$(gxx_include_dir)\" \
@ -1835,8 +1838,9 @@ cccp.o: cccp.c $(CONFIG_H) pcp.h version.c config.status system.h gansidecl.h
-c `echo $(srcdir)/cccp.c | sed 's,^\./,,'`
cppmain$(exeext): cppmain.o cpplib.o cpphash.o cppalloc.o cpperror.o cppexp.o \
prefix.o version.o $(LIBDEPS)
prefix.o version.o mbchar.o @extra_cpp_objs@ $(LIBDEPS)
$(CC) $(ALL_CFLAGS) $(LDFLAGS) -o $@ cppmain.o cpplib.o cpphash.o \
mbchar.o @extra_cpp_objs@ \
cppalloc.o cpperror.o cppexp.o prefix.o version.o $(LIBS)
cppmain.o: cppmain.c $(CONFIG_H) cpplib.h system.h gansidecl.h

View File

@ -33,16 +33,14 @@ Boston, MA 02111-1307, USA. */
#include "c-pragma.h"
#include "toplev.h"
/* MULTIBYTE_CHARS support only works for native compilers.
??? Ideally what we want is to model widechar support after
the current floating point support. */
#ifdef CROSS_COMPILE
#undef MULTIBYTE_CHARS
#endif
#ifdef MULTIBYTE_CHARS
#include "mbchar.h"
#include <locale.h>
#ifndef GET_ENVIRONMENT
#define GET_ENVIRONMENT(ENV_VALUE,ENV_NAME) ((ENV_VALUE) = getenv (ENV_NAME))
#endif
#endif /* MULTIBYTE_CHARS */
#if USE_CPPLIB
#include "cpplib.h"
@ -232,6 +230,7 @@ init_lex ()
#ifdef MULTIBYTE_CHARS
/* Change to the native locale for multibyte conversions. */
setlocale (LC_CTYPE, "");
GET_ENVIRONMENT (literal_codeset, "LANG");
#endif
maxtoken = 40;
@ -1795,30 +1794,27 @@ yylex ()
{
register int result = 0;
register int num_chars = 0;
int chars_seen = 0;
unsigned width = TYPE_PRECISION (char_type_node);
int max_chars;
if (wide_flag)
{
width = WCHAR_TYPE_SIZE;
#ifdef MULTIBYTE_CHARS
max_chars = MB_CUR_MAX;
#else
max_chars = 1;
int longest_char = local_mb_cur_max ();
(void) local_mbtowc (NULL_PTR, NULL_PTR, 0);
#endif
}
else
max_chars = TYPE_PRECISION (integer_type_node) / width;
max_chars = TYPE_PRECISION (integer_type_node) / width;
if (wide_flag)
width = WCHAR_TYPE_SIZE;
while (1)
{
tryagain:
c = GETC();
if (c == '\'' || c == EOF)
break;
++chars_seen;
if (c == '\\')
{
int ignore = 0;
@ -1839,18 +1835,76 @@ yylex ()
pedwarn ("ANSI C forbids newline in character constant");
lineno++;
}
#ifdef MAP_CHARACTER
else
c = MAP_CHARACTER (c);
{
#ifdef MULTIBYTE_CHARS
wchar_t wc;
int i;
int char_len = -1;
for (i = 1; i <= longest_char; ++i)
{
if (i > maxtoken - 4)
extend_token_buffer (token_buffer);
token_buffer[i] = c;
char_len = local_mbtowc (& wc,
token_buffer + 1,
i);
if (char_len != -1)
break;
c = GETC ();
}
if (char_len > 1)
{
/* mbtowc sometimes needs an extra char before accepting */
if (char_len < i)
UNGETC (c);
if (! wide_flag)
{
/* Merge character into result; ignore excess chars. */
for (i = 1; i <= char_len; ++i)
{
if (i > max_chars)
break;
if (width < HOST_BITS_PER_INT)
result = (result << width)
| (token_buffer[i]
& ((1 << width) - 1));
else
result = token_buffer[i];
}
num_chars += char_len;
goto tryagain;
}
c = wc;
}
else
{
if (char_len == -1)
warning ("Ignoring invalid multibyte character");
if (wide_flag)
c = wc;
#ifdef MAP_CHARACTER
else
c = MAP_CHARACTER (c);
#endif
}
#else /* ! MULTIBYTE_CHARS */
#ifdef MAP_CHARACTER
c = MAP_CHARACTER (c);
#endif
#endif /* ! MULTIBYTE_CHARS */
}
num_chars++;
if (num_chars > maxtoken - 4)
extend_token_buffer (token_buffer);
token_buffer[num_chars] = c;
if (wide_flag)
{
if (chars_seen == 1) /* only keep the first one */
result = c;
goto tryagain;
}
/* Merge character into result; ignore excess chars. */
num_chars += (width / TYPE_PRECISION (char_type_node));
if (num_chars < max_chars + 1)
{
if (width < HOST_BITS_PER_INT)
@ -1860,19 +1914,16 @@ yylex ()
}
}
token_buffer[num_chars + 1] = '\'';
token_buffer[num_chars + 2] = 0;
if (c != '\'')
error ("malformatted character constant");
else if (num_chars == 0)
else if (chars_seen == 0)
error ("empty character constant");
else if (num_chars > max_chars)
{
num_chars = max_chars;
error ("character constant too long");
}
else if (num_chars != 1 && ! flag_traditional && warn_multichar)
else if (chars_seen != 1 && ! flag_traditional && warn_multichar)
warning ("multi-character character constant");
/* If char type is signed, sign-extend the constant. */
@ -1897,22 +1948,6 @@ yylex ()
}
else
{
#ifdef MULTIBYTE_CHARS
/* Set the initial shift state and convert the next sequence. */
result = 0;
/* In all locales L'\0' is zero and mbtowc will return zero,
so don't use it. */
if (num_chars > 1
|| (num_chars == 1 && token_buffer[1] != '\0'))
{
wchar_t wc;
(void) mbtowc (NULL_PTR, NULL_PTR, 0);
if (mbtowc (& wc, token_buffer + 1, num_chars) == num_chars)
result = wc;
else
warning ("Ignoring invalid multibyte character");
}
#endif
yylval.ttype = build_int_2 (result, 0);
TREE_TYPE (yylval.ttype) = wchar_type_node;
}
@ -1924,7 +1959,13 @@ yylex ()
case '"':
string_constant:
{
c = GETC();
unsigned width = wide_flag ? WCHAR_TYPE_SIZE
: TYPE_PRECISION (char_type_node);
#ifdef MULTIBYTE_CHARS
int longest_char = local_mb_cur_max ();
(void) local_mbtowc (NULL_PTR, NULL_PTR, 0);
#endif
c = GETC ();
p = token_buffer + 1;
while (c != '"' && c >= 0)
@ -1935,9 +1976,8 @@ yylex ()
c = readescape (&ignore);
if (ignore)
goto skipnewline;
if (!wide_flag
&& TYPE_PRECISION (char_type_node) < HOST_BITS_PER_INT
&& c >= (1 << TYPE_PRECISION (char_type_node)))
if (width < HOST_BITS_PER_INT
&& (unsigned) c >= (1 << width))
pedwarn ("escape sequence out of range for character");
}
else if (c == '\n')
@ -1946,15 +1986,94 @@ yylex ()
pedwarn ("ANSI C forbids newline in string constant");
lineno++;
}
else
{
#ifdef MULTIBYTE_CHARS
wchar_t wc;
int i;
int char_len = -1;
for (i = 0; i < longest_char; ++i)
{
if (p + i == token_buffer + maxtoken)
p = extend_token_buffer (p);
p[i] = c;
if (p == token_buffer + maxtoken)
p = extend_token_buffer (p);
*p++ = c;
char_len = local_mbtowc (& wc, p, i + 1);
if (char_len != -1)
break;
c = GETC ();
}
if (char_len == -1)
warning ("Ignoring invalid multibyte character");
else
{
/* mbtowc sometimes needs an extra char before accepting */
if (char_len <= i)
UNGETC (c);
if (wide_flag)
{
*(wchar_t *)p = wc;
p += sizeof (wc);
}
else
p += (i + 1);
c = GETC ();
continue;
}
#endif /* MULTIBYTE_CHARS */
}
/* Add this single character into the buffer either as a wchar_t
or as a single byte. */
if (wide_flag)
{
unsigned width = TYPE_PRECISION (char_type_node);
unsigned bytemask = (1 << width) - 1;
int byte;
if (p + WCHAR_BYTES >= token_buffer + maxtoken)
p = extend_token_buffer (p);
for (byte = 0; byte < WCHAR_BYTES; ++byte)
{
int value;
if (byte >= sizeof (c))
value = 0;
else
value = (c >> (byte * width)) & bytemask;
if (BYTES_BIG_ENDIAN)
p[WCHAR_BYTES - byte - 1] = value;
else
p[byte] = value;
}
p += WCHAR_BYTES;
}
else
{
if (p == token_buffer + maxtoken)
p = extend_token_buffer (p);
*p++ = c;
}
skipnewline:
c = GETC();
c = GETC ();
}
/* Terminate the string value, either with a single byte zero
or with a wide zero. */
if (wide_flag)
{
if (p + WCHAR_BYTES >= token_buffer + maxtoken)
p = extend_token_buffer (p);
bzero (p, WCHAR_BYTES);
p += WCHAR_BYTES;
}
else
{
if (p == token_buffer + maxtoken)
p = extend_token_buffer (p);
*p++ = 0;
}
*p = 0;
if (c < 0)
error ("Unterminated string constant");
@ -1964,52 +2083,27 @@ yylex ()
if (wide_flag)
{
/* If this is a L"..." wide-string, convert the multibyte string
to a wide character string. */
char *widep = (char *) alloca ((p - token_buffer) * WCHAR_BYTES);
int len;
#ifdef MULTIBYTE_CHARS
len = mbstowcs ((wchar_t *) widep, token_buffer + 1, p - token_buffer);
if (len < 0 || len >= (p - token_buffer))
{
warning ("Ignoring invalid multibyte string");
len = 0;
}
bzero (widep + (len * WCHAR_BYTES), WCHAR_BYTES);
#else
{
char *wp, *cp;
wp = widep + (BYTES_BIG_ENDIAN ? WCHAR_BYTES - 1 : 0);
bzero (widep, (p - token_buffer) * WCHAR_BYTES);
for (cp = token_buffer + 1; cp < p; cp++)
*wp = *cp, wp += WCHAR_BYTES;
len = p - token_buffer - 1;
}
#endif
yylval.ttype = build_string ((len + 1) * WCHAR_BYTES, widep);
yylval.ttype = build_string (p - (token_buffer + 1),
token_buffer + 1);
TREE_TYPE (yylval.ttype) = wchar_array_type_node;
value = STRING;
}
else if (objc_flag)
{
/* Return an Objective-C @"..." constant string object. */
yylval.ttype = build_objc_string (p - token_buffer,
yylval.ttype = build_objc_string (p - (token_buffer + 1),
token_buffer + 1);
TREE_TYPE (yylval.ttype) = char_array_type_node;
value = OBJC_STRING;
}
else
{
yylval.ttype = build_string (p - token_buffer, token_buffer + 1);
yylval.ttype = build_string (p - (token_buffer + 1),
token_buffer + 1);
TREE_TYPE (yylval.ttype) = char_array_type_node;
value = STRING;
}
*p++ = '"';
*p = 0;
break;
}

View File

@ -45,6 +45,11 @@ typedef unsigned char U_CHAR;
#include "gansidecl.h"
#include "pcp.h"
#ifdef MULTIBYTE_CHARS
#include "mbchar.h"
#include <locale.h>
#endif /* MULTIBYTE_CHARS */
#ifndef GET_ENVIRONMENT
#define GET_ENVIRONMENT(ENV_VALUE,ENV_NAME) ENV_VALUE = getenv (ENV_NAME)
#endif
@ -1308,6 +1313,12 @@ main (argc, argv)
bzero ((char *) pend_assertions, argc * sizeof (char *));
bzero ((char *) pend_includes, argc * sizeof (char *));
#ifdef MULTIBYTE_CHARS
/* Change to the native locale for multibyte conversions. */
setlocale (LC_CTYPE, "");
GET_ENVIRONMENT (literal_codeset, "LANG");
#endif
/* Process switches and find input file name. */
for (i = 1; i < argc; i++) {
@ -2774,9 +2785,27 @@ do { ip = &instack[indepth]; \
bp += 2;
else if (*bp == '/' && bp[1] == '*') {
bp += 2;
while (!(*bp == '*' && bp[1] == '/'))
bp++;
bp += 2;
while (1)
{
if (*bp == '*')
{
if (bp[1] == '/')
{
bp += 2;
break;
}
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, limit - bp);
if (length > 1)
bp += (length - 1);
#endif
}
bp++;
}
}
/* There is no point in trying to deal with C++ // comments here,
because if there is one, then this # must be part of the
@ -2937,6 +2966,24 @@ do { ip = &instack[indepth]; \
if (ibp[-1] == c)
goto while2end;
break;
#ifdef MULTIBYTE_CHARS
default:
{
int length;
--ibp;
length = local_mblen (ibp, limit - ibp);
if (length > 0)
{
--obp;
bcopy (ibp, obp, length);
obp += length;
ibp += length;
}
else
++ibp;
}
break;
#endif
}
}
while2end:
@ -2983,6 +3030,15 @@ do { ip = &instack[indepth]; \
*obp++ = '\n';
++op->lineno;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (ibp, limit - ibp);
if (length > 1)
ibp += (length - 1);
#endif
}
}
break;
}
@ -3071,6 +3127,16 @@ do { ip = &instack[indepth]; \
goto limit_reached;
}
break;
#ifdef MULTIBYTE_CHARS
default:
{
int length;
length = local_mblen (ibp, limit - ibp);
if (length > 1)
ibp += (length - 1);
}
break;
#endif
}
}
comment_end:
@ -3433,11 +3499,27 @@ randomchar:
break;
}
}
if (*ibp == '\n') {
else if (*ibp == '\n') {
/* Newline in a file. Count it. */
++ip->lineno;
++op->lineno;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (ibp, limit - ibp);
if (length > 1)
{
if (put_out_comments)
{
bcopy (ibp, obp, length - 1);
obp += length - 1;
}
ibp += (length - 1);
}
#endif
}
if (put_out_comments)
*obp++ = *ibp;
}
@ -3448,9 +3530,32 @@ randomchar:
} else if (! traditional) {
*obp++ = ' ';
}
for (ibp += 2; *ibp != '\n' || ibp[-1] == '\\'; ibp++)
if (put_out_comments)
*obp++ = *ibp;
for (ibp += 2; ; ibp++)
{
if (*ibp == '\n')
{
if (ibp[-1] != '\\')
break;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (ibp, limit - ibp);
if (length > 1)
{
if (put_out_comments)
{
bcopy (ibp, obp, length - 1);
obp += length - 1;
}
ibp += (length - 1);
}
#endif
}
if (put_out_comments)
*obp++ = *ibp;
}
} else
break;
}
@ -6186,6 +6291,25 @@ collect_expansion (buf, end, nargs, arglist)
}
}
#ifdef MULTIBYTE_CHARS
/* Handle multibyte characters inside string and character literals. */
if (expected_delimiter != '\0')
{
int length;
--p;
length = local_mblen (p, limit - p);
if (length > 1)
{
--exp_p;
bcopy (p, exp_p, length);
p += length;
exp_p += length;
continue;
}
++p;
}
#endif
/* Handle the start of a symbol. */
if (is_idchar[c] && nargs > 0) {
U_CHAR *id_beg = p - 1;
@ -7412,9 +7536,27 @@ skip_if_group (ip, any, op)
bp += 2;
else if (*bp == '/' && bp[1] == '*') {
bp += 2;
while (!(*bp == '*' && bp[1] == '/'))
bp++;
bp += 2;
while (1)
{
if (*bp == '*')
{
if (bp[1] == '/')
{
bp += 2;
break;
}
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, endb - bp);
if (length > 1)
bp += (length - 1);
#endif
}
bp++;
}
}
/* There is no point in trying to deal with C++ // comments here,
because if there is one, then this # must be part of the
@ -7458,6 +7600,15 @@ skip_if_group (ip, any, op)
if (bp[1] == '/')
break;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, endb - bp);
if (length > 1)
bp += (length - 1);
#endif
}
}
bp += 2;
} else if (bp[1] == '/' && cplusplus_comments) {
@ -7469,6 +7620,15 @@ skip_if_group (ip, any, op)
warning ("multiline `//' comment");
ip->lineno++;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, endb - bp);
if (length > 1)
bp += (length - 1);
#endif
}
}
} else
break;
@ -7764,6 +7924,15 @@ validate_else (p, limit)
break;
}
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (p, limit - p);
if (length > 1)
p += (length - 1);
#endif
}
}
}
else if (cplusplus_comments && p[1] == '/')
@ -7817,6 +7986,22 @@ skip_to_end_of_comment (ip, line_counter, nowarn)
if (op)
++op->lineno;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, limit - bp);
if (length > 1)
{
if (op)
{
bcopy (bp, op->bufp, length - 1);
op->bufp += (length - 1);
}
bp += (length - 1);
}
#endif
}
if (op)
*op->bufp++ = *bp;
}
@ -7854,6 +8039,23 @@ skip_to_end_of_comment (ip, line_counter, nowarn)
return bp;
}
break;
#ifdef MULTIBYTE_CHARS
default:
{
int length;
bp--;
length = local_mblen (bp, limit - bp);
if (length <= 0)
length = 1;
if (op)
{
op->bufp--;
bcopy (bp, op->bufp, length);
op->bufp += length;
}
bp += length;
}
#endif
}
}
@ -7944,6 +8146,16 @@ skip_quoted_string (bp, limit, start_line, count_newlines, backslash_newlines_p,
}
} else if (c == match)
break;
#ifdef MULTIBYTE_CHARS
{
int length;
--bp;
length = local_mblen (bp, limit - bp);
if (length <= 0)
length = 1;
bp += length;
}
#endif
}
return bp;
}
@ -8381,9 +8593,23 @@ macroexpand (hp, op)
else {
if (c == '\\')
escaped = 1;
if (in_string) {
else if (in_string) {
if (c == in_string)
in_string = 0;
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (arg->raw + i, arglen - i);
if (length > 1)
{
bcopy (arg->raw + i, xbuf + totlen, length);
i += length - 1;
totlen += length;
continue;
}
#endif
}
} else if (c == '\"' || c == '\'')
in_string = c;
}
@ -8717,6 +8943,15 @@ macarg1 (start, limit, macro, depthptr, newlines, comments, rest_args)
break;
}
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, limit - bp);
if (length > 1)
bp += (length - 1);
#endif
}
}
} else if (bp[1] == '/' && cplusplus_comments) {
*comments = 1;
@ -8728,6 +8963,15 @@ macarg1 (start, limit, macro, depthptr, newlines, comments, rest_args)
if (warn_comments)
warning ("multiline `//' comment");
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, limit - bp);
if (length > 1)
bp += (length - 1);
#endif
}
}
}
break;
@ -8751,6 +8995,15 @@ macarg1 (start, limit, macro, depthptr, newlines, comments, rest_args)
if (quotec == '\'')
break;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
length = local_mblen (bp, limit - bp);
if (length > 1)
bp += (length - 1);
#endif
}
}
}
break;
@ -8828,8 +9081,23 @@ discard_comments (start, length, newlines)
/* Comments are equivalent to spaces. */
obp[-1] = ' ';
ibp++;
while (ibp < limit && (*ibp != '\n' || ibp[-1] == '\\'))
ibp++;
while (ibp < limit)
{
if (*ibp == '\n')
{
if (ibp[-1] != '\\')
break;
}
else
{
#ifdef MULTIBYTE_CHARS
int length = local_mblen (ibp, limit - ibp);
if (length > 1)
ibp += (length - 1);
#endif
}
ibp++;
}
break;
}
if (ibp[0] != '*' || ibp + 1 >= limit)
@ -8849,6 +9117,14 @@ discard_comments (start, length, newlines)
break;
}
}
else
{
#ifdef MULTIBYTE_CHARS
int length = local_mblen (ibp, limit - ibp);
if (length > 1)
ibp += (length - 1);
#endif
}
}
break;
@ -8863,9 +9139,12 @@ discard_comments (start, length, newlines)
*obp++ = c = *ibp++;
if (c == quotec)
break;
if (c == '\n' && quotec == '\'')
break;
if (c == '\\') {
if (c == '\n')
{
if (quotec == '\'')
break;
}
else if (c == '\\') {
if (ibp < limit && *ibp == '\n') {
ibp++;
obp--;
@ -8876,6 +9155,23 @@ discard_comments (start, length, newlines)
*obp++ = *ibp++;
}
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
ibp--;
length = local_mblen (ibp, limit - ibp);
if (length > 1)
{
obp--;
bcopy (ibp, obp, length);
ibp += length;
obp += length;
}
else
ibp++;
#endif
}
}
}
break;
@ -8925,10 +9221,33 @@ change_newlines (start, length)
int quotec = c;
while (ibp < limit) {
*obp++ = c = *ibp++;
if (c == quotec && ibp[-2] != '\\')
break;
if (c == '\n' && quotec == '\'')
break;
if (c == quotec)
{
if (ibp[-2] != '\\')
break;
}
else if (c == '\n')
{
if (quotec == '\'')
break;
}
else
{
#ifdef MULTIBYTE_CHARS
int length;
ibp--;
length = local_mblen (ibp, limit - ibp);
if (length > 1)
{
obp--;
bcopy (ibp, obp, length);
ibp += length;
obp += length;
}
else
ibp++;
#endif
}
}
}
break;

View File

@ -39,12 +39,12 @@ Boston, MA 02111-1307, USA.
#include "system.h"
#include <setjmp.h>
/* #define YYDEBUG 1 */
#include "gansidecl.h"
#ifdef MULTIBYTE_CHARS
#include "mbchar.h"
#include <locale.h>
#endif
#include "gansidecl.h"
#endif /* MULTIBYTE_CHARS */
typedef unsigned char U_CHAR;
@ -641,23 +641,18 @@ yylex ()
{
register HOST_WIDE_INT result = 0;
register int num_chars = 0;
int chars_seen = 0;
unsigned width = MAX_CHAR_TYPE_SIZE;
int max_chars;
char *token_buffer;
if (wide_flag)
{
width = MAX_WCHAR_TYPE_SIZE;
#ifdef MULTIBYTE_CHARS
max_chars = MB_CUR_MAX;
#else
max_chars = 1;
int longest_char = local_mb_cur_max ();
char *token_buffer = (char *) alloca (longest_char);
(void) local_mbtowc (NULL_PTR, NULL_PTR, 0);
#endif
}
else
max_chars = MAX_LONG_TYPE_SIZE / width;
token_buffer = (char *) alloca (max_chars + 1);
max_chars = MAX_LONG_TYPE_SIZE / width;
if (wide_flag)
width = MAX_WCHAR_TYPE_SIZE;
while (1)
{
@ -666,44 +661,96 @@ yylex ()
if (c == '\'' || c == EOF)
break;
++chars_seen;
if (c == '\\')
{
c = parse_escape (&lexptr, mask);
}
else
{
#ifdef MULTIBYTE_CHARS
wchar_t wc;
int i;
int char_len = -1;
for (i = 1; i <= longest_char; ++i)
{
token_buffer[i - 1] = c;
char_len = local_mbtowc (& wc, token_buffer, i);
if (char_len != -1)
break;
c = *lexptr++;
}
if (char_len > 1)
{
/* mbtowc sometimes needs an extra char before accepting */
if (char_len < i)
lexptr--;
if (! wide_flag)
{
/* Merge character into result; ignore excess chars. */
for (i = 1; i <= char_len; ++i)
{
if (i > max_chars)
break;
if (width < HOST_BITS_PER_INT)
result = (result << width)
| (token_buffer[i - 1]
& ((1 << width) - 1));
else
result = token_buffer[i - 1];
}
num_chars += char_len;
continue;
}
}
else
{
if (char_len == -1)
warning ("Ignoring invalid multibyte character");
}
if (wide_flag)
c = wc;
#endif /* ! MULTIBYTE_CHARS */
}
num_chars++;
if (wide_flag)
{
if (chars_seen == 1) /* only keep the first one */
result = c;
continue;
}
/* Merge character into result; ignore excess chars. */
num_chars++;
if (num_chars <= max_chars)
{
if (width < HOST_BITS_PER_WIDE_INT)
result = (result << width) | c;
if (width < HOST_BITS_PER_INT)
result = (result << width) | (c & ((1 << width) - 1));
else
result = c;
token_buffer[num_chars - 1] = c;
}
}
token_buffer[num_chars] = 0;
if (c != '\'')
error ("malformatted character constant");
else if (num_chars == 0)
else if (chars_seen == 0)
error ("empty character constant");
else if (num_chars > max_chars)
{
num_chars = max_chars;
error ("character constant too long");
}
else if (num_chars != 1 && ! traditional)
else if (chars_seen != 1 && ! traditional)
warning ("multi-character character constant");
/* If char type is signed, sign-extend the constant. */
if (! wide_flag)
{
int num_bits = num_chars * width;
if (lookup ((U_CHAR *) "__CHAR_UNSIGNED__",
if (num_bits == 0)
/* We already got an error; avoid invalid shift. */
yylval.integer.value = 0;
else if (lookup ((U_CHAR *) "__CHAR_UNSIGNED__",
sizeof ("__CHAR_UNSIGNED__") - 1, -1)
|| ((result >> (num_bits - 1)) & 1) == 0)
yylval.integer.value
@ -716,22 +763,6 @@ yylex ()
}
else
{
#ifdef MULTIBYTE_CHARS
/* Set the initial shift state and convert the next sequence. */
result = 0;
/* In all locales L'\0' is zero and mbtowc will return zero,
so don't use it. */
if (num_chars > 1
|| (num_chars == 1 && token_buffer[0] != '\0'))
{
wchar_t wc;
(void) mbtowc (NULL_PTR, NULL_PTR, 0);
if (mbtowc (& wc, token_buffer, num_chars) == num_chars)
result = wc;
else
pedwarn ("Ignoring invalid multibyte character");
}
#endif
yylval.integer.value = result;
}
}

View File

@ -84,7 +84,7 @@ AC_DEFINE(ENABLE_CHECKING)
# Enable use of cpplib for C.
cpp_main=cccp
AC_ARG_ENABLE(c-cpplib,
[ --enable-c-cpplib Use cpplib for C.],
[ --enable-c-cpplib Use cpplib for C and C++.],
if [[[ x$enable_c_cpplib != xno ]]]; then
extra_c_objs="${extra_c_objs} cpplib.o cppexp.o cpphash.o cpperror.o"
extra_c_objs="${extra_c_objs} prefix.o"
@ -93,6 +93,13 @@ if [[[ x$enable_c_cpplib != xno ]]]; then
cpp_main=cppmain
fi)
# Enable Multibyte Characters for C/C++
AC_ARG_ENABLE(c-mbchar,
[ --enable-c-mbchar Enable multibyte characters for C and C++.],
if [[[ x$enable_c_mbchar != xno ]]]; then
extra_c_flags=-DMULTIBYTE_CHARS=1
fi)
# Enable Haifa scheduler.
AC_ARG_ENABLE(haifa,
[ --enable-haifa Use the experimental scheduler.
@ -193,6 +200,9 @@ AC_CHECK_FUNCS(strtoul bsearch strerror putenv popen bcopy bzero bcmp \
index rindex strchr strrchr kill getrlimit setrlimit atoll atoq \
sysconf isascii gettimeofday)
# Make sure wchar_t is available
#AC_CHECK_TYPE(wchar_t, unsigned int)
GCC_FUNC_VFPRINTF_DOPRNT
GCC_FUNC_PRINTF_PTR
@ -3585,6 +3595,7 @@ AC_SUBST(extra_programs)
AC_SUBST(extra_parts)
AC_SUBST(extra_c_objs)
AC_SUBST(extra_cxx_objs)
AC_SUBST(extra_cpp_objs)
AC_SUBST(extra_c_flags)
AC_SUBST(extra_objs)
AC_SUBST(host_extra_gcc_objs)

View File

@ -5964,8 +5964,9 @@ the language standard. You should not need to use these options yourself.
@cindex environment variables
This section describes several environment variables that affect how GNU
CC operates. They work by specifying directories or prefixes to use
when searching for various kinds of files.
CC operates. Some of them work by specifying directories or prefixes to use
when searching for various kinds of files. Some are used to specify other
ascpects of the compilation environment.
@ifclear INTERNALS
Note that you can also specify places to search using options such as
@ -6065,6 +6066,28 @@ which case the Make rules are written to that file, guessing the target
name from the source file name. Or the value can have the form
@samp{@var{file} @var{target}}, in which case the rules are written to
file @var{file} using @var{target} as the target name.
@item LANG
@findex LANG
@cindex locale definition
This variable is used to pass locale information to the compiler. One way in
which this information is used is to determine the character set to be used
when character literals, string literals and comments are parsed in C and C++.
When the compiler is configured to allow multibyte characters,
the following values for @code{LANG} are recognized:
@table @code
@item C-JIS
Recognize JIS characters.
@item C-SJIS
Recognize SJIS characters.
@item C-EUCJP
Recognize EUCJP characters.
@end table
If @code{LANG} is not defined, or if it has some ther value, then the
compiler will use mblen and mbtowc as defined by the default locale to
recognize and translate multibyte characters.
@end table
@node Running Protoize

288
gcc/mbchar.c Normal file
View File

@ -0,0 +1,288 @@
/* Multibyte Character Functions.
Copyright (C) 1998 Free Software Foundation, Inc.
This file is part of GNU CC.
GNU CC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
GNU CC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GNU CC; see the file COPYING. If not, write to
the Free Software Foundation, 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA. */
/* These functions are used to manipulate multibyte characters. */
/* Note regarding cross compilation:
In general translation of multibyte characters to wide characters can
only work in a native compiler since the translation function (mbtowc)
needs to know about both the source and target character encoding. However,
this particular implementation for JIS, SJIS and EUCJP source characters
will work for any compiler with a newlib target. Other targets may also
work provided that their wchar_t implementation is 2 bytes and the encoding
leaves the source character values unchanged (except for removing the
state shifting markers). */
#ifdef MULTIBYTE_CHARS
#include "config.h"
#include "system.h"
#include "gansidecl.h"
#include "mbchar.h"
#include <locale.h>
typedef enum
{
ESCAPE, DOLLAR, BRACKET, AT, B, J, NUL, JIS_CHAR, OTHER, JIS_C_NUM
} JIS_CHAR_TYPE;
typedef enum
{
ASCII, A_ESC, A_ESC_DL, JIS, JIS_1, JIS_2, J_ESC, J_ESC_BR,
J2_ESC, J2_ESC_BR, INV, JIS_S_NUM
} JIS_STATE;
typedef enum
{
COPYA, COPYJ, COPYJ2, MAKE_A, MAKE_J, NOOP, EMPTY, ERROR
} JIS_ACTION;
/*****************************************************************************
* state/action tables for processing JIS encoding
* Where possible, switches to JIS are grouped with proceding JIS characters
* and switches to ASCII are grouped with preceding JIS characters.
* Thus, maximum returned length is:
* 2 (switch to JIS) + 2 (JIS characters) + 2 (switch back to ASCII) = 6.
*****************************************************************************/
static JIS_STATE JIS_state_table[JIS_S_NUM][JIS_C_NUM] = {
/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER*/
/*ASCII*/ { A_ESC, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
/*A_ESC*/ { ASCII, A_ESC_DL,ASCII, ASCII, ASCII, ASCII, ASCII,ASCII,ASCII},
/*A_ESC_DL*/{ ASCII, ASCII, ASCII, JIS, JIS, ASCII, ASCII,ASCII,ASCII},
/*JIS*/ { J_ESC, JIS_1, JIS_1, JIS_1, JIS_1, JIS_1, INV, JIS_1,INV },
/*JIS_1*/ { INV, JIS_2, JIS_2, JIS_2, JIS_2, JIS_2, INV, JIS_2,INV },
/*JIS_2*/ { J2_ESC,JIS, JIS, JIS, JIS, JIS, INV, JIS, JIS },
/*J_ESC*/ { INV, INV, J_ESC_BR, INV, INV, INV, INV, INV, INV },
/*J_ESC_BR*/{ INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
/*J2_ESC*/ { INV, INV, J2_ESC_BR,INV, INV, INV, INV, INV, INV },
/*J2_ESC_BR*/{INV, INV, INV, INV, ASCII, ASCII, INV, INV, INV },
};
static JIS_ACTION JIS_action_table[JIS_S_NUM][JIS_C_NUM] = {
/* ESCAPE DOLLAR BRACKET AT B J NUL JIS_CHAR OTHER */
/*ASCII */ {NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, EMPTY, COPYA, COPYA},
/*A_ESC */ {COPYA, NOOP, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA, COPYA},
/*A_ESC_DL */{COPYA, COPYA, COPYA, MAKE_J, MAKE_J, COPYA, COPYA, COPYA, COPYA},
/*JIS */ {NOOP, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR },
/*JIS_1 */ {ERROR, NOOP, NOOP, NOOP, NOOP, NOOP, ERROR, NOOP, ERROR },
/*JIS_2 */ {NOOP, COPYJ2,COPYJ2,COPYJ2, COPYJ2, COPYJ2,ERROR, COPYJ2,COPYJ2},
/*J_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR },
/*J_ESC_BR */{ERROR, ERROR, ERROR, ERROR, NOOP, NOOP, ERROR, ERROR, ERROR },
/*J2_ESC */ {ERROR, ERROR, NOOP, ERROR, ERROR, ERROR, ERROR, ERROR, ERROR },
/*J2_ESC_BR*/{ERROR, ERROR, ERROR, ERROR, COPYJ, COPYJ, ERROR, ERROR, ERROR },
};
char *literal_codeset = NULL;
int
local_mbtowc (pwc, s, n)
wchar_t *pwc;
const char *s;
size_t n;
{
static JIS_STATE save_state = ASCII;
JIS_STATE curr_state = save_state;
unsigned char *t = (unsigned char *)s;
if (s != NULL && n == 0)
return -1;
if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
{
/* This must be the "C" locale or unknown locale -- fall thru */
}
else if (! strcmp (literal_codeset, "C-SJIS"))
{
int char1;
if (s == NULL)
return 0; /* not state-dependent */
char1 = *t;
if (ISSJIS1 (char1))
{
int char2 = t[1];
if (n <= 1)
return -1;
if (ISSJIS2 (char2))
{
if (pwc != NULL)
*pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1));
return 2;
}
return -1;
}
if (pwc != NULL)
*pwc = (wchar_t)*t;
if (*t == '\0')
return 0;
return 1;
}
else if (! strcmp (literal_codeset, "C-EUCJP"))
{
int char1;
if (s == NULL)
return 0; /* not state-dependent */
char1 = *t;
if (ISEUCJP (char1))
{
int char2 = t[1];
if (n <= 1)
return -1;
if (ISEUCJP (char2))
{
if (pwc != NULL)
*pwc = (((wchar_t)*t) << 8) + (wchar_t)(*(t+1));
return 2;
}
return -1;
}
if (pwc != NULL)
*pwc = (wchar_t)*t;
if (*t == '\0')
return 0;
return 1;
}
else if (! strcmp (literal_codeset, "C-JIS"))
{
JIS_ACTION action;
JIS_CHAR_TYPE ch;
unsigned char *ptr;
int i, curr_ch;
if (s == NULL)
{
save_state = ASCII;
return 1; /* state-dependent */
}
ptr = t;
for (i = 0; i < n; ++i)
{
curr_ch = t[i];
switch (curr_ch)
{
case JIS_ESC_CHAR:
ch = ESCAPE;
break;
case '$':
ch = DOLLAR;
break;
case '@':
ch = AT;
break;
case '(':
ch = BRACKET;
break;
case 'B':
ch = B;
break;
case 'J':
ch = J;
break;
case '\0':
ch = NUL;
break;
default:
if (ISJIS (curr_ch))
ch = JIS_CHAR;
else
ch = OTHER;
}
action = JIS_action_table[curr_state][ch];
curr_state = JIS_state_table[curr_state][ch];
switch (action)
{
case NOOP:
break;
case EMPTY:
if (pwc != NULL)
*pwc = (wchar_t)0;
save_state = curr_state;
return i;
case COPYA:
if (pwc != NULL)
*pwc = (wchar_t)*ptr;
save_state = curr_state;
return (i + 1);
case COPYJ:
if (pwc != NULL)
*pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1));
save_state = curr_state;
return (i + 1);
case COPYJ2:
if (pwc != NULL)
*pwc = (((wchar_t)*ptr) << 8) + (wchar_t)(*(ptr+1));
save_state = curr_state;
return (ptr - t) + 2;
case MAKE_A:
case MAKE_J:
ptr = (char *)(t + i + 1);
break;
case ERROR:
default:
return -1;
}
}
return -1; /* n < bytes needed */
}
#ifdef CROSS_COMPILE
if (s == NULL)
return 0; /* not state-dependent */
if (pwc != NULL)
*pwc = *s;
return 1;
#else
/* This must be the "C" locale or unknown locale. */
return mbtowc (pwc, s, n);
#endif
}
int
local_mblen (s, n)
const char *s;
size_t n;
{
return local_mbtowc (NULL, s, n);
}
int
local_mb_cur_max ()
{
if (literal_codeset == NULL || strlen (literal_codeset) <= 1)
;
else if (! strcmp (literal_codeset, "C-SJIS"))
return 2;
else if (! strcmp (literal_codeset, "C-EUCJP"))
return 2;
else if (! strcmp (literal_codeset, "C-JIS"))
return 8; /* 3 + 2 + 3 */
#ifdef CROSS_COMPILE
return 1;
#else
return MB_CUR_MAX;
#endif
}
#endif /* MULTIBYTE_CHARS */

25
gcc/mbchar.h Normal file
View File

@ -0,0 +1,25 @@
/* mbchar.h - Various declarations for functions found in mbchar.c
Copyright (C) 1998 Free Software Foundation, Inc.
*/
#ifndef __GCC_MBCHAR_H__
#define __GCC_MBCHAR_H__
#ifdef MULTIBYTE_CHARS
/* escape character used for JIS encoding */
#define JIS_ESC_CHAR 0x1b
#define ISSJIS1(c) ((c) >= 0x81 && (c) <= 0x9f || (c) >= 0xe0 && (c) <= 0xef)
#define ISSJIS2(c) ((c) >= 0x40 && (c) <= 0x7e || (c) >= 0x80 && (c) <= 0xfc)
#define ISEUCJP(c) ((c) >= 0xa1 && (c) <= 0xfe)
#define ISJIS(c) ((c) >= 0x21 && (c) <= 0x7e)
int local_mbtowc PROTO ((wchar_t *, const char *, size_t));
int local_mblen PROTO ((const char *, size_t));
int local_mb_cur_max PROTO ((void));
/* The locale being used for multibyte characters in string/char literals. */
extern char *literal_codeset;
#endif /* MULTIBYTE_CHARS */
#endif /* __GCC_MBCHAR_H__ */