cppcharset.c (one_utf8_to_cppchar, [...]): New functions.

* cppcharset.c (one_utf8_to_cppchar, one_cppchar_to_utf8,
	one_utf8_to_utf32, one_utf32_to_utf8, one_utf8_to_utf16,
	one_utf16_to_utf8, conversion_loop, convert_utf8_utf16,
	convert_utf8_utf32, convert_utf16_utf8,	convert_utf32_utf8,
	convert_no_conversion, convert_using_iconv): New functions.
	(APPLY_CONVERSION): New macro.
	(struct conversion, conversion_tab): New data structure.
	(init_iconv_desc): Check conversion_tab for a custom conversion
	primitive before trying to use iconv.
	(convert_cset): Deleted.
	(cpp_init_iconv): Use UTF- terminology, not UCS-.
	(_cpp_destroy_iconv): Update to match.
	(_cpp_valid_ucn): We don't need iconv to implement UCNs.
	(convert_ucn): Use one_cppchar_to_utf8 and APPLY_CONVERSION.
	(convert_escape, cpp_interpret_string): Use APPLY_CONVERSION.
	(_cpp_interpret_string_notranslate): New function, moved here
	from cpplib.c.

	* cpphash.h (convert_f, struct cset_converter): New types.
	(struct cpp_reader): narrow_cset_desc and wide_cset_desc
	are now struct cset_converter, not bare iconv_t.
	Update prototypes.
	* cpplib.c (interpret_string_notranslate): Moved to cppcharset.c;
	all callers changed.

From-SVN: r69204
This commit is contained in:
Zack Weinberg 2003-07-10 23:16:31 +00:00
parent 38f4680f54
commit 6b88314c65
4 changed files with 633 additions and 156 deletions

View File

@ -1,11 +1,38 @@
2003-07-10 Zack Weinberg <zack@codesourcery.com>
* cppcharset.c (one_utf8_to_cppchar, one_cppchar_to_utf8,
one_utf8_to_utf32, one_utf32_to_utf8, one_utf8_to_utf16,
one_utf16_to_utf8, conversion_loop, convert_utf8_utf16,
convert_utf8_utf32, convert_utf16_utf8, convert_utf32_utf8,
convert_no_conversion, convert_using_iconv): New functions.
(APPLY_CONVERSION): New macro.
(struct conversion, conversion_tab): New data structure.
(init_iconv_desc): Check conversion_tab for a custom conversion
primitive before trying to use iconv.
(convert_cset): Deleted.
(cpp_init_iconv): Use UTF- terminology, not UCS-.
(_cpp_destroy_iconv): Update to match.
(_cpp_valid_ucn): We don't need iconv to implement UCNs.
(convert_ucn): Use one_cppchar_to_utf8 and APPLY_CONVERSION.
(convert_escape, cpp_interpret_string): Use APPLY_CONVERSION.
(_cpp_interpret_string_notranslate): New function, moved here
from cpplib.c.
* cpphash.h (convert_f, struct cset_converter): New types.
(struct cpp_reader): narrow_cset_desc and wide_cset_desc
are now struct cset_converter, not bare iconv_t.
Update prototypes.
* cpplib.c (interpret_string_notranslate): Moved to cppcharset.c;
all callers changed.
2003-07-10 Kelley Cook <kelleycook@wideopenwest.com>
* Makefile.in (options.h): Depend on Makefile. Add move-if-change
to opts.sh command line.
* opts.sh: Write to temporary files with a move-if-change at the end.
2003-06-30 Denis Chertykov <denisc@overta.ru>
Richard Kenner <kenner@vlsi1.ultra.nyu.edu>
Richard Kenner <kenner@vlsi1.ultra.nyu.edu>
* combine.c (gen_binary): Handle the CLOBBER rtx and
don't build a binary operation with it.
@ -319,7 +346,7 @@
2003-07-09 Nathanael Nerode <neroden@gcc.gnu.org>
PR bootstrap/11043
* config/arc/t-arc: Replace bogus references to "x-crtinit.o",
* config/arc/t-arc: Replace bogus references to "x-crtinit.o",
"x-crtfini.o" with "crtinit.o", "crtfini.o".
* fixinc/inclhack.def (limits_ifndefs): Add select test.

View File

@ -92,8 +92,7 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
#error "Unrecognized basic host character set"
#endif
/* This structure is used for a resizable string buffer, mostly by
convert_cset and cpp_interpret_string. */
/* This structure is used for a resizable string buffer throughout. */
struct strbuf
{
uchar *text;
@ -103,23 +102,545 @@ struct strbuf
/* This is enough to hold any string that fits on a single 80-column
line, even if iconv quadruples its size (e.g. conversion from
ASCII to UCS-4) rounded up to a power of two. */
ASCII to UTF-32) rounded up to a power of two. */
#define OUTBUF_BLOCK_SIZE 256
/* Subroutine of cpp_init_iconv: initialize and return an iconv
descriptor for conversion from FROM to TO. If iconv_open() fails,
issue an error and return (iconv_t) -1. Silently return
(iconv_t) -1 if FROM and TO are identical. */
static iconv_t
/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
logic. This is because a depressing number of systems lack iconv,
or have have iconv libraries that do not do these conversions, so
we need a fallback implementation for them. To ensure the fallback
doesn't break due to neglect, it is used on all systems.
UTF-32 encoding is nice and simple: a four-byte binary number,
constrained to the range 00000000-7FFFFFFF to avoid questions of
signedness. We do have to cope with big- and little-endian
variants.
UTF-16 encoding uses two-byte binary numbers, again in big- and
little-endian variants, for all values in the 00000000-0000FFFF
range. Values in the 00010000-0010FFFF range are encoded as pairs
of two-byte numbers, called "surrogate pairs": given a number S in
this range, it is mapped to a pair (H, L) as follows:
H = (S - 0x10000) / 0x400 + 0xD800
L = (S - 0x10000) % 0x400 + 0xDC00
Two-byte values in the D800...DFFF range are ill-formed except as a
component of a surrogate pair. Even if the encoding within a
two-byte value is little-endian, the H member of the surrogate pair
comes first.
There is no way to encode values in the 00110000-7FFFFFFF range,
which is not currently a problem as there are no assigned code
points in that range; however, the author expects that it will
eventually become necessary to abandon UTF-16 due to this
limitation. Note also that, because of these pairs, UTF-16 does
not meet the requirements of the C standard for a wide character
encoding (see 3.7.3 and 6.4.4.4p11).
UTF-8 encoding looks like this:
value range encoded as
00000000-0000007F 0xxxxxxx
00000080-000007FF 110xxxxx 10xxxxxx
00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
never occur. Note also that any value that can be encoded by a
given row of the table can also be encoded by all successive rows,
but this is not done; only the shortest possible encoding for any
given value is valid. For instance, the character 07C0 could be
encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
FC 80 80 80 9F 80. Only the first is valid.
An implementation note: the transformation from UTF-16 to UTF-8, or
vice versa, is easiest done by using UTF-32 as an intermediary. */
/* Internal primitives which go from an UTF-8 byte stream to native-endian
UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
operation in several places below. */
static inline int
one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
cppchar_t *cp)
{
static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
cppchar_t c;
const uchar *inbuf = *inbufp;
size_t nbytes, i;
if (*inbytesleftp < 1)
return EINVAL;
c = *inbuf;
if (c < 0x80)
{
*cp = c;
*inbytesleftp -= 1;
*inbufp += 1;
return 0;
}
/* The number of leading 1-bits in the first byte indicates how many
bytes follow. */
for (nbytes = 2; nbytes < 7; nbytes++)
if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
goto found;
return EILSEQ;
found:
if (*inbytesleftp < nbytes)
return EINVAL;
c = (c & masks[nbytes-1]);
inbuf++;
for (i = 1; i < nbytes; i++)
{
cppchar_t n = *inbuf++;
if ((n & 0xC0) != 0x80)
return EILSEQ;
c = ((c << 6) + (n & 0x3F));
}
/* Make sure the shortest possible encoding was used. */
if (c <= 0x7F && nbytes > 1) return EILSEQ;
if (c <= 0x7FF && nbytes > 2) return EILSEQ;
if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
/* Make sure the character is valid. */
if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
*cp = c;
*inbufp = inbuf;
*inbytesleftp -= nbytes;
return 0;
}
static inline int
one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
{
static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
size_t nbytes;
uchar buf[6], *p = &buf[6];
uchar *outbuf = *outbufp;
nbytes = 1;
if (c < 0x80)
*--p = c;
else
{
do
{
*--p = ((c & 0x3F) | 0x80);
c >>= 6;
nbytes++;
}
while (c >= 0x3F || (c & limits[nbytes-1]));
*--p = (c | masks[nbytes-1]);
}
if (*outbytesleftp < nbytes)
return E2BIG;
while (p < &buf[6])
*outbuf++ = *p++;
*outbytesleftp -= nbytes;
*outbufp = outbuf;
return 0;
}
/* The following four functions transform one character between the two
encodings named in the function name. All have the signature
int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
uchar **outbufp, size_t *outbytesleftp)
BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
interpreted as a boolean indicating whether big-endian or
little-endian encoding is to be used for the member of the pair
that is not UTF-8.
INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
do for iconv.
The return value is either 0 for success, or an errno value for
failure, which may be E2BIG (need more space), EILSEQ (ill-formed
input sequence), ir EINVAL (incomplete input sequence). */
static inline int
one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
uchar **outbufp, size_t *outbytesleftp)
{
uchar *outbuf;
cppchar_t s;
int rval;
/* Check for space first, since we know exactly how much we need. */
if (*outbytesleftp < 4)
return E2BIG;
rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
if (rval)
return rval;
outbuf = *outbufp;
outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
*outbufp += 4;
*outbytesleftp -= 4;
return 0;
}
static inline int
one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
uchar **outbufp, size_t *outbytesleftp)
{
cppchar_t s;
int rval;
const uchar *inbuf;
if (*inbytesleftp < 4)
return EINVAL;
inbuf = *inbufp;
s = inbuf[bigend ? 0 : 3] << 24;
s += inbuf[bigend ? 1 : 2] << 16;
s += inbuf[bigend ? 2 : 1] << 8;
s += inbuf[bigend ? 3 : 0];
if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
return EILSEQ;
rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
if (rval)
return rval;
*inbufp += 4;
*inbytesleftp -= 4;
return 0;
}
static inline int
one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
uchar **outbufp, size_t *outbytesleftp)
{
int rval;
cppchar_t s;
const uchar *save_inbuf = *inbufp;
size_t save_inbytesleft = *inbytesleftp;
uchar *outbuf = *outbufp;
rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
if (rval)
return rval;
if (s > 0x0010FFFF)
{
*inbufp = save_inbuf;
*inbytesleftp = save_inbytesleft;
return EILSEQ;
}
if (s < 0xFFFF)
{
if (*outbytesleftp < 2)
{
*inbufp = save_inbuf;
*inbytesleftp = save_inbytesleft;
return E2BIG;
}
outbuf[bigend ? 1 : 0] = (s & 0x00FF);
outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
*outbufp += 2;
*outbytesleftp -= 2;
return 0;
}
else
{
cppchar_t hi, lo;
if (*outbytesleftp < 4)
{
*inbufp = save_inbuf;
*inbytesleftp = save_inbytesleft;
return E2BIG;
}
hi = (s - 0x10000) / 0x400 + 0xD800;
lo = (s - 0x10000) % 0x400 + 0xDC00;
/* Even if we are little-endian, put the high surrogate first.
??? Matches practice? */
outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
*outbufp += 4;
*outbytesleftp -= 4;
return 0;
}
}
static inline int
one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
uchar **outbufp, size_t *outbytesleftp)
{
cppchar_t s;
const uchar *inbuf = *inbufp;
int rval;
if (*inbytesleftp < 2)
return EINVAL;
s = inbuf[bigend ? 0 : 1] << 8;
s += inbuf[bigend ? 1 : 0];
/* Low surrogate without immediately preceding high surrogate is invalid. */
if (s >= 0xDC00 && s <= 0xDFFF)
return EILSEQ;
/* High surrogate must have a following low surrogate. */
else if (s >= 0xD800 && s <= 0xDBFF)
{
cppchar_t hi = s, lo;
if (*inbytesleftp < 4)
return EINVAL;
lo = inbuf[bigend ? 2 : 3] << 8;
lo += inbuf[bigend ? 3 : 2];
if (lo < 0xDC00 || lo > 0xDFFF)
return EILSEQ;
s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
}
rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
if (rval)
return rval;
/* Success - update the input pointers (one_cppchar_to_utf8 has done
the output pointers for us). */
if (s <= 0xFFFF)
{
*inbufp += 2;
*inbytesleftp -= 2;
}
else
{
*inbufp += 4;
*inbytesleftp -= 4;
}
return 0;
}
/* Helper routine for the next few functions. The 'const' on
one_conversion means that we promise not to modify what function is
pointed to, which lets the inliner see through it. */
static inline bool
conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
uchar **, size_t *),
iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
{
const uchar *inbuf;
uchar *outbuf;
size_t inbytesleft, outbytesleft;
int rval;
inbuf = from;
inbytesleft = flen;
outbuf = to->text + to->len;
outbytesleft = to->asize - to->len;
for (;;)
{
do
rval = one_conversion (cd, &inbuf, &inbytesleft,
&outbuf, &outbytesleft);
while (inbytesleft && !rval);
if (__builtin_expect (inbytesleft == 0, 1))
{
to->len = to->asize - outbytesleft;
return true;
}
if (rval != E2BIG)
{
errno = rval;
return false;
}
outbytesleft += OUTBUF_BLOCK_SIZE;
to->asize += OUTBUF_BLOCK_SIZE;
to->text = xrealloc (to->text, to->asize);
outbuf = to->text + to->asize - outbytesleft;
}
}
/* These functions convert entire strings between character sets.
They all have the signature
bool (*)(iconv_t cd, const uchar *from, size_t flen, struct strbuf *to);
The input string FROM is converted as specified by the function
name plus the iconv descriptor CD (which may be fake), and the
result appended to TO. On any error, false is returned, otherwise true. */
/* These four use the custom conversion code above. */
static bool
convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
struct strbuf *to)
{
return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
}
static bool
convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
struct strbuf *to)
{
return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
}
static bool
convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
struct strbuf *to)
{
return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
}
static bool
convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
struct strbuf *to)
{
return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
}
/* Identity conversion, used when we have no alternative. */
static bool
convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
const uchar *from, size_t flen, struct strbuf *to)
{
if (to->len + flen > to->asize)
{
to->asize = to->len + flen;
to->text = xrealloc (to->text, to->asize);
}
memcpy (to->text + to->len, from, flen);
to->len += flen;
return true;
}
/* And this one uses the system iconv primitive. It's a little
different, since iconv's interface is a little different. */
static bool
convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
struct strbuf *to)
{
ICONV_CONST char *inbuf;
char *outbuf;
size_t inbytesleft, outbytesleft;
/* Reset conversion descriptor and check that it is valid. */
if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
return false;
inbuf = (ICONV_CONST char *)from;
inbytesleft = flen;
outbuf = (char *)to->text + to->len;
outbytesleft = to->asize - to->len;
for (;;)
{
iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if (__builtin_expect (inbytesleft == 0, 1))
{
to->len = to->asize - outbytesleft;
return true;
}
if (errno != E2BIG)
return false;
outbytesleft += OUTBUF_BLOCK_SIZE;
to->asize += OUTBUF_BLOCK_SIZE;
to->text = xrealloc (to->text, to->asize);
outbuf = (char *)to->text + to->asize - outbytesleft;
}
}
/* Arrange for the above custom conversion logic to be used automatically
when conversion between a suitable pair of character sets is requested. */
#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
struct conversion
{
const char *pair;
convert_f func;
iconv_t fake_cd;
};
static const struct conversion conversion_tab[] = {
{ "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
{ "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
{ "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
{ "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
{ "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
{ "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
{ "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
{ "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
};
/* Subroutine of cpp_init_iconv: initialize and return a
cset_converter structure for conversion from FROM to TO. If
iconv_open() fails, issue an error and return an identity
converter. Silently return an identity converter if FROM and TO
are identical. */
static struct cset_converter
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
{
iconv_t dsc;
struct cset_converter ret;
char *pair;
size_t i;
if (!strcasecmp (to, from))
{
ret.func = convert_no_conversion;
ret.cd = (iconv_t) -1;
return ret;
}
if (!strcmp (to, from))
return (iconv_t) -1;
pair = alloca(strlen(to) + strlen(from) + 2);
dsc = iconv_open (to, from);
if (dsc == (iconv_t) -1)
strcpy(pair, from);
strcat(pair, "/");
strcat(pair, to);
for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
if (!strcasecmp (pair, conversion_tab[i].pair))
{
ret.func = conversion_tab[i].func;
ret.cd = conversion_tab[i].fake_cd;
return ret;
}
/* No custom converter - try iconv. */
ret.func = convert_using_iconv;
ret.cd = iconv_open (to, from);
if (ret.cd == (iconv_t) -1)
{
if (errno == EINVAL)
cpp_error (pfile, DL_ERROR, /* XXX should be DL_SORRY */
@ -127,8 +648,10 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
from, to);
else
cpp_errno (pfile, DL_ERROR, "iconv_open");
ret.func = convert_no_conversion;
}
return dsc;
return ret;
}
/* If charset conversion is requested, initialize iconv(3) descriptors
@ -146,9 +669,9 @@ cpp_init_iconv (cpp_reader *pfile)
bool be = CPP_OPTION (pfile, bytes_big_endian);
if (CPP_OPTION (pfile, wchar_precision) >= 32)
default_wcset = be ? "UCS-4BE" : "UCS-4LE";
default_wcset = be ? "UTF-32BE" : "UTF-32LE";
else if (CPP_OPTION (pfile, wchar_precision) >= 16)
default_wcset = be ? "UCS-2BE" : "UCS-2LE";
default_wcset = be ? "UTF-16BE" : "UTF-16LE";
else
/* This effectively means that wide strings are not supported,
so don't do any conversion at all. */
@ -181,67 +704,13 @@ _cpp_destroy_iconv (cpp_reader *pfile)
{
if (HAVE_ICONV)
{
if (pfile->narrow_cset_desc != (iconv_t) -1)
iconv_close (pfile->narrow_cset_desc);
if (pfile->wide_cset_desc != (iconv_t) -1)
iconv_close (pfile->wide_cset_desc);
if (pfile->narrow_cset_desc.func == convert_using_iconv)
iconv_close (pfile->narrow_cset_desc.cd);
if (pfile->wide_cset_desc.func == convert_using_iconv)
iconv_close (pfile->wide_cset_desc.cd);
}
}
/* iconv(3) utility wrapper. Convert the string FROM, of length FLEN,
according to the iconv descriptor CD. The result is appended to
the string buffer TO. If DESC is (iconv_t)-1 or iconv is not
available, the string is simply copied into TO.
Returns true on success, false on error. */
static bool
convert_cset (iconv_t cd, const uchar *from, size_t flen, struct strbuf *to)
{
if (!HAVE_ICONV || cd == (iconv_t)-1)
{
if (to->len + flen > to->asize)
{
to->asize = to->len + flen;
to->text = xrealloc (to->text, to->asize);
}
memcpy (to->text + to->len, from, flen);
to->len += flen;
return true;
}
else
{
ICONV_CONST char *inbuf;
char *outbuf;
size_t inbytesleft, outbytesleft;
/* Reset conversion descriptor and check that it is valid. */
if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
return false;
inbuf = (ICONV_CONST char *)from;
inbytesleft = flen;
outbuf = (char *)to->text + to->len;
outbytesleft = to->asize - to->len;
for (;;)
{
iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if (__builtin_expect (inbytesleft == 0, 1))
{
to->len = to->asize - outbytesleft;
return true;
}
if (errno != E2BIG)
return false;
outbytesleft += OUTBUF_BLOCK_SIZE;
to->asize += OUTBUF_BLOCK_SIZE;
to->text = xrealloc (to->text, to->asize);
outbuf = (char *)to->text + to->asize - outbytesleft;
}
}
}
/* Utility routine that computes a mask of the form 0000...111... with
WIDTH 1-bits. */
@ -390,15 +859,6 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
"universal character %.*s is not valid at the start of an identifier",
(int) (str - base), base);
}
/* We don't accept UCNs if iconv is not available or will not
convert to the target wide character set. */
else if (!HAVE_ICONV || pfile->wide_cset_desc == (iconv_t) -1)
{
/* XXX should be DL_SORRY */
cpp_error (pfile, DL_ERROR,
"universal character names are not supported in this configuration");
}
if (result == 0)
result = 1;
@ -408,58 +868,31 @@ _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
it to the execution character set and write the result into TBUF.
An advanced pointer is returned. Issues all relevant diagnostics.
An advanced pointer is returned. Issues all relevant diagnostics. */
UTF-8 encoding looks like this:
value range encoded as
00000000-0000007F 0xxxxxxx
00000080-000007FF 110xxxxx 10xxxxxx
00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
never occur. Note also that any value that can be encoded by a
given row of the table can also be encoded by all successive rows,
but this is not done; only the shortest possible encoding for any
given value is valid. For instance, the character 07C0 could be
encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
FC 80 80 80 9F 80. Only the first is valid. */
static const uchar *
convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
struct strbuf *tbuf, bool wide)
{
int nbytes;
uchar buf[6], *p = &buf[6];
static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
cppchar_t ucn;
uchar buf[6];
uchar *bufp = buf;
size_t bytesleft = 6;
int rval;
struct cset_converter cvt
= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
from++; /* skip u/U */
from++; /* skip u/U */
ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
if (!ucn)
return from;
nbytes = 1;
if (ucn < 0x80)
*--p = ucn;
else
rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
if (rval)
{
do
{
*--p = ((ucn & 0x3F) | 0x80);
ucn >>= 6;
nbytes++;
}
while (ucn >= 0x3F || (ucn & masks[nbytes-1]));
*--p = (ucn | masks[nbytes-1]);
errno = rval;
cpp_errno (pfile, DL_ERROR, "converting UCN to source character set");
}
if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
p, nbytes, tbuf))
else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
cpp_errno (pfile, DL_ERROR, "converting UCN to execution character set");
return from;
@ -615,6 +1048,8 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
#endif
uchar c;
struct cset_converter cvt
= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
c = *from;
switch (c)
@ -676,8 +1111,7 @@ convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
}
/* Now convert what we have to the execution character set. */
if (!convert_cset (wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc,
&c, 1, tbuf))
if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
cpp_errno (pfile, DL_ERROR,
"converting escape sequence to execution character set");
@ -697,7 +1131,8 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
struct strbuf tbuf;
const uchar *p, *base, *limit;
size_t i;
iconv_t cd = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
struct cset_converter cvt
= wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
tbuf.text = xmalloc (tbuf.asize);
@ -719,7 +1154,7 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
{
/* We have a run of normal characters; these can be fed
directly to convert_cset. */
if (!convert_cset (cd, base, p - base, &tbuf))
if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
goto fail;
}
if (p == limit)
@ -741,6 +1176,25 @@ cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
free (tbuf.text);
return false;
}
/* Subroutine of do_line and do_linemarker. Convert escape sequences
in a string, but do not perform character set conversion. */
bool
_cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
cpp_string *out)
{
struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
bool retval;
pfile->narrow_cset_desc.func = convert_no_conversion;
pfile->narrow_cset_desc.cd = (iconv_t) -1;
retval = cpp_interpret_string (pfile, in, 1, out, false);
pfile->narrow_cset_desc = save_narrow_cset_desc;
return retval;
}
/* Subroutine of cpp_interpret_charconst which performs the conversion
to a number, for narrow strings. STR is the string structure returned

View File

@ -35,6 +35,15 @@ typedef int iconv_t; /* dummy */
struct directive; /* Deliberately incomplete. */
struct pending_option;
struct op;
struct strbuf;
typedef bool (*convert_f) (iconv_t, const unsigned char *, size_t,
struct strbuf *);
struct cset_converter
{
convert_f func;
iconv_t cd;
};
#ifndef HAVE_UCHAR
typedef unsigned char uchar;
@ -369,14 +378,13 @@ struct cpp_reader
unsigned char *macro_buffer;
unsigned int macro_buffer_len;
/* Iconv descriptor for converting from the source character set
to the execution character set. (iconv_t)-1 for no conversion. */
iconv_t narrow_cset_desc;
/* Descriptor for converting from the source character set to the
execution character set. */
struct cset_converter narrow_cset_desc;
/* Iconv descriptor for converting from the execution character set
to the wide execution character set. (iconv_t)-1 for no conversion
other than zero-extending each character to the width of wchar_t. */
iconv_t wide_cset_desc;
/* Descriptor for converting from the source character set to the
wide execution character set. */
struct cset_converter wide_cset_desc;
/* Tree of other included files. See cppfiles.c. */
struct splay_tree_s *all_include_files;
@ -555,8 +563,11 @@ extern uchar *_cpp_copy_replacement_text (const cpp_macro *, uchar *);
extern size_t _cpp_replacement_text_len (const cpp_macro *);
/* In cppcharset.c. */
cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **, const uchar *, int);
void _cpp_destroy_iconv (cpp_reader *);
extern cppchar_t _cpp_valid_ucn (cpp_reader *, const uchar **,
const uchar *, int);
extern void _cpp_destroy_iconv (cpp_reader *);
extern bool _cpp_interpret_string_notranslate (cpp_reader *, const cpp_string *,
cpp_string *);
/* Utility routines and macros. */
#define DSC(str) (const uchar *)str, sizeof str - 1

View File

@ -733,21 +733,6 @@ strtoul_for_line (const uchar *str, unsigned int len, long unsigned int *nump)
return 0;
}
/* Subroutine of do_line and do_linemarker. Convert escape sequences
in a string, but do not perform character set conversion. */
static bool
interpret_string_notranslate (cpp_reader *pfile, const cpp_string *in,
cpp_string *out)
{
iconv_t save_narrow_cset_desc = pfile->narrow_cset_desc;
bool retval;
pfile->narrow_cset_desc = (iconv_t) -1;
retval = cpp_interpret_string (pfile, in, 1, out, false);
pfile->narrow_cset_desc = save_narrow_cset_desc;
return retval;
}
/* Interpret #line command.
Note that the filename string (if any) is a true string constant
(escapes are interpreted), unlike in #line. */
@ -780,7 +765,7 @@ do_line (cpp_reader *pfile)
if (token->type == CPP_STRING)
{
cpp_string s = { 0, 0 };
if (interpret_string_notranslate (pfile, &token->val.str, &s))
if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
new_file = (const char *)s.text;
check_eol (pfile);
}
@ -829,7 +814,7 @@ do_linemarker (cpp_reader *pfile)
if (token->type == CPP_STRING)
{
cpp_string s = { 0, 0 };
if (interpret_string_notranslate (pfile, &token->val.str, &s))
if (_cpp_interpret_string_notranslate (pfile, &token->val.str, &s))
new_file = (const char *)s.text;
new_sysp = 0;