re PR libfortran/52539 (I/O: Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write)

2014-04-26  Jerry DeLisle  <jvdelisle@gcc.gnu>

	PR libfortran/52539
	* io/list_read.c: Add uchar typedef. (push_char4): New function
	to save kind=4 character. (next_char_utf8): New function to read
	a single UTF-8 encoded character value. (read_chracter): Update
	to use the new functions for reading UTF-8 strings.
	(list_formatted_read_scalar): Update to handle list directed
	reads of UTF-8 strings. (nml_read_obj): Likewise update for
	UTF-8 strings in namelists.
	* io/write.c (nml_write_obj): Add kind=4 character support for
	namelist writes.

From-SVN: r209828
This commit is contained in:
Jerry DeLisle 2014-04-26 21:52:26 +00:00
parent 167c3e96ae
commit 42c1e008b0
3 changed files with 223 additions and 53 deletions

View File

@ -1,3 +1,16 @@
2014-04-26 Jerry DeLisle <jvdelisle@gcc.gnu>
PR libfortran/52539
* io/list_read.c: Add uchar typedef. (push_char4): New function
to save kind=4 character. (next_char_utf8): New function to read
a single UTF-8 encoded character value. (read_chracter): Update
to use the new functions for reading UTF-8 strings.
(list_formatted_read_scalar): Update to handle list directed
reads of UTF-8 strings. (nml_read_obj): Likewise update for
UTF-8 strings in namelists.
* io/write.c (nml_write_obj): Add kind=4 character support for
namelist writes.
2014-04-24 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* configure.ac: Quote usage of ac_cv_func_clock_gettime in if test.

View File

@ -32,6 +32,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#include <stdlib.h>
#include <ctype.h>
typedef unsigned char uchar;
/* List directed input. Several parsing subroutines are practically
reimplemented from formatted input, the reason being that there are
@ -97,6 +99,37 @@ push_char (st_parameter_dt *dtp, char c)
dtp->u.p.saved_string[dtp->u.p.saved_used++] = c;
}
/* Save a KIND=4 character to a string buffer, enlarging the buffer
as necessary. */
static void
push_char4 (st_parameter_dt *dtp, gfc_char4_t c)
{
gfc_char4_t *new, *p = (gfc_char4_t *) dtp->u.p.saved_string;
if (p == NULL)
{
dtp->u.p.saved_string = xcalloc (SCRATCH_SIZE, sizeof (gfc_char4_t));
dtp->u.p.saved_length = SCRATCH_SIZE;
dtp->u.p.saved_used = 0;
p = (gfc_char4_t *) dtp->u.p.saved_string;
}
if (dtp->u.p.saved_used >= dtp->u.p.saved_length)
{
dtp->u.p.saved_length = 2 * dtp->u.p.saved_length;
new = realloc (p, dtp->u.p.saved_length);
if (new == NULL)
generate_error (&dtp->common, LIBERROR_OS, NULL);
p = new;
memset (new + dtp->u.p.saved_used, 0,
dtp->u.p.saved_length - dtp->u.p.saved_used);
}
p[dtp->u.p.saved_used++] = c;
}
/* Free the input buffer if necessary. */
@ -247,6 +280,57 @@ done:
}
static gfc_char4_t
next_char_utf8 (st_parameter_dt *dtp)
{
static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
int i, nb;
gfc_char4_t c;
c = next_char (dtp);
if (c < 0x80)
return c;
/* The number of leading 1-bits in the first byte indicates how many
bytes follow. */
for (nb = 2; nb < 7; nb++)
if ((c & ~masks[nb-1]) == patns[nb-1])
goto found;
goto invalid;
found:
c = (c & masks[nb-1]);
/* Decode the bytes read. */
for (i = 1; i < nb; i++)
{
gfc_char4_t n = next_char (dtp);
if ((n & 0xC0) != 0x80)
goto invalid;
c = ((c << 6) + (n & 0x3F));
}
/* Make sure the shortest possible encoding was used. */
if (c <= 0x7F && nb > 1) goto invalid;
if (c <= 0x7FF && nb > 2) goto invalid;
if (c <= 0xFFFF && nb > 3) goto invalid;
if (c <= 0x1FFFFF && nb > 4) goto invalid;
if (c <= 0x3FFFFFF && nb > 5) goto invalid;
/* Make sure the character is valid. */
if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
goto invalid;
return c;
invalid:
generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding");
return (gfc_char4_t) '?';
}
/* Push a character back onto the input. */
static void
@ -1087,50 +1171,97 @@ read_character (st_parameter_dt *dtp, int length __attribute__ ((unused)))
}
get_string:
for (;;)
{
if ((c = next_char (dtp)) == EOF)
goto done_eof;
switch (c)
{
case '"':
case '\'':
if (c != quote)
{
if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
for (;;)
{
if ((c = next_char_utf8 (dtp)) == EOF)
goto done_eof;
switch (c)
{
case '"':
case '\'':
if (c != quote)
{
push_char4 (dtp, c);
break;
}
/* See if we have a doubled quote character or the end of
the string. */
if ((c = next_char_utf8 (dtp)) == EOF)
goto done_eof;
if (c == quote)
{
push_char4 (dtp, quote);
break;
}
unget_char (dtp, c);
goto done;
CASE_SEPARATORS:
if (quote == ' ')
{
unget_char (dtp, c);
goto done;
}
if (c != '\n' && c != '\r')
push_char4 (dtp, c);
break;
default:
push_char4 (dtp, c);
break;
}
}
else
for (;;)
{
if ((c = next_char (dtp)) == EOF)
goto done_eof;
switch (c)
{
case '"':
case '\'':
if (c != quote)
{
push_char (dtp, c);
break;
}
/* See if we have a doubled quote character or the end of
the string. */
if ((c = next_char (dtp)) == EOF)
goto done_eof;
if (c == quote)
{
push_char (dtp, quote);
break;
}
unget_char (dtp, c);
goto done;
CASE_SEPARATORS:
if (quote == ' ')
{
unget_char (dtp, c);
goto done;
}
if (c != '\n' && c != '\r')
push_char (dtp, c);
break;
}
/* See if we have a doubled quote character or the end of
the string. */
if ((c = next_char (dtp)) == EOF)
goto done_eof;
if (c == quote)
{
push_char (dtp, quote);
break;
}
unget_char (dtp, c);
goto done;
CASE_SEPARATORS:
if (quote == ' ')
{
unget_char (dtp, c);
goto done;
}
if (c != '\n' && c != '\r')
break;
default:
push_char (dtp, c);
break;
default:
push_char (dtp, c);
break;
}
}
break;
}
}
/* At this point, we have to have a separator, or else the string is
invalid. */
@ -1903,7 +2034,7 @@ static int
list_formatted_read_scalar (st_parameter_dt *dtp, bt type, void *p,
int kind, size_t size)
{
gfc_char4_t *q;
gfc_char4_t *q, *r;
int c, i, m;
int err = 0;
@ -2031,13 +2162,19 @@ list_formatted_read_scalar (st_parameter_dt *dtp, bt type, void *p,
{
m = ((int) size < dtp->u.p.saved_used)
? (int) size : dtp->u.p.saved_used;
if (kind == 1)
memcpy (p, dtp->u.p.saved_string, m);
q = (gfc_char4_t *) p;
r = (gfc_char4_t *) dtp->u.p.saved_string;
if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
for (i = 0; i < m; i++)
*q++ = *r++;
else
{
q = (gfc_char4_t *) p;
for (i = 0; i < m; i++)
q[i] = (unsigned char) dtp->u.p.saved_string[i];
if (kind == 1)
memcpy (p, dtp->u.p.saved_string, m);
else
for (i = 0; i < m; i++)
*q++ = (unsigned char) dtp->u.p.saved_string[i];
}
}
else
@ -2771,10 +2908,27 @@ nml_read_obj (st_parameter_dt *dtp, namelist_info * nl, index_type offset,
}
else
m = dtp->u.p.saved_used;
pdata = (void*)( pdata + clow - 1 );
memcpy (pdata, dtp->u.p.saved_string, m);
if (m < dlen)
memset ((void*)( pdata + m ), ' ', dlen - m);
if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
{
gfc_char4_t *q4, *p4 = pdata;
int i;
q4 = (gfc_char4_t *) dtp->u.p.saved_string;
p4 += clow -1;
for (i = 0; i < m; i++)
*p4++ = *q4++;
if (m < dlen)
for (i = 0; i < dlen - m; i++)
*p4++ = (gfc_char4_t) ' ';
}
else
{
pdata = (void*)( pdata + clow - 1 );
memcpy (pdata, dtp->u.p.saved_string, m);
if (m < dlen)
memset ((void*)( pdata + m ), ' ', dlen - m);
}
break;
default:

View File

@ -1835,7 +1835,10 @@ nml_write_obj (st_parameter_dt *dtp, namelist_info * obj, index_type offset,
break;
case BT_CHARACTER:
write_character (dtp, p, 1, obj->string_length, DELIM);
if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
write_character (dtp, p, 4, obj->string_length, DELIM);
else
write_character (dtp, p, 1, obj->string_length, DELIM);
break;
case BT_REAL: