From 42c1e008b0153a83329fb63497f936cbb95c68e1 Mon Sep 17 00:00:00 2001 From: Jerry DeLisle Date: Sat, 26 Apr 2014 21:52:26 +0000 Subject: [PATCH] re PR libfortran/52539 (I/O: Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write) 2014-04-26 Jerry DeLisle PR libfortran/52539 * io/list_read.c: Add uchar typedef. (push_char4): New function to save kind=4 character. (next_char_utf8): New function to read a single UTF-8 encoded character value. (read_chracter): Update to use the new functions for reading UTF-8 strings. (list_formatted_read_scalar): Update to handle list directed reads of UTF-8 strings. (nml_read_obj): Likewise update for UTF-8 strings in namelists. * io/write.c (nml_write_obj): Add kind=4 character support for namelist writes. From-SVN: r209828 --- libgfortran/ChangeLog | 13 ++ libgfortran/io/list_read.c | 258 +++++++++++++++++++++++++++++-------- libgfortran/io/write.c | 5 +- 3 files changed, 223 insertions(+), 53 deletions(-) diff --git a/libgfortran/ChangeLog b/libgfortran/ChangeLog index 0fb3ccddf7d..dc37a861f02 100644 --- a/libgfortran/ChangeLog +++ b/libgfortran/ChangeLog @@ -1,3 +1,16 @@ +2014-04-26 Jerry DeLisle + + PR libfortran/52539 + * io/list_read.c: Add uchar typedef. (push_char4): New function + to save kind=4 character. (next_char_utf8): New function to read + a single UTF-8 encoded character value. (read_chracter): Update + to use the new functions for reading UTF-8 strings. + (list_formatted_read_scalar): Update to handle list directed + reads of UTF-8 strings. (nml_read_obj): Likewise update for + UTF-8 strings in namelists. + * io/write.c (nml_write_obj): Add kind=4 character support for + namelist writes. + 2014-04-24 Kyrylo Tkachov * configure.ac: Quote usage of ac_cv_func_clock_gettime in if test. diff --git a/libgfortran/io/list_read.c b/libgfortran/io/list_read.c index 625ba0c8594..b052c06b557 100644 --- a/libgfortran/io/list_read.c +++ b/libgfortran/io/list_read.c @@ -32,6 +32,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #include #include +typedef unsigned char uchar; + /* List directed input. Several parsing subroutines are practically reimplemented from formatted input, the reason being that there are @@ -97,6 +99,37 @@ push_char (st_parameter_dt *dtp, char c) dtp->u.p.saved_string[dtp->u.p.saved_used++] = c; } +/* Save a KIND=4 character to a string buffer, enlarging the buffer + as necessary. */ + +static void +push_char4 (st_parameter_dt *dtp, gfc_char4_t c) +{ + gfc_char4_t *new, *p = (gfc_char4_t *) dtp->u.p.saved_string; + + if (p == NULL) + { + dtp->u.p.saved_string = xcalloc (SCRATCH_SIZE, sizeof (gfc_char4_t)); + dtp->u.p.saved_length = SCRATCH_SIZE; + dtp->u.p.saved_used = 0; + p = (gfc_char4_t *) dtp->u.p.saved_string; + } + + if (dtp->u.p.saved_used >= dtp->u.p.saved_length) + { + dtp->u.p.saved_length = 2 * dtp->u.p.saved_length; + new = realloc (p, dtp->u.p.saved_length); + if (new == NULL) + generate_error (&dtp->common, LIBERROR_OS, NULL); + p = new; + + memset (new + dtp->u.p.saved_used, 0, + dtp->u.p.saved_length - dtp->u.p.saved_used); + } + + p[dtp->u.p.saved_used++] = c; +} + /* Free the input buffer if necessary. */ @@ -247,6 +280,57 @@ done: } +static gfc_char4_t +next_char_utf8 (st_parameter_dt *dtp) +{ + static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 }; + static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + int i, nb; + gfc_char4_t c; + + c = next_char (dtp); + if (c < 0x80) + return c; + + /* The number of leading 1-bits in the first byte indicates how many + bytes follow. */ + for (nb = 2; nb < 7; nb++) + if ((c & ~masks[nb-1]) == patns[nb-1]) + goto found; + goto invalid; + + found: + c = (c & masks[nb-1]); + + /* Decode the bytes read. */ + for (i = 1; i < nb; i++) + { + gfc_char4_t n = next_char (dtp); + + if ((n & 0xC0) != 0x80) + goto invalid; + + c = ((c << 6) + (n & 0x3F)); + } + + /* Make sure the shortest possible encoding was used. */ + if (c <= 0x7F && nb > 1) goto invalid; + if (c <= 0x7FF && nb > 2) goto invalid; + if (c <= 0xFFFF && nb > 3) goto invalid; + if (c <= 0x1FFFFF && nb > 4) goto invalid; + if (c <= 0x3FFFFFF && nb > 5) goto invalid; + + /* Make sure the character is valid. */ + if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) + goto invalid; + + return c; + + invalid: + generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding"); + return (gfc_char4_t) '?'; +} + /* Push a character back onto the input. */ static void @@ -1087,50 +1171,97 @@ read_character (st_parameter_dt *dtp, int length __attribute__ ((unused))) } get_string: - for (;;) - { - if ((c = next_char (dtp)) == EOF) - goto done_eof; - switch (c) - { - case '"': - case '\'': - if (c != quote) - { + + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + for (;;) + { + if ((c = next_char_utf8 (dtp)) == EOF) + goto done_eof; + switch (c) + { + case '"': + case '\'': + if (c != quote) + { + push_char4 (dtp, c); + break; + } + + /* See if we have a doubled quote character or the end of + the string. */ + + if ((c = next_char_utf8 (dtp)) == EOF) + goto done_eof; + if (c == quote) + { + push_char4 (dtp, quote); + break; + } + + unget_char (dtp, c); + goto done; + + CASE_SEPARATORS: + if (quote == ' ') + { + unget_char (dtp, c); + goto done; + } + + if (c != '\n' && c != '\r') + push_char4 (dtp, c); + break; + + default: + push_char4 (dtp, c); + break; + } + } + else + for (;;) + { + if ((c = next_char (dtp)) == EOF) + goto done_eof; + switch (c) + { + case '"': + case '\'': + if (c != quote) + { + push_char (dtp, c); + break; + } + + /* See if we have a doubled quote character or the end of + the string. */ + + if ((c = next_char (dtp)) == EOF) + goto done_eof; + if (c == quote) + { + push_char (dtp, quote); + break; + } + + unget_char (dtp, c); + goto done; + + CASE_SEPARATORS: + if (quote == ' ') + { + unget_char (dtp, c); + goto done; + } + + if (c != '\n' && c != '\r') push_char (dtp, c); - break; - } - - /* See if we have a doubled quote character or the end of - the string. */ - - if ((c = next_char (dtp)) == EOF) - goto done_eof; - if (c == quote) - { - push_char (dtp, quote); - break; - } - - unget_char (dtp, c); - goto done; - - CASE_SEPARATORS: - if (quote == ' ') - { - unget_char (dtp, c); - goto done; - } - - if (c != '\n' && c != '\r') + break; + + default: push_char (dtp, c); - break; - - default: - push_char (dtp, c); - break; - } - } + break; + } + } /* At this point, we have to have a separator, or else the string is invalid. */ @@ -1903,7 +2034,7 @@ static int list_formatted_read_scalar (st_parameter_dt *dtp, bt type, void *p, int kind, size_t size) { - gfc_char4_t *q; + gfc_char4_t *q, *r; int c, i, m; int err = 0; @@ -2031,13 +2162,19 @@ list_formatted_read_scalar (st_parameter_dt *dtp, bt type, void *p, { m = ((int) size < dtp->u.p.saved_used) ? (int) size : dtp->u.p.saved_used; - if (kind == 1) - memcpy (p, dtp->u.p.saved_string, m); + + q = (gfc_char4_t *) p; + r = (gfc_char4_t *) dtp->u.p.saved_string; + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + for (i = 0; i < m; i++) + *q++ = *r++; else { - q = (gfc_char4_t *) p; - for (i = 0; i < m; i++) - q[i] = (unsigned char) dtp->u.p.saved_string[i]; + if (kind == 1) + memcpy (p, dtp->u.p.saved_string, m); + else + for (i = 0; i < m; i++) + *q++ = (unsigned char) dtp->u.p.saved_string[i]; } } else @@ -2771,10 +2908,27 @@ nml_read_obj (st_parameter_dt *dtp, namelist_info * nl, index_type offset, } else m = dtp->u.p.saved_used; - pdata = (void*)( pdata + clow - 1 ); - memcpy (pdata, dtp->u.p.saved_string, m); - if (m < dlen) - memset ((void*)( pdata + m ), ' ', dlen - m); + + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + { + gfc_char4_t *q4, *p4 = pdata; + int i; + + q4 = (gfc_char4_t *) dtp->u.p.saved_string; + p4 += clow -1; + for (i = 0; i < m; i++) + *p4++ = *q4++; + if (m < dlen) + for (i = 0; i < dlen - m; i++) + *p4++ = (gfc_char4_t) ' '; + } + else + { + pdata = (void*)( pdata + clow - 1 ); + memcpy (pdata, dtp->u.p.saved_string, m); + if (m < dlen) + memset ((void*)( pdata + m ), ' ', dlen - m); + } break; default: diff --git a/libgfortran/io/write.c b/libgfortran/io/write.c index eccbe7e2a20..e17a3d86203 100644 --- a/libgfortran/io/write.c +++ b/libgfortran/io/write.c @@ -1835,7 +1835,10 @@ nml_write_obj (st_parameter_dt *dtp, namelist_info * obj, index_type offset, break; case BT_CHARACTER: - write_character (dtp, p, 1, obj->string_length, DELIM); + if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8) + write_character (dtp, p, 4, obj->string_length, DELIM); + else + write_character (dtp, p, 1, obj->string_length, DELIM); break; case BT_REAL: