Input_UTF8.java (read): Fixed handling of surrogate characters.
* gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of surrogate characters. * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to true. (write): Correct handling of surrogate characters. From-SVN: r35569
This commit is contained in:
parent
97e242b0a7
commit
6dd1b06886
@ -1,3 +1,11 @@
|
|||||||
|
2000-08-08 Tom Tromey <tromey@cygnus.com>
|
||||||
|
|
||||||
|
* gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of
|
||||||
|
surrogate characters.
|
||||||
|
* gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to
|
||||||
|
true.
|
||||||
|
(write): Correct handling of surrogate characters.
|
||||||
|
|
||||||
2000-08-07 Tom Tromey <tromey@cygnus.com>
|
2000-08-07 Tom Tromey <tromey@cygnus.com>
|
||||||
|
|
||||||
* java/lang/reflect/Method.java (hashCode): Use getName().
|
* java/lang/reflect/Method.java (hashCode): Use getName().
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 1999 Free Software Foundation
|
/* Copyright (C) 1999, 2000 Free Software Foundation
|
||||||
|
|
||||||
This file is part of libgcj.
|
This file is part of libgcj.
|
||||||
|
|
||||||
@ -56,10 +56,11 @@ public class Input_UTF8 extends BytesToUnicode
|
|||||||
// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
|
// partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
|
||||||
// The definition lo>=0xDC00 && lo<=0xDFFF implies
|
// The definition lo>=0xDC00 && lo<=0xDFFF implies
|
||||||
// that (lo-0xDC00)>>6 is in the range 0..15.
|
// that (lo-0xDC00)>>6 is in the range 0..15.
|
||||||
// Hence we can infer (partial-0x400)>>4 == (hi-0xDB00)
|
// Hence we can solve for `hi' and we can emit
|
||||||
// and we can emit the high-surrogate without waiting
|
// the high-surrogate without waiting for the
|
||||||
// for the final byte:
|
// final byte:
|
||||||
outbuffer[outpos++] = (char) (0xDA00+(partial>>4));
|
outbuffer[outpos++]
|
||||||
|
= (char) (0xD800 + ((partial - 0x400) >> 4));
|
||||||
|
|
||||||
// Now we want to set it up so that when we read
|
// Now we want to set it up so that when we read
|
||||||
// the final byte on the next iteration, we will
|
// the final byte on the next iteration, we will
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/* Copyright (C) 1999 Free Software Foundation
|
/* Copyright (C) 1999, 2000 Free Software Foundation
|
||||||
|
|
||||||
This file is part of libgcj.
|
This file is part of libgcj.
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ public class Output_UTF8 extends UnicodeToBytes
|
|||||||
/** True if a surrogate pair should be emitted as a single UTF8 sequence.
|
/** True if a surrogate pair should be emitted as a single UTF8 sequence.
|
||||||
* Otherwise, a surrogate pair is treated as two separate characters.
|
* Otherwise, a surrogate pair is treated as two separate characters.
|
||||||
* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
|
* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
|
||||||
public boolean standardUTF8;
|
public boolean standardUTF8 = true;
|
||||||
|
|
||||||
// Saves the previous char if it was a high-surrogate.
|
// Saves the previous char if it was a high-surrogate.
|
||||||
char hi_part;
|
char hi_part;
|
||||||
@ -60,9 +60,27 @@ public class Output_UTF8 extends UnicodeToBytes
|
|||||||
while (bytes_todo > 0 && avail > 0);
|
while (bytes_todo > 0 && avail > 0);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
char ch = inbuffer[inpos++];
|
char ch = inbuffer[inpos++];
|
||||||
inlength--;
|
inlength--;
|
||||||
if (ch < 128 && (ch != 0 || standardUTF8))
|
|
||||||
|
if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
|
||||||
|
|| (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
|
||||||
|
{
|
||||||
|
// If the previous character was a high surrogate, and we
|
||||||
|
// don't now have a low surrogate, we print the high
|
||||||
|
// surrogate as an isolated character. If this character
|
||||||
|
// is a low surrogate and we didn't previously see a high
|
||||||
|
// surrogate, we do the same thing.
|
||||||
|
--inpos;
|
||||||
|
++inlength;
|
||||||
|
buf[count++] = (byte) (0xE0 | (hi_part >> 12));
|
||||||
|
value = hi_part;
|
||||||
|
hi_part = 0;
|
||||||
|
avail--;
|
||||||
|
bytes_todo = 2;
|
||||||
|
}
|
||||||
|
else if (ch < 128 && (ch != 0 || standardUTF8))
|
||||||
{
|
{
|
||||||
avail--;
|
avail--;
|
||||||
buf[count++] = (byte) ch;
|
buf[count++] = (byte) ch;
|
||||||
@ -78,19 +96,16 @@ public class Output_UTF8 extends UnicodeToBytes
|
|||||||
{
|
{
|
||||||
if (ch <= 0xDBFF) // High surrogates
|
if (ch <= 0xDBFF) // High surrogates
|
||||||
{
|
{
|
||||||
// The first byte is (0xF0 | value>>18), where value is the
|
// Just save the high surrogate until the next
|
||||||
// Unicode scalar value of the combine character - which
|
// character comes along.
|
||||||
// we may not know yet. But from substituting:
|
|
||||||
// value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
|
|
||||||
// hi==ch, and cancelling we get:
|
|
||||||
buf[count++] = (byte) (0xF0 | ((ch-0xD800) >> 8));
|
|
||||||
avail--;
|
|
||||||
hi_part = ch;
|
hi_part = ch;
|
||||||
}
|
}
|
||||||
else // Low surrogates
|
else // Low surrogates
|
||||||
{
|
{
|
||||||
value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
|
value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
|
||||||
|
buf[count++] = (byte) (0xF0 | (value >> 18));
|
||||||
bytes_todo = 3;
|
bytes_todo = 3;
|
||||||
|
hi_part = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
Loading…
Reference in New Issue
Block a user