109 lines
2.9 KiB
Java
109 lines
2.9 KiB
Java
|
/* Copyright (C) 1999 Cygnus Solutions
|
||
|
|
||
|
This file is part of libgcj.
|
||
|
|
||
|
This software is copyrighted work licensed under the terms of the
|
||
|
Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
|
||
|
details. */
|
||
|
|
||
|
package gnu.gcj.convert;
|
||
|
|
||
|
public class Output_UTF8 extends UnicodeToBytes
|
||
|
{
|
||
|
public String getName() { return "UTF8"; }
|
||
|
|
||
|
/** True if a surrogate pair should be emitted as a single UTF8 sequence.
|
||
|
* Otherwise, a surrogate pair is treated as two separate characters.
|
||
|
* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
|
||
|
public boolean standardUTF8;
|
||
|
|
||
|
// Saves the previous char if it was a high-surrogate.
|
||
|
char hi_part;
|
||
|
// Value of imcomplete character.
|
||
|
int value;
|
||
|
// Number of continuation bytes still to emit.
|
||
|
int bytes_todo;
|
||
|
|
||
|
public int write (char[] inbuffer, int inpos, int inlength)
|
||
|
{
|
||
|
int start_pos = inpos;
|
||
|
int avail = buf.length - count;
|
||
|
for (;;)
|
||
|
{
|
||
|
if (inlength == 0 || avail == 0)
|
||
|
break;
|
||
|
// The algororith is made more complicated because we want to write
|
||
|
// at least one byte in the output buffer, if there is room for
|
||
|
// that byte, and at least one input character is available.
|
||
|
// This makes the code more robust, since client code will
|
||
|
// always "make progress", even in the complicated cases,
|
||
|
// where the output buffer only has room for only *part* of a
|
||
|
// multi-byte sequence, or the input char buffer only has half
|
||
|
// of a surrogate pair (when standardUTF8 is set), or both.
|
||
|
|
||
|
// Handle continuation characters we did not have room for before.
|
||
|
if (bytes_todo > 0)
|
||
|
{
|
||
|
do
|
||
|
{
|
||
|
bytes_todo--;
|
||
|
buf[count++] = (byte)
|
||
|
(((value >> (bytes_todo * 6)) & 0xC0) | 0x80);
|
||
|
avail--;
|
||
|
}
|
||
|
while (bytes_todo > 0 && avail > 0);
|
||
|
continue;
|
||
|
}
|
||
|
char ch = inbuffer[inpos++];
|
||
|
inlength--;
|
||
|
if (ch < 128 && (ch != 0 || standardUTF8))
|
||
|
{
|
||
|
avail--;
|
||
|
buf[count++] = (byte) ch;
|
||
|
}
|
||
|
else if (ch <= 0x07FF)
|
||
|
{
|
||
|
buf[count++] = (byte) (0xC0 | (ch >> 6));
|
||
|
if (--avail > 0)
|
||
|
{
|
||
|
buf[count++] = (byte) ((ch & 0x3F) | 0x80);
|
||
|
avail--;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
value = ch;
|
||
|
bytes_todo = 1;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
|
||
|
{
|
||
|
if (ch <= 0xDBFF) // High surrogates
|
||
|
{
|
||
|
// The first byte is (0xF0 | value>>18), where value is the
|
||
|
// Unicode scalar value of the combine character - which
|
||
|
// we may not know yet. But from substituting:
|
||
|
// value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
|
||
|
// hi==ch, and cancelling we get:
|
||
|
buf[count++] = (byte) (0xF0 | ((ch-0xD800) >> 8));
|
||
|
avail--;
|
||
|
hi_part = ch;
|
||
|
}
|
||
|
else // Low surrogates
|
||
|
{
|
||
|
value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
|
||
|
bytes_todo = 3;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
buf[count++] = (byte) (0xE0 | (ch >> 12));
|
||
|
value = ch;
|
||
|
avail--;
|
||
|
bytes_todo = 2;
|
||
|
}
|
||
|
}
|
||
|
return inpos - start_pos;
|
||
|
}
|
||
|
}
|