/*
*
*
* Copyright 1990-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 only, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License version 2 for more details (a copy is
* included at /legal/license.txt).
*
* You should have received a copy of the GNU General Public License
* version 2 along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 or visit www.sun.com if you need additional
* information or have any questions.
*/
package com.sun.cldc.i18n.j2me;
import com.sun.cldc.i18n.StreamReader;
import java.io.*;
/** Reader for UTF-8 encoded input streams. */
public class UTF_8_Reader extends StreamReader {
/** signals that no byte is available, but not the end of stream */
private static final int NO_BYTE = -2;
/** 'replacement character' [Unicode 1.1.0] */
private static final int RC = 0xFFFD;
/** read ahead buffer to hold a part of char from the last read.
* The only case this buffer is needed is like following:
* after a number of characters (at least one) have been read,
* the next character is encoded by 4 bytes, of which only 3 are
* already available in the input stream. In this case read()
* will finish without waiting for the last byte of the character.
*/
private int[] readAhead;
/* the number of UTF8 bytes that may encode one character */
private static final int MAX_BYTES_PER_CHAR = 4;
/**
* If non-zero, the last read code point must be represented by two
* surrogate code units, and the low surrogate code unit has not yet
* been retrieved during the last read operation.
*/
protected int pendingSurrogate = 0;
/** information saved by mark() and later used by reset() */
protected class MarkInfo {
/** a copy of the enclosing instance's readAhead buffer
* at the moment of execution of mark()
*/
int[] readAhead = new int[MAX_BYTES_PER_CHAR];
/** a copy of the enclosing instance's pendingSurrogate
* at the moment of execution of mark()
*/
int pendingSurrogate = 0;
}
/** information saved by mark() and later used by reset() */
MarkInfo markInfo = null;
/** false if mark() has not been invoked yet */
boolean markIsSet;
/** Constructs a UTF-8 reader. */
public UTF_8_Reader() {
readAhead = new int[MAX_BYTES_PER_CHAR];
}
public Reader open(InputStream in, String enc)
throws UnsupportedEncodingException {
super.open(in, enc);
markIsSet = false;
prepareForNextChar(NO_BYTE);
return this;
}
/**
* maps the number of extra bytes onto the minimal valid value that may
* be encoded with this number of bytes
*/
private static final int[] minimalValidValue
= {0x00, 0x80, 0x800, 0x10000 /*, 0x200000*/};
/**
* Read a block of UTF8 characters.
*
* @param cbuf output buffer for converted characters read
* @param off initial offset into the provided buffer
* @param len length of characters in the buffer
* @return the number of converted characters
* @exception IOException is thrown if the input stream
* could not be read for the raw unconverted character
*/
public int read(char cbuf[], int off, int len) throws IOException {
int count = 0;
int firstByte;
int extraBytes;
int currentChar = 0;
int nextByte;
int headByte = NO_BYTE;
if (len == 0) {
return 0;
}
if (pendingSurrogate != 0) {
cbuf[off + count] = (char)pendingSurrogate;
count++;
pendingSurrogate = 0;
if (len == 1) {
return 1;
}
}
while (count < len) {
// must wait for the first character, and
// other characters are read only if they are available
final boolean mustBlockTillGetsAChar = (0 == count);
firstByte = getByteOfCurrentChar(0, mustBlockTillGetsAChar);
if (firstByte < 0) {
if (firstByte == -1 && count == 0) {
// end of stream
return -1;
}
return count;
}
/* Let's reduce amount of case-mode comparisons */
if ((firstByte&0x80) == 0) {
extraBytes = 0;
currentChar = firstByte;
} else {
switch (firstByte >> 4) {
case 12: case 13:
/* 11 bits: 110x xxxx 10xx xxxx */
extraBytes = 1;
currentChar = firstByte & 0x1F;
break;
case 14:
/* 16 bits: 1110 xxxx 10xx xxxx 10xx xxxx */
extraBytes = 2;
currentChar = firstByte & 0x0F;
break;
case 15:
if ((firstByte&0x08)==0) {
/* 21 bits: 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
extraBytes = 3;
currentChar = firstByte & 0x07;
break;
} // else as default
default:
/* we do replace malformed character with special symbol */
extraBytes = 0;
currentChar = RC;
}
}
for (int j = 1; j <= extraBytes; j++) {
nextByte = getByteOfCurrentChar(j, mustBlockTillGetsAChar);
if (nextByte == NO_BYTE) {
// done for now, comeback later for the rest of char
return count;
}
if (nextByte == -1) {
// end of stream in the middle of char -- set 'RC'
currentChar = RC;
break;
}
if ((nextByte & 0xC0) != 0x80) {
// invalid byte - move it at head of next read sequence
currentChar = RC;
headByte = nextByte;
break;
}
// each extra byte has 6 bits more of the char
currentChar = (currentChar << 6) + (nextByte & 0x3F);
}
if (currentChar < minimalValidValue[extraBytes]) {
// the character is malformed: it should be encoded
// with a shorter sequence of bytes
currentChar = RC;
cbuf[off + count] = (char)currentChar;
count++;
} else if (currentChar <= 0xd7ff
// d800...d8ff and dc00...dfff are high and low surrogate code
// points, they do not represent characters
|| (0xe000 <= currentChar && currentChar <= 0xffff)) {
cbuf[off + count] = (char)currentChar;
count++;
} else if (0xffff < currentChar && currentChar <= 0x10ffff) {
int highSurrogate = 0xd800 | ((currentChar-0x10000) >> 10);
int lowSurrogate = 0xdc00 | (currentChar & 0x3ff);
cbuf[off + count] = (char)highSurrogate;
count++;
if (count < len) {
cbuf[off + count] = (char)lowSurrogate;
count++;
} else {
pendingSurrogate=lowSurrogate;
}
} else {
currentChar = RC;
cbuf[off + count] = (char)currentChar;
count++;
}
prepareForNextChar(headByte);
}
return count;
}
/**
* Get one of the raw bytes for the current character.
* The byte first gets read into the read ahead buffer, unless
* it's already there.
*
* @param byteOfChar which raw byte to get 0 for the first, 3 for the last.
* The bytes must be accessed sequentially, that is,
* the only possible order of byteOfChar values
* in a series of calls is 0, 1, 2, 3.
* @param allowBlockingRead false allows returning NO_BYTE if no byte is
* available in the input stream; true forces reading.
* @return a byte value, NO_BYTE for no byte available or -1 for end of
* stream
*
* @exception IOException if an I/O error occurs.
*/
private int getByteOfCurrentChar(int byteOfChar, boolean allowBlockingRead) throws IOException {
if (readAhead[byteOfChar] != NO_BYTE) {
return readAhead[byteOfChar];
}
/*
* allowBlockingRead will be true for the first character.
* Our read method must block until it gets one char so don't call
* available() for the first character.
*/
if (allowBlockingRead || in.available() > 0) {
readAhead[byteOfChar] = in.read();
}
return readAhead[byteOfChar];
}
/**
* Prepare the reader for the next character by clearing the look
* ahead buffer.
* @param headByte value of first byte. If previous sequence is interrupted
* by malformed byte - this byte should be moved at head of next sequence
*/
private void prepareForNextChar(int headByte) {
readAhead[0] = headByte;
for (int i=1; i<MAX_BYTES_PER_CHAR; i++) {
readAhead[i]=NO_BYTE;
}
}
/**
* Mark the present position in the stream.
*
* @param readAheadLimit number of characters to buffer ahead
* @exception IOException If an I/O error occurs or
* marking is not supported by the underlying input stream.
*/
public void mark(int readAheadLimit) throws IOException {
if (in.markSupported()) {
if (markInfo == null) {
markInfo = new MarkInfo();
}
markInfo.pendingSurrogate = pendingSurrogate;
System.arraycopy(readAhead,0,markInfo.readAhead,0,MAX_BYTES_PER_CHAR);
markIsSet = true;
in.mark(readAheadLimit*MAX_BYTES_PER_CHAR);
} else {
throw new IOException("mark() not supported");
}
}
/**
* Reset the read ahead marks is not supported for UTF8 readers.
* @exception IOException is thrown, for all calls to this method
* because marking is not supported for UTF8 readers
*/
public void reset() throws IOException {
if (in.markSupported()) {
if (markIsSet) {
pendingSurrogate = markInfo.pendingSurrogate;
System.arraycopy(markInfo.readAhead,0,readAhead,0,MAX_BYTES_PER_CHAR);
in.reset();
} else {
throw new IOException("reset(): no mark has been set");
}
} else {
throw new IOException("reset() not supported");
}
}
/**
* Get the size in chars of an array of bytes.
*
* @param array Source buffer
* @param offset Offset at which to start counting characters
* @param length number of bytes to use for counting
*
* @return number of characters that would be converted
*/
/*
* This method is only used by our internal Helper class in the method
* byteToCharArray to know how much to allocate before using a
* reader. If we encounter bad encoding we should return a count
* that includes that character so the reader will throw an IOException
*/
public int sizeOf(byte[] array, int offset, int length) {
int count = 0;
int endOfArray;
int extraBytes;
for (endOfArray = offset + length; offset < endOfArray; ) {
int oldCount = count;
count++;
/* Reduce amount of case-mode comparisons */
if ((array[offset]&0x80) == 0) {
extraBytes = 0;
} else {
switch (((int)array[offset] & 0xff) >> 4) {
case 12: case 13:
/* 11 bits: 110x xxxx 10xx xxxx */
extraBytes = 1;
break;
case 14:
/* 16 bits: 1110 xxxx 10xx xxxx 10xx xxxx */
extraBytes = 2;
break;
case 15:
if (((int)array[offset] & 0x08)==0) {
/* 21 bits: 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx */
// we imply that the 5 high bits are not all zeroes
extraBytes = 3;
count++;
break;
} // else as default
default:
/*
* this byte will be replaced with 'RC'
*/
extraBytes = 0;
}
}
offset++;
// test if extra bytes are in form 10xx xxxx
while (extraBytes-- > 0){
if (offset < endOfArray) {
if ((((int)array[offset]) & 0xC0) != 0x80) {
break; // test fails: char will be replaced with 'RC'
} else {
offset++;
}
} else {
// broken sequence of bytes detected at the array tail
// the broken char still must be counted
count = oldCount+1;
break;
}
}
}
return count;
}
}