/*
* This file is part of the Jikes RVM project (http://jikesrvm.org).
*
* This file is licensed to You under the Eclipse Public License (EPL);
* You may not use this file except in compliance with the License. You
* may obtain a copy of the License at
*
* http://www.opensource.org/licenses/eclipse-1.0.php
*
* See the COPYRIGHT.txt file distributed with this work for information
* regarding copyright ownership.
*/
package org.jikesrvm.classloader;
import java.io.UTFDataFormatException;
import java.nio.ByteBuffer;
import org.vmmagic.pragma.Pure;
import org.jikesrvm.VM;
import org.vmmagic.pragma.Inline;
import org.vmmagic.pragma.NoInline;
/**
* Abstract class that contains conversion routines to/from utf8
* and/or pseudo-utf8. It does not support utf8 encodings of
* more than 3 bytes.
* <p>
* The difference between utf8 and pseudo-utf8 is the special
* treatment of null. In utf8, null is encoded as a single byte
* directly, whereas in pseudo-utf8, it is encoded as a two-byte
* sequence. See the JVM specification for more information.
*/
public abstract class UTF8Convert {
/**
* Strictly check the format of the utf8/pseudo-utf8 byte array in
* fromUTF8.
*/
static final boolean STRICTLY_CHECK_FORMAT = false;
/**
* Set fromUTF8 to not throw an exception when given a normal utf8
* byte array.
*/
static final boolean ALLOW_NORMAL_UTF8 = false;
/**
* Set fromUTF8 to not throw an exception when given a pseudo utf8
* byte array.
*/
static final boolean ALLOW_PSEUDO_UTF8 = true;
/**
* Set toUTF8 to write in pseudo-utf8 (rather than normal utf8).
*/
static final boolean WRITE_PSEUDO_UTF8 = true;
/**
* UTF8 character visitor abstraction
*/
private abstract static class UTF8CharacterVisitor {
abstract void visit_char(char c);
}
/**
* Visitor that builds up a char[] as characters are decoded
*/
private static final class ByteArrayStringEncoderVisitor extends UTF8CharacterVisitor {
final char[] result;
int index;
ByteArrayStringEncoderVisitor(int length) {
result = new char[length];
index = 0;
}
@Override
void visit_char(char c) {
result[index] = c;
index++;
}
@Override
public String toString() {
if (VM.runningVM) {
return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
} else {
return new String(result, 0, index);
}
}
}
/**
* Visitor that builds up a char[] as characters are decoded
*/
private static final class ByteBufferStringEncoderVisitor extends UTF8CharacterVisitor {
final char[] result;
int index;
ByteBufferStringEncoderVisitor(int length) {
result = new char[length];
index = 0;
}
@Override
void visit_char(char c) {
result[index] = c;
index++;
}
@Override
public String toString() {
if (VM.runningVM) {
return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
} else {
return new String(result, 0, index);
}
}
}
/**
* Visitor that builds up a String.hashCode form hashCode as characters are decoded
*/
private static final class StringHashCodeVisitor extends UTF8CharacterVisitor {
int result = 0;
@Override
void visit_char(char c) {
result = result * 31 + c;
}
int getResult() {
return result;
}
}
/**
* Convert the given sequence of (pseudo-)utf8 formatted bytes
* into a String.<p>
*
* The acceptable input formats are controlled by the
* STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
* flags.
*
* @param utf8 (pseudo-)utf8 byte array
* @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
* @return unicode string
*/
public static String fromUTF8(byte[] utf8) throws UTFDataFormatException {
UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length);
visitUTF8(utf8, visitor);
return visitor.toString();
}
/**
* Convert the given sequence of (pseudo-)utf8 formatted bytes
* into a String.
*
* The acceptable input formats are controlled by the
* STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
* flags.<p>
*
* @param utf8 (pseudo-)utf8 byte array
* @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
* @return unicode string
*/
public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException {
UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining());
visitUTF8(utf8, visitor);
return visitor.toString();
}
/**
* Convert the given sequence of (pseudo-)utf8 formatted bytes
* into a String hashCode.<p>
*
* The acceptable input formats are controlled by the
* STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
* flags.
*
* @param utf8 (pseudo-)utf8 byte array
* @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
* @return hashCode corresponding to if this were a String.hashCode
*/
public static int computeStringHashCode(byte[] utf8) throws UTFDataFormatException {
StringHashCodeVisitor visitor = new StringHashCodeVisitor();
visitUTF8(utf8, visitor);
return visitor.getResult();
}
@NoInline
private static void throwDataFormatException(String message, int location) throws UTFDataFormatException {
throw new UTFDataFormatException(message + " at location " + location);
}
/**
* Visit all bytes of the given utf8 string calling the visitor when a
* character is decoded.<p>
*
* The acceptable input formats are controlled by the
* STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
* flags.
*
* @param utf8 (pseudo-)utf8 byte array
* @param visitor called when characters are decoded
* @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
*/
@Inline
private static void visitUTF8(byte[] utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
for (int i = 0, n = utf8.length; i < n;) {
byte b = utf8[i++];
if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
if (b == 0) {
throwDataFormatException("0 byte encountered", i - 1);
}
}
if (b >= 0) { // < 0x80 unsigned
// in the range '\001' to '\177'
visitor.visit_char((char) b);
continue;
}
try {
byte nb = utf8[i++];
if (b < -32) { // < 0xe0 unsigned
// '\000' or in the range '\200' to '\u07FF'
char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
visitor.visit_char(c);
if (STRICTLY_CHECK_FORMAT) {
if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
throwDataFormatException("invalid marker bits for double byte char" , i - 2);
}
if (c < '\200') {
if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
throwDataFormatException("encountered double byte char that should have been single byte", i - 2);
}
} else if (c > '\u07FF') {
throwDataFormatException("encountered double byte char that should have been single byte", i - 2);
}
}
} else {
byte nnb = utf8[i++];
// in the range '\u0800' to '\uFFFF'
char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
visitor.visit_char(c);
if (STRICTLY_CHECK_FORMAT) {
if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
throwDataFormatException("invalid marker bits for triple byte char", i - 3);
}
if (c < '\u0800') {
throwDataFormatException("encountered triple byte char that should have been fewer bytes", i - 3);
}
}
}
} catch (ArrayIndexOutOfBoundsException e) {
throwDataFormatException("unexpected end", i);
}
}
}
/**
* Visit all bytes of the given utf8 string calling the visitor when a
* character is decoded.<p>
*
* The acceptable input formats are controlled by the
* STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
* flags.
*
* @param utf8 (pseudo-)utf8 byte array
* @param visitor called when characters are decoded
* @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
*/
@Inline
private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
while (utf8.hasRemaining()) {
byte b = utf8.get();
if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
if (b == 0) {
throwDataFormatException("0 byte encountered", utf8.position() - 1);
}
}
if (b >= 0) { // < 0x80 unsigned
// in the range '\001' to '\177'
visitor.visit_char((char) b);
continue;
}
try {
byte nb = utf8.get();
if (b < -32) { // < 0xe0 unsigned
// '\000' or in the range '\200' to '\u07FF'
char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
visitor.visit_char(c);
if (STRICTLY_CHECK_FORMAT) {
if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
throwDataFormatException("invalid marker bits for double byte char", utf8.position() - 2);
}
if (c < '\200') {
if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
}
} else if (c > '\u07FF') {
throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
}
}
} else {
byte nnb = utf8.get();
// in the range '\u0800' to '\uFFFF'
char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
visitor.visit_char(c);
if (STRICTLY_CHECK_FORMAT) {
if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
throwDataFormatException("invalid marker bits for triple byte char", utf8.position() - 3);
}
if (c < '\u0800') {
throwDataFormatException("encountered triple byte char that should have been fewer bytes", utf8.position() - 3);
}
}
}
} catch (ArrayIndexOutOfBoundsException e) {
throwDataFormatException("unexpected end", utf8.position());
}
}
}
/**
* Convert the given String into a sequence of (pseudo-)utf8
* formatted bytes.<p>
*
* The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
*
* @param s String to convert
* @return array containing sequence of (pseudo-)utf8 formatted bytes
*/
public static byte[] toUTF8(String s) {
byte[] result = new byte[utfLength(s)];
int result_index = 0;
for (int i = 0, n = s.length(); i < n; ++i) {
char c = s.charAt(i);
// in all shifts below, c is an (unsigned) char,
// so either >>> or >> is ok
if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
result[result_index++] = (byte) c;
} else if (c > 0x07FF) {
result[result_index++] = (byte) (0xe0 | (byte) (c >> 12));
result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6));
result[result_index++] = (byte) (0x80 | (c & 0x3f));
} else {
result[result_index++] = (byte) (0xc0 | (byte) (c >> 6));
result[result_index++] = (byte) (0x80 | (c & 0x3f));
}
}
return result;
}
/**
* Convert the given String into a sequence of (pseudo-)utf8
* formatted bytes.<p>
*
* The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
*
* @param s String to convert
* @param b Byte buffer to hold result
*/
@Inline
public static void toUTF8(String s, ByteBuffer b) {
int result_index = 0;
for (int i = 0, n = s.length(); i < n; ++i) {
char c = s.charAt(i);
// in all shifts below, c is an (unsigned) char,
// so either >>> or >> is ok
if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
b.put((byte) c);
} else if (c > 0x07FF) {
b.put((byte) (0xe0 | (byte) (c >> 12)));
b.put((byte) (0x80 | ((c & 0xfc0) >> 6)));
b.put((byte) (0x80 | (c & 0x3f)));
} else {
b.put((byte) (0xc0 | (byte) (c >> 6)));
b.put((byte) (0x80 | (c & 0x3f)));
}
}
}
@Pure
public static int utfLength(String s) {
int utflen = 0;
for (int i = 0, n = s.length(); i < n; ++i) {
int c = s.charAt(i);
if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
++utflen;
} else if (c > 0x07FF) {
utflen += 3;
} else {
utflen += 2;
}
}
return utflen;
}
/**
* Check whether the given sequence of bytes is valid (pseudo-)utf8.
*
* @param bytes byte array to check
* @return {@code true} iff the given sequence is valid (pseudo-)utf8.
*/
public static boolean check(byte[] bytes) {
for (int i = 0, n = bytes.length; i < n;) {
byte b = bytes[i++];
if (!ALLOW_NORMAL_UTF8) {
if (b == 0) return false;
}
if (b >= 0) { // < 0x80 unsigned
// in the range '\001' to '\177'
continue;
}
try {
byte nb = bytes[i++];
if (b < -32) { // < 0xe0 unsigned
// '\000' or in the range '\200' to '\u07FF'
char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
return false;
}
if (c < '\200') {
if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
return false;
}
} else if (c > '\u07FF') {
return false;
}
} else {
byte nnb = bytes[i++];
// in the range '\u0800' to '\uFFFF'
char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
return false;
}
if (c < '\u0800') {
return false;
}
}
} catch (ArrayIndexOutOfBoundsException e) {
return false;
}
}
return true;
}
}