package com.owlike.genson; import java.io.*; /** * This is an internal class that might evolve in the future into a JsonReader Factory and be moved * to the stream package. */ public final class EncodingAwareReaderFactory { static enum UTFEncoding { UTF_32BE(4), UTF_32LE(4), UTF_16BE(2), UTF_16LE(2), UTF_8(1), UNKNOWN(-1); final int bytes; private UTFEncoding(int bytes) { this.bytes = bytes; } public String encoding() { return name().replace('_', '-'); } } /** * Creates java.io.Reader instances with detected encoding from the input stream * using BOM if present or JSON spec. * * Some links: * http://www.herongyang.com/Unicode/ * http://www.ietf.org/rfc/rfc4627.txt * * @throws IOException * @throws UnsupportedEncodingException */ public Reader createReader(InputStream is) throws IOException { byte[] bytes = new byte[4]; int len = fetchBytes(bytes, is); if (len < 1) return new InputStreamReader(is); // read first 4 bytes if available int bits_32 = (bytes[0] & 0xFF) << 24 | (bytes[1] & 0xFF) << 16 | (bytes[2] & 0xFF) << 8 | (bytes[3] & 0xFF); UTFEncoding encoding = UTFEncoding.UNKNOWN; boolean hasBOM = false; // try to detect the encoding from those 4 bytes if BOM is used if (len == 4) encoding = detectEncodingFromBOM(bits_32); // no BOM then fall back to JSON spec if (encoding == UTFEncoding.UNKNOWN) { encoding = detectEncodingUsingJSONSpec(bits_32); } else hasBOM = true; // should not happen as we default to UTF-8 if (encoding == UTFEncoding.UNKNOWN) { throw new UnsupportedEncodingException("The encoding could not be detected from the stream."); } int usedBOMBytes = hasBOM ? len - (4 - encoding.bytes) : 0; int bytesToUnread = len - usedBOMBytes; // small optimization to avoid encapsulation when there is nothing to unread if (bytesToUnread == 0) { return new InputStreamReader(is, encoding.encoding()); } else { PushbackInputStream pis = new PushbackInputStream(is, bytesToUnread); pis.unread(bytes, usedBOMBytes, bytesToUnread); return new InputStreamReader(pis, encoding.encoding()); } } private UTFEncoding detectEncodingFromBOM(int bits_32) { int bits_16 = bits_32 >>> 16; if (bits_32 == 0x0000FEFF) return UTFEncoding.UTF_32BE; else if (bits_32 == 0xFFFE0000) return UTFEncoding.UTF_32LE; else if (bits_16 == 0xFEFF) return UTFEncoding.UTF_16BE; else if (bits_16 == 0xFFFE) return UTFEncoding.UTF_16LE; else if (bits_32 >>> 8 == 0xEFBBBF) return UTFEncoding.UTF_8; else return UTFEncoding.UNKNOWN; } private UTFEncoding detectEncodingUsingJSONSpec(int bits_32) { int bits_16 = bits_32 >>> 16; if (bits_32 >>> 8 == 0) return UTFEncoding.UTF_32BE; else if ((bits_32 & 0x00FFFFFF) == 0) return UTFEncoding.UTF_32LE; else if ((bits_16 & 0xFF00) == 0) return UTFEncoding.UTF_16BE; else if ((bits_16 & 0x00FF) == 0) return UTFEncoding.UTF_16LE; else return UTFEncoding.UTF_8; } private int fetchBytes(byte[] bytes, InputStream is) throws IOException { int start = 0; int bytesRead; while(start < bytes.length-1 && (bytesRead = is.read(bytes, start, bytes.length-start)) > -1) { start += bytesRead; } return start; } }