package divconq.json3.io; import java.lang.ref.SoftReference; import divconq.json3.util.BufferRecycler; import divconq.json3.util.ByteArrayBuilder; import divconq.json3.util.TextBuffer; /** * Helper class used for efficient encoding of JSON String values (including * JSON field names) into Strings or UTF-8 byte arrays. *<p> * Note that methods in here are somewhat optimized, but not ridiculously so. * Reason is that conversion method results are expected to be cached so that * these methods will not be hot spots during normal operation. */ public final class JsonStringEncoder { private final static char[] HC = CharTypes.copyHexChars(); private final static byte[] HB = CharTypes.copyHexBytes(); private final static int SURR1_FIRST = 0xD800; private final static int SURR1_LAST = 0xDBFF; private final static int SURR2_FIRST = 0xDC00; private final static int SURR2_LAST = 0xDFFF; // private final static int INT_BACKSLASH = '\\'; // private final static int INT_U = 'u'; // private final static int INT_0 = '0'; /** * This <code>ThreadLocal</code> contains a {@link java.lang.ref.SoftReference} * to a {@link BufferRecycler} used to provide a low-cost * buffer recycling between reader and writer instances. */ final protected static ThreadLocal<SoftReference<JsonStringEncoder>> _threadEncoder = new ThreadLocal<SoftReference<JsonStringEncoder>>(); /** * Lazily constructed text buffer used to produce JSON encoded Strings * as characters (without UTF-8 encoding) */ protected TextBuffer _text; /** * Lazily-constructed builder used for UTF-8 encoding of text values * (quoted and unquoted) */ protected ByteArrayBuilder _bytes; /** * Temporary buffer used for composing quote/escape sequences */ protected final char[] _qbuf; /* /********************************************************** /* Construction, instance access /********************************************************** */ public JsonStringEncoder() { _qbuf = new char[6]; _qbuf[0] = '\\'; _qbuf[2] = '0'; _qbuf[3] = '0'; } /* * Factory method for getting an instance; this is either recycled per-thread instance, * or a newly constructed one. */ public static JsonStringEncoder getInstance() { SoftReference<JsonStringEncoder> ref = _threadEncoder.get(); JsonStringEncoder enc = (ref == null) ? null : ref.get(); if (enc == null) { enc = new JsonStringEncoder(); _threadEncoder.set(new SoftReference<JsonStringEncoder>(enc)); } return enc; } /* /********************************************************** /* Public API /********************************************************** */ /* * Method that will quote text contents using JSON standard quoting, * and return results as a character array */ public char[] quoteAsString(String input) { TextBuffer textBuffer = _text; if (textBuffer == null) { // no allocator; can add if we must, shouldn't need to _text = textBuffer = new TextBuffer(null); } char[] outputBuffer = textBuffer.emptyAndGetCurrentSegment(); final int[] escCodes = CharTypes.get7BitOutputEscapes(); final int escCodeCount = escCodes.length; int inPtr = 0; final int inputLen = input.length(); int outPtr = 0; outer: while (inPtr < inputLen) { tight_loop: while (true) { char c = input.charAt(inPtr); if (c < escCodeCount && escCodes[c] != 0) { break tight_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = textBuffer.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = c; if (++inPtr >= inputLen) { break outer; } } // something to escape; 2 or 6-char variant? char d = input.charAt(inPtr++); int escCode = escCodes[d]; int length = (escCode < 0) ? _appendNumeric(d, _qbuf) : _appendNamed(escCode, _qbuf); ; if ((outPtr + length) > outputBuffer.length) { int first = outputBuffer.length - outPtr; if (first > 0) { System.arraycopy(_qbuf, 0, outputBuffer, outPtr, first); } outputBuffer = textBuffer.finishCurrentSegment(); int second = length - first; System.arraycopy(_qbuf, first, outputBuffer, 0, second); outPtr = second; } else { System.arraycopy(_qbuf, 0, outputBuffer, outPtr, length); outPtr += length; } } textBuffer.setCurrentLength(outPtr); return textBuffer.contentsAsArray(); } /* * Will quote given JSON String value using standard quoting, encode * results as UTF-8, and return result as a byte array. */ public byte[] quoteAsUTF8(String text) { ByteArrayBuilder bb = _bytes; if (bb == null) { // no allocator; can add if we must, shouldn't need to _bytes = bb = new ByteArrayBuilder(null); } int inputPtr = 0; int inputEnd = text.length(); int outputPtr = 0; byte[] outputBuffer = bb.resetAndGetFirstSegment(); main: while (inputPtr < inputEnd) { final int[] escCodes = CharTypes.get7BitOutputEscapes(); inner_loop: // ASCII and escapes while (true) { int ch = text.charAt(inputPtr); if (ch > 0x7F || escCodes[ch] != 0) { break inner_loop; } if (outputPtr >= outputBuffer.length) { outputBuffer = bb.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (byte) ch; if (++inputPtr >= inputEnd) { break main; } } if (outputPtr >= outputBuffer.length) { outputBuffer = bb.finishCurrentSegment(); outputPtr = 0; } // Ok, so what did we hit? int ch = (int) text.charAt(inputPtr++); if (ch <= 0x7F) { // needs quoting int escape = escCodes[ch]; // ctrl-char, 6-byte escape... outputPtr = _appendByte(ch, escape, bb, outputPtr); outputBuffer = bb.getCurrentSegment(); continue main; } if (ch <= 0x7FF) { // fine, just needs 2 byte output outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6)); ch = (0x80 | (ch & 0x3f)); } else { // 3 or 4 bytes // Surrogates? if (ch < SURR1_FIRST || ch > SURR2_LAST) { // nope outputBuffer[outputPtr++] = (byte) (0xe0 | (ch >> 12)); if (outputPtr >= outputBuffer.length) { outputBuffer = bb.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 6) & 0x3f)); ch = (0x80 | (ch & 0x3f)); } else { // yes, surrogate pair if (ch > SURR1_LAST) { // must be from first range _illegal(ch); } // and if so, followed by another from next range if (inputPtr >= inputEnd) { _illegal(ch); } ch = _convert(ch, text.charAt(inputPtr++)); if (ch > 0x10FFFF) { // illegal, as per RFC 4627 _illegal(ch); } outputBuffer[outputPtr++] = (byte) (0xf0 | (ch >> 18)); if (outputPtr >= outputBuffer.length) { outputBuffer = bb.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 12) & 0x3f)); if (outputPtr >= outputBuffer.length) { outputBuffer = bb.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (byte) (0x80 | ((ch >> 6) & 0x3f)); ch = (0x80 | (ch & 0x3f)); } } if (outputPtr >= outputBuffer.length) { outputBuffer = bb.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (byte) ch; } return _bytes.completeAndCoalesce(outputPtr); } /* * Will encode given String as UTF-8 (without any quoting), return * resulting byte array. */ public byte[] encodeAsUTF8(String text) { ByteArrayBuilder byteBuilder = _bytes; if (byteBuilder == null) { // no allocator; can add if we must, shouldn't need to _bytes = byteBuilder = new ByteArrayBuilder(null); } int inputPtr = 0; int inputEnd = text.length(); int outputPtr = 0; byte[] outputBuffer = byteBuilder.resetAndGetFirstSegment(); int outputEnd = outputBuffer.length; main_loop: while (inputPtr < inputEnd) { int c = text.charAt(inputPtr++); // first tight loop for ascii while (c <= 0x7F) { if (outputPtr >= outputEnd) { outputBuffer = byteBuilder.finishCurrentSegment(); outputEnd = outputBuffer.length; outputPtr = 0; } outputBuffer[outputPtr++] = (byte) c; if (inputPtr >= inputEnd) { break main_loop; } c = text.charAt(inputPtr++); } // then multi-byte... if (outputPtr >= outputEnd) { outputBuffer = byteBuilder.finishCurrentSegment(); outputEnd = outputBuffer.length; outputPtr = 0; } if (c < 0x800) { // 2-byte outputBuffer[outputPtr++] = (byte) (0xc0 | (c >> 6)); } else { // 3 or 4 bytes // Surrogates? if (c < SURR1_FIRST || c > SURR2_LAST) { // nope outputBuffer[outputPtr++] = (byte) (0xe0 | (c >> 12)); if (outputPtr >= outputEnd) { outputBuffer = byteBuilder.finishCurrentSegment(); outputEnd = outputBuffer.length; outputPtr = 0; } outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); } else { // yes, surrogate pair if (c > SURR1_LAST) { // must be from first range _illegal(c); } // and if so, followed by another from next range if (inputPtr >= inputEnd) { _illegal(c); } c = _convert(c, text.charAt(inputPtr++)); if (c > 0x10FFFF) { // illegal, as per RFC 4627 _illegal(c); } outputBuffer[outputPtr++] = (byte) (0xf0 | (c >> 18)); if (outputPtr >= outputEnd) { outputBuffer = byteBuilder.finishCurrentSegment(); outputEnd = outputBuffer.length; outputPtr = 0; } outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f)); if (outputPtr >= outputEnd) { outputBuffer = byteBuilder.finishCurrentSegment(); outputEnd = outputBuffer.length; outputPtr = 0; } outputBuffer[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f)); } } if (outputPtr >= outputEnd) { outputBuffer = byteBuilder.finishCurrentSegment(); outputEnd = outputBuffer.length; outputPtr = 0; } outputBuffer[outputPtr++] = (byte) (0x80 | (c & 0x3f)); } return _bytes.completeAndCoalesce(outputPtr); } /* /********************************************************** /* Internal methods /********************************************************** */ private int _appendNumeric(int value, char[] qbuf) { qbuf[1] = 'u'; // We know it's a control char, so only the last 2 chars are non-0 qbuf[4] = HC[value >> 4]; qbuf[5] = HC[value & 0xF]; return 6; } private int _appendNamed(int esc, char[] qbuf) { qbuf[1] = (char) esc; return 2; } private int _appendByte(int ch, int esc, ByteArrayBuilder bb, int ptr) { bb.setCurrentSegmentLength(ptr); bb.append('\\'); if (esc < 0) { // standard escape bb.append('u'); if (ch > 0xFF) { int hi = (ch >> 8); bb.append(HB[hi >> 4]); bb.append(HB[hi & 0xF]); ch &= 0xFF; } else { bb.append('0'); bb.append('0'); } bb.append(HB[ch >> 4]); bb.append(HB[ch & 0xF]); } else { // 2-char simple escape bb.append((byte) esc); } return bb.getCurrentSegmentLength(); } private static int _convert(int p1, int p2) { // Ok, then, is the second part valid? if (p2 < SURR2_FIRST || p2 > SURR2_LAST) { throw new IllegalArgumentException("Broken surrogate pair: first char 0x"+Integer.toHexString(p1)+", second 0x"+Integer.toHexString(p2)+"; illegal combination"); } return 0x10000 + ((p1 - SURR1_FIRST) << 10) + (p2 - SURR2_FIRST); } private static void _illegal(int c) { throw new IllegalArgumentException(illegalSurrogateDesc(c)); } protected static String illegalSurrogateDesc(int code) { if (code > 0x10FFFF) { // over max? return "Illegal character point (0x"+Integer.toHexString(code)+") to output; max is 0x10FFFF as per RFC 4627"; } if (code >= SURR1_FIRST) { if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?) return "Unmatched first part of surrogate pair (0x"+Integer.toHexString(code)+")"; } return "Unmatched second part of surrogate pair (0x"+Integer.toHexString(code)+")"; } // should we ever get this? return "Illegal character point (0x"+Integer.toHexString(code)+") to output"; } }