/* ************************************************************************ # # DivConq # # http://divconq.com/ # # Copyright: # Copyright 2014 eTimeline, LLC. All rights reserved. # # License: # See the license.txt file in the project's top-level directory for details. # # Authors: # * Andy White # ************************************************************************ */ package divconq.lang.chars; import io.netty.buffer.ByteBuf; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import divconq.lang.Memory; import divconq.lang.StringBuilder32; public class Utf8Decoder implements ICharDecoder { private int leftBits = 0; private int leftSoFar = 0; private int leftSize = 0; private int charatcer = 0; private boolean needMore = false; // TODO consider maybe array of char instead of string builder? then return new String(array)? // prescan for size protected List<StringBuilder32> result = new ArrayList<StringBuilder32>(); private int lastSpecial = -1; @Override public int getCharacter() { return this.charatcer; } public int getLastSpecialCharacter() { return this.lastSpecial; } @Override public boolean needsMore() { return (this.needMore && (this.leftSize != 0)); } @Override public int getCharacterAndReset() { int tchar = charatcer; this.reset(); return tchar; } @Override public void reset() { this.leftBits = 0; this.leftSoFar = 0; this.leftSize = 0; this.charatcer = 0; this.needMore = false; this.lastSpecial = -1; } @Override public CharSequence processBytes(byte[] values) { if (values == null) return null; return this.processBytes(values, values.length); } public CharSequence processBytes(byte[] values, int len) { if (values == null) return null; StringBuilder32 sb = new StringBuilder32(); try { for (int pos = 0; pos < len; pos++) if (!this.readByteNeedMore(values[pos], true)) sb.append(this.getCharacterAndReset()); } catch (Exception x) { // TODO } return sb; } public List<StringBuilder32> processBytesSplit(ByteBuffer buffer, Special partSep) { StringBuilder32 sb = null; // continue to build up strings from last call if (this.result.size() > 0) sb = this.result.get(this.result.size() - 1); else { sb = new StringBuilder32(); this.result.add(sb); } this.lastSpecial = -1; try { while (buffer.hasRemaining()) { if (this.readByteNeedMore(buffer.get(), true)) continue; if (this.lastSpecial != -1) { if (partSep.getCode() != this.lastSpecial) { List<StringBuilder32> res = this.result; this.result = new ArrayList<StringBuilder32>(); return res; } sb = new StringBuilder32(); this.result.add(sb); this.reset(); continue; } sb.append(this.getCharacterAndReset()); } } catch (Exception x) { // TODO } return null; } // look for string from buffer, up to the char. no special chars allowed public StringBuilder32 processBytesUntil(ByteBuffer buffer, int partChar) { StringBuilder32 sb = null; // continue to build up strings from last call if (this.result.size() > 0) sb = this.result.get(this.result.size() - 1); else { sb = new StringBuilder32(); this.result.add(sb); } this.lastSpecial = -1; try { while (buffer.hasRemaining()) { if (this.readByteNeedMore(buffer.get(), true)) continue; // ignore special chars and CR if ((this.lastSpecial != -1) || (this.charatcer == '\r')) { this.lastSpecial = -1; continue; } if (partChar == this.charatcer) { this.result = new ArrayList<StringBuilder32>(); return sb; } sb.append(this.getCharacterAndReset()); } } catch (Exception x) { // TODO } return null; } public StringBuilder32 processBytesUntil(ByteBuffer buffer, Special partSep) { StringBuilder32 sb = null; // continue to build up strings from last call if (this.result.size() > 0) sb = this.result.get(this.result.size() - 1); else { sb = new StringBuilder32(); this.result.add(sb); } this.lastSpecial = -1; try { while (buffer.hasRemaining()) { if (this.readByteNeedMore(buffer.get(), true)) continue; if (this.lastSpecial != -1) { if (partSep.getCode() == this.lastSpecial) { this.result = new ArrayList<StringBuilder32>(); return sb; } } sb.append(this.getCharacterAndReset()); } } catch (Exception x) { // TODO } return null; } public StringBuilder32 processBytesUntilSpecial(ByteBuffer buffer) { if (buffer == null) return null; StringBuilder32 sb = null; // continue to build up strings from last call if (this.result.size() > 0) sb = this.result.get(this.result.size() - 1); else { sb = new StringBuilder32(); this.result.add(sb); } this.lastSpecial = -1; try { while (buffer.hasRemaining()) { if (this.readByteNeedMore(buffer.get(), true)) continue; if (this.lastSpecial != -1) { this.result = new ArrayList<StringBuilder32>(); return sb; } sb.append(this.getCharacterAndReset()); } } catch (Exception x) { // TODO } return null; } public StringBuilder32 processBytesUntilSpecial(ByteBuf buffer) { if (buffer == null) return null; StringBuilder32 sb = null; // continue to build up strings from last call if (this.result.size() > 0) sb = this.result.get(this.result.size() - 1); else { sb = new StringBuilder32(); this.result.add(sb); } this.lastSpecial = -1; try { while (buffer.readableBytes() > 0) { if (this.readByteNeedMore(buffer.readByte(), true)) continue; if (this.lastSpecial != -1) { this.result = new ArrayList<StringBuilder32>(); return sb; } sb.append(this.getCharacterAndReset()); } } catch (Exception x) { // TODO } return null; } public StringBuilder32 processBytesUntilSpecial(Memory buffer) { if (buffer == null) return null; StringBuilder32 sb = null; // continue to build up strings from last call if (this.result.size() > 0) sb = this.result.get(this.result.size() - 1); else { sb = new StringBuilder32(); this.result.add(sb); } this.lastSpecial = -1; try { while (buffer.readableBytes() > 0) { if (this.readByteNeedMore((byte)buffer.readByte(), true)) continue; if (this.lastSpecial != -1) { this.result = new ArrayList<StringBuilder32>(); return sb; } sb.append(this.getCharacterAndReset()); } } catch (Exception x) { // TODO } return null; } /* public boolean processBytesUntil(byte[] values, int offset, int len, Special sectionSep) { this.lastSpecial = -1; try { for (int pos = offset; pos < len; pos++) { if (this.readByteNeedMore(values[pos], true)) return false; if (this.lastSpecial != -1) { if (sectionSep.getCode() == this.lastSpecial) return true; this.lastSpecial = -1; } } } catch (Exception x) { // TODO } return false; } */ private void processFirstByte(int ch, boolean safe) throws Exception { if (ch < (int)0x80) { // unsafe characters get turned into spaces if (safe && ((ch < (int)0x9) || ((ch < (int)0x20) && (ch > (int)0xD)) || (ch == (int)0x7F))) { this.lastSpecial = ch; ch = (int)0x20; } this.charatcer = (int)ch; return; } this.needMore = true; if ((ch & (int)0xE0) == (int)0xC0) { // Double-byte UTF-8 character. this.leftBits = (ch & (int)0x1F); this.leftSoFar = 1; this.leftSize = 2; return; } if ((ch & (int)0xF0) == (int)0xE0) { // Three-byte UTF-8 character. this.leftBits = (ch & (int)0x0F); this.leftSoFar = 1; this.leftSize = 3; return; } if ((ch & (int)0xF8) == (int)0xF0) { // Four-byte UTF-8 character. this.leftBits = (ch & (int)0x07); this.leftSoFar = 1; this.leftSize = 4; return; } if ((ch & (int)0xFC) == (int)0xF8) { // Five-byte UTF-8 character. this.leftBits = (ch & (int)0x03); this.leftSoFar = 1; this.leftSize = 5; return; } if ((ch & (int)0xFE) == (int)0xFC) { // Six-byte UTF-8 character. this.leftBits = (ch & (int)0x03); this.leftSoFar = 1; this.leftSize = 6; return; } throw new Exception("UTF decoder error: Invalid UTF-8 start character."); } @Override public boolean readByteNeedMore(byte bch, boolean safe) throws Exception { int ch = 0xFF & bch; // read first byte if (this.leftSoFar == 0) { this.processFirstByte(ch, safe); return this.needMore; } // Process an extra byte in a multi-byte sequence. if ((ch & (int)0xC0) == (int)0x80) { this.leftBits = ((this.leftBits << 6) | (ch & (int)0x3F)); if (++this.leftSoFar >= this.leftSize) { // We have a complete character now. if (this.leftBits < (int)0x10000) { // is it an overlong ? boolean overlong = false; switch (this.leftSize) { case 2: overlong = (this.leftBits <= 0x7F); break; case 3: overlong = (this.leftBits <= 0x07FF); break; case 4: overlong = (this.leftBits <= 0xFFFF); break; case 5: overlong = (this.leftBits <= 0x1FFFFF); break; case 6: overlong = (this.leftBits <= 0x03FFFFFF); break; } if (overlong) throw new Exception("UTF decoder error: Invalid UTF-8 sequence, overlong value."); else if ((this.leftBits & 0xF800) == 0xD800) throw new Exception("UTF decoder error: Invalid UTF-8 sequence, surrogate characters not allowed."); else this.charatcer = this.leftBits; } else if (this.leftBits < (int)0x110000) this.charatcer = this.leftBits; else throw new Exception("UTF decoder error: Invalid UTF-8 sequence."); this.leftSize = 0; // signal end return false; } } else throw new Exception("UTF decoder error: Invalid UTF-8 sequence."); return true; } // must pass in a complete buffer public static CharSequence decode(byte[] buffer) { if (buffer == null) return null; Utf8Decoder decoder = new Utf8Decoder(); return decoder.processBytes(buffer); } // TODO by directly reading the byte buf, no copy public static CharSequence decode(ByteBuf buffer) { byte[] dest = new byte[buffer.readableBytes()]; buffer.readBytes(dest); return Utf8Decoder.decode(dest); } // TODO by directly reading the byte buf, no copy public static CharSequence decode(ByteBuf buffer, int max) { byte[] dest = new byte[Math.min(buffer.readableBytes(), max)]; buffer.readBytes(dest); return Utf8Decoder.decode(dest); } }