/* ************************************************************************ # # DivConq # # http://divconq.com/ # # Copyright: # Copyright 2014 eTimeline, LLC. All rights reserved. # # License: # See the license.txt file in the project's top-level directory for details. # # Authors: # * Andy White # ************************************************************************ */ package divconq.lang.chars; import io.netty.buffer.ByteBuf; public class Utf8Encoder { static public byte[] encode(CharSequence chars) { if (chars == null) return null; int seqidx = 0; int leftOver = 0; int bufidx = 0; while (seqidx < chars.length()) { int ch = chars.charAt(seqidx); if (leftOver != 0) { if (ch >= '\uDC00' && ch <= '\uDFFF') { bufidx++; bufidx++; bufidx++; bufidx++; } else { // We have a surrogate start followed by a // regular character. Technically, this is // invalid, skip. } leftOver = 0x00; } if (ch < (int)0x80) { bufidx++; } else if (ch < (int)0x800) { bufidx++; bufidx++; } else if (ch < '\uD800' || ch > (int)'\uDFFF') { bufidx++; bufidx++; bufidx++; } else if (ch <= '\uDBFF') { // This is a surrogate char, exit the inner loop. leftOver = ch; break; } else { // We have a surrogate tail without // leading surrogate. Do nothing leftOver = (int)0x00; } seqidx++; } byte[] buffer = new byte[bufidx]; seqidx = 0; leftOver = 0; bufidx = 0; while (seqidx < chars.length()) { int ch = chars.charAt(seqidx); if (leftOver != 0) { if (ch >= '\uDC00' && ch <= '\uDFFF') { // We have a correct surrogate pair. ch = 0x10000 + (int)ch - 0xDC00 + (((int) leftOver - 0xD800) << 10); buffer[bufidx] = (byte) (0xF0 | (ch >> 18)); bufidx++; buffer[bufidx] = (byte) (0x80 | ((ch >> 12) & 0x3F)); bufidx++; buffer[bufidx] = (byte) (0x80 | ((ch >> 6) & 0x3F)); bufidx++; buffer[bufidx] = (byte) (0x80 | (ch & 0x3F)); bufidx++; } else { // We have a surrogate start followed by a // regular character. Technically, this is // invalid, skip. } leftOver = 0x00; } if (ch < (int)0x80) { buffer[bufidx] = (byte)ch; bufidx++; } else if (ch < (int)0x800) { buffer[bufidx] = (byte) (0xC0 | (ch >> 6)); bufidx++; buffer[bufidx] = (byte) (0x80 | (ch & 0x3F)); bufidx++; } else if (ch < '\uD800' || ch > (int)'\uDFFF') { buffer[bufidx] = (byte) (0xE0 | (ch >> 12)); bufidx++; buffer[bufidx] = (byte) (0x80 | ((ch >> 6) & 0x3F)); bufidx++; buffer[bufidx] = (byte) (0x80 | (ch & 0x3F)); bufidx++; } else if (ch <= '\uDBFF') { // This is a surrogate char, exit the inner loop. leftOver = ch; break; } else { // We have a surrogate tail without // leading surrogate. Do nothing leftOver = (int)0x00; } seqidx++; } return buffer; } static public byte[] encode(int ch) { if (ch == -1) return null; byte[] buffer = null; if (ch < (int)0x80) { buffer = new byte[1]; buffer[0] = (byte)ch; } else if (ch < (int)0x800) { buffer = new byte[2]; buffer[0] = (byte) (0xC0 | (ch >> 6)); buffer[1] = (byte) (0x80 | (ch & 0x3F)); } else if (ch < '\uD800' || ch > (int)'\uDFFF') { buffer = new byte[3]; buffer[0] = (byte) (0xE0 | (ch >> 12)); buffer[1] = (byte) (0x80 | ((ch >> 6) & 0x3F)); buffer[2] = (byte) (0x80 | (ch & 0x3F)); } return buffer; } static public int size(CharSequence chars) { if (chars == null) return 0; int seqidx = 0; int leftOver = 0; int bufidx = 0; while (seqidx < chars.length()) { int ch = chars.charAt(seqidx); if (leftOver != 0) { if (ch >= '\uDC00' && ch <= '\uDFFF') { bufidx++; bufidx++; bufidx++; bufidx++; } else { // We have a surrogate start followed by a // regular character. Technically, this is // invalid, skip. } leftOver = 0x00; } if (ch < (int)0x80) { bufidx++; } else if (ch < (int)0x800) { bufidx++; bufidx++; } else if (ch < '\uD800' || ch > (int)'\uDFFF') { bufidx++; bufidx++; bufidx++; } else if (ch <= '\uDBFF') { // This is a surrogate char, exit the inner loop. leftOver = ch; break; } else { // We have a surrogate tail without // leading surrogate. Do nothing leftOver = (int)0x00; } seqidx++; } return bufidx; } /** * Blindly assumes content will fit in buffer, *you* have to be sure it will... * * @param chars source * @param buffer destination */ static public void encode(CharSequence chars, ByteBuf buffer) { if ((chars == null) || (buffer == null)) return; int seqidx = 0; int leftOver = 0; while (seqidx < chars.length()) { int ch = chars.charAt(seqidx); if (leftOver != 0) { if (ch >= '\uDC00' && ch <= '\uDFFF') { // We have a correct surrogate pair. ch = 0x10000 + (int)ch - 0xDC00 + (((int) leftOver - 0xD800) << 10); buffer.writeByte(0xF0 | (ch >> 18)); buffer.writeByte(0x80 | ((ch >> 12) & 0x3F)); buffer.writeByte(0x80 | ((ch >> 6) & 0x3F)); buffer.writeByte(0x80 | (ch & 0x3F)); } else { // We have a surrogate start followed by a // regular character. Technically, this is // invalid, skip. } leftOver = 0x00; } if (ch < (int)0x80) { buffer.writeByte(ch); } else if (ch < (int)0x800) { buffer.writeByte(0xC0 | (ch >> 6)); buffer.writeByte(0x80 | (ch & 0x3F)); } else if (ch < '\uD800' || ch > (int)'\uDFFF') { buffer.writeByte(0xE0 | (ch >> 12)); buffer.writeByte(0x80 | ((ch >> 6) & 0x3F)); buffer.writeByte(0x80 | (ch & 0x3F)); } else if (ch <= '\uDBFF') { // This is a surrogate char, exit the inner loop. leftOver = ch; break; } else { // We have a surrogate tail without // leading surrogate. Do nothing leftOver = (int)0x00; } seqidx++; } } }