/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tajo.util; import io.netty.buffer.ByteBuf; import java.io.ByteArrayOutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import static io.netty.util.internal.StringUtil.isSurrogate; /** * Extra utilities for bytes */ public class BytesUtils { /** * Parse the first byte of a vint/vlong to determine the number of bytes * @param value the first byte of the vint/vlong * @return the total number of bytes (1 to 9) */ public static int decodeVIntSize(byte value) { if (value >= -112) { return 1; } else if (value < -120) { return -119 - value; } return -111 - value; } /** * @param n Long to make a VLong of. * @return VLong as bytes array. */ public static byte[] vlongToBytes(long n) { byte [] result; int offset = 0; if (n >= -112 && n <= 127) { result = new byte[1]; result[offset] = (byte) n; return result; } int len = -112; if (n < 0) { n ^= -1L; // take one's complement' len = -120; } long tmp = n; while (tmp != 0) { tmp = tmp >> 8; len--; } int size = decodeVIntSize((byte) len); result = new byte[size]; result[offset++] = (byte) len; len = (len < -120) ? -(len + 120) : -(len + 112); for (int idx = len; idx != 0; idx--) { int shiftbits = (idx - 1) * 8; long mask = 0xFFL << shiftbits; result[offset++] = (byte)((n & mask) >> shiftbits); } return result; } public static void writeVLong(ByteArrayOutputStream byteStream, long l) { byte[] vLongBytes = vlongToBytes(l); byteStream.write(vLongBytes, 0, vLongBytes.length); } /** * Converts a char array to a ascii byte array. * * @param chars string * @return the byte array */ static byte[] toASCIIBytes(char[] chars) { byte[] buffer = new byte[chars.length]; for (int i = 0; i < chars.length; i++) { buffer[i] = (byte) chars[i]; } return buffer; } public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target, int numColumns) { return splitWorker(str, 0, -1, separatorChar, target, numColumns); } public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { return splitWorker(str, offset, length, separator, target, numColumns); } public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int numColumns) { return splitWorker(str, 0, -1, separatorChar, null, numColumns); } private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar, int[] target, int numColumns) { return splitWorker(str, offset, length, new byte[] {(byte)separatorChar}, target, numColumns); } /** * Performs the logic for the <code>split</code> and * <code>splitPreserveAllTokens</code> methods that do not return a * maximum array length. * * @param str the String to parse, may be <code>null</code> * @param length amount of bytes to str * @param separator the ascii separate characters * @param target the projection target * @param numColumns number of columns to be retrieved * @return an array of parsed Strings, <code>null</code> if null String input */ private static byte[][] splitWorker(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { if (str == null) { return null; } if (length == 0) { return new byte[numColumns][0]; } if (length < 0) { length = str.length - offset; } int indexMax = 0; if (target != null) { for (int index : target) { indexMax = Math.max(indexMax, index + 1); } } else { indexMax = numColumns; } int[][] indices = split(str, offset, length, separator, new int[indexMax][]); byte[][] result = new byte[numColumns][]; // not-picked -> null, picked but not-exists -> byte[0] if (target != null) { for (int i : target) { int[] index = indices[i]; result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); } } else { for (int i = 0; i < result.length; i++) { int[] index = indices[i]; result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); } } return result; } public static int[][] split(byte[] str, int offset, int length, byte[] separator, int[][] indices) { if (indices.length == 0) { return indices; // trivial } final int limit = offset + length; int start = offset; int colIndex = 0; for (int index = offset; index < limit;) { if (onDelimiter(str, index, limit, separator)) { indices[colIndex++] = new int[] {start, index}; if (colIndex >= indices.length) { return indices; } index += separator.length; start = index; } else { index++; } } if (colIndex < indices.length) { indices[colIndex] = new int[]{start, limit}; } return indices; } private static boolean onDelimiter(byte[] input, int offset, int limit, byte[] delimiter) { for (int i = 0; i < delimiter.length; i++) { if (offset + i >= limit || input[offset + i] != delimiter[i]) { return false; } } return true; } public static byte[][] splitTrivial(byte[] value, byte delimiter) { List<byte[]> split = new ArrayList<>(); int prev = 0; for (int i = 0; i < value.length; i++) { if (value[i] == delimiter) { split.add(Arrays.copyOfRange(value, prev, i)); prev = i + 1; } } if (prev <= value.length) { split.add(Arrays.copyOfRange(value, prev, value.length)); } return split.toArray(new byte[split.size()][]); } /** * It gets the maximum length among all given the array of bytes. * Then, it adds padding (i.e., \0) to byte arrays which are shorter * than the maximum length. * * @param bytes Byte arrays to be padded * @return The array of padded bytes */ public static byte[][] padBytes(byte []...bytes) { byte [][] padded = new byte[bytes.length][]; int maxLen = Integer.MIN_VALUE; for (byte[] aByte : bytes) { maxLen = Math.max(maxLen, aByte.length); } for (int i = 0; i < bytes.length; i++) { int padLen = maxLen - bytes[i].length; if (padLen == 0) { padded[i] = bytes[i]; } else if (padLen > 0) { padded[i] = Bytes.padTail(bytes[i], padLen); } else { throw new RuntimeException("maximum length: " + maxLen + ", bytes[" + i + "].length:" + bytes[i].length); } } return padded; } public static byte [] trimBytes(byte [] bytes) { return new String(bytes).trim().getBytes(); } /** * this is an implementation copied from ByteBufUtil in netty4 */ public static int writeUtf8(ByteBuf buffer, char[] chars, boolean ignoreSurrogate) { int oldWriterIndex = buffer.writerIndex(); int writerIndex = oldWriterIndex; // We can use the _set methods as these not need to do any index checks and reference checks. // This is possible as we called ensureWritable(...) before. for (int i = 0; i < chars.length; i++) { char c = chars[i]; if (c < 0x80) { buffer.setByte(writerIndex++, (byte) c); } else if (c < 0x800) { buffer.setByte(writerIndex++, (byte) (0xc0 | (c >> 6))); buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f))); } else if (!ignoreSurrogate && isSurrogate(c)) { if (!Character.isHighSurrogate(c)) { throw new IllegalArgumentException("Invalid encoding. " + "Expected high (leading) surrogate at index " + i + " but got " + c); } final char c2; try { // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will // re-throw a more informative exception describing the problem. c2 = chars[++i]; } catch (IndexOutOfBoundsException e) { throw new IllegalArgumentException("Underflow. " + "Expected low (trailing) surrogate at index " + i + " but no more characters found.", e); } if (!Character.isLowSurrogate(c2)) { throw new IllegalArgumentException("Invalid encoding. " + "Expected low (trailing) surrogate at index " + i + " but got " + c2); } int codePoint = Character.toCodePoint(c, c2); // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630. buffer.setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18))); buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f))); buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f))); buffer.setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f))); } else { buffer.setByte(writerIndex++, (byte) (0xe0 | (c >> 12))); buffer.setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f))); buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f))); } } // update the writerIndex without any extra checks for performance reasons buffer.writerIndex(writerIndex); return writerIndex - oldWriterIndex; } }