TextUtils.java example

Explorer
tajo-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.datum.protobuf;

import com.google.protobuf.ByteString;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.regex.Pattern;

/**
 * Utilities for coercing types
 * largely follows google/protobuf/text_format.cc.
 */
public class TextUtils {
    private static final Pattern DOUBLE_INFINITY = 
            Pattern.compile("-?inf(inity)?", Pattern.CASE_INSENSITIVE);
    private static final Pattern FLOAT_INFINITY = 
          Pattern.compile("-?inf(inity)?f?", Pattern.CASE_INSENSITIVE);
    private static final Pattern FLOAT_NAN = 
          Pattern.compile("nanf?", Pattern.CASE_INSENSITIVE);
    private static final Pattern DIGITS = 
            Pattern.compile("[0-9]", Pattern.CASE_INSENSITIVE);
    
	/**
     * Convert an unsigned 64-bit integer to a string.
     */
	 public static String unsignedToString(final long value) {
        if (value >= 0) {
            return Long.toString(value);
        } else {
            // Pull off the most-significant bit so that BigInteger doesn't think
            // the number is negative, then set it again using setBit().
            return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL).setBit(63).toString();
        }
    }
	 
    /**
     * Convert an unsigned 32-bit integer to a string.
     */
    public static String unsignedToString(final int value) {
        if (value >= 0) {
            return Integer.toString(value);
        } else {
            return Long.toString((value) & 0x00000000FFFFFFFFL);
        }
    }
    
    /**
     * Convert an unsigned 32-bit integer to a string.
     */
    public static Integer unsignedInt(int value) {
        if (value < 0) {
            return (int) ((value) & 0x00000000FFFFFFFFL);
        }
        return value;
    }

    /**
     * Convert an unsigned 64-bit integer to a string.
     */
    public static Long unsignedLong(long value) {
        if (value < 0) {
            // Pull off the most-significant bit so that BigInteger doesn't think
            // the number is negative, then set it again using setBit().
            return BigInteger.valueOf(value & 0x7FFFFFFFFFFFFFFFL).setBit(63).longValue();
        }
        return value;
    }

    /** 
     * Is this a hex digit? 
     */
    public static boolean isHex(final char c) {
      return ('0' <= c && c <= '9') ||
             ('a' <= c && c <= 'f') ||
             ('A' <= c && c <= 'F');
    }
    
    /** 
     * Is this an octal digit? 
     */
    public static boolean isOctal(final char c) {
      return '0' <= c && c <= '7';
    }

    /**
     * Interpret a character as a digit (in any base up to 36) and return the
     * numeric value.  This is like {@code Character.digit()} but we don't accept
     * non-ASCII digits.
     */
    public static int digitValue(final char c) {
      if ('0' <= c && c <= '9') {
        return c - '0';
      } else if ('a' <= c && c <= 'z') {
        return c - 'a' + 10;
      } else if ('A' <= c && c <= 'Z') {
        return c - 'A' + 10;
      } else {
        throw new IllegalArgumentException("Expected \"0-9\" or \"a-z, A-Z\".");
      }
    }
    
    public static boolean isDigits(final String text) {
        return DIGITS.matcher(text).matches();
    }
    
    private static final int BUFFER_SIZE = 4096;

    // TODO(chrisn): See if working around java.io.Reader#read(CharBuffer)
    // overhead is worthwhile
    public static StringBuilder toStringBuilder(Readable input) throws IOException {
        StringBuilder text = new StringBuilder();
        CharBuffer buffer = CharBuffer.allocate(BUFFER_SIZE);
        while (true) {
            int n = input.read(buffer);
            if (n == -1) {
                break;
            }
            buffer.flip();
            text.append(buffer, 0, n);
        }
        return text;
    }
    
    public static InputStream toInputStream(String input) {
    	return toInputStream(input, Charset.defaultCharset());
    }
    
    public static InputStream toInputStream(String input, Charset cs) {
        return new ByteArrayInputStream(input.getBytes(cs));
    }
    
    
    
    /**
     * If the next token is a double and return its value.
     * Otherwise, throw a {@link NumberFormatException}.
     */
    public static double parseDouble(final String text) throws NumberFormatException {
      // We need to parse infinity and nan separately because
      // Double.parseDouble() does not accept "inf", "infinity", or "nan".
      if (DOUBLE_INFINITY.matcher(text).matches()) {
        final boolean negative = text.startsWith("-");
        return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
      }
      if (text.equalsIgnoreCase("nan")) {
        return Double.NaN;
      }

      final double result = Double.parseDouble(text);
      return result;
    }

    /**
     * Parse a float and return its value.
     * Otherwise, throw a {@link NumberFormatException}.
     */
    public static float parseFloat(final String text) throws NumberFormatException {
      // We need to parse infinity and nan separately because
      // Float.parseFloat() does not accept "inf", "infinity", or "nan".
      if (FLOAT_INFINITY.matcher(text).matches()) {
        final boolean negative = text.startsWith("-");
        return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
      }
      if (FLOAT_NAN.matcher(text).matches()) {
        return Float.NaN;
      }

      final float result = Float.parseFloat(text);
      return result;
    }
    
    /**
     * Parse a boolean and return its value.
     * Otherwise, throw a {@link IllegalArgumentException}.
     */
    public static boolean parseBoolean(final String text) throws IllegalArgumentException {
      if (text.equalsIgnoreCase("true") || text.equalsIgnoreCase("t") ||
              text.equals("1")) {
        return true;
      } else if (text.equalsIgnoreCase("false") || text.equalsIgnoreCase("f") ||
              text.equals("0")) {
        return false;
      } else {
        throw new IllegalArgumentException("Expected \"true\" or \"false\".");
      }
    }

    
    /**
     * Parse a 32-bit signed integer from the text.  Unlike the Java standard
     * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
     * and "0" to signify hexidecimal and octal numbers, respectively.
     */
    public static int parseInt32(final String text) throws NumberFormatException {
      return (int) parseInteger(text, true, false);
    }

    /**
     * Parse a 32-bit unsigned integer from the text.  Unlike the Java standard
     * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
     * and "0" to signify hexidecimal and octal numbers, respectively.  The
     * result is coerced to a (signed) {@code int} when returned since Java has
     * no unsigned integer type.
     */
    public static int parseUInt32(final String text) throws NumberFormatException {
      return (int) parseInteger(text, false, false);
    }

    /**
     * Parse a 64-bit signed integer from the text.  Unlike the Java standard
     * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
     * and "0" to signify hexidecimal and octal numbers, respectively.
     */
    public static long parseInt64(final String text) throws NumberFormatException {
      return parseInteger(text, true, true);
    }

    /**
     * Parse a 64-bit unsigned integer from the text.  Unlike the Java standard
     * {@code Integer.parseInt()}, this function recognizes the prefixes "0x"
     * and "0" to signify hexidecimal and octal numbers, respectively.  The
     * result is coerced to a (signed) {@code long} when returned since Java has
     * no unsigned long type.
     */
    public static long parseUInt64(final String text) throws NumberFormatException {
      return parseInteger(text, false, true);
    }

    public static long parseInteger(final String text,
            final boolean isSigned, final boolean isLong) throws NumberFormatException {
      int pos = 0;

      boolean negative = false;
      if (text.startsWith("-", pos)) {
        if (!isSigned) {
          throw new NumberFormatException("Number must be positive: " + text);
        }
        ++pos;
        negative = true;
      }

      int radix = 10;
      if (text.startsWith("0x", pos)) {
        pos += 2;
        radix = 16;
      } else if (text.startsWith("0", pos)) {
        radix = 8;
      }

      final String numberText = text.substring(pos);

      long result = 0;
      if (numberText.length() < 16) {
        // Can safely assume no overflow.
        result = Long.parseLong(numberText, radix);
        if (negative) {
          result = -result;
        }

        // Check bounds.
        // No need to check for 64-bit numbers since they'd have to be 16 chars
        // or longer to overflow.
        if (!isLong) {
          if (isSigned) {
            if (result > Integer.MAX_VALUE || result < Integer.MIN_VALUE) {
              throw new NumberFormatException(
                "Number out of range for 32-bit signed integer: " + text);
            }
          } else {
            if (result >= (1L << 32) || result < 0) {
              throw new NumberFormatException(
                "Number out of range for 32-bit unsigned integer: " + text);
            }
          }
        }
      } else {
        BigInteger bigValue = new BigInteger(numberText, radix);
        if (negative) {
          bigValue = bigValue.negate();
        }

        // Check bounds.
        if (!isLong) {
          if (isSigned) {
            if (bigValue.bitLength() > 31) {
              throw new NumberFormatException(
                "Number out of range for 32-bit signed integer: " + text);
            }
          } else {
            if (bigValue.bitLength() > 32) {
              throw new NumberFormatException(
                "Number out of range for 32-bit unsigned integer: " + text);
            }
          }
        } else {
          if (isSigned) {
            if (bigValue.bitLength() > 63) {
              throw new NumberFormatException(
                "Number out of range for 64-bit signed integer: " + text);
            }
          } else {
            if (bigValue.bitLength() > 64) {
              throw new NumberFormatException(
                "Number out of range for 64-bit unsigned integer: " + text);
            }
          }
        }

        result = bigValue.longValue();
      }

      return result;
    }
    
    /**
     * Escapes bytes in the format used in protocol buffer text format, which is the same as the
     * format used for C string literals. All bytes that are not printable 7-bit ASCII characters
     * are escaped, as well as backslash, single-quote, and double-quote characters. Characters for
     * which no defined short-hand escape sequence is defined will be escaped using 3-digit octal
     * sequences.
     */
    public static String escapeBytes(final ByteString input) {
        final StringBuilder builder = new StringBuilder(input.size());
        for (int i = 0; i < input.size(); i++) {
          final byte b = input.byteAt(i);
          switch (b) {
            // Java does not recognize \a or \v, apparently.
            case 0x07: builder.append("\\a" ); break;
            case '\b': builder.append("\\b" ); break;
            case '\f': builder.append("\\f" ); break;
            case '\n': builder.append("\\n" ); break;
            case '\r': builder.append("\\r" ); break;
            case '\t': builder.append("\\t" ); break;
            case 0x0b: builder.append("\\v" ); break;
            case '\\': builder.append("\\\\"); break;
            case '\'': builder.append("\\\'"); break;
            case '"' : builder.append("\\\""); break;
            default:
              // Note:  Bytes with the high-order bit set should be escaped.  Since
              //   bytes are signed, such bytes will compare less than 0x20, hence
              //   the following line is correct.
              if (b >= 0x20) {
                builder.append((char) b);
              } else {
                builder.append('\\');
                builder.append((char) ('0' + ((b >>> 6) & 3)));
                builder.append((char) ('0' + ((b >>> 3) & 7)));
                builder.append((char) ('0' + (b & 7)));
              }
              break;
          }
        }
        return builder.toString();
    }
    
    /**
     * Un-escape a byte sequence as escaped using
     * {@link #escapeBytes(com.google.protobuf.ByteString)}.  Two-digit hex escapes (starting with
     * "\x") are also recognized.
     */
    public static ByteString unescapeBytes(final CharSequence charString) {
      // First convert the Java characater sequence to UTF-8 bytes.
      ByteString input = ByteString.copyFromUtf8(charString.toString());
      // Then unescape certain byte sequences introduced by ASCII '\\'.  The valid
      // escapes can all be expressed with ASCII characters, so it is safe to
      // operate on bytes here.
      //
      // Unescaping the input byte array will result in a byte sequence that's no
      // longer than the input.  That's because each escape sequence is between
      // two and four bytes long and stands for a single byte.
      final byte[] result = new byte[input.size()];
      int pos = 0;
      for (int i = 0; i < input.size(); i++) {
        byte c = input.byteAt(i);
        if (c == '\\') {
          if (i + 1 < input.size()) {
            ++i;
            c = input.byteAt(i);
            if (isOctal((char)c)) {
              // Octal escape.
              int code = digitValue((char) c);
              if (i + 1 < input.size() && isOctal((char) input.byteAt(i + 1))) {
                ++i;
                code = code * 8 + digitValue((char) input.byteAt(i));
              }
              if (i + 1 < input.size() && isOctal((char) input.byteAt(i + 1))) {
                ++i;
                code = code * 8 + digitValue((char) input.byteAt(i));
              }
              // TODO: Check that 0 <= code && code <= 0xFF.
              result[pos++] = (byte)code;
            } else {
              switch (c) {
                case 'a' : result[pos++] = 0x07; break;
                case 'b' : result[pos++] = '\b'; break;
                case 'f' : result[pos++] = '\f'; break;
                case 'n' : result[pos++] = '\n'; break;
                case 'r' : result[pos++] = '\r'; break;
                case 't' : result[pos++] = '\t'; break;
                case 'v' : result[pos++] = 0x0b; break;
                case '\\': result[pos++] = '\\'; break;
                case '\'': result[pos++] = '\''; break;
                case '"' : result[pos++] = '\"'; break;

                case 'x':
                  // hex escape
                  int code = 0;
                  if (i + 1 < input.size() && isHex((char) input.byteAt(i + 1))) {
                    ++i;
                    code = digitValue((char) input.byteAt(i));
                  } else {
                    throw new IllegalArgumentException(
                        "Invalid escape sequence: '\\x' with no digits");
                  }
                  if (i + 1 < input.size() && isHex((char) input.byteAt(i + 1))) {
                    ++i;
                    code = code * 16 + digitValue((char) input.byteAt(i));
                  }
                  result[pos++] = (byte)code;
                  break;

                default:
                  throw new IllegalArgumentException(
                      "Invalid escape sequence: '\\" + (char)c + '\'');
              }
            }
          } else {
            throw new IllegalArgumentException(
                "Invalid escape sequence: '\\' at end of string.");
          }
        } else {
          result[pos++] = c;
        }
      }

      return ByteString.copyFrom(result, 0, pos);
    }

}