BytesRef.java example

Explorer
indextank-engine-master
package com.flaptor.org.apache.lucene.util;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.Comparator;

/** Represents byte[], as a slice (offset + length) into an
 *  existing byte[].  The {@link #bytes} member should never be null;
 *  use {@link #EMPTY_BYTES} if necessary.
 *
 *  @lucene.experimental */
public final class BytesRef implements Comparable<BytesRef> {

  static final int HASH_PRIME = 31;
  public static final byte[] EMPTY_BYTES = new byte[0]; 

  /** The contents of the BytesRef. Should never be {@code null}. */
  public byte[] bytes;

  /** Offset of first valid byte. */
  public int offset;

  /** Length of used bytes. */
  public int length;

  public BytesRef() {
    bytes = EMPTY_BYTES;
  }

  /** This instance will directly reference bytes w/o making a copy.
   * bytes should not be null.
   */
  public BytesRef(byte[] bytes, int offset, int length) {
    assert bytes != null;
    this.bytes = bytes;
    this.offset = offset;
    this.length = length;
  }

  /** This instance will directly reference bytes w/o making a copy.
   * bytes should not be null */
  public BytesRef(byte[] bytes) {
    assert bytes != null;
    this.bytes = bytes;
    this.offset = 0;
    this.length = bytes.length;
  }

  public BytesRef(int capacity) {
    this.bytes = new byte[capacity];
  }

  /** Incoming IntsRef values must be Byte.MIN_VALUE -
   *  Byte.MAX_VALUE. */
  public BytesRef(IntsRef intsRef) {
    bytes = new byte[intsRef.length];
    for(int idx=0;idx<intsRef.length;idx++) {
      final int v = intsRef.ints[intsRef.offset + idx];
      assert v >= Byte.MIN_VALUE && v <= Byte.MAX_VALUE;
      bytes[idx] = (byte) v;
    }
    length = intsRef.length;
  }

  /**
   * @param text Initialize the byte[] from the UTF8 bytes
   * for the provided String.  This must be well-formed
   * unicode text, with no unpaired surrogates or U+FFFF.
   */
  public BytesRef(CharSequence text) {
    this();
    copy(text);
  }
  
  /**
   * @param text Initialize the byte[] from the UTF8 bytes
   * for the provided array.  This must be well-formed
   * unicode text, with no unpaired surrogates or U+FFFF.
   */
  public BytesRef(char text[], int offset, int length) {
    this(length * 4);
    copy(text, offset, length);
  }

  public BytesRef(BytesRef other) {
    this();
    copy(other);
  }

  /* // maybe?
  public BytesRef(BytesRef other, boolean shallow) {
    this();
    if (shallow) {
      offset = other.offset;
      length = other.length;
      bytes = other.bytes;
    } else {
      copy(other);
    }
  }
  */

  /**
   * Copies the UTF8 bytes for this string.
   * 
   * @param text Must be well-formed unicode text, with no
   * unpaired surrogates or invalid UTF16 code units.
   */
  public void copy(CharSequence text) {
    UnicodeUtil2.UTF16toUTF8(text, 0, text.length(), this);
  }

  /**
   * Copies the UTF8 bytes for this string.
   * 
   * @param text Must be well-formed unicode text, with no
   * unpaired surrogates or invalid UTF16 code units.
   */
  public void copy(char text[], int offset, int length) {
    UnicodeUtil2.UTF16toUTF8(text, offset, length, this);
  }
  
  public boolean bytesEquals(BytesRef other) {
    if (length == other.length) {
      int otherUpto = other.offset;
      final byte[] otherBytes = other.bytes;
      final int end = offset + length;
      for(int upto=offset;upto<end;upto++,otherUpto++) {
        if (bytes[upto] != otherBytes[otherUpto]) {
          return false;
        }
      }
      return true;
    } else {
      return false;
    }
  }

  @Override
  public Object clone() {
    return new BytesRef(this);
  }

  private boolean sliceEquals(BytesRef other, int pos) {
    if (pos < 0 || length - pos < other.length) {
      return false;
    }
    int i = offset + pos;
    int j = other.offset;
    final int k = other.offset + other.length;
    
    while (j < k) {
      if (bytes[i++] != other.bytes[j++]) {
        return false;
      }
    }
    
    return true;
  }
  
  public boolean startsWith(BytesRef other) {
    return sliceEquals(other, 0);
  }

  public boolean endsWith(BytesRef other) {
    return sliceEquals(other, length - other.length);
  }
  
  /** Calculates the hash code as required by TermsHash during indexing.
   * <p>It is defined as:
   * <pre>
   *  int hash = 0;
   *  for (int i = offset; i < offset + length; i++) {
   *    hash = 31*hash + bytes[i];
   *  }
   * </pre>
   */
  @Override
  public int hashCode() {
    int result = 0;
    final int end = offset + length;
    for(int i=offset;i<end;i++) {
      result = HASH_PRIME * result + bytes[i];
    }
    return result;
  }

  @Override
  public boolean equals(Object other) {
    if (other == null) {
      return false;
    }
    return this.bytesEquals((BytesRef) other);
  }

  /** Interprets stored bytes as UTF8 bytes, returning the
   *  resulting string */
  public String utf8ToString() {
    final CharsRef ref = new CharsRef(length);
    UnicodeUtil2.UTF8toUTF16(bytes, offset, length, ref);
    return ref.toString(); 
  }
  
  /** Interprets stored bytes as UTF8 bytes into the given {@link CharsRef} */
  public CharsRef utf8ToChars(CharsRef ref) {
    UnicodeUtil2.UTF8toUTF16(bytes, offset, length, ref);
    return ref;
  }

  /** Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] */
  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append('[');
    final int end = offset + length;
    for(int i=offset;i<end;i++) {
      if (i > offset) {
        sb.append(' ');
      }
      sb.append(Integer.toHexString(bytes[i]&0xff));
    }
    sb.append(']');
    return sb.toString();
  }

  /**
   * Copies the given {@link BytesRef}
   * <p>
   * NOTE: this method resets the offset to 0 and resizes the reference array
   * if needed.
   */
  public void copy(BytesRef other) {
    if (bytes.length < other.length) {
      bytes = new byte[other.length];
    }
    System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
    length = other.length;
    offset = 0;
  }

  /**
   * Copies the given long value and encodes it as 8 byte Big-Endian.
   * <p>
   * NOTE: this method resets the offset to 0, length to 8 and resizes the reference array
   * if needed.
   */
  public void copy(long value) {
    if (bytes.length < 8) {
      bytes = new byte[8];
    }
    copyInternal((int) (value >> 32), offset = 0);
    copyInternal((int) value, 4);
    length = 8;
  }
  
  /**
   * Copies the given int value and encodes it as 4 byte Big-Endian.
   * <p>
   * NOTE: this method resets the offset to 0, length to 4 and resizes the reference array
   * if needed.
   */
  public void copy(int value) {
    if (bytes.length < 4) {
      bytes = new byte[4];
    }
    copyInternal(value, offset = 0);
    length = 4;
  }

  /**
   * Copies the given short value and encodes it as a 2 byte Big-Endian.
   * <p>
   * NOTE: this method resets the offset to 0, length to 2 and resizes the reference array
   * if needed.
   */
  public void copy(short value) {
    if (bytes.length < 2) {
      bytes = new byte[2];
    }
    bytes[offset] = (byte) (value >> 8);
    bytes[offset + 1] = (byte) (value);

  }
  
  /**
   * Converts 2 consecutive bytes from the current offset to a short. Bytes are
   * interpreted as Big-Endian (most significant bit first)
   * <p>
   * NOTE: this method does <b>NOT</b> check the bounds of the referenced array.
   */
  public short asShort() {
    int pos = offset;
    return (short) (0xFFFF & ((bytes[pos++] & 0xFF) << 8) | (bytes[pos] & 0xFF));
  }

  /**
   * Converts 4 consecutive bytes from the current offset to an int. Bytes are
   * interpreted as Big-Endian (most significant bit first)
   * <p>
   * NOTE: this method does <b>NOT</b> check the bounds of the referenced array.
   */
  public int asInt() {
    return asIntInternal(offset);
  }

  /**
   * Converts 8 consecutive bytes from the current offset to a long. Bytes are
   * interpreted as Big-Endian (most significant bit first)
   * <p>
   * NOTE: this method does <b>NOT</b> check the bounds of the referenced array.
   */
  public long asLong() {
    return (((long) asIntInternal(offset) << 32) | asIntInternal(offset + 4) & 0xFFFFFFFFL);
  }

  private void copyInternal(int value, int startOffset) {
    bytes[startOffset] = (byte) (value >> 24);
    bytes[startOffset + 1] = (byte) (value >> 16);
    bytes[startOffset + 2] = (byte) (value >> 8);
    bytes[startOffset + 3] = (byte) (value);
  }

  private int asIntInternal(int pos) {
    return ((bytes[pos++] & 0xFF) << 24) | ((bytes[pos++] & 0xFF) << 16)
        | ((bytes[pos++] & 0xFF) << 8) | (bytes[pos] & 0xFF);
  }

  public void append(BytesRef other) {
    int newLen = length + other.length;
    if (bytes.length < newLen) {
      byte[] newBytes = new byte[newLen];
      System.arraycopy(bytes, offset, newBytes, 0, length);
      offset = 0;
      bytes = newBytes;
    }
    System.arraycopy(other.bytes, other.offset, bytes, length+offset, other.length);
    length = newLen;
  }

  public void grow(int newLength) {
    bytes = ArrayUtil2.grow(bytes, newLength);
  }

  /** Unsigned byte order comparison */
  public int compareTo(BytesRef other) {
    if (this == other) return 0;

    final byte[] aBytes = this.bytes;
    int aUpto = this.offset;
    final byte[] bBytes = other.bytes;
    int bUpto = other.offset;

    final int aStop = aUpto + Math.min(this.length, other.length);

    while(aUpto < aStop) {
      int aByte = aBytes[aUpto++] & 0xff;
      int bByte = bBytes[bUpto++] & 0xff;
      int diff = aByte - bByte;
      if (diff != 0) return diff;
    }

    // One is a prefix of the other, or, they are equal:
    return this.length - other.length;
  }
  
  private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();

  public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
    return utf8SortedAsUnicodeSortOrder;
  }

  private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
    // Only singleton
    private UTF8SortedAsUnicodeComparator() {};

    public int compare(BytesRef a, BytesRef b) {
      final byte[] aBytes = a.bytes;
      int aUpto = a.offset;
      final byte[] bBytes = b.bytes;
      int bUpto = b.offset;
      
      final int aStop;
      if (a.length < b.length) {
        aStop = aUpto + a.length;
      } else {
        aStop = aUpto + b.length;
      }

      while(aUpto < aStop) {
        int aByte = aBytes[aUpto++] & 0xff;
        int bByte = bBytes[bUpto++] & 0xff;

        int diff = aByte - bByte;
        if (diff != 0) {
          return diff;
        }
      }

      // One is a prefix of the other, or, they are equal:
      return a.length - b.length;
    }    
  }

  private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();

  public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
    return utf8SortedAsUTF16SortOrder;
  }

  private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
    // Only singleton
    private UTF8SortedAsUTF16Comparator() {};

    public int compare(BytesRef a, BytesRef b) {

      final byte[] aBytes = a.bytes;
      int aUpto = a.offset;
      final byte[] bBytes = b.bytes;
      int bUpto = b.offset;
      
      final int aStop;
      if (a.length < b.length) {
        aStop = aUpto + a.length;
      } else {
        aStop = aUpto + b.length;
      }

      while(aUpto < aStop) {
        int aByte = aBytes[aUpto++] & 0xff;
        int bByte = bBytes[bUpto++] & 0xff;

        if (aByte != bByte) {

          // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order

          // We know the terms are not equal, but, we may
          // have to carefully fixup the bytes at the
          // difference to match UTF16's sort order:
          
          // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff, 
          // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences]
          // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such
          // that 6-byte sequences are needed we have much bigger problems anyway.
          if (aByte >= 0xee && bByte >= 0xee) {
            if ((aByte & 0xfe) == 0xee) {
              aByte += 0xe;
            }
            if ((bByte&0xfe) == 0xee) {
              bByte += 0xe;
            }
          }
          return aByte - bByte;
        }
      }

      // One is a prefix of the other, or, they are equal:
      return a.length - b.length;
    }
  }
}