ByteBloomFilter.java example

Explorer
SecureBase-master
- hbase-0.92.1
  - security
    - src
      - main
        java
        org
        apache
        hadoop
        hbase
        ipc
        SecureClient.java
        SecureConnectionHeader.java
        SecureRpcEngine.java
        SecureServer.java
        security
        AccessDeniedException.java
        HBasePolicyProvider.java
        HBaseSaslRpcClient.java
        HBaseSaslRpcServer.java
        access
        AccessControlFilter.java
        AccessControlLists.java
        AccessController.java
        AccessControllerProtocol.java
        Permission.java
        TableAuthManager.java
        TablePermission.java
        UserPermission.java
        ZKPermissionWatcher.java
        token
        AuthenticationKey.java
        AuthenticationProtocol.java
        AuthenticationTokenIdentifier.java
        AuthenticationTokenSecretManager.java
        AuthenticationTokenSelector.java
        TokenProvider.java
        TokenUtil.java
        ZKSecretWatcher.java
      - test
        java
        org
        apache
        hadoop
        hbase
        security
        access
        SecureTestUtil.java
        TestAccessControlFilter.java
        TestAccessController.java
        TestTablePermissions.java
        TestZKPermissionsWatcher.java
        token
        TestTokenAuthentication.java
        TestZKSecretWatcher.java
  - src
/*
 * Copyright 2010 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hbase.util;

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.NumberFormat;
import java.util.Random;

/**
 * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
 * <p>
 * The Bloom filter is a data structure that was introduced in 1970 and that has
 * been adopted by the networking research community in the past decade thanks
 * to the bandwidth efficiencies that it offers for the transmission of set
 * membership information between networked hosts. A sender encodes the
 * information into a bit vector, the Bloom filter, that is more compact than a
 * conventional representation. Computation and space costs for construction are
 * linear in the number of elements. The receiver uses the filter to test
 * whether various elements are members of the set. Though the filter will
 * occasionally return a false positive, it will never return a false negative.
 * When creating the filter, the sender can choose its desired point in a
 * trade-off between the false positive rate and the size.
 *
 * <p>
 * Originally inspired by <a href="http://www.one-lab.org">European Commission
 * One-Lab Project 034819</a>.
 *
 * Bloom filters are very sensitive to the number of elements inserted into
 * them. For HBase, the number of entries depends on the size of the data stored
 * in the column. Currently the default region size is 256MB, so entry count ~=
 * 256MB / (average value size for column). Despite this rule of thumb, there is
 * no efficient way to calculate the entry count after compactions. Therefore,
 * it is often easier to use a dynamic bloom filter that will add extra space
 * instead of allowing the error rate to grow.
 *
 * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey
 * .pdf )
 *
 * m denotes the number of bits in the Bloom filter (bitSize) n denotes the
 * number of elements inserted into the Bloom filter (maxKeys) k represents the
 * number of hash functions used (nbHash) e represents the desired false
 * positive rate for the bloom (err)
 *
 * If we fix the error rate (e) and know the number of entries, then the optimal
 * bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) / ln(0.6185)
 *
 * The probability of false positives is minimized when k = m/n ln(2).
 *
 * @see BloomFilter The general behavior of a filter
 *
 * @see <a
 *      href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">
 *      Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
 */
public class ByteBloomFilter implements BloomFilter, BloomFilterWriter {

  /** Current file format version */
  public static final int VERSION = 1;

  /** Bytes (B) in the array. This actually has to fit into an int. */
  protected long byteSize;
  /** Number of hash functions */
  protected int hashCount;
  /** Hash type */
  protected final int hashType;
  /** Hash Function */
  protected final Hash hash;
  /** Keys currently in the bloom */
  protected int keyCount;
  /** Max Keys expected for the bloom */
  protected int maxKeys;
  /** Bloom bits */
  protected ByteBuffer bloom;

  /** Record separator for the Bloom filter statistics human-readable string */
  public static final String STATS_RECORD_SEP = "; ";

  /**
   * Used in computing the optimal Bloom filter size. This approximately equals
   * 0.480453.
   */
  public static final double LOG2_SQUARED = Math.log(2) * Math.log(2);

  /**
   * A random number generator to use for "fake lookups" when testing to
   * estimate the ideal false positive rate.
   */
  private static Random randomGeneratorForTest;

  /** Bit-value lookup array to prevent doing the same work over and over */
  private static final byte [] bitvals = {
    (byte) 0x01,
    (byte) 0x02,
    (byte) 0x04,
    (byte) 0x08,
    (byte) 0x10,
    (byte) 0x20,
    (byte) 0x40,
    (byte) 0x80
  };

  /**
   * Loads bloom filter meta data from file input.
   * @param meta stored bloom meta data
   * @throws IllegalArgumentException meta data is invalid
   */
  public ByteBloomFilter(DataInput meta)
      throws IOException, IllegalArgumentException {
    this.byteSize = meta.readInt();
    this.hashCount = meta.readInt();
    this.hashType = meta.readInt();
    this.keyCount = meta.readInt();
    this.maxKeys = this.keyCount;

    this.hash = Hash.getInstance(this.hashType);
    if (hash == null) {
      throw new IllegalArgumentException("Invalid hash type: " + hashType);
    }
    sanityCheck();
  }

  /**
   * @param maxKeys
   * @param errorRate
   * @return the number of bits for a Bloom filter than can hold the given
   *         number of keys and provide the given error rate, assuming that the
   *         optimal number of hash functions is used and it does not have to
   *         be an integer.
   */
  public static long computeBitSize(long maxKeys, double errorRate) {
    return (long) Math.ceil(maxKeys * (-Math.log(errorRate) / LOG2_SQUARED));
  }

  /**
   * The maximum number of keys we can put into a Bloom filter of a certain
   * size to maintain the given error rate, assuming the number of hash
   * functions is chosen optimally and does not even have to be an integer
   * (hence the "ideal" in the function name).
   *
   * @param bitSize
   * @param errorRate
   * @return maximum number of keys that can be inserted into the Bloom filter
   * @see #computeMaxKeys(long, double, int) for a more precise estimate
   */
  public static long idealMaxKeys(long bitSize, double errorRate) {
    // The reason we need to use floor here is that otherwise we might put
    // more keys in a Bloom filter than is allowed by the target error rate.
    return (long) (bitSize * (LOG2_SQUARED / -Math.log(errorRate)));
  }

  /**
   * The maximum number of keys we can put into a Bloom filter of a certain
   * size to get the given error rate, with the given number of hash functions.
   *
   * @param bitSize
   * @param errorRate
   * @param hashCount
   * @return the maximum number of keys that can be inserted in a Bloom filter
   *         to maintain the target error rate, if the number of hash functions
   *         is provided.
   */
  public static long computeMaxKeys(long bitSize, double errorRate,
      int hashCount) {
    return (long) (-bitSize * 1.0 / hashCount *
        Math.log(1 - Math.exp(Math.log(errorRate) / hashCount)));
  }

  /**
   * Computes the error rate for this Bloom filter, taking into account the
   * actual number of hash functions and keys inserted. The return value of
   * this function changes as a Bloom filter is being populated. Used for
   * reporting the actual error rate of compound Bloom filters when writing
   * them out.
   *
   * @return error rate for this particular Bloom filter
   */
  public double actualErrorRate() {
    return actualErrorRate(keyCount, byteSize * 8, hashCount);
  }

  /**
   * Computes the actual error rate for the given number of elements, number
   * of bits, and number of hash functions. Taken directly from the
   * <a href=
   * "http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives"
   * > Wikipedia Bloom filter article</a>.
   *
   * @param maxKeys
   * @param bitSize
   * @param functionCount
   * @return the actual error rate
   */
  public static double actualErrorRate(long maxKeys, long bitSize,
      int functionCount) {
    return Math.exp(Math.log(1 - Math.exp(-functionCount * maxKeys * 1.0
        / bitSize)) * functionCount);
  }

  /**
   * Increases the given byte size of a Bloom filter until it can be folded by
   * the given factor.
   *
   * @param bitSize
   * @param foldFactor
   * @return Foldable byte size
   */
  public static int computeFoldableByteSize(long bitSize, int foldFactor) {
    long byteSizeLong = (bitSize + 7) / 8;
    int mask = (1 << foldFactor) - 1;
    if ((mask & byteSizeLong) != 0) {
      byteSizeLong >>= foldFactor;
      ++byteSizeLong;
      byteSizeLong <<= foldFactor;
    }
    if (byteSizeLong > Integer.MAX_VALUE) {
      throw new IllegalArgumentException("byteSize=" + byteSizeLong + " too "
          + "large for bitSize=" + bitSize + ", foldFactor=" + foldFactor);
    }
    return (int) byteSizeLong;
  }

  private static int optimalFunctionCount(int maxKeys, long bitSize) {
    return (int) Math.ceil(Math.log(2) * (bitSize / maxKeys));
  }

  /** Private constructor used by other constructors. */
  private ByteBloomFilter(int hashType) {
    this.hashType = hashType;
    this.hash = Hash.getInstance(hashType);
  }

  /**
   * Determines & initializes bloom filter meta data from user config. Call
   * {@link #allocBloom()} to allocate bloom filter data.
   *
   * @param maxKeys Maximum expected number of keys that will be stored in this
   *          bloom
   * @param errorRate Desired false positive error rate. Lower rate = more
   *          storage required
   * @param hashType Type of hash function to use
   * @param foldFactor When finished adding entries, you may be able to 'fold'
   *          this bloom to save space. Tradeoff potentially excess bytes in
   *          bloom for ability to fold if keyCount is exponentially greater
   *          than maxKeys.
   * @throws IllegalArgumentException
   */
  public ByteBloomFilter(int maxKeys, double errorRate, int hashType,
      int foldFactor) throws IllegalArgumentException {
    this(hashType);

    long bitSize = computeBitSize(maxKeys, errorRate);
    hashCount = optimalFunctionCount(maxKeys, bitSize);
    this.maxKeys = maxKeys;

    // increase byteSize so folding is possible
    byteSize = computeFoldableByteSize(bitSize, foldFactor);

    sanityCheck();
  }

  /**
   * Creates a Bloom filter of the given size.
   *
   * @param byteSizeHint the desired number of bytes for the Bloom filter bit
   *          array. Will be increased so that folding is possible.
   * @param errorRate target false positive rate of the Bloom filter
   * @param hashType Bloom filter hash function type
   * @param foldFactor
   * @return the new Bloom filter of the desired size
   */
  public static ByteBloomFilter createBySize(int byteSizeHint,
      double errorRate, int hashType, int foldFactor) {
    ByteBloomFilter bbf = new ByteBloomFilter(hashType);

    bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8, foldFactor);
    long bitSize = bbf.byteSize * 8;
    bbf.maxKeys = (int) idealMaxKeys(bitSize, errorRate);
    bbf.hashCount = optimalFunctionCount(bbf.maxKeys, bitSize);

    // Adjust max keys to bring error rate closer to what was requested,
    // because byteSize was adjusted to allow for folding, and hashCount was
    // rounded.
    bbf.maxKeys = (int) computeMaxKeys(bitSize, errorRate, bbf.hashCount);

    return bbf;
  }

  /**
   * Creates another similar Bloom filter. Does not copy the actual bits, and
   * sets the new filter's key count to zero.
   *
   * @return a Bloom filter with the same configuration as this
   */
  public ByteBloomFilter createAnother() {
    ByteBloomFilter bbf = new ByteBloomFilter(hashType);
    bbf.byteSize = byteSize;
    bbf.hashCount = hashCount;
    bbf.maxKeys = maxKeys;
    return bbf;
  }

  @Override
  public void allocBloom() {
    if (this.bloom != null) {
      throw new IllegalArgumentException("can only create bloom once.");
    }
    this.bloom = ByteBuffer.allocate((int)this.byteSize);
    assert this.bloom.hasArray();
  }

  void sanityCheck() throws IllegalArgumentException {
    if(0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) {
      throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize);
    }

    if(this.hashCount <= 0) {
      throw new IllegalArgumentException("Hash function count must be > 0");
    }

    if (this.hash == null) {
      throw new IllegalArgumentException("hashType must be known");
    }

    if (this.keyCount < 0) {
      throw new IllegalArgumentException("must have positive keyCount");
    }
  }

  void bloomCheck(ByteBuffer bloom)  throws IllegalArgumentException {
    if (this.byteSize != bloom.limit()) {
      throw new IllegalArgumentException(
          "Configured bloom length should match actual length");
    }
  }

  public void add(byte [] buf) {
    add(buf, 0, buf.length);
  }

  @Override
  public void add(byte [] buf, int offset, int len) {
    /*
     * For faster hashing, use combinatorial generation
     * http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
     */
    int hash1 = this.hash.hash(buf, offset, len, 0);
    int hash2 = this.hash.hash(buf, offset, len, hash1);

    for (int i = 0; i < this.hashCount; i++) {
      long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
      set(hashLoc);
    }

    ++this.keyCount;
  }

  /** Should only be used in tests */
  boolean contains(byte [] buf) {
    return contains(buf, 0, buf.length, this.bloom);
  }

  /** Should only be used in tests */
  boolean contains(byte [] buf, int offset, int length) {
    return contains(buf, offset, length, bloom);
  }

  /** Should only be used in tests */
  boolean contains(byte[] buf, ByteBuffer bloom) {
    return contains(buf, 0, buf.length, bloom);
  }

  @Override
  public boolean contains(byte[] buf, int offset, int length,
      ByteBuffer theBloom) {
    if (theBloom == null) {
      // In a version 1 HFile Bloom filter data is stored in a separate meta
      // block which is loaded on demand, but in version 2 it is pre-loaded.
      // We want to use the same API in both cases.
      theBloom = bloom;
    }

    if (theBloom.limit() != byteSize) {
      throw new IllegalArgumentException("Bloom does not match expected size:"
          + " theBloom.limit()=" + theBloom.limit() + ", byteSize=" + byteSize);
    }

    return contains(buf, offset, length, theBloom.array(),
        theBloom.arrayOffset(), (int) byteSize, hash, hashCount);
  }

  public static boolean contains(byte[] buf, int offset, int length,
      byte[] bloomArray, int bloomOffset, int bloomSize, Hash hash,
      int hashCount) {

    int hash1 = hash.hash(buf, offset, length, 0);
    int hash2 = hash.hash(buf, offset, length, hash1);
    int bloomBitSize = bloomSize * 8;

    if (randomGeneratorForTest == null) {
      // Production mode.
      for (int i = 0; i < hashCount; i++) {
        long hashLoc = Math.abs((hash1 + i * hash2) % bloomBitSize);
        if (!get(hashLoc, bloomArray, bloomOffset))
          return false;
      }
    } else {
      // Test mode with "fake lookups" to estimate "ideal false positive rate".
      for (int i = 0; i < hashCount; i++) {
        long hashLoc = randomGeneratorForTest.nextInt(bloomBitSize);
        if (!get(hashLoc, bloomArray, bloomOffset))
          return false;
      }
    }

    return true;
  }

  //---------------------------------------------------------------------------
  /** Private helpers */

  /**
   * Set the bit at the specified index to 1.
   *
   * @param pos index of bit
   */
  void set(long pos) {
    int bytePos = (int)(pos / 8);
    int bitPos = (int)(pos % 8);
    byte curByte = bloom.get(bytePos);
    curByte |= bitvals[bitPos];
    bloom.put(bytePos, curByte);
  }

  /**
   * Check if bit at specified index is 1.
   *
   * @param pos index of bit
   * @return true if bit at specified index is 1, false if 0.
   */
  static boolean get(long pos, byte[] bloomArray, int bloomOffset) {
    int bytePos = (int)(pos / 8);
    int bitPos = (int)(pos % 8);
    byte curByte = bloomArray[bloomOffset + bytePos];
    curByte &= bitvals[bitPos];
    return (curByte != 0);
  }

  @Override
  public long getKeyCount() {
    return keyCount;
  }

  @Override
  public long getMaxKeys() {
    return maxKeys;
  }

  @Override
  public long getByteSize() {
    return byteSize;
  }

  public int getHashType() {
    return hashType;
  }

  @Override
  public void compactBloom() {
    // see if the actual size is exponentially smaller than expected.
    if (this.keyCount > 0 && this.bloom.hasArray()) {
      int pieces = 1;
      int newByteSize = (int)this.byteSize;
      int newMaxKeys = this.maxKeys;

      // while exponentially smaller & folding is lossless
      while ( (newByteSize & 1) == 0 && newMaxKeys > (this.keyCount<<1) ) {
        pieces <<= 1;
        newByteSize >>= 1;
        newMaxKeys >>= 1;
      }

      // if we should fold these into pieces
      if (pieces > 1) {
        byte[] array = this.bloom.array();
        int start = this.bloom.arrayOffset();
        int end = start + newByteSize;
        int off = end;
        for(int p = 1; p < pieces; ++p) {
          for(int pos = start; pos < end; ++pos) {
            array[pos] |= array[off++];
          }
        }
        // folding done, only use a subset of this array
        this.bloom.rewind();
        this.bloom.limit(newByteSize);
        this.bloom = this.bloom.slice();
        this.byteSize = newByteSize;
        this.maxKeys = newMaxKeys;
      }
    }
  }


  //---------------------------------------------------------------------------

  /**
   * Writes just the bloom filter to the output array
   * @param out OutputStream to place bloom
   * @throws IOException Error writing bloom array
   */
  public void writeBloom(final DataOutput out) throws IOException {
    if (!this.bloom.hasArray()) {
      throw new IOException("Only writes ByteBuffer with underlying array.");
    }
    out.write(bloom.array(), bloom.arrayOffset(), bloom.limit());
  }

  @Override
  public Writable getMetaWriter() {
    return new MetaWriter();
  }

  @Override
  public Writable getDataWriter() {
    return new DataWriter();
  }

  private class MetaWriter implements Writable {
    protected MetaWriter() {}
    @Override
    public void readFields(DataInput arg0) throws IOException {
      throw new IOException("Cant read with this class.");
    }

    @Override
    public void write(DataOutput out) throws IOException {
      out.writeInt(VERSION);
      out.writeInt((int) byteSize);
      out.writeInt(hashCount);
      out.writeInt(hashType);
      out.writeInt(keyCount);
    }
  }

  private class DataWriter implements Writable {
    protected DataWriter() {}
    @Override
    public void readFields(DataInput arg0) throws IOException {
      throw new IOException("Cant read with this class.");
    }

    @Override
    public void write(DataOutput out) throws IOException {
      writeBloom(out);
    }
  }

  public int getHashCount() {
    return hashCount;
  }

  @Override
  public boolean supportsAutoLoading() {
    return bloom != null;
  }

  public static void setFakeLookupMode(boolean enabled) {
    if (enabled) {
      randomGeneratorForTest = new Random(283742987L);
    } else {
      randomGeneratorForTest = null;
    }
  }

  /**
   * {@inheritDoc}
   * Just concatenate row and column by default. May return the original row
   * buffer if the column qualifier is empty.
   */
  @Override
  public byte[] createBloomKey(byte[] rowBuf, int rowOffset, int rowLen,
      byte[] qualBuf, int qualOffset, int qualLen) {
    // Optimize the frequent case when only the row is provided.
    if (qualLen <= 0 && rowOffset == 0 && rowLen == rowBuf.length)
      return rowBuf;

    byte [] result = new byte[rowLen + qualLen];
    System.arraycopy(rowBuf, rowOffset, result, 0,  rowLen);
    if (qualLen > 0)
      System.arraycopy(qualBuf, qualOffset, result, rowLen, qualLen);
    return result;
  }

  @Override
  public RawComparator<byte[]> getComparator() {
    return Bytes.BYTES_RAWCOMPARATOR;
  }

  /**
   * A human-readable string with statistics for the given Bloom filter.
   *
   * @param bloomFilter the Bloom filter to output statistics for;
   * @return a string consisting of "<key>: <value>" parts
   *         separated by {@link #STATS_RECORD_SEP}.
   */
  public static String formatStats(BloomFilterBase bloomFilter) {
    StringBuilder sb = new StringBuilder();
    long k = bloomFilter.getKeyCount();
    long m = bloomFilter.getMaxKeys();

    sb.append("BloomSize: " + bloomFilter.getByteSize() + STATS_RECORD_SEP);
    sb.append("No of Keys in bloom: " + k + STATS_RECORD_SEP);
    sb.append("Max Keys for bloom: " + m);
    if (m > 0) {
      sb.append(STATS_RECORD_SEP + "Percentage filled: "
          + NumberFormat.getPercentInstance().format(k * 1.0 / m));
    }
    return sb.toString();
  }

  @Override
  public String toString() {
    return formatStats(this) + STATS_RECORD_SEP + "Actual error rate: "
        + String.format("%.8f", actualErrorRate());
  }

}