HFileBlockIndex.java example

Explorer
SecureBase-master
- hbase-0.92.1
  - security
    - src
      - main
        java
        org
        apache
        hadoop
        hbase
        ipc
        SecureClient.java
        SecureConnectionHeader.java
        SecureRpcEngine.java
        SecureServer.java
        security
        AccessDeniedException.java
        HBasePolicyProvider.java
        HBaseSaslRpcClient.java
        HBaseSaslRpcServer.java
        access
        AccessControlFilter.java
        AccessControlLists.java
        AccessController.java
        AccessControllerProtocol.java
        Permission.java
        TableAuthManager.java
        TablePermission.java
        UserPermission.java
        ZKPermissionWatcher.java
        token
        AuthenticationKey.java
        AuthenticationProtocol.java
        AuthenticationTokenIdentifier.java
        AuthenticationTokenSecretManager.java
        AuthenticationTokenSelector.java
        TokenProvider.java
        TokenUtil.java
        ZKSecretWatcher.java
      - test
        java
        org
        apache
        hadoop
        hbase
        security
        access
        SecureTestUtil.java
        TestAccessControlFilter.java
        TestAccessController.java
        TestTablePermissions.java
        TestZKPermissionsWatcher.java
        token
        TestTokenAuthentication.java
        TestZKSecretWatcher.java
  - src
/*
 * Copyright 2011 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.io.hfile;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.CompoundBloomFilterWriter;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.util.StringUtils;

/**
 * Provides functionality to write ({@link BlockIndexWriter}) and read
 * ({@link BlockIndexReader}) single-level and multi-level block indexes.
 *
 * Examples of how to use the block index writer can be found in
 * {@link CompoundBloomFilterWriter} and {@link HFileWriterV2}. Examples of how
 * to use the reader can be found in {@link HFileReaderV2} and
 * TestHFileBlockIndex.
 */
public class HFileBlockIndex {

  private static final Log LOG = LogFactory.getLog(HFileBlockIndex.class);

  static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024;

  /**
   * The maximum size guideline for index blocks (both leaf, intermediate, and
   * root). If not specified, <code>DEFAULT_MAX_CHUNK_SIZE</code> is used.
   */
  public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size";

  /**
   * The number of bytes stored in each "secondary index" entry in addition to
   * key bytes in the non-root index block format. The first long is the file
   * offset of the deeper-level block the entry points to, and the int that
   * follows is that block's on-disk size without including header.
   */
  static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT
      + Bytes.SIZEOF_LONG;

  /**
   * Error message when trying to use inline block API in single-level mode.
   */
  private static final String INLINE_BLOCKS_NOT_ALLOWED =
      "Inline blocks are not allowed in the single-level-only mode";

  /**
   * Configuration key to cache leaf- and intermediate-level index blocks on
   * write.
   */
  public static final String CACHE_INDEX_BLOCKS_ON_WRITE_KEY =
      "hfile.block.index.cacheonwrite";

  /**
   * The size of a meta-data record used for finding the mid-key in a
   * multi-level index. Consists of the middle leaf-level index block offset
   * (long), its on-disk size without header included (int), and the mid-key
   * entry's zero-based index in that leaf index block.
   */
  private static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG +
      2 * Bytes.SIZEOF_INT;

  /**
   * The reader will always hold the root level index in the memory. Index
   * blocks at all other levels will be cached in the LRU cache in practice,
   * although this API does not enforce that.
   *
   * All non-root (leaf and intermediate) index blocks contain what we call a
   * "secondary index": an array of offsets to the entries within the block.
   * This allows us to do binary search for the entry corresponding to the
   * given key without having to deserialize the block.
   */
  public static class BlockIndexReader implements HeapSize {
    /** Needed doing lookup on blocks. */
    private final RawComparator<byte[]> comparator;

    // Root-level data.
    private byte[][] blockKeys;
    private long[] blockOffsets;
    private int[] blockDataSizes;
    private int rootByteSize = 0;
    private int rootCount = 0;

    // Mid-key metadata.
    private long midLeafBlockOffset = -1;
    private int midLeafBlockOnDiskSize = -1;
    private int midKeyEntry = -1;

    /** Pre-computed mid-key */
    private AtomicReference<byte[]> midKey = new AtomicReference<byte[]>();

    /**
     * The number of levels in the block index tree. One if there is only root
     * level, two for root and leaf levels, etc.
     */
    private int searchTreeLevel;

    /** A way to read {@link HFile} blocks at a given offset */
    private CachingBlockReader cachingBlockReader;

    public BlockIndexReader(final RawComparator<byte[]> c, final int treeLevel,
        final CachingBlockReader cachingBlockReader) {
      this(c, treeLevel);
      this.cachingBlockReader = cachingBlockReader;
    }

    public BlockIndexReader(final RawComparator<byte[]> c, final int treeLevel)
    {
      comparator = c;
      searchTreeLevel = treeLevel;
    }

    /**
     * @return true if the block index is empty.
     */
    public boolean isEmpty() {
      return blockKeys.length == 0;
    }

    /**
     * Verifies that the block index is non-empty and throws an
     * {@link IllegalStateException} otherwise.
     */
    public void ensureNonEmpty() {
      if (blockKeys.length == 0) {
        throw new IllegalStateException("Block index is empty or not loaded");
      }
    }

    /**
     * Return the data block which contains this key. This function will only
     * be called when the HFile version is larger than 1.
     *
     * @param key the key we are looking for
     * @param keyOffset the offset of the key in its byte array
     * @param keyLength the length of the key
     * @param currentBlock the current block, to avoid re-reading the same
     *          block
     * @return reader a basic way to load blocks
     * @throws IOException
     */
    public HFileBlock seekToDataBlock(final byte[] key, int keyOffset,
        int keyLength, HFileBlock currentBlock, boolean cacheBlocks,
        boolean pread, boolean isCompaction)
        throws IOException {
      int rootLevelIndex = rootBlockContainingKey(key, keyOffset, keyLength);
      if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) {
        return null;
      }

      // Read the next-level (intermediate or leaf) index block.
      long currentOffset = blockOffsets[rootLevelIndex];
      int currentOnDiskSize = blockDataSizes[rootLevelIndex];

      int lookupLevel = 1; // How many levels deep we are in our lookup.

      HFileBlock block;
      while (true) {

        if (currentBlock != null && currentBlock.getOffset() == currentOffset)
        {
          // Avoid reading the same block again, even with caching turned off.
          // This is crucial for compaction-type workload which might have
          // caching turned off. This is like a one-block cache inside the
          // scanner.
          block = currentBlock;
        } else {
          // Call HFile's caching block reader API. We always cache index
          // blocks, otherwise we might get terrible performance.
          boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel);
          block = cachingBlockReader.readBlock(currentOffset, currentOnDiskSize,
              shouldCache, pread, isCompaction);
        }

        if (block == null) {
          throw new IOException("Failed to read block at offset " +
              currentOffset + ", onDiskSize=" + currentOnDiskSize);
        }

        // Found a data block, break the loop and check our level in the tree.
        if (block.getBlockType().equals(BlockType.DATA)) {
          break;
        }

        // Not a data block. This must be a leaf-level or intermediate-level
        // index block. We don't allow going deeper than searchTreeLevel.
        if (++lookupLevel > searchTreeLevel) {
          throw new IOException("Search Tree Level overflow: lookupLevel="+
              lookupLevel + ", searchTreeLevel=" + searchTreeLevel);
        }

        // Locate the entry corresponding to the given key in the non-root
        // (leaf or intermediate-level) index block.
        ByteBuffer buffer = block.getBufferWithoutHeader();
        if (!locateNonRootIndexEntry(buffer, key, keyOffset, keyLength,
            comparator)) {
          throw new IOException("The key "
              + Bytes.toStringBinary(key, keyOffset, keyLength)
              + " is before the" + " first key of the non-root index block "
              + block);
        }

        currentOffset = buffer.getLong();
        currentOnDiskSize = buffer.getInt();
      }

      if (lookupLevel != searchTreeLevel) {
        throw new IOException("Reached a data block at level " + lookupLevel +
            " but the number of levels is " + searchTreeLevel);
      }

      return block;
    }

    /**
     * An approximation to the {@link HFile}'s mid-key. Operates on block
     * boundaries, and does not go inside blocks. In other words, returns the
     * first key of the middle block of the file.
     *
     * @return the first key of the middle block
     */
    public byte[] midkey() throws IOException {
      if (rootCount == 0)
        throw new IOException("HFile empty");

      byte[] midKey = this.midKey.get();
      if (midKey != null)
        return midKey;

      if (midLeafBlockOffset >= 0) {
        if (cachingBlockReader == null) {
          throw new IOException("Have to read the middle leaf block but " +
              "no block reader available");
        }

        // Caching, using pread, assuming this is not a compaction.
        HFileBlock midLeafBlock = cachingBlockReader.readBlock(
            midLeafBlockOffset, midLeafBlockOnDiskSize, true, true, false);

        ByteBuffer b = midLeafBlock.getBufferWithoutHeader();
        int numDataBlocks = b.getInt();
        int keyRelOffset = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 1));
        int keyLen = b.getInt(Bytes.SIZEOF_INT * (midKeyEntry + 2)) -
            keyRelOffset;
        int keyOffset = b.arrayOffset() +
            Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset +
            SECONDARY_INDEX_ENTRY_OVERHEAD;
        midKey = Arrays.copyOfRange(b.array(), keyOffset, keyOffset + keyLen);
      } else {
        // The middle of the root-level index.
        midKey = blockKeys[(rootCount - 1) / 2];
      }

      this.midKey.set(midKey);
      return midKey;
    }

    /**
     * @param i from 0 to {@link #getRootBlockCount() - 1}
     */
    public byte[] getRootBlockKey(int i) {
      return blockKeys[i];
    }

    /**
     * @param i from 0 to {@link #getRootBlockCount() - 1}
     */
    public long getRootBlockOffset(int i) {
      return blockOffsets[i];
    }

    /**
     * @param i zero-based index of a root-level block
     * @return the on-disk size of the root-level block for version 2, or the
     *         uncompressed size for version 1
     */
    public int getRootBlockDataSize(int i) {
      return blockDataSizes[i];
    }

    /**
     * @return the number of root-level blocks in this block index
     */
    public int getRootBlockCount() {
      return rootCount;
    }

    /**
     * Finds the root-level index block containing the given key.
     *
     * @param key
     *          Key to find
     * @return Offset of block containing <code>key</code> (between 0 and the
     *         number of blocks - 1) or -1 if this file does not contain the
     *         request.
     */
    public int rootBlockContainingKey(final byte[] key, int offset,
        int length) {
      int pos = Bytes.binarySearch(blockKeys, key, offset, length,
          comparator);
      // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
      // binarySearch's javadoc.

      if (pos >= 0) {
        // This means this is an exact match with an element of blockKeys.
        assert pos < blockKeys.length;
        return pos;
      }

      // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
      // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
      // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
      // key < blockKeys[0], meaning the file does not contain the given key.

      int i = -pos - 1;
      assert 0 <= i && i <= blockKeys.length;
      return i - 1;
    }

    /**
     * Adds a new entry in the root block index. Only used when reading.
     *
     * @param key Last key in the block
     * @param offset file offset where the block is stored
     * @param dataSize the uncompressed data size
     */
    private void add(final byte[] key, final long offset, final int dataSize) {
      blockOffsets[rootCount] = offset;
      blockKeys[rootCount] = key;
      blockDataSizes[rootCount] = dataSize;

      rootCount++;
      rootByteSize += SECONDARY_INDEX_ENTRY_OVERHEAD + key.length;
    }

    /**
     * Performs a binary search over a non-root level index block. Utilizes the
     * secondary index, which records the offsets of (offset, onDiskSize,
     * firstKey) tuples of all entries.
     *
     * @param key the key we are searching for offsets to individual entries in
     *          the blockIndex buffer
     * @param keyOffset the offset of the key in its byte array
     * @param keyLength the length of the key
     * @param nonRootIndex the non-root index block buffer, starting with the
     *          secondary index. The position is ignored.
     * @return the index i in [0, numEntries - 1] such that keys[i] <= key <
     *         keys[i + 1], if keys is the array of all keys being searched, or
     *         -1 otherwise
     * @throws IOException
     */
    static int binarySearchNonRootIndex(byte[] key, int keyOffset,
        int keyLength, ByteBuffer nonRootIndex,
        RawComparator<byte[]> comparator) {

      int numEntries = nonRootIndex.getInt(0);
      int low = 0;
      int high = numEntries - 1;
      int mid = 0;

      // Entries start after the number of entries and the secondary index.
      // The secondary index takes numEntries + 1 ints.
      int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);

      // If we imagine that keys[-1] = -Infinity and
      // keys[numEntries] = Infinity, then we are maintaining an invariant that
      // keys[low - 1] < key < keys[high + 1] while narrowing down the range.

      while (low <= high) {
        mid = (low + high) >>> 1;

        // Midkey's offset relative to the end of secondary index
        int midKeyRelOffset = nonRootIndex.getInt(
            Bytes.SIZEOF_INT * (mid + 1));

        // The offset of the middle key in the blockIndex buffer
        int midKeyOffset = entriesOffset       // Skip secondary index
            + midKeyRelOffset                  // Skip all entries until mid
            + SECONDARY_INDEX_ENTRY_OVERHEAD;  // Skip offset and on-disk-size

        // We subtract the two consecutive secondary index elements, which
        // gives us the size of the whole (offset, onDiskSize, key) tuple. We
        // then need to subtract the overhead of offset and onDiskSize.
        int midLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (mid + 2)) -
            midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;

        // we have to compare in this order, because the comparator order
        // has special logic when the 'left side' is a special key.
        int cmp = comparator.compare(key, keyOffset, keyLength,
            nonRootIndex.array(), nonRootIndex.arrayOffset() + midKeyOffset,
            midLength);

        // key lives above the midpoint
        if (cmp > 0)
          low = mid + 1; // Maintain the invariant that keys[low - 1] < key
        // key lives below the midpoint
        else if (cmp < 0)
          high = mid - 1; // Maintain the invariant that key < keys[high + 1]
        else
          return mid; // exact match
      }

      // As per our invariant, keys[low - 1] < key < keys[high + 1], meaning
      // that low - 1 < high + 1 and (low - high) <= 1. As per the loop break
      // condition, low >= high + 1. Therefore, low = high + 1.

      if (low != high + 1) {
        throw new IllegalStateException("Binary search broken: low=" + low
            + " " + "instead of " + (high + 1));
      }

      // OK, our invariant says that keys[low - 1] < key < keys[low]. We need to
      // return i such that keys[i] <= key < keys[i + 1]. Therefore i = low - 1.
      int i = low - 1;

      // Some extra validation on the result.
      if (i < -1 || i >= numEntries) {
        throw new IllegalStateException("Binary search broken: result is " +
            i + " but expected to be between -1 and (numEntries - 1) = " +
            (numEntries - 1));
      }

      return i;
    }

    /**
     * Search for one key using the secondary index in a non-root block. In case
     * of success, positions the provided buffer at the entry of interest, where
     * the file offset and the on-disk-size can be read.
     *
     * @param nonRootBlock a non-root block without header. Initial position
     *          does not matter.
     * @param key the byte array containing the key
     * @param keyOffset the offset of the key in its byte array
     * @param keyLength the length of the key
     * @return true in the case the index entry containing the given key was
     *         found, false in the case the given key is before the first key
     *
     */
    static boolean locateNonRootIndexEntry(ByteBuffer nonRootBlock, byte[] key,
        int keyOffset, int keyLength, RawComparator<byte[]> comparator) {
      int entryIndex = binarySearchNonRootIndex(key, keyOffset, keyLength,
          nonRootBlock, comparator);

      if (entryIndex == -1) {
        return false;
      }

      int numEntries = nonRootBlock.getInt(0);

      // The end of secondary index and the beginning of entries themselves.
      int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);

      // The offset of the entry we are interested in relative to the end of
      // the secondary index.
      int entryRelOffset = nonRootBlock.getInt(Bytes.SIZEOF_INT
          * (1 + entryIndex));

      nonRootBlock.position(entriesOffset + entryRelOffset);
      return true;
    }

    /**
     * Read in the root-level index from the given input stream. Must match
     * what was written into the root level by
     * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the
     * offset that function returned.
     *
     * @param in the buffered input stream or wrapped byte input stream
     * @param numEntries the number of root-level index entries
     * @throws IOException
     */
    public void readRootIndex(DataInput in, final int numEntries)
        throws IOException {
      blockOffsets = new long[numEntries];
      blockKeys = new byte[numEntries][];
      blockDataSizes = new int[numEntries];

      // If index size is zero, no index was written.
      if (numEntries > 0) {
        for (int i = 0; i < numEntries; ++i) {
          long offset = in.readLong();
          int dataSize = in.readInt();
          byte[] key = Bytes.readByteArray(in);
          add(key, offset, dataSize);
        }
      }
    }

    /**
     * Read the root-level metadata of a multi-level block index. Based on
     * {@link #readRootIndex(DataInput, int)}, but also reads metadata
     * necessary to compute the mid-key in a multi-level index.
     *
     * @param in the buffered or byte input stream to read from
     * @param numEntries the number of root-level index entries
     * @throws IOException
     */
    public void readMultiLevelIndexRoot(DataInputStream in,
        final int numEntries) throws IOException {
      readRootIndex(in, numEntries);
      if (in.available() < MID_KEY_METADATA_SIZE) {
        // No mid-key metadata available.
        return;
      }

      midLeafBlockOffset = in.readLong();
      midLeafBlockOnDiskSize = in.readInt();
      midKeyEntry = in.readInt();
    }

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append("size=" + rootCount).append("\n");
      for (int i = 0; i < rootCount; i++) {
        sb.append("key=").append(KeyValue.keyToString(blockKeys[i]))
            .append("\n  offset=").append(blockOffsets[i])
            .append(", dataSize=" + blockDataSizes[i]).append("\n");
      }
      return sb.toString();
    }

    @Override
    public long heapSize() {
      long heapSize = ClassSize.align(6 * ClassSize.REFERENCE +
          3 * Bytes.SIZEOF_INT + ClassSize.OBJECT);

      // Mid-key metadata.
      heapSize += MID_KEY_METADATA_SIZE;

      // Calculating the size of blockKeys
      if (blockKeys != null) {
        // Adding array + references overhead
        heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length
            * ClassSize.REFERENCE);

        // Adding bytes
        for (byte[] key : blockKeys) {
          heapSize += ClassSize.align(ClassSize.ARRAY + key.length);
        }
      }

      if (blockOffsets != null) {
        heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length
            * Bytes.SIZEOF_LONG);
      }

      if (blockDataSizes != null) {
        heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length
            * Bytes.SIZEOF_INT);
      }

      return ClassSize.align(heapSize);
    }

  }

  /**
   * Writes the block index into the output stream. Generate the tree from
   * bottom up. The leaf level is written to disk as a sequence of inline
   * blocks, if it is larger than a certain number of bytes. If the leaf level
   * is not large enough, we write all entries to the root level instead.
   *
   * After all leaf blocks have been written, we end up with an index
   * referencing the resulting leaf index blocks. If that index is larger than
   * the allowed root index size, the writer will break it up into
   * reasonable-size intermediate-level index block chunks write those chunks
   * out, and create another index referencing those chunks. This will be
   * repeated until the remaining index is small enough to become the root
   * index. However, in most practical cases we will only have leaf-level
   * blocks and the root index, or just the root index.
   */
  public static class BlockIndexWriter implements InlineBlockWriter {
    /**
     * While the index is being written, this represents the current block
     * index referencing all leaf blocks, with one exception. If the file is
     * being closed and there are not enough blocks to complete even a single
     * leaf block, no leaf blocks get written and this contains the entire
     * block index. After all levels of the index were written by
     * {@link #writeIndexBlocks(FSDataOutputStream)}, this contains the final
     * root-level index.
     */
    private BlockIndexChunk rootChunk = new BlockIndexChunk();

    /**
     * Current leaf-level chunk. New entries referencing data blocks get added
     * to this chunk until it grows large enough to be written to disk.
     */
    private BlockIndexChunk curInlineChunk = new BlockIndexChunk();

    /**
     * The number of block index levels. This is one if there is only root
     * level (even empty), two if there a leaf level and root level, and is
     * higher if there are intermediate levels. This is only final after
     * {@link #writeIndexBlocks(FSDataOutputStream)} has been called. The
     * initial value accounts for the root level, and will be increased to two
     * as soon as we find out there is a leaf-level in
     * {@link #blockWritten(long, int)}.
     */
    private int numLevels = 1;

    private HFileBlock.Writer blockWriter;
    private byte[] firstKey = null;

    /**
     * The total number of leaf-level entries, i.e. entries referenced by
     * leaf-level blocks. For the data block index this is equal to the number
     * of data blocks.
     */
    private long totalNumEntries;

    /** Total compressed size of all index blocks. */
    private long totalBlockOnDiskSize;

    /** Total uncompressed size of all index blocks. */
    private long totalBlockUncompressedSize;

    /** The maximum size guideline of all multi-level index blocks. */
    private int maxChunkSize;

    /** Whether we require this block index to always be single-level. */
    private boolean singleLevelOnly;

    /** Block cache, or null if cache-on-write is disabled */
    private BlockCache blockCache;

    /** Name to use for computing cache keys */
    private String nameForCaching;

    /** Creates a single-level block index writer */
    public BlockIndexWriter() {
      this(null, null, null);
      singleLevelOnly = true;
    }

    /**
     * Creates a multi-level block index writer.
     *
     * @param blockWriter the block writer to use to write index blocks
     * @param blockCache if this is not null, index blocks will be cached
     *    on write into this block cache.
     */
    public BlockIndexWriter(HFileBlock.Writer blockWriter,
        BlockCache blockCache, String nameForCaching) {
      if ((blockCache == null) != (nameForCaching == null)) {
        throw new IllegalArgumentException("Block cache and file name for " +
            "caching must be both specified or both null");
      }

      this.blockWriter = blockWriter;
      this.blockCache = blockCache;
      this.nameForCaching = nameForCaching;
      this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE;
    }

    public void setMaxChunkSize(int maxChunkSize) {
      if (maxChunkSize <= 0) {
        throw new IllegalArgumentException("Invald maximum index block size");
      }
      this.maxChunkSize = maxChunkSize;
    }

    /**
     * Writes the root level and intermediate levels of the block index into
     * the output stream, generating the tree from bottom up. Assumes that the
     * leaf level has been inline-written to the disk if there is enough data
     * for more than one leaf block. We iterate by breaking the current level
     * of the block index, starting with the index of all leaf-level blocks,
     * into chunks small enough to be written to disk, and generate its parent
     * level, until we end up with a level small enough to become the root
     * level.
     *
     * If the leaf level is not large enough, there is no inline block index
     * anymore, so we only write that level of block index to disk as the root
     * level.
     *
     * @param out FSDataOutputStream
     * @return position at which we entered the root-level index.
     * @throws IOException
     */
    public long writeIndexBlocks(FSDataOutputStream out) throws IOException {
      if (curInlineChunk.getNumEntries() != 0) {
        throw new IOException("Trying to write a multi-level block index, " +
            "but are " + curInlineChunk.getNumEntries() + " entries in the " +
            "last inline chunk.");
      }

      // We need to get mid-key metadata before we create intermediate
      // indexes and overwrite the root chunk.
      byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata()
          : null;

      while (rootChunk.getRootSize() > maxChunkSize) {
        rootChunk = writeIntermediateLevel(out, rootChunk);
        numLevels += 1;
      }

      // write the root level
      long rootLevelIndexPos = out.getPos();

      {
        DataOutput blockStream = blockWriter.startWriting(BlockType.ROOT_INDEX,
            false);
        rootChunk.writeRoot(blockStream);
        if (midKeyMetadata != null)
          blockStream.write(midKeyMetadata);
        blockWriter.writeHeaderAndData(out);
      }

      // Add root index block size
      totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
      totalBlockUncompressedSize +=
          blockWriter.getUncompressedSizeWithoutHeader();

      if (LOG.isTraceEnabled()) {
        LOG.trace("Wrote a " + numLevels + "-level index with root level at pos "
          + out.getPos() + ", " + rootChunk.getNumEntries()
          + " root-level entries, " + totalNumEntries + " total entries, "
          + StringUtils.humanReadableInt(this.totalBlockOnDiskSize) +
          " on-disk size, "
          + StringUtils.humanReadableInt(totalBlockUncompressedSize) +
          " total uncompressed size.");
      }
      return rootLevelIndexPos;
    }

    /**
     * Writes the block index data as a single level only. Does not do any
     * block framing.
     *
     * @param out the buffered output stream to write the index to. Typically a
     *          stream writing into an {@link HFile} block.
     * @param description a short description of the index being written. Used
     *          in a log message.
     * @throws IOException
     */
    public void writeSingleLevelIndex(DataOutput out, String description)
        throws IOException {
      expectNumLevels(1);

      if (!singleLevelOnly)
        throw new IOException("Single-level mode is turned off");

      if (rootChunk.getNumEntries() > 0)
        throw new IOException("Root-level entries already added in " +
            "single-level mode");

      rootChunk = curInlineChunk;
      curInlineChunk = new BlockIndexChunk();

      if (LOG.isTraceEnabled()) {
        LOG.trace("Wrote a single-level " + description + " index with "
          + rootChunk.getNumEntries() + " entries, " + rootChunk.getRootSize()
          + " bytes");
      }
      rootChunk.writeRoot(out);
    }

    /**
     * Split the current level of the block index into intermediate index
     * blocks of permitted size and write those blocks to disk. Return the next
     * level of the block index referencing those intermediate-level blocks.
     *
     * @param out
     * @param currentLevel the current level of the block index, such as the a
     *          chunk referencing all leaf-level index blocks
     * @return the parent level block index, which becomes the root index after
     *         a few (usually zero) iterations
     * @throws IOException
     */
    private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out,
        BlockIndexChunk currentLevel) throws IOException {
      // Entries referencing intermediate-level blocks we are about to create.
      BlockIndexChunk parent = new BlockIndexChunk();

      // The current intermediate-level block index chunk.
      BlockIndexChunk curChunk = new BlockIndexChunk();

      for (int i = 0; i < currentLevel.getNumEntries(); ++i) {
        curChunk.add(currentLevel.getBlockKey(i),
            currentLevel.getBlockOffset(i), currentLevel.getOnDiskDataSize(i));

        if (curChunk.getRootSize() >= maxChunkSize)
          writeIntermediateBlock(out, parent, curChunk);
      }

      if (curChunk.getNumEntries() > 0) {
        writeIntermediateBlock(out, parent, curChunk);
      }

      return parent;
    }

    private void writeIntermediateBlock(FSDataOutputStream out,
        BlockIndexChunk parent, BlockIndexChunk curChunk) throws IOException {
      long beginOffset = out.getPos();
      DataOutputStream dos = blockWriter.startWriting(
          BlockType.INTERMEDIATE_INDEX, cacheOnWrite());
      curChunk.writeNonRoot(dos);
      byte[] curFirstKey = curChunk.getBlockKey(0);
      blockWriter.writeHeaderAndData(out);

      if (blockCache != null) {
        blockCache.cacheBlock(HFile.getBlockCacheKey(nameForCaching,
            beginOffset), blockWriter.getBlockForCaching());
      }

      // Add intermediate index block size
      totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
      totalBlockUncompressedSize +=
          blockWriter.getUncompressedSizeWithoutHeader();

      // OFFSET is the beginning offset the chunk of block index entries.
      // SIZE is the total byte size of the chunk of block index entries
      // + the secondary index size
      // FIRST_KEY is the first key in the chunk of block index
      // entries.
      parent.add(curFirstKey, beginOffset,
          blockWriter.getOnDiskSizeWithHeader());

      // clear current block index chunk
      curChunk.clear();
      curFirstKey = null;
    }

    /**
     * @return how many block index entries there are in the root level
     */
    public final int getNumRootEntries() {
      return rootChunk.getNumEntries();
    }

    /**
     * @return the number of levels in this block index.
     */
    public int getNumLevels() {
      return numLevels;
    }

    private void expectNumLevels(int expectedNumLevels) {
      if (numLevels != expectedNumLevels) {
        throw new IllegalStateException("Number of block index levels is "
            + numLevels + "but is expected to be " + expectedNumLevels);
      }
    }

    /**
     * Whether there is an inline block ready to be written. In general, we
     * write an leaf-level index block as an inline block as soon as its size
     * as serialized in the non-root format reaches a certain threshold.
     */
    @Override
    public boolean shouldWriteBlock(boolean closing) {
      if (singleLevelOnly)
        throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);

      if (curInlineChunk.getNumEntries() == 0)
        return false;

      // We do have some entries in the current inline chunk.
      if (closing) {
        if (rootChunk.getNumEntries() == 0) {
          // We did not add any leaf-level blocks yet. Instead of creating a
          // leaf level with one block, move these entries to the root level.

          expectNumLevels(1);
          rootChunk = curInlineChunk;
          curInlineChunk = new BlockIndexChunk();
          return false;
        }

        return true;
      } else {
        return curInlineChunk.getNonRootSize() >= maxChunkSize;
      }
    }

    /**
     * Write out the current inline index block. Inline blocks are non-root
     * blocks, so the non-root index format is used.
     *
     * @param out
     */
    @Override
    public void writeInlineBlock(DataOutput out) throws IOException {
      if (singleLevelOnly)
        throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);

      // Write the inline block index to the output stream in the non-root
      // index block format.
      curInlineChunk.writeNonRoot(out);

      // Save the first key of the inline block so that we can add it to the
      // parent-level index.
      firstKey = curInlineChunk.getBlockKey(0);

      // Start a new inline index block
      curInlineChunk.clear();
    }

    /**
     * Called after an inline block has been written so that we can add an
     * entry referring to that block to the parent-level index.
     */
    @Override
    public void blockWritten(long offset, int onDiskSize, int uncompressedSize)
    {
      // Add leaf index block size
      totalBlockOnDiskSize += onDiskSize;
      totalBlockUncompressedSize += uncompressedSize;

      if (singleLevelOnly)
        throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);

      if (firstKey == null) {
        throw new IllegalStateException("Trying to add second-level index " +
            "entry with offset=" + offset + " and onDiskSize=" + onDiskSize +
            "but the first key was not set in writeInlineBlock");
      }

      if (rootChunk.getNumEntries() == 0) {
        // We are writing the first leaf block, so increase index level.
        expectNumLevels(1);
        numLevels = 2;
      }

      // Add another entry to the second-level index. Include the number of
      // entries in all previous leaf-level chunks for mid-key calculation.
      rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries);
      firstKey = null;
    }

    @Override
    public BlockType getInlineBlockType() {
      return BlockType.LEAF_INDEX;
    }

    /**
     * Add one index entry to the current leaf-level block. When the leaf-level
     * block gets large enough, it will be flushed to disk as an inline block.
     *
     * @param firstKey the first key of the data block
     * @param blockOffset the offset of the data block
     * @param blockDataSize the on-disk size of the data block ({@link HFile}
     *          format version 2), or the uncompressed size of the data block (
     *          {@link HFile} format version 1).
     */
    public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize)
    {
      curInlineChunk.add(firstKey, blockOffset, blockDataSize);
      ++totalNumEntries;
    }

    /**
     * @throws IOException if we happened to write a multi-level index.
     */
    public void ensureSingleLevel() throws IOException {
      if (numLevels > 1) {
        throw new IOException ("Wrote a " + numLevels + "-level index with " +
            rootChunk.getNumEntries() + " root-level entries, but " +
            "this is expected to be a single-level block index.");
      }
    }

    /**
     * @return true if we are using cache-on-write. This is configured by the
     *         caller of the constructor by either passing a valid block cache
     *         or null.
     */
    @Override
    public boolean cacheOnWrite() {
      return blockCache != null;
    }

    /**
     * The total uncompressed size of the root index block, intermediate-level
     * index blocks, and leaf-level index blocks.
     *
     * @return the total uncompressed size of all index blocks
     */
    public long getTotalUncompressedSize() {
      return totalBlockUncompressedSize;
    }

  }

  /**
   * A single chunk of the block index in the process of writing. The data in
   * this chunk can become a leaf-level, intermediate-level, or root index
   * block.
   */
  static class BlockIndexChunk {

    /** First keys of the key range corresponding to each index entry. */
    private final List<byte[]> blockKeys = new ArrayList<byte[]>();

    /** Block offset in backing stream. */
    private final List<Long> blockOffsets = new ArrayList<Long>();

    /** On-disk data sizes of lower-level data or index blocks. */
    private final List<Integer> onDiskDataSizes = new ArrayList<Integer>();

    /**
     * The cumulative number of sub-entries, i.e. entries on deeper-level block
     * index entries. numSubEntriesAt[i] is the number of sub-entries in the
     * blocks corresponding to this chunk's entries #0 through #i inclusively.
     */
    private final List<Long> numSubEntriesAt = new ArrayList<Long>();

    /**
     * The offset of the next entry to be added, relative to the end of the
     * "secondary index" in the "non-root" format representation of this index
     * chunk. This is the next value to be added to the secondary index.
     */
    private int curTotalNonRootEntrySize = 0;

    /**
     * The accumulated size of this chunk if stored in the root index format.
     */
    private int curTotalRootSize = 0;

    /**
     * The "secondary index" used for binary search over variable-length
     * records in a "non-root" format block. These offsets are relative to the
     * end of this secondary index.
     */
    private final List<Integer> secondaryIndexOffsetMarks =
        new ArrayList<Integer>();

    /**
     * Adds a new entry to this block index chunk.
     *
     * @param firstKey the first key in the block pointed to by this entry
     * @param blockOffset the offset of the next-level block pointed to by this
     *          entry
     * @param onDiskDataSize the on-disk data of the block pointed to by this
     *          entry, including header size
     * @param curTotalNumSubEntries if this chunk is the root index chunk under
     *          construction, this specifies the current total number of
     *          sub-entries in all leaf-level chunks, including the one
     *          corresponding to the second-level entry being added.
     */
    void add(byte[] firstKey, long blockOffset, int onDiskDataSize,
        long curTotalNumSubEntries) {
      // Record the offset for the secondary index
      secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize);
      curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD
          + firstKey.length;

      curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT
          + WritableUtils.getVIntSize(firstKey.length) + firstKey.length;

      blockKeys.add(firstKey);
      blockOffsets.add(blockOffset);
      onDiskDataSizes.add(onDiskDataSize);

      if (curTotalNumSubEntries != -1) {
        numSubEntriesAt.add(curTotalNumSubEntries);

        // Make sure the parallel arrays are in sync.
        if (numSubEntriesAt.size() != blockKeys.size()) {
          throw new IllegalStateException("Only have key/value count " +
              "stats for " + numSubEntriesAt.size() + " block index " +
              "entries out of " + blockKeys.size());
        }
      }
    }

    /**
     * The same as {@link #add(byte[], long, int, long)} but does not take the
     * key/value into account. Used for single-level indexes.
     *
     * @see {@link #add(byte[], long, int, long)}
     */
    public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) {
      add(firstKey, blockOffset, onDiskDataSize, -1);
    }

    public void clear() {
      blockKeys.clear();
      blockOffsets.clear();
      onDiskDataSizes.clear();
      secondaryIndexOffsetMarks.clear();
      numSubEntriesAt.clear();
      curTotalNonRootEntrySize = 0;
      curTotalRootSize = 0;
    }

    /**
     * Finds the entry corresponding to the deeper-level index block containing
     * the given deeper-level entry (a "sub-entry"), assuming a global 0-based
     * ordering of sub-entries.
     *
     * <p>
     * <i> Implementation note. </i> We are looking for i such that
     * numSubEntriesAt[i - 1] <= k < numSubEntriesAt[i], because a deeper-level
     * block #i (0-based) contains sub-entries # numSubEntriesAt[i - 1]'th
     * through numSubEntriesAt[i] - 1, assuming a global 0-based ordering of
     * sub-entries. i is by definition the insertion point of k in
     * numSubEntriesAt.
     *
     * @param k sub-entry index, from 0 to the total number sub-entries - 1
     * @return the 0-based index of the entry corresponding to the given
     *         sub-entry
     */
    public int getEntryBySubEntry(long k) {
      // We define mid-key as the key corresponding to k'th sub-entry
      // (0-based).

      int i = Collections.binarySearch(numSubEntriesAt, k);

      // Exact match: cumulativeWeight[i] = k. This means chunks #0 through
      // #i contain exactly k sub-entries, and the sub-entry #k (0-based)
      // is in the (i + 1)'th chunk.
      if (i >= 0)
        return i + 1;

      // Inexact match. Return the insertion point.
      return -i - 1;
    }

    /**
     * Used when writing the root block index of a multi-level block index.
     * Serializes additional information allowing to efficiently identify the
     * mid-key.
     *
     * @return a few serialized fields for finding the mid-key
     * @throws IOException if could not create metadata for computing mid-key
     */
    public byte[] getMidKeyMetadata() throws IOException {
      ByteArrayOutputStream baos = new ByteArrayOutputStream(
          MID_KEY_METADATA_SIZE);
      DataOutputStream baosDos = new DataOutputStream(baos);
      long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1);
      if (totalNumSubEntries == 0) {
        throw new IOException("No leaf-level entries, mid-key unavailable");
      }
      long midKeySubEntry = (totalNumSubEntries - 1) / 2;
      int midKeyEntry = getEntryBySubEntry(midKeySubEntry);

      baosDos.writeLong(blockOffsets.get(midKeyEntry));
      baosDos.writeInt(onDiskDataSizes.get(midKeyEntry));

      long numSubEntriesBefore = midKeyEntry > 0
          ? numSubEntriesAt.get(midKeyEntry - 1) : 0;
      long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore;
      if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE)
      {
        throw new IOException("Could not identify mid-key index within the "
            + "leaf-level block containing mid-key: out of range ("
            + subEntryWithinEntry + ", numSubEntriesBefore="
            + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry
            + ")");
      }

      baosDos.writeInt((int) subEntryWithinEntry);

      if (baosDos.size() != MID_KEY_METADATA_SIZE) {
        throw new IOException("Could not write mid-key metadata: size=" +
            baosDos.size() + ", correct size: " + MID_KEY_METADATA_SIZE);
      }

      // Close just to be good citizens, although this has no effect.
      baos.close();

      return baos.toByteArray();
    }

    /**
     * Writes the block index chunk in the non-root index block format. This
     * format contains the number of entries, an index of integer offsets
     * for quick binary search on variable-length records, and tuples of
     * block offset, on-disk block size, and the first key for each entry.
     *
     * @param out
     * @throws IOException
     */
    void writeNonRoot(DataOutput out) throws IOException {
      // The number of entries in the block.
      out.writeInt(blockKeys.size());

      if (secondaryIndexOffsetMarks.size() != blockKeys.size()) {
        throw new IOException("Corrupted block index chunk writer: " +
            blockKeys.size() + " entries but " +
            secondaryIndexOffsetMarks.size() + " secondary index items");
      }

      // For each entry, write a "secondary index" of relative offsets to the
      // entries from the end of the secondary index. This works, because at
      // read time we read the number of entries and know where the secondary
      // index ends.
      for (int currentSecondaryIndex : secondaryIndexOffsetMarks)
        out.writeInt(currentSecondaryIndex);

      // We include one other element in the secondary index to calculate the
      // size of each entry more easily by subtracting secondary index elements.
      out.writeInt(curTotalNonRootEntrySize);

      for (int i = 0; i < blockKeys.size(); ++i) {
        out.writeLong(blockOffsets.get(i));
        out.writeInt(onDiskDataSizes.get(i));
        out.write(blockKeys.get(i));
      }
    }

    /**
     * @return the size of this chunk if stored in the non-root index block
     *         format
     */
    int getNonRootSize() {
      return Bytes.SIZEOF_INT                          // Number of entries
          + Bytes.SIZEOF_INT * (blockKeys.size() + 1)  // Secondary index
          + curTotalNonRootEntrySize;                  // All entries
    }

    /**
     * Writes this chunk into the given output stream in the root block index
     * format. This format is similar to the {@link HFile} version 1 block
     * index format, except that we store on-disk size of the block instead of
     * its uncompressed size.
     *
     * @param out the data output stream to write the block index to. Typically
     *          a stream writing into an {@link HFile} block.
     * @throws IOException
     */
    void writeRoot(DataOutput out) throws IOException {
      for (int i = 0; i < blockKeys.size(); ++i) {
        out.writeLong(blockOffsets.get(i));
        out.writeInt(onDiskDataSizes.get(i));
        Bytes.writeByteArray(out, blockKeys.get(i));
      }
    }

    /**
     * @return the size of this chunk if stored in the root index block format
     */
    int getRootSize() {
      return curTotalRootSize;
    }

    /**
     * @return the number of entries in this block index chunk
     */
    public int getNumEntries() {
      return blockKeys.size();
    }

    public byte[] getBlockKey(int i) {
      return blockKeys.get(i);
    }

    public long getBlockOffset(int i) {
      return blockOffsets.get(i);
    }

    public int getOnDiskDataSize(int i) {
      return onDiskDataSizes.get(i);
    }

    public long getCumulativeNumKV(int i) {
      if (i < 0)
        return 0;
      return numSubEntriesAt.get(i);
    }

  }

  /**
   * @return true if the given configuration specifies that we should
   *         cache-on-write index blocks
   */
  public static boolean shouldCacheOnWrite(Configuration conf) {
    return conf.getBoolean(CACHE_INDEX_BLOCKS_ON_WRITE_KEY, false);
  }

  public static int getMaxChunkSize(Configuration conf) {
    return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE);
  }

}