DataBlockEncodingTool.java example

Explorer
HBase-Research-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
package org.apache.hadoop.hbase.regionserver;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.encoding.EncodedDataBlock;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder;
import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder;
import org.apache.hadoop.hbase.io.hfile.Compression.Algorithm;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;

/**
 * Tests various algorithms for key compression on an existing HFile. Useful
 * for testing, debugging and benchmarking.
 */
public class DataBlockEncodingTool {
  private static final Log LOG = LogFactory.getLog(
      DataBlockEncodingTool.class);

  private static final boolean includesMemstoreTS = true;

  /**
   * How many times should benchmark run.
   * More times means better data in terms of statistics.
   * It has to be larger than BENCHMARK_N_OMIT.
   */
  public static int BENCHMARK_N_TIMES = 12;

  /**
   * How many first runs should omit benchmark.
   * Usually it is one in order to exclude setup cost.
   * Has to be 0 or larger.
   */
  public static int BENCHMARK_N_OMIT = 2;

  /** Compression algorithm to use if not specified on the command line */
  private static final Algorithm DEFAULT_COMPRESSION =
      Compression.Algorithm.GZ;

  private List<EncodedDataBlock> codecs = new ArrayList<EncodedDataBlock>();
  private int totalPrefixLength = 0;
  private int totalKeyLength = 0;
  private int totalValueLength = 0;
  private int totalKeyRedundancyLength = 0;

  final private String compressionAlgorithmName;
  final private Algorithm compressionAlgorithm;
  final private Compressor compressor;
  final private Decompressor decompressor;

  /**
   * @param compressionAlgorithmName What kind of algorithm should be used
   *                                 as baseline for comparison (e.g. lzo, gz).
   */
  public DataBlockEncodingTool(String compressionAlgorithmName) {
    this.compressionAlgorithmName = compressionAlgorithmName;
    this.compressionAlgorithm = Compression.getCompressionAlgorithmByName(
        compressionAlgorithmName);
    this.compressor = this.compressionAlgorithm.getCompressor();
    this.decompressor = this.compressionAlgorithm.getDecompressor();
  }

  /**
   * Check statistics for given HFile for different data block encoders.
   * @param scanner Of file which will be compressed.
   * @param kvLimit Maximal count of KeyValue which will be processed.
   * @throws IOException thrown if scanner is invalid
   */
  public void checkStatistics(final KeyValueScanner scanner, final int kvLimit)
      throws IOException {
    scanner.seek(KeyValue.LOWESTKEY);

    KeyValue currentKv;

    byte[] previousKey = null;
    byte[] currentKey;

    List<DataBlockEncoder> dataBlockEncoders =
        DataBlockEncoding.getAllEncoders();

    for (DataBlockEncoder d : dataBlockEncoders) {
      codecs.add(new EncodedDataBlock(d, includesMemstoreTS));
    }

    int j = 0;
    while ((currentKv = scanner.next()) != null && j < kvLimit) {
      // Iterates through key/value pairs
      j++;
      currentKey = currentKv.getKey();
      if (previousKey != null) {
        for (int i = 0; i < previousKey.length && i < currentKey.length &&
            previousKey[i] == currentKey[i]; ++i) {
          totalKeyRedundancyLength++;
        }
      }

      for (EncodedDataBlock codec : codecs) {
        codec.addKv(currentKv);
      }

      previousKey = currentKey;

      totalPrefixLength += currentKv.getLength() - currentKv.getKeyLength() -
          currentKv.getValueLength();
      totalKeyLength += currentKv.getKeyLength();
      totalValueLength += currentKv.getValueLength();
    }
  }

  /**
   * Verify if all data block encoders are working properly.
   * 
   * @param scanner Of file which was compressed.
   * @param kvLimit Maximal count of KeyValue which will be processed.
   * @return true if all data block encoders compressed/decompressed correctly.
   * @throws IOException thrown if scanner is invalid
   */
  public boolean verifyCodecs(final KeyValueScanner scanner, final int kvLimit)
      throws IOException {
    KeyValue currentKv;

    scanner.seek(KeyValue.LOWESTKEY);
    List<Iterator<KeyValue>> codecIterators =
        new ArrayList<Iterator<KeyValue>>();
    for(EncodedDataBlock codec : codecs) {
      codecIterators.add(codec.getIterator());
    }

    int j = 0;
    while ((currentKv = scanner.next()) != null && j < kvLimit) {
      // Iterates through key/value pairs
      ++j;
      for (Iterator<KeyValue> it : codecIterators) {
        KeyValue codecKv = it.next();
        if (codecKv == null || 0 != Bytes.compareTo(
            codecKv.getBuffer(), codecKv.getOffset(), codecKv.getLength(),
            currentKv.getBuffer(), currentKv.getOffset(),
            currentKv.getLength())) {
          if (codecKv == null) {
            LOG.error("There is a bug in codec " + it +
                " it returned null KeyValue,");
          } else {
            int prefix = 0;
            int limitLength = 2 * Bytes.SIZEOF_INT +
                Math.min(codecKv.getLength(), currentKv.getLength());
            while (prefix < limitLength &&
                codecKv.getBuffer()[prefix + codecKv.getOffset()] ==
                currentKv.getBuffer()[prefix + currentKv.getOffset()]) {
              prefix++;
            }

            LOG.error("There is bug in codec " + it.toString() +
                "\n on element " + j +
                "\n codecKv.getKeyLength() " + codecKv.getKeyLength() +
                "\n codecKv.getValueLength() " + codecKv.getValueLength() +
                "\n codecKv.getLength() " + codecKv.getLength() +
                "\n currentKv.getKeyLength() " + currentKv.getKeyLength() +
                "\n currentKv.getValueLength() " + currentKv.getValueLength() +
                "\n codecKv.getLength() " + currentKv.getLength() +
                "\n currentKV rowLength " + currentKv.getRowLength() +
                " familyName " + currentKv.getFamilyLength() +
                " qualifier " + currentKv.getQualifierLength() +
                "\n prefix " + prefix +
                "\n codecKv   '" + Bytes.toStringBinary(codecKv.getBuffer(),
                    codecKv.getOffset(), prefix) + "' diff '" +
                    Bytes.toStringBinary(codecKv.getBuffer(),
                        codecKv.getOffset() + prefix, codecKv.getLength() -
                        prefix) + "'" +
                "\n currentKv '" + Bytes.toStringBinary(
                   currentKv.getBuffer(),
                   currentKv.getOffset(), prefix) + "' diff '" +
                   Bytes.toStringBinary(currentKv.getBuffer(),
                       currentKv.getOffset() + prefix, currentKv.getLength() -
                       prefix) + "'"
                );
          }
          return false;
        }
      }
    }

    LOG.info("Verification was successful!");

    return true;
  }

  /**
   * Benchmark codec's speed.
   */
  public void benchmarkCodecs() {
    int prevTotalSize = -1;
    for (EncodedDataBlock codec : codecs) {
      prevTotalSize = benchmarkEncoder(prevTotalSize, codec);
    }

    byte[] buffer = codecs.get(0).getRawKeyValues();

    benchmarkDefaultCompression(prevTotalSize, buffer);
  }

  /**
   * Benchmark compression/decompression throughput.
   * @param previousTotalSize Total size used for verification. Use -1 if
   *          unknown.
   * @param codec Tested encoder.
   * @return Size of uncompressed data.
   */
  private int benchmarkEncoder(int previousTotalSize, EncodedDataBlock codec) {
    int prevTotalSize = previousTotalSize;
    int totalSize = 0;

    // decompression time
    List<Long> durations = new ArrayList<Long>();
    for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) {
      totalSize = 0;

      Iterator<KeyValue> it;

      it = codec.getIterator();

      // count only the algorithm time, without memory allocations
      // (expect first time)
      final long startTime = System.nanoTime();
      while (it.hasNext()) {
        totalSize += it.next().getLength();
      }
      final long finishTime = System.nanoTime();
      if (itTime >= BENCHMARK_N_OMIT) {
        durations.add(finishTime - startTime);
      }

      if (prevTotalSize != -1 && prevTotalSize != totalSize) {
        throw new IllegalStateException(String.format(
            "Algorithm '%s' decoded data to different size", codec.toString()));
      }
      prevTotalSize = totalSize;
    }

    // compression time
    List<Long> compressDurations = new ArrayList<Long>();
    for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) {
      final long startTime = System.nanoTime();
      codec.doCompressData();
      final long finishTime = System.nanoTime();
      if (itTime >= BENCHMARK_N_OMIT) {
        compressDurations.add(finishTime - startTime);
      }
    }

    System.out.println(codec.toString() + ":");
    printBenchmarkResult(totalSize, compressDurations, false);
    printBenchmarkResult(totalSize, durations, true);

    return prevTotalSize;
  }

  private void benchmarkDefaultCompression(int totalSize, byte[] rawBuffer) {
    benchmarkAlgorithm(compressionAlgorithm, compressor, decompressor,
        compressionAlgorithmName.toUpperCase(), rawBuffer, 0, totalSize);
  }

  /**
   * Check decompress performance of a given algorithm and print it.
   * @param algorithm Compression algorithm.
   * @param compressorCodec Compressor to be tested.
   * @param decompressorCodec Decompressor of the same algorithm.
   * @param name Name of algorithm.
   * @param buffer Buffer to be compressed.
   * @param offset Position of the beginning of the data.
   * @param length Length of data in buffer.
   */
  public static void benchmarkAlgorithm(
      Compression.Algorithm algorithm,
      Compressor compressorCodec,
      Decompressor decompressorCodec,
      String name,
      byte[] buffer, int offset, int length) {
    System.out.println(name + ":");

    // compress it
    List<Long> compressDurations = new ArrayList<Long>();
    ByteArrayOutputStream compressedStream = new ByteArrayOutputStream();
    OutputStream compressingStream;
    try {
      for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) {
        final long startTime = System.nanoTime();
        compressingStream = algorithm.createCompressionStream(
            compressedStream, compressorCodec, 0);
        compressingStream.write(buffer, offset, length);
        compressingStream.flush();
        compressedStream.toByteArray();

        final long finishTime = System.nanoTime();

        // add time record
        if (itTime >= BENCHMARK_N_OMIT) {
          compressDurations.add(finishTime - startTime);
        }

        if (itTime + 1 < BENCHMARK_N_TIMES) { // not the last one
          compressedStream.reset();
        }
      }
    } catch (IOException e) {
      throw new RuntimeException(String.format(
          "Benchmark, or encoding algorithm '%s' cause some stream problems",
          name), e);
    }
    printBenchmarkResult(length, compressDurations, false);


    byte[] compBuffer = compressedStream.toByteArray();

    // uncompress it several times and measure performance
    List<Long> durations = new ArrayList<Long>();
    for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) {
      final long startTime = System.nanoTime();
      byte[] newBuf = new byte[length + 1];

      try {

        ByteArrayInputStream downStream = new ByteArrayInputStream(compBuffer,
            0, compBuffer.length);
        InputStream decompressedStream = algorithm.createDecompressionStream(
            downStream, decompressorCodec, 0);

        int destOffset = 0;
        int nextChunk;
        while ((nextChunk = decompressedStream.available()) > 0) {
          destOffset += decompressedStream.read(newBuf, destOffset, nextChunk);
        }
        decompressedStream.close();

        // iterate over KeyValue
        KeyValue kv;
        for (int pos = 0; pos < length; pos += kv.getLength()) {
          kv = new KeyValue(newBuf, pos);
        }

      } catch (IOException e) {
        throw new RuntimeException(String.format(
            "Decoding path in '%s' algorithm cause exception ", name), e);
      }

      final long finishTime = System.nanoTime();

      // check correctness
      if (0 != Bytes.compareTo(buffer, 0, length, newBuf, 0, length)) {
        int prefix = 0;
        for(; prefix < buffer.length && prefix < newBuf.length; ++prefix) {
          if (buffer[prefix] != newBuf[prefix]) {
            break;
          }
        }
        throw new RuntimeException(String.format(
            "Algorithm '%s' is corrupting the data", name));
      }

      // add time record
      if (itTime >= BENCHMARK_N_OMIT) {
        durations.add(finishTime - startTime);
      }
    }
    printBenchmarkResult(length, durations, true);
  }

  private static void printBenchmarkResult(int totalSize,
      List<Long> durationsInNanoSed, boolean isDecompression) {
    long meanTime = 0;
    for (long time : durationsInNanoSed) {
      meanTime += time;
    }
    meanTime /= durationsInNanoSed.size();

    long standardDev = 0;
    for (long time : durationsInNanoSed) {
      standardDev += (time - meanTime) * (time - meanTime);
    }
    standardDev = (long) Math.sqrt(standardDev / durationsInNanoSed.size());

    final double million = 1000.0 * 1000.0 * 1000.0;
    double mbPerSec = (totalSize * million) / (1024.0 * 1024.0 * meanTime);
    double mbPerSecDev = (totalSize * million) /
        (1024.0 * 1024.0 * (meanTime - standardDev));

    System.out.println(String.format(
        "  %s performance:%s %6.2f MB/s (+/- %.2f MB/s)",
        isDecompression ? "Decompression" : "Compression",
        isDecompression ? "" : "  ",
        mbPerSec, mbPerSecDev - mbPerSec));
  }

  /**
   * Display statistics of different compression algorithms.
   */
  public void displayStatistics() {
    int totalLength = totalPrefixLength + totalKeyLength + totalValueLength;
    if (compressor != null) {  // might be null e.g. for pure-Java GZIP 
      compressor.reset();
    }

    for(EncodedDataBlock codec : codecs) {
      System.out.println(codec.toString());
      int saved = totalKeyLength + totalPrefixLength + totalValueLength
          - codec.getSize();
      System.out.println(
          String.format("  Saved bytes:                 %8d", saved));
      double keyRatio = (saved * 100.0) / (totalPrefixLength + totalKeyLength);
      double allRatio = (saved * 100.0) / totalLength;
      System.out.println(
          String.format("  Key compression ratio:        %.2f %%", keyRatio));
      System.out.println(
          String.format("  All compression ratio:        %.2f %%", allRatio));

      String compressedSizeCaption =
          String.format("  %s compressed size:         ",
              compressionAlgorithmName.toUpperCase());
      String compressOnlyRatioCaption =
          String.format("  %s compression ratio:        ",
              compressionAlgorithmName.toUpperCase());

      if (compressor != null) {
        int compressedSize = codec.checkCompressedSize(compressor);
        System.out.println(compressedSizeCaption +
            String.format("%8d", compressedSize));
        double compressOnlyRatio =
            100.0 * (1.0 - compressedSize / (0.0 + totalLength));
        System.out.println(compressOnlyRatioCaption
            + String.format("%.2f %%", compressOnlyRatio));
      } else {
        System.out.println(compressedSizeCaption + "N/A");
        System.out.println(compressOnlyRatioCaption + "N/A");
      }
    }

    System.out.println(
        String.format("Total KV prefix length:   %8d", totalPrefixLength));
    System.out.println(
        String.format("Total key length:         %8d", totalKeyLength));
    System.out.println(
        String.format("Total key redundancy:     %8d",
            totalKeyRedundancyLength));
    System.out.println(
        String.format("Total value length:       %8d", totalValueLength));
  }

  /**
   * Test a data block encoder on the given HFile. Output results to console.
   * @param kvLimit The limit of KeyValue which will be analyzed.
   * @param hfilePath an HFile path on the file system.
   * @param compressionName Compression algorithm used for comparison.
   * @param doBenchmark Run performance benchmarks.
   * @param doVerify Verify correctness.
   * @throws IOException When pathName is incorrect.
   */
  public static void testCodecs(Configuration conf, int kvLimit,
      String hfilePath, String compressionName, boolean doBenchmark,
      boolean doVerify) throws IOException {
    // create environment
    Path path = new Path(hfilePath);
    CacheConfig cacheConf = new CacheConfig(conf);
    FileSystem fs = FileSystem.get(conf);
    StoreFile hsf = new StoreFile(fs, path, conf, cacheConf,
        StoreFile.BloomType.NONE, NoOpDataBlockEncoder.INSTANCE);

    StoreFile.Reader reader = hsf.createReader();
    reader.loadFileInfo();
    KeyValueScanner scanner = reader.getStoreFileScanner(true, true);

    // run the utilities
    DataBlockEncodingTool comp = new DataBlockEncodingTool(compressionName);
    comp.checkStatistics(scanner, kvLimit);
    if (doVerify) {
      comp.verifyCodecs(scanner, kvLimit);
    }
    if (doBenchmark) {
      comp.benchmarkCodecs();
    }
    comp.displayStatistics();

    // cleanup
    scanner.close();
    reader.close(cacheConf.shouldEvictOnClose());
  }

  private static void printUsage(Options options) {
    System.err.println("Usage:");
    System.err.println(String.format("./hbase %s <options>",
        DataBlockEncodingTool.class.getName()));
    System.err.println("Options:");
    for (Object it : options.getOptions()) {
      Option opt = (Option) it;
      if (opt.hasArg()) {
        System.err.println(String.format("-%s %s: %s", opt.getOpt(),
            opt.getArgName(), opt.getDescription()));
      } else {
        System.err.println(String.format("-%s: %s", opt.getOpt(),
            opt.getDescription()));
      }
    }
  }

  /**
   * A command line interface to benchmarks.
   * @param args Should have length at least 1 and holds the file path to HFile.
   * @throws IOException If you specified the wrong file.
   */
  public static void main(final String[] args) throws IOException {
    // set up user arguments
    Options options = new Options();
    options.addOption("f", true, "HFile to analyse (REQUIRED)");
    options.getOption("f").setArgName("FILENAME");
    options.addOption("n", true,
        "Limit number of KeyValue which will be analysed");
    options.getOption("n").setArgName("NUMBER");
    options.addOption("b", false, "Measure read throughput");
    options.addOption("c", false, "Omit corectness tests.");
    options.addOption("a", true,
        "What kind of compression algorithm use for comparison.");

    // parse arguments
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;
    try {
      cmd = parser.parse(options, args);
    } catch (ParseException e) {
      System.err.println("Could not parse arguments!");
      System.exit(-1);
      return; // avoid warning
    }

    int kvLimit = Integer.MAX_VALUE;
    if (cmd.hasOption("n")) {
      kvLimit = Integer.parseInt(cmd.getOptionValue("n"));
    }

    // basic argument sanity checks
    if (!cmd.hasOption("f")) {
      System.err.println("ERROR: Filename is required!");
      printUsage(options);
      System.exit(-1);
    }

    String pathName = cmd.getOptionValue("f");
    String compressionName = DEFAULT_COMPRESSION.getName();
    if (cmd.hasOption("a")) {
      compressionName = cmd.getOptionValue("a").toLowerCase();
    }
    boolean doBenchmark = cmd.hasOption("b");
    boolean doVerify = !cmd.hasOption("c");

    final Configuration conf = HBaseConfiguration.create();
    try {
      testCodecs(conf, kvLimit, pathName, compressionName, doBenchmark,
          doVerify);
    } finally {
      (new CacheConfig(conf)).getBlockCache().shutdown();
    }
  }

}