/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package org.apache.hadoop.hbase.regionserver; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.io.encoding.EncodedDataBlock; import org.apache.hadoop.hbase.io.encoding.DataBlockEncoder; import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.Compression; import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder; import org.apache.hadoop.hbase.io.hfile.Compression.Algorithm; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.compress.Decompressor; /** * Tests various algorithms for key compression on an existing HFile. Useful * for testing, debugging and benchmarking. */ public class DataBlockEncodingTool { private static final Log LOG = LogFactory.getLog( DataBlockEncodingTool.class); private static final boolean includesMemstoreTS = true; /** * How many times should benchmark run. * More times means better data in terms of statistics. * It has to be larger than BENCHMARK_N_OMIT. */ public static int BENCHMARK_N_TIMES = 12; /** * How many first runs should omit benchmark. * Usually it is one in order to exclude setup cost. * Has to be 0 or larger. */ public static int BENCHMARK_N_OMIT = 2; /** Compression algorithm to use if not specified on the command line */ private static final Algorithm DEFAULT_COMPRESSION = Compression.Algorithm.GZ; private List<EncodedDataBlock> codecs = new ArrayList<EncodedDataBlock>(); private int totalPrefixLength = 0; private int totalKeyLength = 0; private int totalValueLength = 0; private int totalKeyRedundancyLength = 0; final private String compressionAlgorithmName; final private Algorithm compressionAlgorithm; final private Compressor compressor; final private Decompressor decompressor; /** * @param compressionAlgorithmName What kind of algorithm should be used * as baseline for comparison (e.g. lzo, gz). */ public DataBlockEncodingTool(String compressionAlgorithmName) { this.compressionAlgorithmName = compressionAlgorithmName; this.compressionAlgorithm = Compression.getCompressionAlgorithmByName( compressionAlgorithmName); this.compressor = this.compressionAlgorithm.getCompressor(); this.decompressor = this.compressionAlgorithm.getDecompressor(); } /** * Check statistics for given HFile for different data block encoders. * @param scanner Of file which will be compressed. * @param kvLimit Maximal count of KeyValue which will be processed. * @throws IOException thrown if scanner is invalid */ public void checkStatistics(final KeyValueScanner scanner, final int kvLimit) throws IOException { scanner.seek(KeyValue.LOWESTKEY); KeyValue currentKv; byte[] previousKey = null; byte[] currentKey; List<DataBlockEncoder> dataBlockEncoders = DataBlockEncoding.getAllEncoders(); for (DataBlockEncoder d : dataBlockEncoders) { codecs.add(new EncodedDataBlock(d, includesMemstoreTS)); } int j = 0; while ((currentKv = scanner.next()) != null && j < kvLimit) { // Iterates through key/value pairs j++; currentKey = currentKv.getKey(); if (previousKey != null) { for (int i = 0; i < previousKey.length && i < currentKey.length && previousKey[i] == currentKey[i]; ++i) { totalKeyRedundancyLength++; } } for (EncodedDataBlock codec : codecs) { codec.addKv(currentKv); } previousKey = currentKey; totalPrefixLength += currentKv.getLength() - currentKv.getKeyLength() - currentKv.getValueLength(); totalKeyLength += currentKv.getKeyLength(); totalValueLength += currentKv.getValueLength(); } } /** * Verify if all data block encoders are working properly. * * @param scanner Of file which was compressed. * @param kvLimit Maximal count of KeyValue which will be processed. * @return true if all data block encoders compressed/decompressed correctly. * @throws IOException thrown if scanner is invalid */ public boolean verifyCodecs(final KeyValueScanner scanner, final int kvLimit) throws IOException { KeyValue currentKv; scanner.seek(KeyValue.LOWESTKEY); List<Iterator<KeyValue>> codecIterators = new ArrayList<Iterator<KeyValue>>(); for(EncodedDataBlock codec : codecs) { codecIterators.add(codec.getIterator()); } int j = 0; while ((currentKv = scanner.next()) != null && j < kvLimit) { // Iterates through key/value pairs ++j; for (Iterator<KeyValue> it : codecIterators) { KeyValue codecKv = it.next(); if (codecKv == null || 0 != Bytes.compareTo( codecKv.getBuffer(), codecKv.getOffset(), codecKv.getLength(), currentKv.getBuffer(), currentKv.getOffset(), currentKv.getLength())) { if (codecKv == null) { LOG.error("There is a bug in codec " + it + " it returned null KeyValue,"); } else { int prefix = 0; int limitLength = 2 * Bytes.SIZEOF_INT + Math.min(codecKv.getLength(), currentKv.getLength()); while (prefix < limitLength && codecKv.getBuffer()[prefix + codecKv.getOffset()] == currentKv.getBuffer()[prefix + currentKv.getOffset()]) { prefix++; } LOG.error("There is bug in codec " + it.toString() + "\n on element " + j + "\n codecKv.getKeyLength() " + codecKv.getKeyLength() + "\n codecKv.getValueLength() " + codecKv.getValueLength() + "\n codecKv.getLength() " + codecKv.getLength() + "\n currentKv.getKeyLength() " + currentKv.getKeyLength() + "\n currentKv.getValueLength() " + currentKv.getValueLength() + "\n codecKv.getLength() " + currentKv.getLength() + "\n currentKV rowLength " + currentKv.getRowLength() + " familyName " + currentKv.getFamilyLength() + " qualifier " + currentKv.getQualifierLength() + "\n prefix " + prefix + "\n codecKv '" + Bytes.toStringBinary(codecKv.getBuffer(), codecKv.getOffset(), prefix) + "' diff '" + Bytes.toStringBinary(codecKv.getBuffer(), codecKv.getOffset() + prefix, codecKv.getLength() - prefix) + "'" + "\n currentKv '" + Bytes.toStringBinary( currentKv.getBuffer(), currentKv.getOffset(), prefix) + "' diff '" + Bytes.toStringBinary(currentKv.getBuffer(), currentKv.getOffset() + prefix, currentKv.getLength() - prefix) + "'" ); } return false; } } } LOG.info("Verification was successful!"); return true; } /** * Benchmark codec's speed. */ public void benchmarkCodecs() { int prevTotalSize = -1; for (EncodedDataBlock codec : codecs) { prevTotalSize = benchmarkEncoder(prevTotalSize, codec); } byte[] buffer = codecs.get(0).getRawKeyValues(); benchmarkDefaultCompression(prevTotalSize, buffer); } /** * Benchmark compression/decompression throughput. * @param previousTotalSize Total size used for verification. Use -1 if * unknown. * @param codec Tested encoder. * @return Size of uncompressed data. */ private int benchmarkEncoder(int previousTotalSize, EncodedDataBlock codec) { int prevTotalSize = previousTotalSize; int totalSize = 0; // decompression time List<Long> durations = new ArrayList<Long>(); for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) { totalSize = 0; Iterator<KeyValue> it; it = codec.getIterator(); // count only the algorithm time, without memory allocations // (expect first time) final long startTime = System.nanoTime(); while (it.hasNext()) { totalSize += it.next().getLength(); } final long finishTime = System.nanoTime(); if (itTime >= BENCHMARK_N_OMIT) { durations.add(finishTime - startTime); } if (prevTotalSize != -1 && prevTotalSize != totalSize) { throw new IllegalStateException(String.format( "Algorithm '%s' decoded data to different size", codec.toString())); } prevTotalSize = totalSize; } // compression time List<Long> compressDurations = new ArrayList<Long>(); for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) { final long startTime = System.nanoTime(); codec.doCompressData(); final long finishTime = System.nanoTime(); if (itTime >= BENCHMARK_N_OMIT) { compressDurations.add(finishTime - startTime); } } System.out.println(codec.toString() + ":"); printBenchmarkResult(totalSize, compressDurations, false); printBenchmarkResult(totalSize, durations, true); return prevTotalSize; } private void benchmarkDefaultCompression(int totalSize, byte[] rawBuffer) { benchmarkAlgorithm(compressionAlgorithm, compressor, decompressor, compressionAlgorithmName.toUpperCase(), rawBuffer, 0, totalSize); } /** * Check decompress performance of a given algorithm and print it. * @param algorithm Compression algorithm. * @param compressorCodec Compressor to be tested. * @param decompressorCodec Decompressor of the same algorithm. * @param name Name of algorithm. * @param buffer Buffer to be compressed. * @param offset Position of the beginning of the data. * @param length Length of data in buffer. */ public static void benchmarkAlgorithm( Compression.Algorithm algorithm, Compressor compressorCodec, Decompressor decompressorCodec, String name, byte[] buffer, int offset, int length) { System.out.println(name + ":"); // compress it List<Long> compressDurations = new ArrayList<Long>(); ByteArrayOutputStream compressedStream = new ByteArrayOutputStream(); OutputStream compressingStream; try { for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) { final long startTime = System.nanoTime(); compressingStream = algorithm.createCompressionStream( compressedStream, compressorCodec, 0); compressingStream.write(buffer, offset, length); compressingStream.flush(); compressedStream.toByteArray(); final long finishTime = System.nanoTime(); // add time record if (itTime >= BENCHMARK_N_OMIT) { compressDurations.add(finishTime - startTime); } if (itTime + 1 < BENCHMARK_N_TIMES) { // not the last one compressedStream.reset(); } } } catch (IOException e) { throw new RuntimeException(String.format( "Benchmark, or encoding algorithm '%s' cause some stream problems", name), e); } printBenchmarkResult(length, compressDurations, false); byte[] compBuffer = compressedStream.toByteArray(); // uncompress it several times and measure performance List<Long> durations = new ArrayList<Long>(); for (int itTime = 0; itTime < BENCHMARK_N_TIMES; ++itTime) { final long startTime = System.nanoTime(); byte[] newBuf = new byte[length + 1]; try { ByteArrayInputStream downStream = new ByteArrayInputStream(compBuffer, 0, compBuffer.length); InputStream decompressedStream = algorithm.createDecompressionStream( downStream, decompressorCodec, 0); int destOffset = 0; int nextChunk; while ((nextChunk = decompressedStream.available()) > 0) { destOffset += decompressedStream.read(newBuf, destOffset, nextChunk); } decompressedStream.close(); // iterate over KeyValue KeyValue kv; for (int pos = 0; pos < length; pos += kv.getLength()) { kv = new KeyValue(newBuf, pos); } } catch (IOException e) { throw new RuntimeException(String.format( "Decoding path in '%s' algorithm cause exception ", name), e); } final long finishTime = System.nanoTime(); // check correctness if (0 != Bytes.compareTo(buffer, 0, length, newBuf, 0, length)) { int prefix = 0; for(; prefix < buffer.length && prefix < newBuf.length; ++prefix) { if (buffer[prefix] != newBuf[prefix]) { break; } } throw new RuntimeException(String.format( "Algorithm '%s' is corrupting the data", name)); } // add time record if (itTime >= BENCHMARK_N_OMIT) { durations.add(finishTime - startTime); } } printBenchmarkResult(length, durations, true); } private static void printBenchmarkResult(int totalSize, List<Long> durationsInNanoSed, boolean isDecompression) { long meanTime = 0; for (long time : durationsInNanoSed) { meanTime += time; } meanTime /= durationsInNanoSed.size(); long standardDev = 0; for (long time : durationsInNanoSed) { standardDev += (time - meanTime) * (time - meanTime); } standardDev = (long) Math.sqrt(standardDev / durationsInNanoSed.size()); final double million = 1000.0 * 1000.0 * 1000.0; double mbPerSec = (totalSize * million) / (1024.0 * 1024.0 * meanTime); double mbPerSecDev = (totalSize * million) / (1024.0 * 1024.0 * (meanTime - standardDev)); System.out.println(String.format( " %s performance:%s %6.2f MB/s (+/- %.2f MB/s)", isDecompression ? "Decompression" : "Compression", isDecompression ? "" : " ", mbPerSec, mbPerSecDev - mbPerSec)); } /** * Display statistics of different compression algorithms. */ public void displayStatistics() { int totalLength = totalPrefixLength + totalKeyLength + totalValueLength; if (compressor != null) { // might be null e.g. for pure-Java GZIP compressor.reset(); } for(EncodedDataBlock codec : codecs) { System.out.println(codec.toString()); int saved = totalKeyLength + totalPrefixLength + totalValueLength - codec.getSize(); System.out.println( String.format(" Saved bytes: %8d", saved)); double keyRatio = (saved * 100.0) / (totalPrefixLength + totalKeyLength); double allRatio = (saved * 100.0) / totalLength; System.out.println( String.format(" Key compression ratio: %.2f %%", keyRatio)); System.out.println( String.format(" All compression ratio: %.2f %%", allRatio)); String compressedSizeCaption = String.format(" %s compressed size: ", compressionAlgorithmName.toUpperCase()); String compressOnlyRatioCaption = String.format(" %s compression ratio: ", compressionAlgorithmName.toUpperCase()); if (compressor != null) { int compressedSize = codec.checkCompressedSize(compressor); System.out.println(compressedSizeCaption + String.format("%8d", compressedSize)); double compressOnlyRatio = 100.0 * (1.0 - compressedSize / (0.0 + totalLength)); System.out.println(compressOnlyRatioCaption + String.format("%.2f %%", compressOnlyRatio)); } else { System.out.println(compressedSizeCaption + "N/A"); System.out.println(compressOnlyRatioCaption + "N/A"); } } System.out.println( String.format("Total KV prefix length: %8d", totalPrefixLength)); System.out.println( String.format("Total key length: %8d", totalKeyLength)); System.out.println( String.format("Total key redundancy: %8d", totalKeyRedundancyLength)); System.out.println( String.format("Total value length: %8d", totalValueLength)); } /** * Test a data block encoder on the given HFile. Output results to console. * @param kvLimit The limit of KeyValue which will be analyzed. * @param hfilePath an HFile path on the file system. * @param compressionName Compression algorithm used for comparison. * @param doBenchmark Run performance benchmarks. * @param doVerify Verify correctness. * @throws IOException When pathName is incorrect. */ public static void testCodecs(Configuration conf, int kvLimit, String hfilePath, String compressionName, boolean doBenchmark, boolean doVerify) throws IOException { // create environment Path path = new Path(hfilePath); CacheConfig cacheConf = new CacheConfig(conf); FileSystem fs = FileSystem.get(conf); StoreFile hsf = new StoreFile(fs, path, conf, cacheConf, StoreFile.BloomType.NONE, NoOpDataBlockEncoder.INSTANCE); StoreFile.Reader reader = hsf.createReader(); reader.loadFileInfo(); KeyValueScanner scanner = reader.getStoreFileScanner(true, true); // run the utilities DataBlockEncodingTool comp = new DataBlockEncodingTool(compressionName); comp.checkStatistics(scanner, kvLimit); if (doVerify) { comp.verifyCodecs(scanner, kvLimit); } if (doBenchmark) { comp.benchmarkCodecs(); } comp.displayStatistics(); // cleanup scanner.close(); reader.close(cacheConf.shouldEvictOnClose()); } private static void printUsage(Options options) { System.err.println("Usage:"); System.err.println(String.format("./hbase %s <options>", DataBlockEncodingTool.class.getName())); System.err.println("Options:"); for (Object it : options.getOptions()) { Option opt = (Option) it; if (opt.hasArg()) { System.err.println(String.format("-%s %s: %s", opt.getOpt(), opt.getArgName(), opt.getDescription())); } else { System.err.println(String.format("-%s: %s", opt.getOpt(), opt.getDescription())); } } } /** * A command line interface to benchmarks. * @param args Should have length at least 1 and holds the file path to HFile. * @throws IOException If you specified the wrong file. */ public static void main(final String[] args) throws IOException { // set up user arguments Options options = new Options(); options.addOption("f", true, "HFile to analyse (REQUIRED)"); options.getOption("f").setArgName("FILENAME"); options.addOption("n", true, "Limit number of KeyValue which will be analysed"); options.getOption("n").setArgName("NUMBER"); options.addOption("b", false, "Measure read throughput"); options.addOption("c", false, "Omit corectness tests."); options.addOption("a", true, "What kind of compression algorithm use for comparison."); // parse arguments CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { System.err.println("Could not parse arguments!"); System.exit(-1); return; // avoid warning } int kvLimit = Integer.MAX_VALUE; if (cmd.hasOption("n")) { kvLimit = Integer.parseInt(cmd.getOptionValue("n")); } // basic argument sanity checks if (!cmd.hasOption("f")) { System.err.println("ERROR: Filename is required!"); printUsage(options); System.exit(-1); } String pathName = cmd.getOptionValue("f"); String compressionName = DEFAULT_COMPRESSION.getName(); if (cmd.hasOption("a")) { compressionName = cmd.getOptionValue("a").toLowerCase(); } boolean doBenchmark = cmd.hasOption("b"); boolean doVerify = !cmd.hasOption("c"); final Configuration conf = HBaseConfiguration.create(); try { testCodecs(conf, kvLimit, pathName, compressionName, doBenchmark, doVerify); } finally { (new CacheConfig(conf)).getBlockCache().shutdown(); } } }