/**
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import java.io.DataInput;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Map;
import java.util.SortedSet;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HDFSBlocksDistribution;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValue.KVComparator;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.HalfStoreFileReader;
import org.apache.hadoop.hbase.io.Reference;
import org.apache.hadoop.hbase.io.hfile.CacheConfig;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileScanner;
import org.apache.hadoop.hbase.io.hfile.HFileWriterV1;
import org.apache.hadoop.hbase.io.hfile.HFileWriterV2;
import org.apache.hadoop.hbase.util.BloomFilter;
import org.apache.hadoop.hbase.util.BloomFilterFactory;
import org.apache.hadoop.hbase.util.BloomFilterWriter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Writables;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableUtils;
import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Ordering;
/**
* A Store data file. Stores usually have one or more of these files. They
* are produced by flushing the memstore to disk. To
* create, call {@link #createWriter(FileSystem, Path, int, Configuration, CacheConfig)}
* and append data. Be sure to add any metadata before calling close on the
* Writer (Use the appendMetadata convenience methods). On close, a StoreFile
* is sitting in the Filesystem. To refer to it, create a StoreFile instance
* passing filesystem and path. To read, call {@link #createReader()}.
* <p>StoreFiles may also reference store files in another Store.
*
* The reason for this weird pattern where you use a different instance for the
* writer and a reader is that we write once but read a lot more.
*/
public class StoreFile {
static final Log LOG = LogFactory.getLog(StoreFile.class.getName());
public static enum BloomType {
/**
* Bloomfilters disabled
*/
NONE,
/**
* Bloom enabled with Table row as Key
*/
ROW,
/**
* Bloom enabled with Table row & column (family+qualifier) as Key
*/
ROWCOL
}
// Keys for fileinfo values in HFile
/** Max Sequence ID in FileInfo */
public static final byte [] MAX_SEQ_ID_KEY = Bytes.toBytes("MAX_SEQ_ID_KEY");
/** Major compaction flag in FileInfo */
public static final byte[] MAJOR_COMPACTION_KEY =
Bytes.toBytes("MAJOR_COMPACTION_KEY");
/** Bloom filter Type in FileInfo */
static final byte[] BLOOM_FILTER_TYPE_KEY =
Bytes.toBytes("BLOOM_FILTER_TYPE");
/** Last Bloom filter key in FileInfo */
private static final byte[] LAST_BLOOM_KEY = Bytes.toBytes("LAST_BLOOM_KEY");
/** Key for Timerange information in metadata*/
public static final byte[] TIMERANGE_KEY = Bytes.toBytes("TIMERANGE");
// Make default block size for StoreFiles 8k while testing. TODO: FIX!
// Need to make it 8k for testing.
public static final int DEFAULT_BLOCKSIZE_SMALL = 8 * 1024;
private final FileSystem fs;
// This file's path.
private final Path path;
// If this storefile references another, this is the reference instance.
private Reference reference;
// If this StoreFile references another, this is the other files path.
private Path referencePath;
// Block cache configuration and reference.
private final CacheConfig cacheConf;
// HDFS blocks distribuion information
private HDFSBlocksDistribution hdfsBlocksDistribution;
// Keys for metadata stored in backing HFile.
// Set when we obtain a Reader.
private long sequenceid = -1;
// max of the MemstoreTS in the KV's in this store
// Set when we obtain a Reader.
private long maxMemstoreTS = -1;
public long getMaxMemstoreTS() {
return maxMemstoreTS;
}
public void setMaxMemstoreTS(long maxMemstoreTS) {
this.maxMemstoreTS = maxMemstoreTS;
}
// If true, this file was product of a major compaction. Its then set
// whenever you get a Reader.
private AtomicBoolean majorCompaction = null;
/** Meta key set when store file is a result of a bulk load */
public static final byte[] BULKLOAD_TASK_KEY =
Bytes.toBytes("BULKLOAD_SOURCE_TASK");
public static final byte[] BULKLOAD_TIME_KEY =
Bytes.toBytes("BULKLOAD_TIMESTAMP");
/**
* Map of the metadata entries in the corresponding HFile
*/
private Map<byte[], byte[]> metadataMap;
/*
* Regex that will work for straight filenames and for reference names.
* If reference, then the regex has more than just one group. Group 1 is
* this files id. Group 2 the referenced region name, etc.
*/
private static final Pattern REF_NAME_PARSER =
Pattern.compile("^([0-9a-f]+)(?:\\.(.+))?$");
// StoreFile.Reader
private volatile Reader reader;
/**
* Bloom filter type specified in column family configuration. Does not
* necessarily correspond to the Bloom filter type present in the HFile.
*/
private final BloomType cfBloomType;
// the last modification time stamp
private long modificationTimeStamp = 0L;
/**
* Constructor, loads a reader and it's indices, etc. May allocate a
* substantial amount of ram depending on the underlying files (10-20MB?).
*
* @param fs The current file system to use.
* @param p The path of the file.
* @param blockcache <code>true</code> if the block cache is enabled.
* @param conf The current configuration.
* @param cacheConf The cache configuration and block cache reference.
* @param cfBloomType The bloom type to use for this store file as specified
* by column family configuration. This may or may not be the same
* as the Bloom filter type actually present in the HFile, because
* column family configuration might change. If this is
* {@link BloomType#NONE}, the existing Bloom filter is ignored.
* @throws IOException When opening the reader fails.
*/
StoreFile(final FileSystem fs,
final Path p,
final Configuration conf,
final CacheConfig cacheConf,
final BloomType cfBloomType)
throws IOException {
this.fs = fs;
this.path = p;
this.cacheConf = cacheConf;
if (isReference(p)) {
this.reference = Reference.read(fs, p);
this.referencePath = getReferredToFile(this.path);
}
if (BloomFilterFactory.isBloomEnabled(conf)) {
this.cfBloomType = cfBloomType;
} else {
LOG.info("Ignoring bloom filter check for file " + path + ": " +
"cfBloomType=" + cfBloomType + " (disabled in config)");
this.cfBloomType = BloomType.NONE;
}
// cache the modification time stamp of this store file
FileStatus[] stats = FSUtils.listStatus(fs, p, null);
if (stats != null && stats.length == 1) {
this.modificationTimeStamp = stats[0].getModificationTime();
} else {
this.modificationTimeStamp = 0;
}
}
/**
* @return Path or null if this StoreFile was made with a Stream.
*/
Path getPath() {
return this.path;
}
/**
* @return The Store/ColumnFamily this file belongs to.
*/
byte [] getFamily() {
return Bytes.toBytes(this.path.getParent().getName());
}
/**
* @return True if this is a StoreFile Reference; call after {@link #open()}
* else may get wrong answer.
*/
boolean isReference() {
return this.reference != null;
}
/**
* @param p Path to check.
* @return True if the path has format of a HStoreFile reference.
*/
public static boolean isReference(final Path p) {
return !p.getName().startsWith("_") &&
isReference(p, REF_NAME_PARSER.matcher(p.getName()));
}
/**
* @param p Path to check.
* @param m Matcher to use.
* @return True if the path has format of a HStoreFile reference.
*/
public static boolean isReference(final Path p, final Matcher m) {
if (m == null || !m.matches()) {
LOG.warn("Failed match of store file name " + p.toString());
throw new RuntimeException("Failed match of store file name " +
p.toString());
}
return m.groupCount() > 1 && m.group(2) != null;
}
/*
* Return path to the file referred to by a Reference. Presumes a directory
* hierarchy of <code>${hbase.rootdir}/tablename/regionname/familyname</code>.
* @param p Path to a Reference file.
* @return Calculated path to parent region file.
* @throws IOException
*/
static Path getReferredToFile(final Path p) {
Matcher m = REF_NAME_PARSER.matcher(p.getName());
if (m == null || !m.matches()) {
LOG.warn("Failed match of store file name " + p.toString());
throw new RuntimeException("Failed match of store file name " +
p.toString());
}
// Other region name is suffix on the passed Reference file name
String otherRegion = m.group(2);
// Tabledir is up two directories from where Reference was written.
Path tableDir = p.getParent().getParent().getParent();
String nameStrippedOfSuffix = m.group(1);
// Build up new path with the referenced region in place of our current
// region in the reference path. Also strip regionname suffix from name.
return new Path(new Path(new Path(tableDir, otherRegion),
p.getParent().getName()), nameStrippedOfSuffix);
}
/**
* @return True if this file was made by a major compaction.
*/
boolean isMajorCompaction() {
if (this.majorCompaction == null) {
throw new NullPointerException("This has not been set yet");
}
return this.majorCompaction.get();
}
/**
* @return This files maximum edit sequence id.
*/
public long getMaxSequenceId() {
return this.sequenceid;
}
public long getModificationTimeStamp() {
return modificationTimeStamp;
}
/**
* Return the largest memstoreTS found across all storefiles in
* the given list. Store files that were created by a mapreduce
* bulk load are ignored, as they do not correspond to any specific
* put operation, and thus do not have a memstoreTS associated with them.
* @return 0 if no non-bulk-load files are provided or, this is Store that
* does not yet have any store files.
*/
public static long getMaxMemstoreTSInList(Collection<StoreFile> sfs) {
long max = 0;
for (StoreFile sf : sfs) {
if (!sf.isBulkLoadResult()) {
max = Math.max(max, sf.getMaxMemstoreTS());
}
}
return max;
}
/**
* Return the highest sequence ID found across all storefiles in
* the given list. Store files that were created by a mapreduce
* bulk load are ignored, as they do not correspond to any edit
* log items.
* @return 0 if no non-bulk-load files are provided or, this is Store that
* does not yet have any store files.
*/
public static long getMaxSequenceIdInList(Collection<StoreFile> sfs) {
long max = 0;
for (StoreFile sf : sfs) {
if (!sf.isBulkLoadResult()) {
max = Math.max(max, sf.getMaxSequenceId());
}
}
return max;
}
/**
* @return true if this storefile was created by HFileOutputFormat
* for a bulk load.
*/
boolean isBulkLoadResult() {
return metadataMap.containsKey(BULKLOAD_TIME_KEY);
}
/**
* Return the timestamp at which this bulk load file was generated.
*/
public long getBulkLoadTimestamp() {
return Bytes.toLong(metadataMap.get(BULKLOAD_TIME_KEY));
}
/**
* @return the cached value of HDFS blocks distribution. The cached value is
* calculated when store file is opened.
*/
public HDFSBlocksDistribution getHDFSBlockDistribution() {
return this.hdfsBlocksDistribution;
}
/**
* helper function to compute HDFS blocks distribution of a given reference
* file.For reference file, we don't compute the exact value. We use some
* estimate instead given it might be good enough. we assume bottom part
* takes the first half of reference file, top part takes the second half
* of the reference file. This is just estimate, given
* midkey ofregion != midkey of HFile, also the number and size of keys vary.
* If this estimate isn't good enough, we can improve it later.
* @param fs The FileSystem
* @param reference The reference
* @param reference The referencePath
* @return HDFS blocks distribution
*/
static private HDFSBlocksDistribution computeRefFileHDFSBlockDistribution(
FileSystem fs, Reference reference, Path referencePath) throws IOException {
if ( referencePath == null) {
return null;
}
FileStatus status = fs.getFileStatus(referencePath);
long start = 0;
long length = 0;
if (Reference.isTopFileRegion(reference.getFileRegion())) {
start = status.getLen()/2;
length = status.getLen() - status.getLen()/2;
} else {
start = 0;
length = status.getLen()/2;
}
return FSUtils.computeHDFSBlocksDistribution(fs, status, start, length);
}
/**
* helper function to compute HDFS blocks distribution of a given file.
* For reference file, it is an estimate
* @param fs The FileSystem
* @param p The path of the file
* @return HDFS blocks distribution
*/
static public HDFSBlocksDistribution computeHDFSBlockDistribution(
FileSystem fs, Path p) throws IOException {
if (isReference(p)) {
Reference reference = Reference.read(fs, p);
Path referencePath = getReferredToFile(p);
return computeRefFileHDFSBlockDistribution(fs, reference, referencePath);
} else {
FileStatus status = fs.getFileStatus(p);
long length = status.getLen();
return FSUtils.computeHDFSBlocksDistribution(fs, status, 0, length);
}
}
/**
* compute HDFS block distribution, for reference file, it is an estimate
*/
private void computeHDFSBlockDistribution() throws IOException {
if (isReference()) {
this.hdfsBlocksDistribution = computeRefFileHDFSBlockDistribution(
this.fs, this.reference, this.referencePath);
} else {
FileStatus status = this.fs.getFileStatus(this.path);
long length = status.getLen();
this.hdfsBlocksDistribution = FSUtils.computeHDFSBlocksDistribution(
this.fs, status, 0, length);
}
}
/**
* Opens reader on this store file. Called by Constructor.
* @return Reader for the store file.
* @throws IOException
* @see #closeReader()
*/
private Reader open() throws IOException {
if (this.reader != null) {
throw new IllegalAccessError("Already open");
}
if (isReference()) {
this.reader = new HalfStoreFileReader(this.fs, this.referencePath,
this.cacheConf, this.reference);
} else {
this.reader = new Reader(this.fs, this.path, this.cacheConf);
}
computeHDFSBlockDistribution();
// Load up indices and fileinfo. This also loads Bloom filter type.
metadataMap = Collections.unmodifiableMap(this.reader.loadFileInfo());
// Read in our metadata.
byte [] b = metadataMap.get(MAX_SEQ_ID_KEY);
if (b != null) {
// By convention, if halfhfile, top half has a sequence number > bottom
// half. Thats why we add one in below. Its done for case the two halves
// are ever merged back together --rare. Without it, on open of store,
// since store files are distingushed by sequence id, the one half would
// subsume the other.
this.sequenceid = Bytes.toLong(b);
if (isReference()) {
if (Reference.isTopFileRegion(this.reference.getFileRegion())) {
this.sequenceid += 1;
}
}
}
this.reader.setSequenceID(this.sequenceid);
b = metadataMap.get(HFileWriterV2.MAX_MEMSTORE_TS_KEY);
if (b != null) {
this.maxMemstoreTS = Bytes.toLong(b);
}
b = metadataMap.get(MAJOR_COMPACTION_KEY);
if (b != null) {
boolean mc = Bytes.toBoolean(b);
if (this.majorCompaction == null) {
this.majorCompaction = new AtomicBoolean(mc);
} else {
this.majorCompaction.set(mc);
}
} else {
// Presume it is not major compacted if it doesn't explicity say so
// HFileOutputFormat explicitly sets the major compacted key.
this.majorCompaction = new AtomicBoolean(false);
}
BloomType hfileBloomType = reader.getBloomFilterType();
if (cfBloomType != BloomType.NONE) {
reader.loadBloomfilter();
if (hfileBloomType != cfBloomType) {
LOG.info("HFile Bloom filter type for "
+ reader.getHFileReader().getName() + ": " + hfileBloomType
+ ", but " + cfBloomType + " specified in column family "
+ "configuration");
}
} else if (hfileBloomType != BloomType.NONE) {
LOG.info("Bloom filter turned off by CF config for "
+ reader.getHFileReader().getName());
}
try {
byte [] timerangeBytes = metadataMap.get(TIMERANGE_KEY);
if (timerangeBytes != null) {
this.reader.timeRangeTracker = new TimeRangeTracker();
Writables.copyWritable(timerangeBytes, this.reader.timeRangeTracker);
}
} catch (IllegalArgumentException e) {
LOG.error("Error reading timestamp range data from meta -- " +
"proceeding without", e);
this.reader.timeRangeTracker = null;
}
return this.reader;
}
/**
* @return Reader for StoreFile. creates if necessary
* @throws IOException
*/
public Reader createReader() throws IOException {
if (this.reader == null) {
this.reader = open();
}
return this.reader;
}
/**
* @return Current reader. Must call createReader first else returns null.
* @see #createReader()
*/
public Reader getReader() {
return this.reader;
}
/**
* @param evictOnClose
* @throws IOException
*/
public synchronized void closeReader(boolean evictOnClose)
throws IOException {
if (this.reader != null) {
this.reader.close(evictOnClose);
this.reader = null;
}
}
/**
* Delete this file
* @throws IOException
*/
public void deleteReader() throws IOException {
closeReader(true);
this.fs.delete(getPath(), true);
}
@Override
public String toString() {
return this.path.toString() +
(isReference()? "-" + this.referencePath + "-" + reference.toString(): "");
}
/**
* @return a length description of this StoreFile, suitable for debug output
*/
public String toStringDetailed() {
StringBuilder sb = new StringBuilder();
sb.append(this.path.toString());
sb.append(", isReference=").append(isReference());
sb.append(", isBulkLoadResult=").append(isBulkLoadResult());
if (isBulkLoadResult()) {
sb.append(", bulkLoadTS=").append(getBulkLoadTimestamp());
} else {
sb.append(", seqid=").append(getMaxSequenceId());
}
sb.append(", majorCompaction=").append(isMajorCompaction());
return sb.toString();
}
/**
* Utility to help with rename.
* @param fs
* @param src
* @param tgt
* @return True if succeeded.
* @throws IOException
*/
public static Path rename(final FileSystem fs,
final Path src,
final Path tgt)
throws IOException {
if (!fs.exists(src)) {
throw new FileNotFoundException(src.toString());
}
if (!fs.rename(src, tgt)) {
throw new IOException("Failed rename of " + src + " to " + tgt);
}
return tgt;
}
/**
* Get a store file writer. Client is responsible for closing file when done.
*
* @param fs
* @param dir Path to family directory. Makes the directory if doesn't exist.
* Creates a file with a unique name in this directory.
* @param blocksize size per filesystem block
* @return StoreFile.Writer
* @throws IOException
*/
public static Writer createWriter(final FileSystem fs, final Path dir,
final int blocksize, Configuration conf, CacheConfig cacheConf)
throws IOException {
return createWriter(fs, dir, blocksize, null, null, conf, cacheConf,
BloomType.NONE, 0);
}
/**
* Create a store file writer. Client is responsible for closing file when done.
* If metadata, add BEFORE closing using appendMetadata()
* @param fs
* @param dir Path to family directory. Makes the directory if doesn't exist.
* Creates a file with a unique name in this directory.
* @param blocksize
* @param algorithm Pass null to get default.
* @param c Pass null to get default.
* @param conf HBase system configuration. used with bloom filters
* @param cacheConf Cache configuration and reference.
* @param bloomType column family setting for bloom filters
* @param maxKeyCount estimated maximum number of keys we expect to add
* @return HFile.Writer
* @throws IOException
*/
public static StoreFile.Writer createWriter(final FileSystem fs,
final Path dir,
final int blocksize,
final Compression.Algorithm algorithm,
final KeyValue.KVComparator c,
final Configuration conf,
final CacheConfig cacheConf,
BloomType bloomType,
long maxKeyCount)
throws IOException {
if (!fs.exists(dir)) {
fs.mkdirs(dir);
}
Path path = getUniqueFile(fs, dir);
if (!BloomFilterFactory.isBloomEnabled(conf)) {
bloomType = BloomType.NONE;
}
return new Writer(fs, path, blocksize,
algorithm == null? HFile.DEFAULT_COMPRESSION_ALGORITHM: algorithm,
conf, cacheConf, c == null ? KeyValue.COMPARATOR: c, bloomType,
maxKeyCount);
}
/**
* @param fs
* @param dir Directory to create file in.
* @return random filename inside passed <code>dir</code>
*/
public static Path getUniqueFile(final FileSystem fs, final Path dir)
throws IOException {
if (!fs.getFileStatus(dir).isDir()) {
throw new IOException("Expecting " + dir.toString() +
" to be a directory");
}
return getRandomFilename(fs, dir);
}
/**
*
* @param fs
* @param dir
* @return Path to a file that doesn't exist at time of this invocation.
* @throws IOException
*/
static Path getRandomFilename(final FileSystem fs, final Path dir)
throws IOException {
return getRandomFilename(fs, dir, null);
}
/**
*
* @param fs
* @param dir
* @param suffix
* @return Path to a file that doesn't exist at time of this invocation.
* @throws IOException
*/
static Path getRandomFilename(final FileSystem fs,
final Path dir,
final String suffix)
throws IOException {
return new Path(dir, UUID.randomUUID().toString().replaceAll("-", "")
+ (suffix == null ? "" : suffix));
}
/**
* Write out a split reference.
*
* Package local so it doesnt leak out of regionserver.
*
* @param fs
* @param splitDir Presumes path format is actually
* <code>SOME_DIRECTORY/REGIONNAME/FAMILY</code>.
* @param f File to split.
* @param splitRow
* @param range
* @return Path to created reference.
* @throws IOException
*/
static Path split(final FileSystem fs,
final Path splitDir,
final StoreFile f,
final byte [] splitRow,
final Reference.Range range)
throws IOException {
// A reference to the bottom half of the hsf store file.
Reference r = new Reference(splitRow, range);
// Add the referred-to regions name as a dot separated suffix.
// See REF_NAME_PARSER regex above. The referred-to regions name is
// up in the path of the passed in <code>f</code> -- parentdir is family,
// then the directory above is the region name.
String parentRegionName = f.getPath().getParent().getParent().getName();
// Write reference with same file id only with the other region name as
// suffix and into the new region location (under same family).
Path p = new Path(splitDir, f.getPath().getName() + "." + parentRegionName);
return r.write(fs, p);
}
/**
* A StoreFile writer. Use this to read/write HBase Store Files. It is package
* local because it is an implementation detail of the HBase regionserver.
*/
public static class Writer {
private final BloomFilterWriter bloomFilterWriter;
private final BloomType bloomType;
private byte[] lastBloomKey;
private int lastBloomKeyOffset, lastBloomKeyLen;
private KVComparator kvComparator;
private KeyValue lastKv = null;
TimeRangeTracker timeRangeTracker = new TimeRangeTracker();
/* isTimeRangeTrackerSet keeps track if the timeRange has already been set
* When flushing a memstore, we set TimeRange and use this variable to
* indicate that it doesn't need to be calculated again while
* appending KeyValues.
* It is not set in cases of compactions when it is recalculated using only
* the appended KeyValues*/
boolean isTimeRangeTrackerSet = false;
protected HFile.Writer writer;
/**
* Creates an HFile.Writer that also write helpful meta data.
* @param fs file system to write to
* @param path file name to create
* @param blocksize HDFS block size
* @param compress HDFS block compression
* @param conf user configuration
* @param comparator key comparator
* @param bloomType bloom filter setting
* @param maxKeys the expected maximum number of keys to be added. Was used
* for Bloom filter size in {@link HFile} format version 1.
* @throws IOException problem writing to FS
*/
public Writer(FileSystem fs, Path path, int blocksize,
Compression.Algorithm compress, final Configuration conf,
CacheConfig cacheConf,
final KVComparator comparator, BloomType bloomType, long maxKeys)
throws IOException {
writer = HFile.getWriterFactory(conf, cacheConf).createWriter(
fs, path, blocksize,
compress, comparator.getRawComparator());
this.kvComparator = comparator;
bloomFilterWriter = BloomFilterFactory.createBloomAtWrite(conf, cacheConf,
bloomType, (int) Math.min(maxKeys, Integer.MAX_VALUE), writer);
if (bloomFilterWriter != null) {
this.bloomType = bloomType;
LOG.info("Bloom filter type for " + path + ": " + this.bloomType +
", "+ bloomFilterWriter.getClass().getSimpleName());
} else {
// Not using Bloom filters.
this.bloomType = BloomType.NONE;
}
}
/**
* Writes meta data.
* Call before {@link #close()} since its written as meta data to this file.
* @param maxSequenceId Maximum sequence id.
* @param majorCompaction True if this file is product of a major compaction
* @throws IOException problem writing to FS
*/
public void appendMetadata(final long maxSequenceId, final boolean majorCompaction)
throws IOException {
writer.appendFileInfo(MAX_SEQ_ID_KEY, Bytes.toBytes(maxSequenceId));
writer.appendFileInfo(MAJOR_COMPACTION_KEY,
Bytes.toBytes(majorCompaction));
appendTimeRangeMetadata();
}
/**
* Add TimestampRange to Metadata
*/
public void appendTimeRangeMetadata() throws IOException {
appendFileInfo(TIMERANGE_KEY,WritableUtils.toByteArray(timeRangeTracker));
}
/**
* Set TimeRangeTracker
* @param trt
*/
public void setTimeRangeTracker(final TimeRangeTracker trt) {
this.timeRangeTracker = trt;
isTimeRangeTrackerSet = true;
}
/**
* If the timeRangeTracker is not set,
* update TimeRangeTracker to include the timestamp of this key
* @param kv
*/
public void includeInTimeRangeTracker(final KeyValue kv) {
if (!isTimeRangeTrackerSet) {
timeRangeTracker.includeTimestamp(kv);
}
}
/**
* If the timeRangeTracker is not set,
* update TimeRangeTracker to include the timestamp of this key
* @param key
* @throws IOException
*/
public void includeInTimeRangeTracker(final byte [] key) {
if (!isTimeRangeTrackerSet) {
timeRangeTracker.includeTimestamp(key);
}
}
public void append(final KeyValue kv) throws IOException {
if (this.bloomFilterWriter != null) {
// only add to the bloom filter on a new, unique key
boolean newKey = true;
if (this.lastKv != null) {
switch(bloomType) {
case ROW:
newKey = ! kvComparator.matchingRows(kv, lastKv);
break;
case ROWCOL:
newKey = ! kvComparator.matchingRowColumn(kv, lastKv);
break;
case NONE:
newKey = false;
break;
default:
throw new IOException("Invalid Bloom filter type: " + bloomType);
}
}
if (newKey) {
/*
* http://2.bp.blogspot.com/_Cib_A77V54U/StZMrzaKufI/AAAAAAAAADo/ZhK7bGoJdMQ/s400/KeyValue.png
* Key = RowLen + Row + FamilyLen + Column [Family + Qualifier] + TimeStamp
*
* 2 Types of Filtering:
* 1. Row = Row
* 2. RowCol = Row + Qualifier
*/
byte[] bloomKey;
int bloomKeyOffset, bloomKeyLen;
switch (bloomType) {
case ROW:
bloomKey = kv.getBuffer();
bloomKeyOffset = kv.getRowOffset();
bloomKeyLen = kv.getRowLength();
break;
case ROWCOL:
// merge(row, qualifier)
// TODO: could save one buffer copy in case of compound Bloom
// filters when this involves creating a KeyValue
bloomKey = bloomFilterWriter.createBloomKey(kv.getBuffer(),
kv.getRowOffset(), kv.getRowLength(), kv.getBuffer(),
kv.getQualifierOffset(), kv.getQualifierLength());
bloomKeyOffset = 0;
bloomKeyLen = bloomKey.length;
break;
default:
throw new IOException("Invalid Bloom filter type: " + bloomType +
" (ROW or ROWCOL expected)");
}
bloomFilterWriter.add(bloomKey, bloomKeyOffset, bloomKeyLen);
if (lastBloomKey != null
&& bloomFilterWriter.getComparator().compare(bloomKey,
bloomKeyOffset, bloomKeyLen, lastBloomKey,
lastBloomKeyOffset, lastBloomKeyLen) <= 0) {
throw new IOException("Non-increasing Bloom keys: "
+ Bytes.toStringBinary(bloomKey, bloomKeyOffset, bloomKeyLen)
+ " after "
+ Bytes.toStringBinary(lastBloomKey, lastBloomKeyOffset,
lastBloomKeyLen));
}
lastBloomKey = bloomKey;
lastBloomKeyOffset = bloomKeyOffset;
lastBloomKeyLen = bloomKeyLen;
this.lastKv = kv;
}
}
writer.append(kv);
includeInTimeRangeTracker(kv);
}
public Path getPath() {
return this.writer.getPath();
}
boolean hasBloom() {
return this.bloomFilterWriter != null;
}
/**
* For unit testing only.
* @return the Bloom filter used by this writer.
*/
BloomFilterWriter getBloomWriter() {
return bloomFilterWriter;
}
public void close() throws IOException {
// Make sure we wrote something to the Bloom filter before adding it.
boolean haveBloom = bloomFilterWriter != null &&
bloomFilterWriter.getKeyCount() > 0;
if (haveBloom) {
bloomFilterWriter.compactBloom();
writer.addBloomFilter(bloomFilterWriter);
writer.appendFileInfo(BLOOM_FILTER_TYPE_KEY,
Bytes.toBytes(bloomType.toString()));
if (lastBloomKey != null) {
writer.appendFileInfo(LAST_BLOOM_KEY, Arrays.copyOfRange(
lastBloomKey, lastBloomKeyOffset, lastBloomKeyOffset
+ lastBloomKeyLen));
}
}
writer.close();
// Log final Bloom filter statistics. This needs to be done after close()
// because compound Bloom filters might be finalized as part of closing.
if (haveBloom && bloomFilterWriter.getMaxKeys() > 0) {
StoreFile.LOG.info("Bloom added to HFile ("
+ getPath() + "): " +
bloomFilterWriter.toString().replace("\n", "; "));
}
}
public void appendFileInfo(byte[] key, byte[] value) throws IOException {
writer.appendFileInfo(key, value);
}
}
/**
* Reader for a StoreFile.
*/
public static class Reader {
static final Log LOG = LogFactory.getLog(Reader.class.getName());
protected BloomFilter bloomFilter = null;
protected BloomType bloomFilterType;
private final HFile.Reader reader;
protected TimeRangeTracker timeRangeTracker = null;
protected long sequenceID = -1;
private byte[] lastBloomKey;
public Reader(FileSystem fs, Path path, CacheConfig cacheConf)
throws IOException {
reader = HFile.createReader(fs, path, cacheConf);
bloomFilterType = BloomType.NONE;
}
/**
* ONLY USE DEFAULT CONSTRUCTOR FOR UNIT TESTS
*/
Reader() {
this.reader = null;
}
public RawComparator<byte []> getComparator() {
return reader.getComparator();
}
/**
* Get a scanner to scan over this StoreFile.
*
* @param cacheBlocks should this scanner cache blocks?
* @param pread use pread (for highly concurrent small readers)
* @return a scanner
*/
public StoreFileScanner getStoreFileScanner(boolean cacheBlocks, boolean pread) {
return getStoreFileScanner(cacheBlocks, pread, false);
}
/**
* Get a scanner to scan over this StoreFile.
*
* @param cacheBlocks should this scanner cache blocks?
* @param pread use pread (for highly concurrent small readers)
* @param isCompaction is scanner being used for compaction?
* @return a scanner
*/
public StoreFileScanner getStoreFileScanner(boolean cacheBlocks,
boolean pread, boolean isCompaction) {
return new StoreFileScanner(this, getScanner(cacheBlocks, pread,
isCompaction), !isCompaction);
}
/**
* Warning: Do not write further code which depends on this call. Instead
* use getStoreFileScanner() which uses the StoreFileScanner class/interface
* which is the preferred way to scan a store with higher level concepts.
*
* @param cacheBlocks should we cache the blocks?
* @param pread use pread (for concurrent small readers)
* @return the underlying HFileScanner
*/
@Deprecated
public HFileScanner getScanner(boolean cacheBlocks, boolean pread) {
return getScanner(cacheBlocks, pread, false);
}
/**
* Warning: Do not write further code which depends on this call. Instead
* use getStoreFileScanner() which uses the StoreFileScanner class/interface
* which is the preferred way to scan a store with higher level concepts.
*
* @param cacheBlocks
* should we cache the blocks?
* @param pread
* use pread (for concurrent small readers)
* @param isCompaction
* is scanner being used for compaction?
* @return the underlying HFileScanner
*/
@Deprecated
public HFileScanner getScanner(boolean cacheBlocks, boolean pread,
boolean isCompaction) {
return reader.getScanner(cacheBlocks, pread, isCompaction);
}
public void close(boolean evictOnClose) throws IOException {
reader.close(evictOnClose);
}
public boolean shouldSeek(Scan scan, final SortedSet<byte[]> columns) {
return (passesTimerangeFilter(scan) && passesBloomFilter(scan, columns));
}
/**
* Check if this storeFile may contain keys within the TimeRange
* @param scan
* @return False if it definitely does not exist in this StoreFile
*/
private boolean passesTimerangeFilter(Scan scan) {
if (timeRangeTracker == null) {
return true;
} else {
return timeRangeTracker.includesTimeRange(scan.getTimeRange());
}
}
/**
* Checks whether the given scan passes the Bloom filter (if present). Only
* checks Bloom filters for single-row or single-row-column scans. Bloom
* filter checking for multi-gets is implemented as part of the store
* scanner system (see {@link StoreFileScanner#seekExactly}) and uses
* the lower-level API {@link #passesBloomFilter(byte[], int, int, byte[],
* int, int)}.
*
* @param scan the scan specification. Used to determine the row, and to
* check whether this is a single-row ("get") scan.
* @param columns the set of columns. Only used for row-column Bloom
* filters.
* @return true if the scan with the given column set passes the Bloom
* filter, or if the Bloom filter is not applicable for the scan.
* False if the Bloom filter is applicable and the scan fails it.
*/
private boolean passesBloomFilter(Scan scan,
final SortedSet<byte[]> columns) {
// Multi-column non-get scans will use Bloom filters through the
// lower-level API function that this function calls.
if (!scan.isGetScan()) {
return true;
}
byte[] row = scan.getStartRow();
switch (this.bloomFilterType) {
case ROW:
return passesBloomFilter(row, 0, row.length, null, 0, 0);
case ROWCOL:
if (columns != null && columns.size() == 1) {
byte[] column = columns.first();
return passesBloomFilter(row, 0, row.length, column, 0,
column.length);
}
// For multi-column queries the Bloom filter is checked from the
// seekExact operation.
return true;
default:
return true;
}
}
/**
* A method for checking Bloom filters. Called directly from
* StoreFileScanner in case of a multi-column query.
*
* @param row
* @param rowOffset
* @param rowLen
* @param col
* @param colOffset
* @param colLen
* @return True if passes
*/
public boolean passesBloomFilter(byte[] row, int rowOffset, int rowLen,
byte[] col, int colOffset, int colLen) {
if (bloomFilter == null)
return true;
byte[] key;
switch (bloomFilterType) {
case ROW:
if (col != null) {
throw new RuntimeException("Row-only Bloom filter called with " +
"column specified");
}
if (rowOffset != 0 || rowLen != row.length) {
throw new AssertionError("For row-only Bloom filters the row "
+ "must occupy the whole array");
}
key = row;
break;
case ROWCOL:
key = bloomFilter.createBloomKey(row, rowOffset, rowLen, col,
colOffset, colLen);
break;
default:
return true;
}
// Cache Bloom filter as a local variable in case it is set to null by
// another thread on an IO error.
BloomFilter bloomFilter = this.bloomFilter;
if (bloomFilter == null) {
return true;
}
// Empty file?
if (reader.getTrailer().getEntryCount() == 0)
return false;
try {
boolean shouldCheckBloom;
ByteBuffer bloom;
if (bloomFilter.supportsAutoLoading()) {
bloom = null;
shouldCheckBloom = true;
} else {
bloom = reader.getMetaBlock(HFileWriterV1.BLOOM_FILTER_DATA_KEY,
true);
shouldCheckBloom = bloom != null;
}
if (shouldCheckBloom) {
boolean exists;
// Whether the primary Bloom key is greater than the last Bloom key
// from the file info. For row-column Bloom filters this is not yet
// a sufficient condition to return false.
boolean keyIsAfterLast = lastBloomKey != null
&& bloomFilter.getComparator().compare(key, lastBloomKey) > 0;
if (bloomFilterType == BloomType.ROWCOL) {
// Since a Row Delete is essentially a DeleteFamily applied to all
// columns, a file might be skipped if using row+col Bloom filter.
// In order to ensure this file is included an additional check is
// required looking only for a row bloom.
byte[] rowBloomKey = bloomFilter.createBloomKey(row, 0, row.length,
null, 0, 0);
if (keyIsAfterLast
&& bloomFilter.getComparator().compare(rowBloomKey,
lastBloomKey) > 0) {
exists = false;
} else {
exists =
this.bloomFilter.contains(key, 0, key.length, bloom) ||
this.bloomFilter.contains(rowBloomKey, 0, rowBloomKey.length,
bloom);
}
} else {
exists = !keyIsAfterLast
&& this.bloomFilter.contains(key, 0, key.length, bloom);
}
return exists;
}
} catch (IOException e) {
LOG.error("Error reading bloom filter data -- proceeding without",
e);
setBloomFilterFaulty();
} catch (IllegalArgumentException e) {
LOG.error("Bad bloom filter data -- proceeding without", e);
setBloomFilterFaulty();
}
return true;
}
public Map<byte[], byte[]> loadFileInfo() throws IOException {
Map<byte [], byte []> fi = reader.loadFileInfo();
byte[] b = fi.get(BLOOM_FILTER_TYPE_KEY);
if (b != null) {
bloomFilterType = BloomType.valueOf(Bytes.toString(b));
}
lastBloomKey = fi.get(LAST_BLOOM_KEY);
return fi;
}
public void loadBloomfilter() {
if (this.bloomFilter != null) {
return; // already loaded
}
try {
DataInput bloomMeta = reader.getBloomFilterMetadata();
if (bloomMeta != null) {
if (bloomFilterType == BloomType.NONE) {
throw new IOException(
"valid bloom filter type not found in FileInfo");
}
bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
LOG.info("Loaded " + bloomFilterType + " " +
bloomFilter.getClass().getSimpleName() + " metadata for " +
reader.getName());
}
} catch (IOException e) {
LOG.error("Error reading bloom filter meta -- proceeding without", e);
this.bloomFilter = null;
} catch (IllegalArgumentException e) {
LOG.error("Bad bloom filter meta -- proceeding without", e);
this.bloomFilter = null;
}
}
/**
* The number of Bloom filter entries in this store file, or an estimate
* thereof, if the Bloom filter is not loaded. This always returns an upper
* bound of the number of Bloom filter entries.
*
* @return an estimate of the number of Bloom filter entries in this file
*/
public long getFilterEntries() {
return bloomFilter != null ? bloomFilter.getKeyCount()
: reader.getEntries();
}
public void setBloomFilterFaulty() {
bloomFilter = null;
}
public byte[] getLastKey() {
return reader.getLastKey();
}
public byte[] midkey() throws IOException {
return reader.midkey();
}
public long length() {
return reader.length();
}
public long getTotalUncompressedBytes() {
return reader.getTrailer().getTotalUncompressedBytes();
}
public long getEntries() {
return reader.getEntries();
}
public byte[] getFirstKey() {
return reader.getFirstKey();
}
public long indexSize() {
return reader.indexSize();
}
public BloomType getBloomFilterType() {
return this.bloomFilterType;
}
public long getSequenceID() {
return sequenceID;
}
public void setSequenceID(long sequenceID) {
this.sequenceID = sequenceID;
}
BloomFilter getBloomFilter() {
return bloomFilter;
}
long getUncompressedDataIndexSize() {
return reader.getTrailer().getUncompressedDataIndexSize();
}
public long getTotalBloomSize() {
if (bloomFilter == null)
return 0;
return bloomFilter.getByteSize();
}
public int getHFileVersion() {
return reader.getTrailer().getVersion();
}
HFile.Reader getHFileReader() {
return reader;
}
void disableBloomFilterForTesting() {
bloomFilter = null;
}
}
/**
* Useful comparators for comparing StoreFiles.
*/
abstract static class Comparators {
/**
* Comparator that compares based on the flush time of
* the StoreFiles. All bulk loads are placed before all non-
* bulk loads, and then all files are sorted by sequence ID.
* If there are ties, the path name is used as a tie-breaker.
*/
static final Comparator<StoreFile> FLUSH_TIME =
Ordering.compound(ImmutableList.of(
Ordering.natural().onResultOf(new GetBulkTime()),
Ordering.natural().onResultOf(new GetSeqId()),
Ordering.natural().onResultOf(new GetPathName())
));
private static class GetBulkTime implements Function<StoreFile, Long> {
@Override
public Long apply(StoreFile sf) {
if (!sf.isBulkLoadResult()) return Long.MAX_VALUE;
return sf.getBulkLoadTimestamp();
}
}
private static class GetSeqId implements Function<StoreFile, Long> {
@Override
public Long apply(StoreFile sf) {
if (sf.isBulkLoadResult()) return -1L;
return sf.getMaxSequenceId();
}
}
private static class GetPathName implements Function<StoreFile, String> {
@Override
public String apply(StoreFile sf) {
return sf.getPath().getName();
}
}
/**
* FILE_SIZE = descending sort StoreFiles (largest --> smallest in size)
*/
static final Comparator<StoreFile> FILE_SIZE =
Ordering.natural().reverse().onResultOf(new Function<StoreFile, Long>() {
@Override
public Long apply(StoreFile sf) {
return sf.getReader().length();
}
});
}
}