package gobblin.compaction.verify;
import com.google.common.collect.Lists;
import gobblin.compaction.conditions.RecompactionConditionBasedOnRatio;
import gobblin.compaction.mapreduce.MRCompactor;
import gobblin.compaction.parser.CompactionPathParser;
import gobblin.configuration.State;
import gobblin.dataset.FileSystemDataset;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.Map;
/**
* Compare the source and destination avro records. Determine if a compaction is needed.
*/
@Slf4j
public class CompactionThresholdVerifier implements CompactionVerifier<FileSystemDataset> {
public final static String COMPACTION_VERIFIER_THRESHOLD = "compaction-verifier-threshold";
private final State state;
/**
* Constructor
*/
public CompactionThresholdVerifier(State state) {
this.state = state;
}
/**
* There are two record count we are comparing here
* 1) The new record count in the input folder
* 2) The record count we compacted previously from last run
* Calculate two numbers difference and compare with a predefined threshold.
*
* (Alternatively we can save the previous record count to a state store. However each input
* folder is a dataset. We may end up with loading too many redundant job level state for each
* dataset. To avoid scalability issue, we choose a stateless approach where each dataset tracks
* record count by themselves and persist it in the file system)
*
* @return true iff the difference exceeds the threshold or this is the first time compaction
*/
public boolean verify (FileSystemDataset dataset) {
Map<String, Double> thresholdMap = RecompactionConditionBasedOnRatio.
getDatasetRegexAndRecompactThreshold (state.getProp(MRCompactor.COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET,
StringUtils.EMPTY));
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(state).parse(dataset);
double threshold = RecompactionConditionBasedOnRatio.getRatioThresholdByDatasetName (result.getDatasetName(), thresholdMap);
log.info ("Threshold is {} for dataset {}", threshold, result.getDatasetName());
InputRecordCountHelper helper = new InputRecordCountHelper(state);
try {
double newRecords = helper.calculateRecordCount (Lists.newArrayList(new Path(dataset.datasetURN())));
double oldRecords = InputRecordCountHelper.readRecordCount (helper.getFs(), new Path(result.getDstAbsoluteDir()));
log.info ("Dataset {} : previous records {}, current records {}", dataset.datasetURN(), oldRecords, newRecords);
if (oldRecords == 0) {
return true;
}
if ((newRecords - oldRecords) / oldRecords > threshold) {
log.info ("Dataset {} records exceeded the threshold {}", dataset.datasetURN(), threshold);
return true;
}
} catch (IOException e) {
log.error(e.toString());
}
return false;
}
/**
* Get compaction threshold verifier name
*/
public String getName() {
return COMPACTION_VERIFIER_THRESHOLD;
}
}