/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.compaction.conditions;
import java.util.List;
import java.util.Map;
import gobblin.annotation.Alias;
import gobblin.compaction.dataset.DatasetHelper;
import gobblin.compaction.dataset.Dataset;
import gobblin.compaction.mapreduce.MRCompactor;
import gobblin.util.DatasetFilterUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Splitter;
import com.google.common.collect.Maps;
/**
* An implementation {@link RecompactionCondition} which examines the late record percentage.
* If the percent exceeds the limit, a recompaction is triggered.
*/
@Alias("RecompactionConditionBasedOnRatio")
public class RecompactionConditionBasedOnRatio implements RecompactionCondition {
public static final char DATASETS_WITH_DIFFERENT_RECOMPACT_THRESHOLDS_SEPARATOR = ';';
public static final char DATASETS_WITH_SAME_RECOMPACT_THRESHOLDS_SEPARATOR = ',';
public static final char DATASETS_AND_RECOMPACT_THRESHOLD_SEPARATOR = ':';
private static final Logger logger = LoggerFactory.getLogger (RecompactionConditionBasedOnRatio.class);
private final double ratio;
private RecompactionConditionBasedOnRatio (Dataset dataset) {
Map<String, Double> datasetRegexAndRecompactThreshold = getDatasetRegexAndRecompactThreshold(
dataset.jobProps().getProp(
MRCompactor.COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET, StringUtils.EMPTY));
this.ratio = getOwnRatioThreshold (dataset, datasetRegexAndRecompactThreshold);
}
@Alias("RecompactBasedOnRatio")
public static class Factory implements RecompactionConditionFactory {
@Override public RecompactionCondition createRecompactionCondition (Dataset dataset) {
return new RecompactionConditionBasedOnRatio (dataset);
}
}
public static Map<String, Double> getDatasetRegexAndRecompactThreshold (String datasetsAndRecompactThresholds) {
Map<String, Double> topicRegexAndRecompactThreshold = Maps.newHashMap();
for (String entry : Splitter.on(DATASETS_WITH_DIFFERENT_RECOMPACT_THRESHOLDS_SEPARATOR).trimResults()
.omitEmptyStrings().splitToList(datasetsAndRecompactThresholds)) {
List<String> topicsAndRecompactThreshold =
Splitter.on(DATASETS_AND_RECOMPACT_THRESHOLD_SEPARATOR).trimResults().omitEmptyStrings().splitToList(entry);
if (topicsAndRecompactThreshold.size() != 2) {
logger.error("Invalid form (DATASET_NAME:THRESHOLD) in "
+ MRCompactor.COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET + ".");
} else {
topicRegexAndRecompactThreshold.put(topicsAndRecompactThreshold.get(0),
Double.parseDouble(topicsAndRecompactThreshold.get(1)));
}
}
return topicRegexAndRecompactThreshold;
}
private double getOwnRatioThreshold (Dataset dataset, Map<String, Double> datasetRegexAndRecompactThreshold) {
return getRatioThresholdByDatasetName (dataset.getDatasetName(), datasetRegexAndRecompactThreshold);
}
public static double getRatioThresholdByDatasetName (String datasetName, Map<String, Double> datasetRegexAndRecompactThreshold) {
for (Map.Entry<String, Double> topicRegexEntry : datasetRegexAndRecompactThreshold.entrySet()) {
if (DatasetFilterUtils.stringInPatterns(datasetName,
DatasetFilterUtils.getPatternsFromStrings(Splitter.on(DATASETS_WITH_SAME_RECOMPACT_THRESHOLDS_SEPARATOR)
.trimResults().omitEmptyStrings().splitToList(topicRegexEntry.getKey())))) {
return topicRegexEntry.getValue();
}
}
return MRCompactor.DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET;
}
public boolean isRecompactionNeeded (DatasetHelper datasetHelper) {
long lateDataCount = datasetHelper.getLateOutputRecordCount();
long nonLateDataCount = datasetHelper.getOutputRecordCount();
double lateDataPercent = lateDataCount * 1.0 / (lateDataCount + nonLateDataCount);
logger.info ("Late data ratio is " + lateDataPercent + " and threshold is " + this.ratio);
if (lateDataPercent > ratio) {
return true;
}
return false;
}
}