/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.compaction.dataset;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Sets;
import gobblin.compaction.mapreduce.MRCompactor;
import gobblin.configuration.State;
import gobblin.util.DatasetFilterUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.Period;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.PeriodFormatter;
import org.joda.time.format.PeriodFormatterBuilder;
import java.io.IOException;
import java.util.Set;
/**
* An implementation {@link DatasetsFinder} based on time-based subdirs of the inputDir.
*
* {@link #inputDir} may contain multiple datasets. The path must follow some subdir and time-based pattern,
* which can be configured by compaction.*.subdir and compaction.timebased.folder.pattern.
* For example, the subdir name is 'daily' and time-based patterhn is 'YYYY/MM/dd'.
* A dataset will be created for each qualified folder that matches '[intputDir]/datasetName/daily/YYYY/MM/dd'.
*
* Dataset name is used for blacklist/whitelist, and finding high/normal priorities, and recompaction threshold.
*
* To control which folders to process, use properties compaction.timebased.min.time.ago and
* compaction.timebased.max.time.ago. The format is ?m?d?h, e.g., 3m or 2d10h.
*/
@Slf4j
public class TimeBasedSubDirDatasetsFinder extends DatasetsFinder {
private static final String COMPACTION_TIMEBASED_PREFIX = "compaction.timebased.";
/**
* Configuration properties related to time based compaction jobs.
*/
public static final String COMPACTION_TIMEBASED_FOLDER_PATTERN = COMPACTION_TIMEBASED_PREFIX + "folder.pattern";
public static final String DEFAULT_COMPACTION_TIMEBASED_FOLDER_PATTERN = "YYYY/MM/dd";
public static final String COMPACTION_TIMEBASED_SUBDIR_PATTERN = COMPACTION_TIMEBASED_PREFIX + "subdir.pattern";
public static final String DEFAULT_COMPACTION_TIMEBASED_SUBDIR_PATTERN = "*";
// The earliest dataset timestamp to be processed. Format = ?m?d?h.
public static final String COMPACTION_TIMEBASED_MAX_TIME_AGO = COMPACTION_TIMEBASED_PREFIX + "max.time.ago";
public static final String DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO = "3d";
// The latest dataset timestamp to be processed. Format = ?m?d?h.
public static final String COMPACTION_TIMEBASED_MIN_TIME_AGO = COMPACTION_TIMEBASED_PREFIX + "min.time.ago";
public static final String DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO = "1d";
protected final String folderTimePattern;
protected final String subDirPattern;
protected final DateTimeZone timeZone;
protected final DateTimeFormatter timeFormatter;
protected final String inputSubDir;
protected final String inputLateSubDir;
protected final String destSubDir;
protected final String destLateSubDir;
@VisibleForTesting
public TimeBasedSubDirDatasetsFinder(State state, FileSystem fs) throws Exception {
super(state, fs);
this.inputSubDir = getInputSubDir();
this.inputLateSubDir = getInputLateSubDir();
this.destSubDir = getDestSubDir();
this.destLateSubDir = getDestLateSubDir();
this.folderTimePattern = getFolderPattern();
this.subDirPattern = getSubDirPattern();
this.timeZone = DateTimeZone
.forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
this.timeFormatter = DateTimeFormat.forPattern(this.folderTimePattern).withZone(this.timeZone);
}
public TimeBasedSubDirDatasetsFinder(State state) throws Exception {
super(state);
this.inputSubDir = getInputSubDir();
this.inputLateSubDir = getInputLateSubDir();
this.destSubDir = getDestSubDir();
this.destLateSubDir = getDestLateSubDir();
this.folderTimePattern = getFolderPattern();
this.subDirPattern = getSubDirPattern();
this.timeZone = DateTimeZone
.forID(this.state.getProp(MRCompactor.COMPACTION_TIMEZONE, MRCompactor.DEFAULT_COMPACTION_TIMEZONE));
this.timeFormatter = DateTimeFormat.forPattern(this.folderTimePattern).withZone(this.timeZone);
}
protected String getDatasetName(String path, String basePath) {
int startPos = path.indexOf(basePath) + basePath.length();
return StringUtils.removeStart(path.substring(startPos), "/");
}
/**
* Each subdir in {@link DatasetsFinder#inputDir} is considered a dataset, if it satisfies blacklist and whitelist.
*/
@Override
public Set<Dataset> findDistinctDatasets() throws IOException {
Set<Dataset> datasets = Sets.newHashSet();
for (FileStatus datasetsFileStatus : this.fs.globStatus(new Path(inputDir, subDirPattern))) {
log.info("Scanning directory : " + datasetsFileStatus.getPath().toString());
if (datasetsFileStatus.isDirectory()) {
String datasetName = getDatasetName(datasetsFileStatus.getPath().toString(), inputDir);
if (DatasetFilterUtils.survived(datasetName, this.blacklist, this.whitelist)) {
log.info("Found dataset: " + datasetName);
Path inputPath = new Path(this.inputDir, new Path(datasetName, this.inputSubDir));
Path inputLatePath = new Path(this.inputDir, new Path(datasetName, this.inputLateSubDir));
Path outputPath = new Path(this.destDir, new Path(datasetName, this.destSubDir));
Path outputLatePath = new Path(this.destDir, new Path(datasetName, this.destLateSubDir));
Path outputTmpPath = new Path(this.tmpOutputDir, new Path(datasetName, this.destSubDir));
double priority = this.getDatasetPriority(datasetName);
String folderStructure = getFolderStructure();
for (FileStatus status : this.fs.globStatus(new Path(inputPath, folderStructure))) {
Path jobInputPath = status.getPath();
DateTime folderTime = null;
try {
folderTime = getFolderTime(jobInputPath, inputPath);
} catch (RuntimeException e) {
log.warn("{} is not a valid folder. Will be skipped due to exception.", jobInputPath, e);
continue;
}
if (folderWithinAllowedPeriod(jobInputPath, folderTime)) {
Path jobInputLatePath = appendFolderTime(inputLatePath, folderTime);
Path jobOutputPath = appendFolderTime(outputPath, folderTime);
Path jobOutputLatePath = appendFolderTime(outputLatePath, folderTime);
Path jobOutputTmpPath = appendFolderTime(outputTmpPath, folderTime);
Dataset timeBasedDataset = new Dataset.Builder().withPriority(priority)
.withDatasetName(datasetName)
.addInputPath(this.recompactDatasets ? jobOutputPath : jobInputPath)
.addInputLatePath(this.recompactDatasets ? jobOutputLatePath : jobInputLatePath)
.withOutputPath(jobOutputPath).withOutputLatePath(jobOutputLatePath)
.withOutputTmpPath(jobOutputTmpPath).build();
// Stores the extra information for timeBasedDataset
timeBasedDataset.setJobProp(MRCompactor.COMPACTION_JOB_DEST_PARTITION,
folderTime.toString(this.timeFormatter));
timeBasedDataset.setJobProp(MRCompactor.COMPACTION_INPUT_PATH_TIME, folderTime.getMillis());
datasets.add(timeBasedDataset);
}
}
}
}
}
return datasets;
}
private String getInputSubDir() {
return this.state.getProp(MRCompactor.COMPACTION_INPUT_SUBDIR, MRCompactor.DEFAULT_COMPACTION_INPUT_SUBDIR);
}
private String getInputLateSubDir() {
return this.state.getProp(MRCompactor.COMPACTION_INPUT_SUBDIR, MRCompactor.DEFAULT_COMPACTION_INPUT_SUBDIR)
+ MRCompactor.COMPACTION_LATE_DIR_SUFFIX;
}
private String getDestLateSubDir() {
return this.state.getProp(MRCompactor.COMPACTION_DEST_SUBDIR, MRCompactor.DEFAULT_COMPACTION_DEST_SUBDIR)
+ MRCompactor.COMPACTION_LATE_DIR_SUFFIX;
}
private String getDestSubDir() {
return this.state.getProp(MRCompactor.COMPACTION_DEST_SUBDIR, MRCompactor.DEFAULT_COMPACTION_DEST_SUBDIR);
}
protected String getFolderStructure() {
return this.folderTimePattern.replaceAll("[a-zA-Z0-9='-]+", "*");
}
private String getFolderPattern() {
String folderPattern =
this.state.getProp(COMPACTION_TIMEBASED_FOLDER_PATTERN, DEFAULT_COMPACTION_TIMEBASED_FOLDER_PATTERN);
log.info("Compaction folder pattern: " + folderPattern);
return folderPattern;
}
private String getSubDirPattern() {
String subdirPattern =
this.state.getProp(COMPACTION_TIMEBASED_SUBDIR_PATTERN, DEFAULT_COMPACTION_TIMEBASED_SUBDIR_PATTERN);
log.info("Compaction subdir pattern: " + subdirPattern);
return subdirPattern;
}
protected DateTime getFolderTime(Path path, Path basePath) {
int startPos = path.toString().indexOf(basePath.toString()) + basePath.toString().length();
return this.timeFormatter.parseDateTime(StringUtils.removeStart(path.toString().substring(startPos), "/"));
}
/**
* Return true iff input folder time is between compaction.timebased.min.time.ago and
* compaction.timebased.max.time.ago.
*/
protected boolean folderWithinAllowedPeriod(Path inputFolder, DateTime folderTime) {
DateTime currentTime = new DateTime(this.timeZone);
PeriodFormatter periodFormatter = getPeriodFormatter();
DateTime earliestAllowedFolderTime = getEarliestAllowedFolderTime(currentTime, periodFormatter);
DateTime latestAllowedFolderTime = getLatestAllowedFolderTime(currentTime, periodFormatter);
if (folderTime.isBefore(earliestAllowedFolderTime)) {
log.info(String.format("Folder time for %s is %s, earlier than the earliest allowed folder time, %s. Skipping",
inputFolder, folderTime, earliestAllowedFolderTime));
return false;
} else if (folderTime.isAfter(latestAllowedFolderTime)) {
log.info(String.format("Folder time for %s is %s, later than the latest allowed folder time, %s. Skipping",
inputFolder, folderTime, latestAllowedFolderTime));
return false;
} else {
return true;
}
}
public static PeriodFormatter getPeriodFormatter() {
return new PeriodFormatterBuilder().appendMonths().appendSuffix("m").appendDays().appendSuffix("d").appendHours()
.appendSuffix("h").appendMinutes().appendSuffix("min").toFormatter();
}
private DateTime getEarliestAllowedFolderTime(DateTime currentTime, PeriodFormatter periodFormatter) {
String maxTimeAgoStr =
this.state.getProp(COMPACTION_TIMEBASED_MAX_TIME_AGO, DEFAULT_COMPACTION_TIMEBASED_MAX_TIME_AGO);
Period maxTimeAgo = periodFormatter.parsePeriod(maxTimeAgoStr);
return currentTime.minus(maxTimeAgo);
}
private DateTime getLatestAllowedFolderTime(DateTime currentTime, PeriodFormatter periodFormatter) {
String minTimeAgoStr =
this.state.getProp(COMPACTION_TIMEBASED_MIN_TIME_AGO, DEFAULT_COMPACTION_TIMEBASED_MIN_TIME_AGO);
Period minTimeAgo = periodFormatter.parsePeriod(minTimeAgoStr);
return currentTime.minus(minTimeAgo);
}
protected Path appendFolderTime(Path path, DateTime folderTime) {
return new Path(path, folderTime.toString(this.timeFormatter));
}
}