/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.retention.profile;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import gobblin.data.management.retention.DatasetCleaner;
import gobblin.dataset.Dataset;
import gobblin.dataset.DatasetsFinder;
import gobblin.util.ConfigUtils;
import gobblin.util.PathUtils;
/**
* A configurable {@link DatasetsFinder} that looks for
* {@link gobblin.data.management.retention.dataset.CleanableDataset}s using a glob pattern.
*/
public abstract class ConfigurableGlobDatasetFinder<T extends Dataset> implements DatasetsFinder<T> {
private static final Logger LOG = LoggerFactory.getLogger(ConfigurableGlobDatasetFinder.class);
private static final String CONFIGURATION_KEY_PREFIX = "gobblin.";
@Deprecated
/** @deprecated use {@link DATASET_FINDER_PATTERN_KEY} */
public static final String DATASET_PATTERN_KEY = DatasetCleaner.CONFIGURATION_KEY_PREFIX + "dataset.pattern";
@Deprecated
/** @deprecated use {@link DATASET_FINDER_BLACKLIST_KEY} */
public static final String DATASET_BLACKLIST_KEY = DatasetCleaner.CONFIGURATION_KEY_PREFIX + "dataset.blacklist";
public static final String DATASET_FINDER_PATTERN_KEY = CONFIGURATION_KEY_PREFIX + "dataset.pattern";
public static final String DATASET_FINDER_BLACKLIST_KEY = CONFIGURATION_KEY_PREFIX + "dataset.blacklist";
protected final Path datasetPattern;
private final Optional<Pattern> blacklist;
private final Path commonRoot;
protected final FileSystem fs;
protected final Properties props;
private static final Map<String, String> DEPRECATIONS = ImmutableMap.of(DATASET_FINDER_PATTERN_KEY,
DATASET_PATTERN_KEY, DATASET_FINDER_BLACKLIST_KEY, DATASET_BLACKLIST_KEY);
public ConfigurableGlobDatasetFinder(FileSystem fs, Properties jobProps, Config config) {
for (String property : requiredProperties()) {
Preconditions.checkArgument(config.hasPath(property) || config.hasPath(DEPRECATIONS.get(property)),
String.format("Missing required property %s", property));
}
if (ConfigUtils.hasNonEmptyPath(config, DATASET_BLACKLIST_KEY)) {
this.blacklist = Optional.of(Pattern.compile(config.getString(DATASET_BLACKLIST_KEY)));
} else if (ConfigUtils.hasNonEmptyPath(config, DATASET_FINDER_BLACKLIST_KEY)) {
this.blacklist = Optional.of(Pattern.compile(config.getString(DATASET_FINDER_BLACKLIST_KEY)));
} else {
this.blacklist = Optional.absent();
}
this.fs = fs;
Path tmpDatasetPattern;
if (config.hasPath(DATASET_FINDER_PATTERN_KEY)) {
tmpDatasetPattern = new Path(config.getString(DATASET_FINDER_PATTERN_KEY));
} else {
tmpDatasetPattern = new Path(config.getString(DATASET_PATTERN_KEY));
}
this.datasetPattern =
tmpDatasetPattern.isAbsolute() ? tmpDatasetPattern : new Path(this.fs.getWorkingDirectory(), tmpDatasetPattern);
this.commonRoot = PathUtils.deepestNonGlobPath(this.datasetPattern);
this.props = jobProps;
}
public ConfigurableGlobDatasetFinder(FileSystem fs, Properties props) {
this(fs, props, ConfigFactory.parseProperties(props));
}
/**
* List of required properties for subclasses of this dataset. The constructor will check that the input
* {@link java.util.Properties} contain all properties returned.
* @return List of all required property keys in the constructor {@link java.util.Properties}.
*/
public List<String> requiredProperties() {
return Lists.newArrayList(DATASET_FINDER_PATTERN_KEY);
}
/**
* Finds all directories satisfying the input glob pattern, and creates a {@link gobblin.data.management.retention.dataset.CleanableDataset}
* for each one using {@link #datasetAtPath}.
* @return List of {@link gobblin.data.management.retention.dataset.CleanableDataset}s in the file system.
* @throws IOException
*/
@Override
public List<T> findDatasets() throws IOException {
List<T> datasets = Lists.newArrayList();
LOG.info("Finding datasets for pattern " + this.datasetPattern);
FileStatus[] fileStatuss = this.getDatasetDirs();
if (fileStatuss != null) {
for (FileStatus fileStatus : fileStatuss) {
Path pathToMatch = PathUtils.getPathWithoutSchemeAndAuthority(fileStatus.getPath());
if (this.blacklist.isPresent() && this.blacklist.get().matcher(pathToMatch.toString()).find()) {
continue;
}
LOG.info("Found dataset at " + fileStatus.getPath());
datasets.add(datasetAtPath(PathUtils.getPathWithoutSchemeAndAuthority(fileStatus.getPath())));
}
}
return datasets;
}
/**
* @return all the directories that satisfy the input glob pattern.
* @throws IOException
*/
protected FileStatus[] getDatasetDirs() throws IOException {
return this.fs.globStatus(this.datasetPattern);
}
/**
* Returns the deepest non-glob ancestor of the dataset pattern.
*/
@Override
public Path commonDatasetRoot() {
return this.commonRoot;
}
/**
* Creates a {@link gobblin.data.management.retention.dataset.CleanableDataset} from a path. The default implementation
* creates a {@link gobblin.data.management.retention.dataset.ConfigurableCleanableDataset}.
* @param path {@link org.apache.hadoop.fs.Path} where dataset is located.
* @return {@link gobblin.data.management.retention.dataset.CleanableDataset} at that path.
* @throws IOException
*/
public abstract T datasetAtPath(Path path) throws IOException;
}