/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.version.finder;
import gobblin.data.management.version.FileSystemDatasetVersion;
import gobblin.data.management.version.StringDatasetVersion;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/**
* Finds watermarked dataset versions as direct subdirectories of the dataset directory. The watermark is assumed
* to be part of the subdirectory name. By default, the watermark is the subdirectory name itself, but a regular
* expression can be provided to extract the watermark from the name. The watermarks will be sorted by String
* sorting.
*
* <p>
* For example, snapshots of a database can be named by the unix timestamp when the snapshot was dumped:
* /path/to/snapshots/1436223009-snapshot
* /path/to/snapshots/1436234210-snapshot
* In this case the versions are 1436223009-snapshot, 1436234210-snapshot. Since the watermark is at the
* beginning of the name, the natural string ordering is good enough to sort the snapshots, so no regexp is
* required to extract the actual watermark.
* </p>
*/
public class WatermarkDatasetVersionFinder extends DatasetVersionFinder<StringDatasetVersion> {
public static final Logger LOGGER = LoggerFactory.getLogger(WatermarkDatasetVersionFinder.class);
public static final String WATERMARK_REGEX_KEY = "version.watermark.regex";
private Optional<Pattern> pattern;
public WatermarkDatasetVersionFinder(FileSystem fs, Properties props) {
this(fs, ConfigFactory.parseProperties(props));
}
public WatermarkDatasetVersionFinder(FileSystem fs, Config config) {
super(fs);
if (config.hasPath(WATERMARK_REGEX_KEY)) {
initPattern(config.getString(WATERMARK_REGEX_KEY));
} else {
this.pattern = Optional.absent();
}
}
private void initPattern(String patternString) {
this.pattern = Optional.of(patternString).transform(new Function<String, Pattern>() {
@Nullable
@Override
public Pattern apply(String input) {
return Pattern.compile(input);
}
});
}
@Override
public Class<? extends FileSystemDatasetVersion> versionClass() {
return StringDatasetVersion.class;
}
@Override
public Path globVersionPattern() {
return new Path("*");
}
@Override
public StringDatasetVersion getDatasetVersion(Path pathRelativeToDatasetRoot, Path fullPath) {
if (this.pattern.isPresent()) {
Matcher matcher = this.pattern.get().matcher(pathRelativeToDatasetRoot.getName());
if (!matcher.find() || matcher.groupCount() < 1) {
LOGGER.warn("Candidate dataset version at " + pathRelativeToDatasetRoot
+ " does not match expected pattern. Ignoring.");
return null;
}
return new StringDatasetVersion(matcher.group(1), fullPath);
}
return new StringDatasetVersion(pathRelativeToDatasetRoot.getName(), fullPath);
}
}