/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.version.finder;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.google.common.collect.Lists;
import gobblin.dataset.Dataset;
import gobblin.dataset.FileSystemDataset;
import gobblin.data.management.version.FileSystemDatasetVersion;
import gobblin.util.PathUtils;
/**
* Class to find {@link FileSystemDataset} versions in the file system.
*
* Concrete subclasses should implement a ({@link org.apache.hadoop.fs.FileSystem}, {@link java.util.Properties})
* constructor to be instantiated.
*
* Provides a callback {@link AbstractDatasetVersionFinder#getDatasetVersion(Path, FileStatus)} which subclasses need to
* implement.
*
* @param <T> Type of {@link gobblin.data.management.version.FileSystemDatasetVersion} expected from this class.
*/
public abstract class AbstractDatasetVersionFinder<T extends FileSystemDatasetVersion> implements VersionFinder<T> {
protected FileSystem fs;
public AbstractDatasetVersionFinder(FileSystem fs, Properties props) {
this.fs = fs;
}
public AbstractDatasetVersionFinder(FileSystem fs) {
this(fs, new Properties());
}
/**
* Find dataset versions in the input {@link org.apache.hadoop.fs.Path}. Dataset versions are subdirectories of the
* input {@link org.apache.hadoop.fs.Path} representing a single manageable unit in the dataset.
* See {@link gobblin.data.management.retention.DatasetCleaner} for more information.
*
* @param dataset {@link org.apache.hadoop.fs.Path} to directory containing all versions of a dataset.
* @return Map of {@link gobblin.data.management.version.DatasetVersion} and {@link org.apache.hadoop.fs.FileStatus}
* for each dataset version found.
* @throws IOException
*/
@Override
public Collection<T> findDatasetVersions(Dataset dataset) throws IOException {
FileSystemDataset fsDataset = (FileSystemDataset) dataset;
Path versionGlobStatus = new Path(fsDataset.datasetRoot(), globVersionPattern());
FileStatus[] dataSetVersionPaths = this.fs.globStatus(versionGlobStatus);
List<T> dataSetVersions = Lists.newArrayList();
for (FileStatus dataSetVersionPath : dataSetVersionPaths) {
T datasetVersion =
getDatasetVersion(PathUtils.relativizePath(dataSetVersionPath.getPath(), fsDataset.datasetRoot()),
dataSetVersionPath);
if (datasetVersion != null) {
dataSetVersions.add(datasetVersion);
}
}
return dataSetVersions;
}
/**
* Should return class of T.
*/
@Override
public abstract Class<? extends FileSystemDatasetVersion> versionClass();
/**
* Glob pattern relative to the root of the dataset used to find {@link org.apache.hadoop.fs.FileStatus} for each
* dataset version.
* @return glob pattern relative to dataset root.
*/
public abstract Path globVersionPattern();
/**
* Create a {@link gobblin.data.management.version.DatasetVersion} with <code>versionFileStatus</code> and a path
* relative to the dataset.
* @param pathRelativeToDatasetRoot {@link org.apache.hadoop.fs.Path} of dataset version relative to dataset root.
* @param versionFileStatus {@link FileStatus} of the dataset version.
* @return {@link gobblin.data.management.version.DatasetVersion} for that {@link FileStatus}.
*/
public abstract T getDatasetVersion(Path pathRelativeToDatasetRoot, FileStatus versionFileStatus);
}