/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.retention.dataset;
import java.io.IOException;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import com.google.common.collect.ImmutableList;
import com.typesafe.config.Config;
import gobblin.data.management.retention.policy.RetentionPolicy;
import gobblin.data.management.version.FileSystemDatasetVersion;
import gobblin.data.management.version.finder.VersionFinder;
import gobblin.dataset.FileSystemDataset;
/**
* Implementation of a {@link CleanableDataset} that uses a
* {@link gobblin.data.management.retention.version.finder.VersionFinder} to find dataset versions, a
* {@link gobblin.data.management.retention.policy.RetentionPolicy} to figure out deletable versions, and then deletes
* those files and newly empty parent directories.
*
* <p>
* Concrete subclasses should implement {@link #getVersionFinder} and {@link #getRetentionPolicy}.
* </p>
*
* <p>
* Datasets are directories in the filesystem containing data files organized in version-like directory structures.
* Example datasets:
* </p>
*
* <p>
* For snapshot based datasets, with the directory structure:
* <pre>
* /path/to/table/
* snapshot1/
* dataFiles...
* snapshot2/
* dataFiles...
* </pre>
* each of snapshot1 and snapshot2 are dataset versions.
* </p>
*
* <p>
* For tracking datasets, with the directory structure:
* <pre>
* /path/to/tracking/data/
* 2015/
* 06/
* 01/
* dataFiles...
* 02/
* dataFiles...
* </pre>
* each of 2015/06/01 and 2015/06/02 are dataset versions.
* </p>
*
* <p>
* {@link CleanableDatasetBase} uses a {@link gobblin.data.management.version.finder.DatasetVersionFinder} to find all
* subdirectories that are versions of this dataset. After that, for each dataset, it uses a
* {@link gobblin.data.management.retention.policy.RetentionPolicy} to decide which versions of the dataset should be
* deleted. For each version deleted, if {@link #deleteEmptyDirectories} it will also look at all parent directories
* and delete directories that are now empty, up to but not including the dataset root.
* </p>
*
* @param <T> type of {@link gobblin.data.management.retention.version.DatasetVersion} supported by this
* {@link CleanableDataset}.
*/
public abstract class CleanableDatasetBase<T extends FileSystemDatasetVersion>
extends MultiVersionCleanableDatasetBase<T> implements CleanableDataset, FileSystemDataset {
/**
* Get {@link gobblin.data.management.retention.version.finder.VersionFinder} to use.
*/
public abstract VersionFinder<? extends T> getVersionFinder();
/**
* Get {@link gobblin.data.management.retention.policy.RetentionPolicy} to use.
*/
public abstract RetentionPolicy<T> getRetentionPolicy();
public CleanableDatasetBase(final FileSystem fs, final Properties props, Config config, Logger log)
throws IOException {
super(fs, props, config, log);
}
public CleanableDatasetBase(final FileSystem fs, final Properties props, Logger log) throws IOException {
super(fs, props, log);
}
public CleanableDatasetBase(FileSystem fs, Properties properties, boolean simulate, boolean skipTrash,
boolean deleteEmptyDirectories, boolean deleteAsOwner, boolean isDatasetBlacklisted, Logger log)
throws IOException {
super(fs, properties, simulate, skipTrash, deleteEmptyDirectories, deleteAsOwner, isDatasetBlacklisted, log);
}
public CleanableDatasetBase(FileSystem fs, Properties properties, boolean simulate, boolean skipTrash,
boolean deleteEmptyDirectories, boolean deleteAsOwner, Logger log) throws IOException {
super(fs, properties, simulate, skipTrash, deleteEmptyDirectories, deleteAsOwner,
Boolean.parseBoolean(IS_DATASET_BLACKLISTED_DEFAULT), log);
}
@Override
public List<VersionFinderAndPolicy<T>> getVersionFindersAndPolicies() {
return ImmutableList
.<VersionFinderAndPolicy<T>> of(new VersionFinderAndPolicy<>(getRetentionPolicy(), getVersionFinder()));
}
}