/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.retention.dataset;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.Singular;
import org.apache.hadoop.fs.FileSystem;
import org.slf4j.Logger;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import gobblin.data.management.policy.EmbeddedRetentionSelectionPolicy;
import gobblin.data.management.policy.SelectNothingPolicy;
import gobblin.data.management.policy.VersionSelectionPolicy;
import gobblin.data.management.retention.action.RetentionAction;
import gobblin.data.management.retention.policy.RetentionPolicy;
import gobblin.data.management.trash.ProxiedTrash;
import gobblin.data.management.version.DatasetVersion;
import gobblin.data.management.version.FileSystemDatasetVersion;
import gobblin.data.management.version.finder.VersionFinder;
import gobblin.dataset.FileSystemDataset;
import gobblin.util.ConfigUtils;
/**
* A {@link CleanableDataset} that may have multiple {@link VersionFinder}, {@link VersionSelectionPolicy}
* and {@link RetentionAction}s. Retention needs to performed for different kinds of {@link DatasetVersion}s. Each
* kind of {@link DatasetVersion} can have its own {@link VersionSelectionPolicy} and/or {@link RetentionAction}
* associated with it.
* <ul>
* <li>{@link MultiVersionCleanableDatasetBase#getVersionFindersAndPolicies()} gets a list {@link VersionFinderAndPolicy}s
* <li>Each {@link VersionFinderAndPolicy} contains a {@link VersionFinder} and a {@link VersionSelectionPolicy}. It can
* optionally have a {@link RetentionAction}
* <li>The {@link MultiVersionCleanableDatasetBase#clean()} method finds all the {@link FileSystemDatasetVersion}s using
* {@link VersionFinderAndPolicy#versionFinder}
* <li> It gets the deletable {@link FileSystemDatasetVersion}s by applying {@link VersionFinderAndPolicy#versionSelectionPolicy}.
* These deletable version are deleted and then deletes empty parent directories.
* <li>If additional retention actions are available at {@link VersionFinderAndPolicy#getRetentionActions()}, all versions
* found by the {@link VersionFinderAndPolicy#versionFinder} are passed to {@link RetentionAction#execute(List)} for
* each {@link RetentionAction}
* </ul>
*
* <p>
* Concrete subclasses should implement {@link #getVersionFindersAndPolicies()}
* </p>
*
* <p>
* Datasets are directories in the filesystem containing data files organized in version-like directory structures.
* Example datasets:
* </p>
*
* <p>
* For snapshot based datasets, with the directory structure:
* <pre>
* /path/to/table/
* snapshot1/
* dataFiles...
* snapshot2/
* dataFiles...
* </pre>
* each of snapshot1 and snapshot2 are dataset versions.
* </p>
*
* <p>
* For tracking datasets, with the directory structure:
* <pre>
* /path/to/tracking/data/
* 2015/
* 06/
* 01/
* dataFiles...
* 02/
* dataFiles...
* </pre>
* each of 2015/06/01 and 2015/06/02 are dataset versions.
* </p>
*
* @param <T> type of {@link FileSystemDatasetVersion} supported by this {@link CleanableDataset}.
*/
public abstract class MultiVersionCleanableDatasetBase<T extends FileSystemDatasetVersion>
implements CleanableDataset, FileSystemDataset {
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
public static final String CONFIGURATION_KEY_PREFIX = FsCleanableHelper.CONFIGURATION_KEY_PREFIX;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
public static final String SIMULATE_KEY = FsCleanableHelper.SIMULATE_KEY;
public static final String SIMULATE_DEFAULT = FsCleanableHelper.SIMULATE_DEFAULT;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
public static final String SKIP_TRASH_KEY = FsCleanableHelper.SKIP_TRASH_KEY;
public static final String SKIP_TRASH_DEFAULT = FsCleanableHelper.SKIP_TRASH_DEFAULT;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
public static final String DELETE_EMPTY_DIRECTORIES_KEY = FsCleanableHelper.DELETE_EMPTY_DIRECTORIES_KEY;
public static final String DELETE_EMPTY_DIRECTORIES_DEFAULT = FsCleanableHelper.DELETE_EMPTY_DIRECTORIES_DEFAULT;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
public static final String DELETE_AS_OWNER_KEY = FsCleanableHelper.DELETE_AS_OWNER_KEY;
public static final String DELETE_AS_OWNER_DEFAULT = FsCleanableHelper.DELETE_AS_OWNER_DEFAULT;
public static final String IS_DATASET_BLACKLISTED_KEY = CONFIGURATION_KEY_PREFIX + "dataset.is.blacklisted";
public static final String IS_DATASET_BLACKLISTED_DEFAULT = Boolean.toString(false);
protected final FileSystem fs;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
protected final ProxiedTrash trash;
@Getter
@VisibleForTesting
protected final boolean isDatasetBlacklisted;
private final FsCleanableHelper fsCleanableHelper;
protected final Logger log;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
protected final boolean simulate;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
protected final boolean skipTrash;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
protected final boolean deleteEmptyDirectories;
/**
* @deprecated in favor of {@link FsCleanableHelper}
*/
@Deprecated
protected final boolean deleteAsOwner;
/**
* Get {@link gobblin.data.management.retention.policy.RetentionPolicy} to use.
*/
public abstract List<VersionFinderAndPolicy<T>> getVersionFindersAndPolicies();
public MultiVersionCleanableDatasetBase(final FileSystem fs, final Properties props, Config config, Logger log)
throws IOException {
this(fs, props, Boolean.valueOf(props.getProperty(SIMULATE_KEY, SIMULATE_DEFAULT)),
Boolean.valueOf(props.getProperty(SKIP_TRASH_KEY, SKIP_TRASH_DEFAULT)),
Boolean.valueOf(props.getProperty(DELETE_EMPTY_DIRECTORIES_KEY, DELETE_EMPTY_DIRECTORIES_DEFAULT)),
Boolean.valueOf(props.getProperty(DELETE_AS_OWNER_KEY, DELETE_AS_OWNER_DEFAULT)),
ConfigUtils.getBoolean(config, IS_DATASET_BLACKLISTED_KEY, Boolean.valueOf(IS_DATASET_BLACKLISTED_DEFAULT)), log);
}
public MultiVersionCleanableDatasetBase(final FileSystem fs, final Properties props, Logger log) throws IOException {
// This constructor is used by retention jobs configured through job configs and do not use dataset configs from config store.
// IS_DATASET_BLACKLISTED_KEY is only available with dataset config. Hence set IS_DATASET_BLACKLISTED_KEY to default
// ...false for jobs running with job configs
this(fs, props, ConfigFactory.parseMap(ImmutableMap.<String, String> of(IS_DATASET_BLACKLISTED_KEY,
IS_DATASET_BLACKLISTED_DEFAULT)), log);
}
/**
* Constructor for {@link MultiVersionCleanableDatasetBase}.
* @param fs {@link org.apache.hadoop.fs.FileSystem} where files are located.
* @param properties {@link java.util.Properties} for object.
* @param simulate whether to simulate deletes.
* @param skipTrash if true, delete files and directories immediately.
* @param deleteEmptyDirectories if true, newly empty parent directories will be deleted.
* @param deleteAsOwner if true, all deletions will be executed as the owner of the file / directory.
* @param log logger to use.
* @param isDatasetBlacklisted if true, clean will be skipped for this dataset
*
* @throws IOException
*/
public MultiVersionCleanableDatasetBase(FileSystem fs, Properties properties, boolean simulate, boolean skipTrash,
boolean deleteEmptyDirectories, boolean deleteAsOwner, boolean isDatasetBlacklisted, Logger log)
throws IOException {
this.log = log;
this.fsCleanableHelper = new FsCleanableHelper(fs, properties, simulate, skipTrash, deleteEmptyDirectories, deleteAsOwner, log);
this.fs = fs;
this.simulate = simulate;
this.skipTrash = skipTrash;
this.deleteEmptyDirectories = deleteEmptyDirectories;
this.trash = this.fsCleanableHelper.getTrash();
this.deleteAsOwner = deleteAsOwner;
this.isDatasetBlacklisted = isDatasetBlacklisted;
}
public MultiVersionCleanableDatasetBase(FileSystem fs, Properties properties, boolean simulate, boolean skipTrash,
boolean deleteEmptyDirectories, boolean deleteAsOwner, Logger log) throws IOException {
this(fs, properties, simulate, skipTrash, deleteEmptyDirectories, deleteAsOwner,
Boolean.parseBoolean(IS_DATASET_BLACKLISTED_DEFAULT), log);
}
/**
* Method to perform the Retention operations for this dataset.
*
*<ul>
* <li>{@link MultiVersionCleanableDatasetBase#getVersionFindersAndPolicies()} gets a list {@link VersionFinderAndPolicy}s
* <li>Each {@link VersionFinderAndPolicy} contains a {@link VersionFinder} and a {@link VersionSelectionPolicy}. It can
* optionally have a {@link RetentionAction}
* <li>The {@link MultiVersionCleanableDatasetBase#clean()} method finds all the {@link FileSystemDatasetVersion}s using
* {@link VersionFinderAndPolicy#versionFinder}
* <li> It gets the deletable {@link FileSystemDatasetVersion}s by applying {@link VersionFinderAndPolicy#versionSelectionPolicy}.
* These deletable version are deleted and then deletes empty parent directories.
* <li>If additional retention actions are available at {@link VersionFinderAndPolicy#getRetentionActions()}, all versions
* found by the {@link VersionFinderAndPolicy#versionFinder} are passed to {@link RetentionAction#execute(List)} for
* each {@link RetentionAction}
* </ul>
*
*/
@Override
public void clean() throws IOException {
if (this.isDatasetBlacklisted) {
this.log.info("Dataset blacklisted. Cleanup skipped for " + datasetRoot());
return;
}
boolean atLeastOneFailureSeen = false;
for (VersionFinderAndPolicy<T> versionFinderAndPolicy : getVersionFindersAndPolicies()) {
VersionSelectionPolicy<T> selectionPolicy = versionFinderAndPolicy.getVersionSelectionPolicy();
VersionFinder<? extends T> versionFinder = versionFinderAndPolicy.getVersionFinder();
if (!selectionPolicy.versionClass().isAssignableFrom(versionFinder.versionClass())) {
throw new IOException("Incompatible dataset version classes.");
}
this.log.info(String.format("Cleaning dataset %s. Using version finder %s and policy %s", this,
versionFinder.getClass().getName(), selectionPolicy));
List<T> versions = Lists.newArrayList(versionFinder.findDatasetVersions(this));
if (versions.isEmpty()) {
this.log.warn("No dataset version can be found. Ignoring.");
continue;
}
Collections.sort(versions, Collections.reverseOrder());
Collection<T> deletableVersions = selectionPolicy.listSelectedVersions(versions);
cleanImpl(deletableVersions);
List<DatasetVersion> allVersions = Lists.newArrayList();
for (T ver : versions) {
allVersions.add(ver);
}
for (RetentionAction retentionAction : versionFinderAndPolicy.getRetentionActions()) {
try {
retentionAction.execute(allVersions);
} catch (Throwable t) {
atLeastOneFailureSeen = true;
log.error(String.format("RetentionAction %s failed for dataset %s", retentionAction.getClass().getName(),
this.datasetRoot()), t);
}
}
}
if (atLeastOneFailureSeen) {
throw new RuntimeException(String.format(
"At least one failure happened while processing %s. Look for previous logs for failures", datasetRoot()));
}
}
protected void cleanImpl(Collection<T> deletableVersions) throws IOException {
this.fsCleanableHelper.clean(deletableVersions, this);
}
@Override
public String toString() {
return datasetRoot().toString();
}
@Override
public String datasetURN() {
return this.datasetRoot().toString();
}
/**
* A composition of version finder
* @param <T> the type of {@link FileSystemDatasetVersion} this version finder knows to find
*/
@Getter
@Builder
@AllArgsConstructor
public static class VersionFinderAndPolicy<T extends FileSystemDatasetVersion> {
private final VersionSelectionPolicy<T> versionSelectionPolicy;
private final VersionFinder<? extends T> versionFinder;
@Singular
private final List<RetentionAction> retentionActions;
/**
* Constructor for backward compatibility
* @deprecated use {@link VersionFinderAndPolicyBuilder}
*/
@Deprecated
public VersionFinderAndPolicy(VersionSelectionPolicy<T> versionSelectionPolicy, VersionFinder<? extends T> versionFinder) {
this.versionSelectionPolicy = versionSelectionPolicy;
this.versionFinder = versionFinder;
this.retentionActions = Lists.newArrayList();
}
public VersionFinderAndPolicy(RetentionPolicy<T> retentionPolicy, VersionFinder<? extends T> versionFinder) {
this(new EmbeddedRetentionSelectionPolicy<>(retentionPolicy), versionFinder);
}
public static class VersionFinderAndPolicyBuilder<T extends FileSystemDatasetVersion> {
@SuppressWarnings("unchecked")
public VersionFinderAndPolicy<T> build() {
VersionSelectionPolicy<T> localVersionSelectionPolicy;
List<RetentionAction> localRetentionActions;
if (this.versionSelectionPolicy == null) {
localVersionSelectionPolicy = (VersionSelectionPolicy<T>) new SelectNothingPolicy(new Properties());
} else {
localVersionSelectionPolicy = this.versionSelectionPolicy;
}
if (this.retentionActions == null) {
localRetentionActions = Lists.newArrayList();
} else {
localRetentionActions = Lists.newArrayList(this.retentionActions);
}
return new VersionFinderAndPolicy<T>(localVersionSelectionPolicy, this.versionFinder,
localRetentionActions);
}
}
}
}