/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.retention.dataset;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigRenderOptions;
import gobblin.data.management.copy.hive.HiveDataset;
import gobblin.data.management.policy.SelectBeforeTimeBasedPolicy;
import gobblin.data.management.policy.VersionSelectionPolicy;
import gobblin.data.management.retention.version.HiveDatasetVersionCleaner;
import gobblin.data.management.version.HiveDatasetVersion;
import gobblin.data.management.version.finder.AbstractHiveDatasetVersionFinder;
import gobblin.data.management.version.finder.DatePartitionHiveVersionFinder;
import gobblin.hive.HiveMetastoreClientPool;
import gobblin.util.AutoReturnableObject;
import gobblin.util.ConfigUtils;
import gobblin.util.reflection.GobblinConstructorUtils;
/**
* <p>
* A {@link HiveDataset} used for Retention. A {@link HiveDataset} represents a hive table and a {@link HiveDatasetVersion}
* represents a hive partition of this table.
* </p>
*
* <ul>
* <li>A version finder at {@value #VERSION_FINDER_CLASS_KEY} is used to find all the partitions the dataset
* <li>A selection policy at {@value #SELECTION_POLICY_CLASS_KEY} is applied on all these partitions to get the partitions to be deleted.
* <li>These selected partitions are dropped in the hive metastore and all the data on FileSystem is also deleted
* </ul>
*
*/
@Slf4j
@SuppressWarnings({ "rawtypes", "unchecked" })
@Getter
public class CleanableHiveDataset extends HiveDataset implements CleanableDataset {
private static final String SHOULD_DELETE_DATA_KEY = "gobblin.retention.hive.shouldDeleteData";
private static final String SHOULD_DELETE_DATA_DEFAULT = Boolean.toString(false);
private static final String VERSION_FINDER_CLASS_KEY = "version.finder.class";
private static final String DEFAULT_VERSION_FINDER_CLASS = DatePartitionHiveVersionFinder.class.getName();
private static final String SELECTION_POLICY_CLASS_KEY = "selection.policy.class";
private static final String DEFAULT_SELECTION_POLICY_CLASS = SelectBeforeTimeBasedPolicy.class.getName();
private final VersionSelectionPolicy hiveSelectionPolicy;
private final AbstractHiveDatasetVersionFinder hiveDatasetVersionFinder;
private final boolean simulate;
private final boolean shouldDeleteData;
private final FsCleanableHelper fsCleanableHelper;
public CleanableHiveDataset(FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Properties jobProps,
Config config) throws IOException {
super(fs, clientPool, table, jobProps, config);
try {
this.hiveSelectionPolicy =
(VersionSelectionPolicy) GobblinConstructorUtils.invokeFirstConstructor(Class.forName(ConfigUtils.getString(
this.datasetConfig, SELECTION_POLICY_CLASS_KEY, DEFAULT_SELECTION_POLICY_CLASS)), ImmutableList.<Object> of(
this.datasetConfig, jobProps), ImmutableList.<Object> of(this.datasetConfig), ImmutableList.<Object> of(jobProps));
log.info(String.format("Configured selection policy %s for dataset:%s with config %s",
ConfigUtils.getString(this.datasetConfig, SELECTION_POLICY_CLASS_KEY, DEFAULT_SELECTION_POLICY_CLASS),
datasetURN(), this.datasetConfig.root().render(ConfigRenderOptions.concise())));
this.hiveDatasetVersionFinder =
(AbstractHiveDatasetVersionFinder) GobblinConstructorUtils.invokeFirstConstructor(Class.forName(ConfigUtils
.getString(this.datasetConfig, VERSION_FINDER_CLASS_KEY, DEFAULT_VERSION_FINDER_CLASS)), ImmutableList
.<Object> of(this.fs, this.datasetConfig), ImmutableList.<Object> of(this.fs, jobProps));
} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException
| ClassNotFoundException e) {
log.error("Failed to instantiate CleanableHiveDataset", e);
throw new IllegalArgumentException(e);
}
this.fsCleanableHelper = new FsCleanableHelper(fs, jobProps, this.datasetConfig, log);
this.shouldDeleteData = Boolean.valueOf(jobProps.getProperty(SHOULD_DELETE_DATA_KEY, SHOULD_DELETE_DATA_DEFAULT));
this.simulate = Boolean.valueOf(jobProps.getProperty(FsCleanableHelper.SIMULATE_KEY, FsCleanableHelper.SIMULATE_DEFAULT));
}
/**
* Drops the partitions selected by {@link #hiveSelectionPolicy}. Also deletes the data associated with it.
* <p>
* If an {@link Exception} occurs while processing a {@link Partition}, other {@link Partition}s will still be deleted.
* However, a {@link RuntimeException} is thrown at the end if there was at least one {@link Exception}.
* </p>
*/
@Override
public void clean() throws IOException {
List versions = Lists.newArrayList(this.hiveDatasetVersionFinder.findDatasetVersions(this));
if (versions.isEmpty()) {
log.warn(String.format("No dataset version can be found. Ignoring %s", this.getTable().getCompleteName()));
return;
}
Collections.sort(versions, Collections.reverseOrder());
Collection<HiveDatasetVersion> deletableVersions = this.hiveSelectionPolicy.listSelectedVersions(versions);
log.info(String.format("Cleaning dataset %s .Will drop %s out of %s partitions.", datasetURN(), deletableVersions.size(),
versions.size()));
List<Exception> exceptions = Lists.newArrayList();
for (HiveDatasetVersion hiveDatasetVersion : deletableVersions) {
try {
// Initialize the version cleaner
HiveDatasetVersionCleaner hiveDatasetVersionCleaner = new HiveDatasetVersionCleaner(hiveDatasetVersion, this);
// Perform pre-clean actions
hiveDatasetVersionCleaner.preCleanAction();
// Perform actual cleaning
hiveDatasetVersionCleaner.clean();
// Perform post-clean actions eg. swap partitions
hiveDatasetVersionCleaner.postCleanAction();
} catch (IOException e) {
exceptions.add(e);
}
}
if (!exceptions.isEmpty()) {
throw new RuntimeException(String.format("Deletion failed for %s partitions", exceptions.size()));
}
}
@Override
public Path datasetRoot() {
return super.getTable().getDataLocation();
}
}