/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.copy.hive;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.thrift.TException;
import com.google.common.annotations.VisibleForTesting;
import com.typesafe.config.ConfigFactory;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closer;
import com.google.gson.Gson;
import com.typesafe.config.Config;
import gobblin.commit.CommitStep;
import gobblin.configuration.State;
import gobblin.util.ClassAliasResolver;
import gobblin.data.management.copy.CopyConfiguration;
import gobblin.data.management.copy.CopyEntity;
import gobblin.data.management.copy.CopyableFile;
import gobblin.data.management.copy.OwnerAndPermission;
import gobblin.data.management.copy.entities.PostPublishStep;
import gobblin.data.management.copy.hive.avro.HiveAvroCopyEntityHelper;
import gobblin.data.management.partition.FileSet;
import gobblin.hive.HiveMetastoreClientPool;
import gobblin.hive.HiveRegProps;
import gobblin.hive.HiveRegisterStep;
import gobblin.hive.PartitionDeregisterStep;
import gobblin.hive.TableDeregisterStep;
import gobblin.hive.metastore.HiveMetaStoreUtils;
import gobblin.hive.spec.HiveSpec;
import gobblin.hive.spec.SimpleHiveSpec;
import gobblin.metrics.event.EventSubmitter;
import gobblin.metrics.event.MultiTimingEvent;
import gobblin.util.PathUtils;
import gobblin.util.commit.DeleteFileCommitStep;
import gobblin.util.reflection.GobblinConstructorUtils;
import gobblin.util.request_allocation.PushDownRequestor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.Singular;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
/**
* Creates {@link CopyEntity}s for copying a Hive table.
*/
@Slf4j
@Getter
public class HiveCopyEntityHelper {
public static final String EXISTING_ENTITY_POLICY_KEY =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".existing.entity.conflict.policy";
public static final String DEFAULT_EXISTING_ENTITY_POLICY = ExistingEntityPolicy.ABORT.name();
public static final String UNMANAGED_DATA_POLICY_KEY =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".unmanaged.data.conflict.policy";
public static final String DEFAULT_UNMANAGED_DATA_POLICY = UnmanagedDataPolicy.ABORT.name();
/** Target metastore URI */
public static final String TARGET_METASTORE_URI_KEY =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.target.metastore.uri";
/** Target database name */
public static final String TARGET_DATABASE_KEY = HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.target.database";
/** A filter to select partitions to copy */
public static final String COPY_PARTITIONS_FILTER_CONSTANT =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.partition.filter.constant";
/** Use an implementation of {@link PartitionFilterGenerator} to dynamically create partition filter. The value should
* be the name of the implementation to use. */
public static final String COPY_PARTITION_FILTER_GENERATOR =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.partition.filter.generator";
/** A predicate applied to each partition before any file listing.
* If the predicate returns true, the partition will be skipped. */
public static final String FAST_PARTITION_SKIP_PREDICATE =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.fast.partition.skip.predicate";
/** A predicate applied to non partition table before any file listing.
* If the predicate returns true, the table will be skipped. */
public static final String FAST_TABLE_SKIP_PREDICATE =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.fast.table.skip.predicate";
/** Method for deleting files on deregister. One of {@link DeregisterFileDeleteMethod}. */
public static final String DELETE_FILES_ON_DEREGISTER =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.deregister.fileDeleteMethod";
public static final DeregisterFileDeleteMethod DEFAULT_DEREGISTER_DELETE_METHOD =
DeregisterFileDeleteMethod.NO_DELETE;
/**
* Config key to specify if {@link IMetaStoreClient }'s filtering method {@link listPartitionsByFilter} is not enough
* for filtering out specific partitions.
* For example, if you specify "Path" as the filter type and "Hourly" as the filtering condition,
* partitions with Path containing '/Hourly/' will be kept.
*/
public static final String HIVE_PARTITION_EXTENDED_FILTER_TYPE = HiveDatasetFinder.HIVE_DATASET_PREFIX + ".extendedFilterType";
static final Gson gson = new Gson();
private static final String source_client = "source_client";
private static final String target_client = "target_client";
public static final String GOBBLIN_DISTCP = "gobblin-distcp";
public static class Stages {
public static final String EXISTING_PARTITION = "ExistingPartition";
public static final String PARTITION_SKIP_PREDICATE = "PartitionSkipPredicate";
public static final String CREATE_LOCATIONS = "CreateLocations";
public static final String FULL_PATH_DIFF = "FullPathDiff";
public static final String CREATE_DELETE_UNITS = "CreateDeleteUnits";
public static final String CREATE_COPY_UNITS = "CreateCopyUnits";
public static final String SOURCE_PATH_LISTING = "SourcePathListing";
public static final String TARGET_EXISTING_PATH_LISTING = "TargetExistingPathListing";
public static final String DESIRED_PATHS_LISTING = "DesiredPathsListing";
public static final String PATH_DIFF = "PathDiff";
public static final String COMPUTE_DELETE_PATHS = "ComputeDeletePaths";
public static final String GET_TABLES = "GetTables";
public static final String COMPUTE_TARGETS = "ComputeTargets";
}
private final long startTime;
private final HiveDataset dataset;
private final CopyConfiguration configuration;
private final FileSystem targetFs;
private final HiveMetastoreClientPool targetClientPool;
private final String targetDatabase;
private final HiveRegProps hiveRegProps;
private Optional<Table> existingTargetTable;
private final Table targetTable;
private final Optional<String> targetURI;
private final ExistingEntityPolicy existingEntityPolicy;
private final UnmanagedDataPolicy unmanagedDataPolicy;
private final Optional<String> partitionFilter;
private Optional<? extends HivePartitionExtendedFilter> hivePartitionExtendedFilter;
private final Optional<Predicate<HivePartitionFileSet>> fastPartitionSkip;
private final Optional<Predicate<HiveCopyEntityHelper>> fastTableSkip;
private final DeregisterFileDeleteMethod deleteMethod;
private final Optional<CommitStep> tableRegistrationStep;
private final Map<List<String>, Partition> sourcePartitions;
private final Map<List<String>, Partition> targetPartitions;
private final EventSubmitter eventSubmitter;
@Getter
protected final HiveTargetPathHelper targetPathHelper;
/**
* Defines what should be done for partitions that exist in the target but are not compatible with the source.
*/
public enum ExistingEntityPolicy {
/** Deregister target partition, delete its files, and create a new partition with correct values. */
REPLACE_PARTITIONS,
/** Deregister target table, do NOT delete its files, and create a new table with correct values. */
REPLACE_TABLE,
/** Keep the target table as registered while updating the file location */
UPDATE_TABLE,
/** Abort copying of conflict table. */
ABORT
}
/**
* Defines what should be done for data that is not managed by the existing target table / partition.
*/
public enum UnmanagedDataPolicy {
/** Delete any data that is not managed by the existing target table / partition. */
DELETE_UNMANAGED_DATA,
/** Abort copying of conflict table / partition. */
ABORT
}
public enum DeregisterFileDeleteMethod {
/** Delete the files pointed at by the input format. */
INPUT_FORMAT,
/** Delete all files at the partition location recursively. */
RECURSIVE,
/** Don't delete files, just deregister partition. */
NO_DELETE
}
/**
* A container for the differences between desired and existing files.
*/
@Builder
@ToString
protected static class DiffPathSet {
/** Desired files that don't exist on target */
@Singular(value = "copyFile")
Collection<FileStatus> filesToCopy;
/** Files in target that are not desired */
@Singular(value = "deleteFile")
Collection<Path> pathsToDelete;
}
/**
* Represents a source {@link FileStatus} and a {@link Path} destination.
*/
@Data
private static class SourceAndDestination {
private final FileStatus source;
private final Path destination;
}
HiveCopyEntityHelper(HiveDataset dataset, CopyConfiguration configuration, FileSystem targetFs) throws IOException {
try (Closer closer = Closer.create()) {
log.info("Finding copy entities for table " + dataset.table.getCompleteName());
this.eventSubmitter = new EventSubmitter.Builder(dataset.getMetricContext(), "hive.dataset.copy").build();
MultiTimingEvent multiTimer = closer.register(new MultiTimingEvent(this.eventSubmitter, "HiveCopySetup", true));
this.startTime = System.currentTimeMillis();
this.dataset = dataset;
this.configuration = configuration;
this.targetFs = targetFs;
this.targetPathHelper = new HiveTargetPathHelper(this.dataset);
this.hiveRegProps = new HiveRegProps(new State(this.dataset.getProperties()));
this.targetURI = Optional.fromNullable(this.dataset.getProperties().getProperty(TARGET_METASTORE_URI_KEY));
this.targetClientPool = HiveMetastoreClientPool.get(this.dataset.getProperties(), this.targetURI);
this.targetDatabase = Optional.fromNullable(this.dataset.getProperties().getProperty(TARGET_DATABASE_KEY))
.or(this.dataset.table.getDbName());
this.existingEntityPolicy = ExistingEntityPolicy.valueOf(this.dataset.getProperties()
.getProperty(EXISTING_ENTITY_POLICY_KEY, DEFAULT_EXISTING_ENTITY_POLICY).toUpperCase());
this.unmanagedDataPolicy = UnmanagedDataPolicy.valueOf(
this.dataset.getProperties().getProperty(UNMANAGED_DATA_POLICY_KEY, DEFAULT_UNMANAGED_DATA_POLICY)
.toUpperCase());
this.deleteMethod = this.dataset.getProperties().containsKey(DELETE_FILES_ON_DEREGISTER)
? DeregisterFileDeleteMethod
.valueOf(this.dataset.getProperties().getProperty(DELETE_FILES_ON_DEREGISTER).toUpperCase())
: DEFAULT_DEREGISTER_DELETE_METHOD;
if (this.dataset.getProperties().containsKey(COPY_PARTITION_FILTER_GENERATOR)) {
try {
PartitionFilterGenerator generator = GobblinConstructorUtils.invokeFirstConstructor(
(Class<PartitionFilterGenerator>) Class
.forName(this.dataset.getProperties().getProperty(COPY_PARTITION_FILTER_GENERATOR)),
Lists.<Object> newArrayList(this.dataset.getProperties()), Lists.newArrayList());
this.partitionFilter = Optional.of(generator.getFilter(this.dataset));
log.info(String.format("Dynamic partition filter for table %s: %s.", this.dataset.table.getCompleteName(),
this.partitionFilter.get()));
} catch (ReflectiveOperationException roe) {
throw new IOException(roe);
}
} else {
this.partitionFilter =
Optional.fromNullable(this.dataset.getProperties().getProperty(COPY_PARTITIONS_FILTER_CONSTANT));
}
// Initialize extended partition filter
if ( this.dataset.getProperties().containsKey(HIVE_PARTITION_EXTENDED_FILTER_TYPE)){
String filterType = dataset.getProperties().getProperty(HIVE_PARTITION_EXTENDED_FILTER_TYPE);
try {
Config config = ConfigFactory.parseProperties(this.dataset.getProperties());
this.hivePartitionExtendedFilter =
Optional.of(new ClassAliasResolver<>(HivePartitionExtendedFilterFactory.class).resolveClass(filterType).newInstance().createFilter(config));
} catch (ReflectiveOperationException roe) {
log.error("Error: Could not find filter with alias " + filterType);
closer.close();
throw new IOException(roe);
}
}
else {
this.hivePartitionExtendedFilter = Optional.absent();
}
try {
this.fastPartitionSkip = this.dataset.getProperties().containsKey(FAST_PARTITION_SKIP_PREDICATE)
? Optional.of(GobblinConstructorUtils.invokeFirstConstructor(
(Class<Predicate<HivePartitionFileSet>>) Class
.forName(this.dataset.getProperties().getProperty(FAST_PARTITION_SKIP_PREDICATE)),
Lists.<Object> newArrayList(this), Lists.newArrayList()))
: Optional.<Predicate<HivePartitionFileSet>> absent();
this.fastTableSkip = this.dataset.getProperties().containsKey(FAST_TABLE_SKIP_PREDICATE)
? Optional.of(GobblinConstructorUtils.invokeFirstConstructor(
(Class<Predicate<HiveCopyEntityHelper>>) Class
.forName(this.dataset.getProperties().getProperty(FAST_TABLE_SKIP_PREDICATE)),
Lists.newArrayList()))
: Optional.<Predicate<HiveCopyEntityHelper>> absent();
} catch (ReflectiveOperationException roe) {
closer.close();
throw new IOException(roe);
}
Map<String, HiveMetastoreClientPool> namedPools =
ImmutableMap.of(source_client, this.dataset.clientPool, target_client, this.targetClientPool);
multiTimer.nextStage(Stages.GET_TABLES);
try (HiveMetastoreClientPool.MultiClient multiClient = HiveMetastoreClientPool.safeGetClients(namedPools)) {
if (multiClient.getClient(target_client).tableExists(this.targetDatabase, this.dataset.table.getTableName())) {
this.existingTargetTable = Optional.of(new Table(
multiClient.getClient(target_client).getTable(this.targetDatabase, this.dataset.table.getTableName())));
} else {
this.existingTargetTable = Optional.absent();
}
// Constructing CommitStep object for table registration
Path targetPath = getTargetLocation(dataset.fs, this.targetFs, dataset.table.getDataLocation(),
Optional.<Partition> absent());
this.targetTable = getTargetTable(this.dataset.table, targetPath);
HiveSpec tableHiveSpec = new SimpleHiveSpec.Builder<>(targetPath)
.withTable(HiveMetaStoreUtils.getHiveTable(this.targetTable.getTTable())).build();
CommitStep tableRegistrationStep =
new HiveRegisterStep(this.targetURI, tableHiveSpec, this.hiveRegProps);
this.tableRegistrationStep = Optional.of(tableRegistrationStep);
if (this.existingTargetTable.isPresent() && this.existingTargetTable.get().isPartitioned()) {
checkPartitionedTableCompatibility(this.targetTable, this.existingTargetTable.get());
}
if (HiveUtils.isPartitioned(this.dataset.table)) {
this.sourcePartitions = HiveUtils.getPartitionsMap(multiClient.getClient(source_client), this.dataset.table,
this.partitionFilter, this.hivePartitionExtendedFilter);
// Note: this must be mutable, so we copy the map
this.targetPartitions =
this.existingTargetTable.isPresent() ? Maps.newHashMap(
HiveUtils.getPartitionsMap(multiClient.getClient(target_client),
this.existingTargetTable.get(), this.partitionFilter, this.hivePartitionExtendedFilter))
: Maps.<List<String>, Partition> newHashMap();
} else {
this.sourcePartitions = Maps.newHashMap();
this.targetPartitions = Maps.newHashMap();
}
} catch (TException te) {
closer.close();
throw new IOException("Failed to generate work units for table " + dataset.table.getCompleteName(), te);
}
}
}
/**
* See {@link #getCopyEntities(CopyConfiguration, Comparator, PushDownRequestor)}. This method does not pushdown any prioritizer.
*/
Iterator<FileSet<CopyEntity>> getCopyEntities(CopyConfiguration configuration) throws IOException {
return getCopyEntities(configuration, null, null);
}
/**
* Finds all files read by the table and generates {@link CopyEntity}s for duplicating the table. The semantics are as follows:
* 1. Find all valid {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}. If the table is partitioned, the
* {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of the base
* table will be ignored, and we will instead process the {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of each partition.
* 2. For each {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} find all files referred by it.
* 3. Generate a {@link CopyableFile} for each file referred by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}.
* 4. If the table is partitioned, create a file set for each partition.
* 5. Create work units for registering, deregistering partitions / tables, and deleting unnecessary files in the target.
*
* For computation of target locations see {@link HiveTargetPathHelper#getTargetPath}
*/
Iterator<FileSet<CopyEntity>> getCopyEntities(CopyConfiguration configuration, Comparator<FileSet<CopyEntity>> prioritizer,
PushDownRequestor<FileSet<CopyEntity>> requestor) throws IOException {
if (HiveUtils.isPartitioned(this.dataset.table)) {
return new PartitionIterator(this.sourcePartitions, configuration, prioritizer, requestor);
} else {
FileSet<CopyEntity> fileSet = new UnpartitionedTableFileSet(this.dataset.table.getCompleteName(), this.dataset, this);
return Iterators.singletonIterator(fileSet);
}
}
/**
* An iterator producing a {@link FileSet} of {@link CopyEntity} for each partition in this table. The files
* are not scanned or the {@link FileSet} materialized until {@link #next} is called.
*/
private class PartitionIterator implements Iterator<FileSet<CopyEntity>> {
static final String DEREGISTER_FILE_SET = "deregister";
private final List<FileSet<CopyEntity>> allFileSets;
private final Iterator<FileSet<CopyEntity>> fileSetIterator;
public PartitionIterator(Map<List<String>, Partition> partitionMap, CopyConfiguration configuration,
Comparator<FileSet<CopyEntity>> prioritizer, PushDownRequestor<FileSet<CopyEntity>> requestor) {
this.allFileSets = generateAllFileSets(partitionMap);
for (FileSet<CopyEntity> fileSet : this.allFileSets) {
fileSet.setRequestor(requestor);
}
if (prioritizer != null) {
Collections.sort(this.allFileSets, prioritizer);
}
this.fileSetIterator = this.allFileSets.iterator();
}
@Override
public boolean hasNext() {
return this.fileSetIterator.hasNext();
}
@Override
public FileSet<CopyEntity> next() {
return this.fileSetIterator.next();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
private List<FileSet<CopyEntity>> generateAllFileSets(Map<List<String>, Partition> partitionMap) {
List<FileSet<CopyEntity>> fileSets = Lists.newArrayList();
for (Map.Entry<List<String>, Partition> partition : partitionMap.entrySet()) {
fileSets.add(fileSetForPartition(partition.getValue()));
HiveCopyEntityHelper.this.targetPartitions.remove(partition.getKey());
}
if (!HiveCopyEntityHelper.this.targetPartitions.isEmpty()) {
fileSets.add(new HivePartitionsDeregisterFileSet(
HiveCopyEntityHelper.this.dataset.getTable().getCompleteName() + DEREGISTER_FILE_SET,
HiveCopyEntityHelper.this.dataset, HiveCopyEntityHelper.this.targetPartitions.values(), HiveCopyEntityHelper.this));
}
return fileSets;
}
private FileSet<CopyEntity> fileSetForPartition(final Partition partition) {
return new HivePartitionFileSet(HiveCopyEntityHelper.this, partition, HiveCopyEntityHelper.this.dataset.getProperties());
}
}
private Table getTargetTable(Table originTable, Path targetLocation) throws IOException {
try {
Table targetTable = originTable.copy();
targetTable.setDbName(this.targetDatabase);
targetTable.setDataLocation(targetLocation);
/*
* Need to set the table owner as the flow executor
*/
targetTable.setOwner(UserGroupInformation.getCurrentUser().getShortUserName());
targetTable.getTTable().putToParameters(HiveDataset.REGISTERER, GOBBLIN_DISTCP);
targetTable.getTTable().putToParameters(HiveDataset.REGISTRATION_GENERATION_TIME_MILLIS,
Long.toString(this.startTime));
targetTable.getTTable().unsetCreateTime();
HiveAvroCopyEntityHelper.updateTableAttributesIfAvro(targetTable, this);
return targetTable;
} catch (HiveException he) {
throw new IOException(he);
}
}
int addPartitionDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority,
Table table, Partition partition) throws IOException {
int stepPriority = initialPriority;
Collection<Path> partitionPaths = Lists.newArrayList();
if (this.deleteMethod == DeregisterFileDeleteMethod.RECURSIVE) {
partitionPaths = Lists.newArrayList(partition.getDataLocation());
} else if (this.deleteMethod == DeregisterFileDeleteMethod.INPUT_FORMAT) {
InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(partition.getTPartition().getSd());
HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(partition.getDataLocation(), inputFormat,
this.targetFs, this.dataset.getProperties());
partitionPaths = targetLocation.getPaths().keySet();
} else if (this.deleteMethod == DeregisterFileDeleteMethod.NO_DELETE) {
partitionPaths = Lists.newArrayList();
}
if (!partitionPaths.isEmpty()) {
DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.targetFs, partitionPaths,
this.dataset.getProperties(), table.getDataLocation());
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deletePaths, stepPriority++));
}
PartitionDeregisterStep deregister =
new PartitionDeregisterStep(table.getTTable(), partition.getTPartition(), this.targetURI, this.hiveRegProps);
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deregister, stepPriority++));
return stepPriority;
}
@VisibleForTesting
protected int addTableDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority, Table table)
throws IOException {
int stepPriority = initialPriority;
Collection<Path> tablePaths = Lists.newArrayList();
switch (this.getDeleteMethod()) {
case RECURSIVE:
tablePaths = Lists.newArrayList(table.getDataLocation());
break;
case INPUT_FORMAT:
InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(table.getSd());
HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(table.getDataLocation(), inputFormat,
this.getTargetFs(), this.getDataset().getProperties());
tablePaths = targetLocation.getPaths().keySet();
break;
case NO_DELETE:
tablePaths = Lists.newArrayList();
break;
default:
tablePaths = Lists.newArrayList();
}
if (!tablePaths.isEmpty()) {
DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.getTargetFs(), tablePaths,
this.getDataset().getProperties(), table.getDataLocation());
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deletePaths, stepPriority++));
}
TableDeregisterStep deregister =
new TableDeregisterStep(table.getTTable(), this.getTargetURI(), this.getHiveRegProps());
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deregister, stepPriority++));
return stepPriority;
}
int addSharedSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority) {
int priority = initialPriority;
if (this.tableRegistrationStep.isPresent()) {
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), this.tableRegistrationStep.get(),
priority++));
}
return priority;
}
/**
* Compares three entities to figure out which files should be copied and which files should be deleted in the target
* file system.
* @param sourceLocation Represents the source table or partition.
* @param desiredTargetLocation Represents the new desired table or partition.
* @param currentTargetLocation Represents the corresponding existing table or partition in the target hcat if it exists.
* @param partition If present, contains partition information.
* @return A {@link DiffPathSet} with data on files to copy and delete.
* @throws IOException if the copy of this table / partition should be aborted.
*/
@VisibleForTesting
protected static DiffPathSet fullPathDiff(HiveLocationDescriptor sourceLocation,
HiveLocationDescriptor desiredTargetLocation, Optional<HiveLocationDescriptor> currentTargetLocation,
Optional<Partition> partition, MultiTimingEvent multiTimer, HiveCopyEntityHelper helper) throws IOException {
DiffPathSet.DiffPathSetBuilder builder = DiffPathSet.builder();
multiTimer.nextStage(Stages.SOURCE_PATH_LISTING);
// These are the paths at the source
Map<Path, FileStatus> sourcePaths = sourceLocation.getPaths();
multiTimer.nextStage(Stages.TARGET_EXISTING_PATH_LISTING);
// These are the paths that the existing target table / partition uses now
Map<Path, FileStatus> targetExistingPaths = currentTargetLocation.isPresent()
? currentTargetLocation.get().getPaths() : Maps.<Path, FileStatus> newHashMap();
multiTimer.nextStage(Stages.DESIRED_PATHS_LISTING);
// These are the paths that exist at the destination and the new table / partition would pick up
Map<Path, FileStatus> desiredTargetExistingPaths;
try {
desiredTargetExistingPaths = desiredTargetLocation.getPaths();
} catch (InvalidInputException ioe) {
// Thrown if inputFormat cannot find location in target. Since location doesn't exist, this set is empty.
desiredTargetExistingPaths = Maps.newHashMap();
}
multiTimer.nextStage(Stages.PATH_DIFF);
for (FileStatus sourcePath : sourcePaths.values()) {
// For each source path
Path newPath = helper.getTargetPathHelper().getTargetPath(sourcePath.getPath(), desiredTargetLocation.getFileSystem(), partition, true);
boolean shouldCopy = true;
if (desiredTargetExistingPaths.containsKey(newPath)) {
// If the file exists at the destination, check whether it should be replaced, if not, no need to copy
FileStatus existingTargetStatus = desiredTargetExistingPaths.get(newPath);
if (!helper.shouldReplaceFile(existingTargetStatus, sourcePath)) {
shouldCopy = false;
}
}
if (shouldCopy) {
builder.copyFile(sourcePath);
} else {
// if not copying, we want to keep the file in the target
// at the end of this loop, all files in targetExistingPaths will be marked for deletion, so remove this file
targetExistingPaths.remove(newPath);
desiredTargetExistingPaths.remove(newPath);
}
}
multiTimer.nextStage(Stages.COMPUTE_DELETE_PATHS);
// At this point, targetExistingPaths contains paths managed by target partition / table, but that we don't want
// delete them
for (Path delete : targetExistingPaths.keySet()) {
builder.deleteFile(delete);
desiredTargetExistingPaths.remove(delete);
}
// Now desiredTargetExistingPaths contains paths that we don't want, but which are not managed by the existing
// table / partition.
// Ideally, we shouldn't delete them (they're not managed by Hive), and we don't want to pick
// them up in the new table / partition, so if there are any leftover files, we should abort copying
// this table / partition.
if (desiredTargetExistingPaths.size() > 0 && helper.getUnmanagedDataPolicy() != UnmanagedDataPolicy.DELETE_UNMANAGED_DATA) {
throw new IOException(String.format(
"New table / partition would pick up existing, undesired files in target file system. " + "%s, files %s.",
partition.isPresent() ? partition.get().getCompleteName() : helper.getDataset().getTable().getCompleteName(),
Arrays.toString(desiredTargetExistingPaths.keySet().toArray())));
}
// Unless, the policy requires us to delete such un-managed files - in which case: we will add the leftover files
// to the deletion list.
else if (desiredTargetExistingPaths.size() > 0) {
for (Path delete : desiredTargetExistingPaths.keySet()) {
builder.deleteFile(delete);
}
log.warn(String.format("Un-managed files detected in target file system, however deleting them "
+ "because of the policy: %s Files to be deleted are: %s", UnmanagedDataPolicy.DELETE_UNMANAGED_DATA,
StringUtils.join(desiredTargetExistingPaths.keySet(), ",")));
}
return builder.build();
}
private static boolean shouldReplaceFile(FileStatus referencePath, FileStatus replacementFile) {
return replacementFile.getLen() != referencePath.getLen()
|| referencePath.getModificationTime() < replacementFile.getModificationTime();
}
private void checkPartitionedTableCompatibility(Table desiredTargetTable, Table existingTargetTable)
throws IOException {
if (!desiredTargetTable.getDataLocation().equals(existingTargetTable.getDataLocation())) {
throw new HiveTableLocationNotMatchException(desiredTargetTable.getDataLocation(),
existingTargetTable.getDataLocation());
}
if (HiveUtils.isPartitioned(desiredTargetTable) != HiveUtils.isPartitioned(existingTargetTable)) {
throw new IOException(String.format(
"%s: Desired target table %s partitioned, existing target table %s partitioned. Tables are incompatible.",
this.dataset.tableIdentifier, HiveUtils.isPartitioned(desiredTargetTable) ? "is" : "is not",
HiveUtils.isPartitioned(existingTargetTable) ? "is" : "is not"));
}
if (desiredTargetTable.isPartitioned()
&& !desiredTargetTable.getPartitionKeys().equals(existingTargetTable.getPartitionKeys())) {
throw new IOException(String.format(
"%s: Desired target table has partition keys %s, existing target table has partition keys %s. "
+ "Tables are incompatible.",
this.dataset.tableIdentifier, gson.toJson(desiredTargetTable.getPartitionKeys()),
gson.toJson(existingTargetTable.getPartitionKeys())));
}
}
/**
* Get builders for a {@link CopyableFile} for each file referred to by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}.
*/
List<CopyableFile.Builder> getCopyableFilesFromPaths(Collection<FileStatus> paths,
CopyConfiguration configuration, Optional<Partition> partition) throws IOException {
List<CopyableFile.Builder> builders = Lists.newArrayList();
List<SourceAndDestination> dataFiles = Lists.newArrayList();
Configuration hadoopConfiguration = new Configuration();
FileSystem actualSourceFs = null;
String referenceScheme = null;
String referenceAuthority = null;
for (FileStatus status : paths) {
dataFiles.add(new SourceAndDestination(status, getTargetPathHelper().getTargetPath(status.getPath(), this.targetFs, partition, true)));
}
for (SourceAndDestination sourceAndDestination : dataFiles) {
URI uri = sourceAndDestination.getSource().getPath().toUri();
if (actualSourceFs == null || !StringUtils.equals(referenceScheme, uri.getScheme())
|| !StringUtils.equals(referenceAuthority, uri.getAuthority())) {
actualSourceFs = sourceAndDestination.getSource().getPath().getFileSystem(hadoopConfiguration);
referenceScheme = uri.getScheme();
referenceAuthority = uri.getAuthority();
}
if (!this.dataset.getTableRootPath().isPresent()) {
// The logic for computing ancestor owner and permissions for hive copies depends on tables having a non-glob
// location. Currently, this restriction is also imposed by Hive, so this is not a problem. If this ever changes
// on the Hive side, and we try to copy a table with a glob location, this logic will have to change.
throw new IOException(String.format("Table %s does not have a concrete table root path.",
this.dataset.getTable().getCompleteName()));
}
List<OwnerAndPermission> ancestorOwnerAndPermission =
CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(actualSourceFs,
sourceAndDestination.getSource().getPath().getParent(), this.dataset.getTableRootPath().get().getParent(), configuration);
builders.add(CopyableFile.fromOriginAndDestination(actualSourceFs, sourceAndDestination.getSource(),
sourceAndDestination.getDestination(), configuration).
ancestorsOwnerAndPermission(ancestorOwnerAndPermission));
}
return builders;
}
/**
* Compute the target location for a Hive location.
* @param sourceFs Source {@link FileSystem}.
* @param path source {@link Path} in Hive location.
* @param partition partition these paths correspond to.
* @return transformed location in the target.
* @throws IOException if cannot generate a single target location.
*/
Path getTargetLocation(FileSystem sourceFs, FileSystem targetFs, Path path, Optional<Partition> partition)
throws IOException {
return getTargetPathHelper().getTargetPath(path, targetFs, partition, false);
}
protected static Path replacedPrefix(Path sourcePath, Path prefixTobeReplaced, Path prefixReplacement) {
Path sourcePathWithoutSchemeAndAuthority = PathUtils.getPathWithoutSchemeAndAuthority(sourcePath);
Preconditions.checkArgument(PathUtils.isAncestor(prefixTobeReplaced, sourcePathWithoutSchemeAndAuthority),
"When replacing prefix, all locations must be descendants of the prefix. "
+ "The prefix: %s, file location: %s.",
prefixTobeReplaced, sourcePathWithoutSchemeAndAuthority);
Path relativePath = PathUtils.relativizePath(sourcePathWithoutSchemeAndAuthority, prefixTobeReplaced);
Path result = new Path(prefixReplacement, relativePath);
return result;
}
public FileSystem getTargetFileSystem() {
return this.targetFs;
}
}