/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.conversion.hive.converter;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import lombok.extern.slf4j.Slf4j;
import org.apache.avro.Schema;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.thrift.TException;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import gobblin.configuration.WorkUnitState;
import gobblin.converter.Converter;
import gobblin.converter.DataConversionException;
import gobblin.converter.SingleRecordIterable;
import gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset;
import gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset.ConversionConfig;
import gobblin.data.management.conversion.hive.entities.QueryBasedHiveConversionEntity;
import gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity;
import gobblin.data.management.conversion.hive.events.EventWorkunitUtils;
import gobblin.data.management.conversion.hive.query.HiveAvroORCQueryGenerator;
import gobblin.data.management.copy.hive.HiveDatasetFinder;
import gobblin.data.management.copy.hive.HiveUtils;
import gobblin.data.management.copy.hive.WhitelistBlacklist;
import gobblin.hive.HiveMetastoreClientPool;
import gobblin.metrics.event.sla.SlaEventKeys;
import gobblin.util.AutoReturnableObject;
import gobblin.util.HadoopUtils;
/**
* Builds the Hive avro to ORC conversion query. The record type for this converter is {@link QueryBasedHiveConversionEntity}. A {@link QueryBasedHiveConversionEntity}
* can be a hive table or a hive partition.
* <p>
* Concrete subclasses define the semantics of Avro to ORC conversion for a specific ORC format by providing {@link ConversionConfig}s.
* </p>
*/
@Slf4j
public abstract class AbstractAvroToOrcConverter extends Converter<Schema, Schema, QueryBasedHiveConversionEntity, QueryBasedHiveConversionEntity> {
/***
* Subdirectory within destination ORC table directory to publish data
*/
private static final String PUBLISHED_TABLE_SUBDIRECTORY = "final";
private static final String ORC_FORMAT = "orc";
/**
* Hive runtime property key names for tracking
*/
private static final String GOBBLIN_DATASET_URN_KEY = "gobblin.datasetUrn";
private static final String GOBBLIN_PARTITION_NAME_KEY = "gobblin.partitionName";
private static final String GOBBLIN_WORKUNIT_CREATE_TIME_KEY = "gobblin.workunitCreateTime";
/***
* Separators used by Hive
*/
private static final String HIVE_PARTITIONS_INFO = "/";
private static final String HIVE_PARTITIONS_TYPE = ":";
protected final FileSystem fs;
/**
* Supported destination ORC formats
*/
protected enum OrcFormats {
FLATTENED_ORC("flattenedOrc"),
NESTED_ORC("nestedOrc");
private final String configPrefix;
OrcFormats(String configPrefix) {
this.configPrefix = configPrefix;
}
public String getConfigPrefix() {
return this.configPrefix;
}
}
/**
* list of partitions that a partition has replaced. E.g. list of hourly partitons for a daily partition
*/
public static final String REPLACED_PARTITIONS_HIVE_METASTORE_KEY = "gobblin.replaced.partitions";
/**
* The dataset being converted.
*/
protected ConvertibleHiveDataset hiveDataset;
/**
* If the property is set to true then in the destination dir permissions, group won't be explicitly set.
*/
public static final String HIVE_DATASET_DESTINATION_SKIP_SETGROUP = "hive.dataset.destination.skip.setGroup";
public static final boolean DEFAULT_HIVE_DATASET_DESTINATION_SKIP_SETGROUP = false;
/**
* If set to true, a set format DDL will be separate from add partition DDL
*/
public static final String HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY = "hive.conversion.setSerdeToAvroExplicitly";
public static final boolean DEFAULT_HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY = true;
/***
* Global Hive conversion view registration whitelist / blacklist key
*/
public static final String HIVE_CONVERSION_VIEW_REGISTRATION_WHITELIST = "hive.conversion.view.registration.whitelist";
public static final String HIVE_CONVERSION_VIEW_REGISTRATION_BLACKLIST = "hive.conversion.view.registration.blacklist";
/**
* Subclasses can convert the {@link Schema} if required.
*
* {@inheritDoc}
* @see gobblin.converter.Converter#convertSchema(java.lang.Object, gobblin.configuration.WorkUnitState)
*/
@Override
public abstract Schema convertSchema(Schema inputSchema, WorkUnitState workUnit);
/**
* <p>
* This method is called by {@link AbstractAvroToOrcConverter#convertRecord(Schema, QueryBasedHiveConversionEntity, WorkUnitState)} before building the
* conversion query. Subclasses can find out if conversion is enabled for their format by calling
* {@link ConvertibleHiveDataset#getConversionConfigForFormat(String)} on the <code>hiveDataset</code>.<br>
* Available ORC formats are defined by the enum {@link OrcFormats}
* </p>
* <p>
* If this method returns false, no Avro to to ORC conversion queries will be built for the ORC format.
* </p>
* @return true if conversion is required. false otherwise
*/
protected abstract boolean hasConversionConfig();
/**
* Get the {@link ConversionConfig} required for building the Avro to ORC conversion query
* @return Conversion config
*/
protected abstract ConversionConfig getConversionConfig();
public AbstractAvroToOrcConverter() {
try {
this.fs = FileSystem.get(HadoopUtils.newConfiguration());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Populate the avro to orc conversion queries. The Queries will be added to {@link QueryBasedHiveConversionEntity#getQueries()}
*/
@Override
public Iterable<QueryBasedHiveConversionEntity> convertRecord(Schema outputAvroSchema, QueryBasedHiveConversionEntity conversionEntity, WorkUnitState workUnit)
throws DataConversionException {
Preconditions.checkNotNull(outputAvroSchema, "Avro schema must not be null");
Preconditions.checkNotNull(conversionEntity, "Conversion entity must not be null");
Preconditions.checkNotNull(workUnit, "Workunit state must not be null");
Preconditions.checkNotNull(conversionEntity.getHiveTable(), "Hive table within conversion entity must not be null");
EventWorkunitUtils.setBeginDDLBuildTimeMetadata(workUnit, System.currentTimeMillis());
this.hiveDataset = conversionEntity.getConvertibleHiveDataset();
if (!hasConversionConfig()) {
return new SingleRecordIterable<>(conversionEntity);
}
// Avro table name and location
String avroTableName = conversionEntity.getHiveTable().getTableName();
// ORC table name and location
String orcTableName = getConversionConfig().getDestinationTableName();
String orcStagingTableName = getOrcStagingTableName(getConversionConfig().getDestinationStagingTableName());
String orcTableDatabase = getConversionConfig().getDestinationDbName();
String orcDataLocation = getOrcDataLocation();
String orcStagingDataLocation = getOrcStagingDataLocation(orcStagingTableName);
boolean isEvolutionEnabled = getConversionConfig().isEvolutionEnabled();
Pair<Optional<Table>, Optional<List<Partition>>> destinationMeta = getDestinationTableMeta(orcTableDatabase,
orcTableName, workUnit);
Optional<Table> destinationTableMeta = destinationMeta.getLeft();
// Optional
// View registration blacklist / whitelist
Optional<WhitelistBlacklist> optionalViewRegistrationWhiteBlacklist = getViewWhiteBackListFromWorkUnit(workUnit);
// wrapperViewName : If specified view with 'wrapperViewName' is created if not already exists
// over destination table
// isUpdateViewAlwaysEnabled: If false 'wrapperViewName' is only updated when schema evolves; if true
// 'wrapperViewName' is always updated (everytime publish happens)
Optional<String> wrapperViewName = Optional.<String>absent();
if (optionalViewRegistrationWhiteBlacklist.isPresent()) {
wrapperViewName = optionalViewRegistrationWhiteBlacklist.get().acceptTable(orcTableDatabase, orcTableName)
? getConversionConfig().getDestinationViewName() : wrapperViewName;
} else {
wrapperViewName = getConversionConfig().getDestinationViewName();
}
boolean shouldUpdateView = getConversionConfig().isUpdateViewAlwaysEnabled();
// Other properties
Optional<List<String>> clusterBy =
getConversionConfig().getClusterBy().isEmpty()
? Optional.<List<String>> absent()
: Optional.of(getConversionConfig().getClusterBy());
Optional<Integer> numBuckets = getConversionConfig().getNumBuckets();
Optional<Integer> rowLimit = getConversionConfig().getRowLimit();
Properties tableProperties = getConversionConfig().getDestinationTableProperties();
// Partition dir hint helps create different directory for hourly and daily partition with same timestamp, such as:
// .. daily_2016-01-01-00 and hourly_2016-01-01-00
// This helps existing hourly data from not being deleted at the time of roll up, and so Hive queries in flight
// .. do not fail
List<String> sourceDataPathIdentifier = getConversionConfig().getSourceDataPathIdentifier();
// Populate optional partition info
Map<String, String> partitionsDDLInfo = Maps.newHashMap();
Map<String, String> partitionsDMLInfo = Maps.newHashMap();
populatePartitionInfo(conversionEntity, partitionsDDLInfo, partitionsDMLInfo);
/*
* Create ORC data location with the same permissions as Avro data
*
* Note that hive can also automatically create the non-existing directories but it does not
* seem to create it with the desired permissions.
* According to hive docs permissions for newly created directories/files can be controlled using uMask like,
*
* SET hive.warehouse.subdir.inherit.perms=false;
* SET fs.permissions.umask-mode=022;
* Upon testing, this did not work
*/
try {
FileStatus sourceDataFileStatus = this.fs.getFileStatus(conversionEntity.getHiveTable().getDataLocation());
FsPermission sourceDataPermission = sourceDataFileStatus.getPermission();
if (!this.fs.mkdirs(new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission)) {
throw new RuntimeException(String.format("Failed to create path %s with permissions %s", new Path(
getConversionConfig().getDestinationDataPath()), sourceDataPermission));
} else {
this.fs.setPermission(new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission);
// Set the same group as source
if (!workUnit.getPropAsBoolean(HIVE_DATASET_DESTINATION_SKIP_SETGROUP,
DEFAULT_HIVE_DATASET_DESTINATION_SKIP_SETGROUP)) {
this.fs.setOwner(new Path(getConversionConfig().getDestinationDataPath()), null,
sourceDataFileStatus.getGroup());
}
log.info(String.format("Created %s with permissions %s and group %s", new Path(getConversionConfig()
.getDestinationDataPath()), sourceDataPermission, sourceDataFileStatus.getGroup()));
}
} catch (IOException e) {
Throwables.propagate(e);
}
// Set hive runtime properties from conversion config
for (Map.Entry<Object, Object> entry : getConversionConfig().getHiveRuntimeProperties().entrySet()) {
conversionEntity.getQueries().add(String.format("SET %s=%s", entry.getKey(), entry.getValue()));
}
// Set hive runtime properties for tracking
conversionEntity.getQueries().add(String.format("SET %s=%s", GOBBLIN_DATASET_URN_KEY,
conversionEntity.getHiveTable().getCompleteName()));
if (conversionEntity.getHivePartition().isPresent()) {
conversionEntity.getQueries().add(String.format("SET %s=%s", GOBBLIN_PARTITION_NAME_KEY,
conversionEntity.getHivePartition().get().getCompleteName()));
}
conversionEntity.getQueries().add(String
.format("SET %s=%s", GOBBLIN_WORKUNIT_CREATE_TIME_KEY,
workUnit.getWorkunit().getProp(SlaEventKeys.ORIGIN_TS_IN_MILLI_SECS_KEY)));
// Create DDL statement for table
Map<String, String> hiveColumns = new LinkedHashMap<>();
String createStagingTableDDL =
HiveAvroORCQueryGenerator.generateCreateTableDDL(outputAvroSchema,
orcStagingTableName,
orcStagingDataLocation,
Optional.of(orcTableDatabase),
Optional.of(partitionsDDLInfo),
clusterBy,
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(),
numBuckets,
Optional.<String>absent(),
Optional.<String>absent(),
Optional.<String>absent(),
tableProperties,
isEvolutionEnabled,
destinationTableMeta,
hiveColumns);
conversionEntity.getQueries().add(createStagingTableDDL);
log.debug("Create staging table DDL: " + createStagingTableDDL);
// Create DDL statement for partition
String orcStagingDataPartitionDirName = getOrcStagingDataPartitionDirName(conversionEntity, sourceDataPathIdentifier);
String orcStagingDataPartitionLocation = orcStagingDataLocation + Path.SEPARATOR + orcStagingDataPartitionDirName;
if (partitionsDMLInfo.size() > 0) {
List<String> createStagingPartitionDDL =
HiveAvroORCQueryGenerator.generateCreatePartitionDDL(orcTableDatabase,
orcStagingTableName,
orcStagingDataPartitionLocation,
partitionsDMLInfo);
conversionEntity.getQueries().addAll(createStagingPartitionDDL);
log.debug("Create staging partition DDL: " + createStagingPartitionDDL);
}
// Create DML statement
String insertInORCStagingTableDML =
HiveAvroORCQueryGenerator
.generateTableMappingDML(conversionEntity.getHiveTable().getAvroSchema(),
outputAvroSchema,
avroTableName,
orcStagingTableName,
Optional.of(conversionEntity.getHiveTable().getDbName()),
Optional.of(orcTableDatabase),
Optional.of(partitionsDMLInfo),
Optional.<Boolean>absent(),
Optional.<Boolean>absent(),
isEvolutionEnabled,
destinationTableMeta,
rowLimit);
conversionEntity.getQueries().add(insertInORCStagingTableDML);
log.debug("Conversion staging DML: " + insertInORCStagingTableDML);
// TODO: Split this method into two (conversion and publish)
// Addition to WUS for Staging publish:
// A. Evolution turned on:
// 1. If table does not exists: simply create it (now it should exist)
// 2. If table exists:
// 2.1 Evolve table (alter table)
// 2.2 If snapshot table:
// 2.2.1 Delete data in final table directory
// 2.2.2 Move data from staging to final table directory
// 2.2.3 Drop this staging table and delete directories
// 2.3 If partitioned table, move partitions from staging to final table; for all partitions:
// 2.3.1 Drop if exists partition in final table
// 2.3.2 Move partition directory
// 2.3.3 Create partition with location
// 2.3.4 Drop this staging table and delete directories
// B. Evolution turned off:
// 1. If table does not exists: simply create it (now it should exist)
// 2. If table exists:
// 2.1 Do not evolve table
// 2.2 If snapshot table:
// 2.2.1 Delete data in final table directory
// 2.2.2 Move data from staging to final table directory
// 2.2.3 Drop this staging table and delete directories
// 2.3 If partitioned table, move partitions from staging to final table; for all partitions:
// 2.3.1 Drop if exists partition in final table
// 2.3.2 Move partition directory
// 2.3.3 Create partition with location
// 2.3.4 Drop this staging table and delete directories
// Note: The queries below also serve as compatibility check module before conversion, an incompatible
// .. schema throws a Runtime exeption, hence preventing further execution
QueryBasedHivePublishEntity publishEntity = new QueryBasedHivePublishEntity();
List<String> publishQueries = publishEntity.getPublishQueries();
Map<String, String> publishDirectories = publishEntity.getPublishDirectories();
List<String> cleanupQueries = publishEntity.getCleanupQueries();
List<String> cleanupDirectories = publishEntity.getCleanupDirectories();
// Step:
// A.1, B.1: If table does not exists, simply create it
if (!destinationTableMeta.isPresent()) {
String createTargetTableDDL =
HiveAvroORCQueryGenerator.generateCreateTableDDL(outputAvroSchema,
orcTableName,
orcDataLocation,
Optional.of(orcTableDatabase),
Optional.of(partitionsDDLInfo),
clusterBy,
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(),
numBuckets,
Optional.<String>absent(),
Optional.<String>absent(),
Optional.<String>absent(),
tableProperties,
isEvolutionEnabled,
destinationTableMeta,
new HashMap<String, String>());
publishQueries.add(createTargetTableDDL);
log.debug("Create final table DDL: " + createTargetTableDDL);
}
// Step:
// A.2.1: If table pre-exists (destinationTableMeta would be present), evolve table
// B.2.1: No-op
List<String> evolutionDDLs = HiveAvroORCQueryGenerator.generateEvolutionDDL(orcStagingTableName,
orcTableName,
Optional.of(orcTableDatabase),
Optional.of(orcTableDatabase),
outputAvroSchema,
isEvolutionEnabled,
hiveColumns,
destinationTableMeta);
log.debug("Evolve final table DDLs: " + evolutionDDLs);
EventWorkunitUtils.setEvolutionMetadata(workUnit, evolutionDDLs);
// View (if present) must be updated if evolution happens
shouldUpdateView |= evolutionDDLs.size() > 0;
publishQueries.addAll(evolutionDDLs);
if (partitionsDDLInfo.size() == 0) {
// Step:
// A.2.2, B.2.2: Snapshot table
// Step:
// A.2.2.1, B.2.2.1: Delete data in final table directory
// A.2.2.2, B.2.2.2: Move data from staging to final table directory
log.info("Snapshot directory to move: " + orcStagingDataLocation + " to: " + orcDataLocation);
publishDirectories.put(orcStagingDataLocation, orcDataLocation);
// Step:
// A.2.2.3, B.2.2.3: Drop this staging table and delete directories
String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(orcTableDatabase, orcStagingTableName);
log.debug("Drop staging table DDL: " + dropStagingTableDDL);
cleanupQueries.add(dropStagingTableDDL);
// Delete: orcStagingDataLocation
log.info("Staging table directory to delete: " + orcStagingDataLocation);
cleanupDirectories.add(orcStagingDataLocation);
} else {
// Step:
// A.2.3, B.2.3: If partitioned table, move partitions from staging to final table; for all partitions:
// Step:
// A.2.3.1, B.2.3.1: Drop if exists partition in final table
List<String> dropPartitionsDDL =
HiveAvroORCQueryGenerator.generateDropPartitionsDDL(orcTableDatabase,
orcTableName,
partitionsDMLInfo);
log.debug("Drop partitions if exist in final table: " + dropPartitionsDDL);
publishQueries.addAll(dropPartitionsDDL);
// Step:
// A.2.3.2, B.2.3.2: Move partition directory
// Move: orcStagingDataPartitionLocation to: orcFinalDataPartitionLocation
String orcFinalDataPartitionLocation = orcDataLocation + Path.SEPARATOR + orcStagingDataPartitionDirName;
log.info("Partition directory to move: " + orcStagingDataPartitionLocation + " to: " + orcFinalDataPartitionLocation);
publishDirectories.put(orcStagingDataPartitionLocation, orcFinalDataPartitionLocation);
// Step:
// A.2.3.3, B.2.3.3: Create partition with location (and update storage format if not in ORC already)
String orcDataPartitionLocation = orcDataLocation + Path.SEPARATOR + orcStagingDataPartitionDirName;
if (workUnit.getPropAsBoolean(HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY,
DEFAULT_HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY)) {
List<String> createFinalPartitionDDL =
HiveAvroORCQueryGenerator.generateCreatePartitionDDL(orcTableDatabase,
orcTableName,
orcDataPartitionLocation,
partitionsDMLInfo,
Optional.<String>absent());
log.debug("Create final partition DDL: " + createFinalPartitionDDL);
publishQueries.addAll(createFinalPartitionDDL);
// Updating storage format non-transactionally is a stop gap measure until Hive supports transactionally update
// .. storage format in ADD PARITTION command (today it only supports specifying location)
List<String> updatePartitionStorageFormatDDL =
HiveAvroORCQueryGenerator.generateAlterTableOrPartitionStorageFormatDDL(orcTableDatabase,
orcTableName,
Optional.of(partitionsDMLInfo),
ORC_FORMAT);
log.debug("Update final partition storage format to ORC (if not already in ORC)");
publishQueries.addAll(updatePartitionStorageFormatDDL);
} else {
List<String> createFinalPartitionDDL =
HiveAvroORCQueryGenerator.generateCreatePartitionDDL(orcTableDatabase,
orcTableName,
orcDataPartitionLocation,
partitionsDMLInfo,
Optional.fromNullable(ORC_FORMAT));
log.debug("Create final partition DDL: " + createFinalPartitionDDL);
publishQueries.addAll(createFinalPartitionDDL);
}
// Step:
// A.2.3.4, B.2.3.4: Drop this staging table and delete directories
String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(orcTableDatabase, orcStagingTableName);
log.debug("Drop staging table DDL: " + dropStagingTableDDL);
cleanupQueries.add(dropStagingTableDDL);
// Delete: orcStagingDataLocation
log.info("Staging table directory to delete: " + orcStagingDataLocation);
cleanupDirectories.add(orcStagingDataLocation);
}
/*
* Drop the replaced partitions if any. This is required in case the partition being converted is derived from
* several other partitions. E.g. Daily partition is a replacement of hourly partitions of the same day. When daily
* partition is converted to ORC all it's hourly ORC partitions need to be dropped.
*/
publishQueries.addAll(HiveAvroORCQueryGenerator.generateDropPartitionsDDL(orcTableDatabase,
orcTableName,
getDropPartitionsDDLInfo(conversionEntity)));
/*
* Create or update view over the ORC table if specified in the config (ie. wrapper view name is present in config)
*/
if (wrapperViewName.isPresent()) {
String viewName = wrapperViewName.get();
List<String> createOrUpdateViewDDLs = HiveAvroORCQueryGenerator.generateCreateOrUpdateViewDDL(orcTableDatabase,
orcTableName, orcTableDatabase, viewName, shouldUpdateView);
log.debug("Create or update View DDLs: " + createOrUpdateViewDDLs);
publishQueries.addAll(createOrUpdateViewDDLs);
}
HiveAvroORCQueryGenerator.serializePublishCommands(workUnit, publishEntity);
log.debug("Publish partition entity: " + publishEntity);
log.debug("Conversion Query " + conversionEntity.getQueries());
EventWorkunitUtils.setEndDDLBuildTimeMetadata(workUnit, System.currentTimeMillis());
return new SingleRecordIterable<>(conversionEntity);
}
/***
* Get Hive view registration whitelist blacklist from Workunit state
* @param workUnit Workunit containing view whitelist blacklist property
* @return Optional WhitelistBlacklist if Workunit contains it
*/
@VisibleForTesting
public static Optional<WhitelistBlacklist> getViewWhiteBackListFromWorkUnit(WorkUnitState workUnit) {
Optional<WhitelistBlacklist> optionalViewWhiteBlacklist = Optional.absent();
if (workUnit == null) {
return optionalViewWhiteBlacklist;
}
if (workUnit.contains(HIVE_CONVERSION_VIEW_REGISTRATION_WHITELIST)
|| workUnit.contains(HIVE_CONVERSION_VIEW_REGISTRATION_BLACKLIST)) {
String viewWhiteList = workUnit.getProp(HIVE_CONVERSION_VIEW_REGISTRATION_WHITELIST, StringUtils.EMPTY);
String viewBlackList = workUnit.getProp(HIVE_CONVERSION_VIEW_REGISTRATION_BLACKLIST, StringUtils.EMPTY);
try {
optionalViewWhiteBlacklist = Optional.of(new WhitelistBlacklist(viewWhiteList, viewBlackList));
} catch (IOException e) {
Throwables.propagate(e);
}
}
return optionalViewWhiteBlacklist;
}
/***
* Get the staging table name for current converter. Each converter creates its own staging table.
* @param stagingTableNamePrefix for the staging table for this converter.
* @return Staging table name.
*/
private String getOrcStagingTableName(String stagingTableNamePrefix) {
int randomNumber = new Random().nextInt(10);
String uniqueStagingTableQualifier = String.format("%s%s", System.currentTimeMillis(), randomNumber);
return stagingTableNamePrefix + "_" + uniqueStagingTableQualifier;
}
/***
* Get the ORC partition directory name of the format: [hourly_][daily_]<partitionSpec1>[partitionSpec ..]
* @param conversionEntity Conversion entity.
* @param sourceDataPathIdentifier Hints to look in source partition location to prefix the partition dir name
* such as hourly or daily.
* @return Partition directory name.
*/
private String getOrcStagingDataPartitionDirName(QueryBasedHiveConversionEntity conversionEntity,
List<String> sourceDataPathIdentifier) {
if (conversionEntity.getHivePartition().isPresent()) {
StringBuilder dirNamePrefix = new StringBuilder();
String sourceHivePartitionLocation = conversionEntity.getHivePartition().get().getDataLocation().toString();
if (null != sourceDataPathIdentifier && null != sourceHivePartitionLocation) {
for (String hint : sourceDataPathIdentifier) {
if (sourceHivePartitionLocation.toLowerCase().contains(hint.toLowerCase())) {
dirNamePrefix.append(hint.toLowerCase()).append("_");
}
}
}
return dirNamePrefix + conversionEntity.getHivePartition().get().getName();
} else {
return StringUtils.EMPTY;
}
}
/***
* Get the ORC final table location of format: <ORC final table location>/final
* @return ORC final table location.
*/
private String getOrcDataLocation() {
String orcDataLocation = getConversionConfig().getDestinationDataPath();
return orcDataLocation + Path.SEPARATOR + PUBLISHED_TABLE_SUBDIRECTORY;
}
/***
* Get the ORC staging table location of format: <ORC final table location>/<ORC staging table name>
* @param orcStagingTableName ORC staging table name.
* @return ORC staging table location.
*/
private String getOrcStagingDataLocation(String orcStagingTableName) {
String orcDataLocation = getConversionConfig().getDestinationDataPath();
return orcDataLocation + Path.SEPARATOR + orcStagingTableName;
}
@VisibleForTesting
public static List<Map<String, String>> getDropPartitionsDDLInfo(QueryBasedHiveConversionEntity conversionEntity) {
if (!conversionEntity.getHivePartition().isPresent()) {
return Collections.emptyList();
}
return getDropPartitionsDDLInfo(conversionEntity.getHivePartition().get());
}
/**
* Parse the {@link #REPLACED_PARTITIONS_HIVE_METASTORE_KEY} from partition parameters to returns DDLs for all the partitions to be
* dropped.
*
* @return A {@link List} of partitions to be dropped. Each element of the list is a {@link Map} which maps a partition's
* key and value.
*
*/
public static List<Map<String, String>> getDropPartitionsDDLInfo(Partition hivePartition) {
List<Map<String, String>> replacedPartitionsDDLInfo = Lists.newArrayList();
List<FieldSchema> partitionKeys = hivePartition.getTable().getPartitionKeys();
if (StringUtils.isNotBlank(hivePartition.getParameters().get(REPLACED_PARTITIONS_HIVE_METASTORE_KEY))) {
// Partitions are separated by "|"
for (String partitionsInfoString : Splitter.on("|").omitEmptyStrings().split(hivePartition.getParameters().get(REPLACED_PARTITIONS_HIVE_METASTORE_KEY))) {
// Values for a partition are separated by ","
List<String> partitionValues = Splitter.on(",").omitEmptyStrings().trimResults().splitToList(partitionsInfoString);
// Do not drop partition the being processed. Sometimes a partition may have replaced another partition of the same values.
if (!partitionValues.equals(hivePartition.getValues())) {
ImmutableMap.Builder<String, String> partitionDDLInfoMap = ImmutableMap.builder();
for (int i = 0; i < partitionKeys.size(); i++) {
partitionDDLInfoMap.put(partitionKeys.get(i).getName(), partitionValues.get(i));
}
replacedPartitionsDDLInfo.add(partitionDDLInfoMap.build());
}
}
}
return replacedPartitionsDDLInfo;
}
private void populatePartitionInfo(QueryBasedHiveConversionEntity conversionEntity, Map<String, String> partitionsDDLInfo,
Map<String, String> partitionsDMLInfo) {
String partitionsInfoString = null;
String partitionsTypeString = null;
if (conversionEntity.getHivePartition().isPresent()) {
partitionsInfoString = conversionEntity.getHivePartition().get().getName();
partitionsTypeString = conversionEntity.getHivePartition().get().getSchema().getProperty("partition_columns.types");
}
if (StringUtils.isNotBlank(partitionsInfoString) || StringUtils.isNotBlank(partitionsTypeString)) {
if (StringUtils.isBlank(partitionsInfoString) || StringUtils.isBlank(partitionsTypeString)) {
throw new IllegalArgumentException("Both partitions info and partitions must be present, if one is specified");
}
List<String> pInfo = Splitter.on(HIVE_PARTITIONS_INFO).omitEmptyStrings().trimResults().splitToList(partitionsInfoString);
List<String> pType = Splitter.on(HIVE_PARTITIONS_TYPE).omitEmptyStrings().trimResults().splitToList(partitionsTypeString);
log.debug("PartitionsInfoString: " + partitionsInfoString);
log.debug("PartitionsTypeString: " + partitionsTypeString);
if (pInfo.size() != pType.size()) {
throw new IllegalArgumentException("partitions info and partitions type list should of same size");
}
for (int i = 0; i < pInfo.size(); i++) {
List<String> partitionInfoParts = Splitter.on("=").omitEmptyStrings().trimResults().splitToList(pInfo.get(i));
String partitionType = pType.get(i);
if (partitionInfoParts.size() != 2) {
throw new IllegalArgumentException(
String.format("Partition details should be of the format partitionName=partitionValue. Recieved: %s", pInfo.get(i)));
}
partitionsDDLInfo.put(partitionInfoParts.get(0), partitionType);
partitionsDMLInfo.put(partitionInfoParts.get(0), partitionInfoParts.get(1));
}
}
}
private Pair<Optional<Table>, Optional<List<Partition>>> getDestinationTableMeta(String dbName,
String tableName, WorkUnitState state)
throws DataConversionException {
Optional<Table> table = Optional.<Table>absent();
Optional<List<Partition>> partitions = Optional.<List<Partition>>absent();
try {
HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getJobState().getProperties(),
Optional.fromNullable(state.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY)));
try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
table = Optional.of(client.get().getTable(dbName, tableName));
if (table.isPresent()) {
org.apache.hadoop.hive.ql.metadata.Table qlTable = new org.apache.hadoop.hive.ql.metadata.Table(table.get());
if (HiveUtils.isPartitioned(qlTable)) {
partitions = Optional.of(HiveUtils.getPartitions(client.get(), qlTable, Optional.<String>absent()));
}
}
}
} catch (NoSuchObjectException e) {
return ImmutablePair.of(table, partitions);
} catch (IOException | TException e) {
throw new DataConversionException("Could not fetch destination table metadata", e);
}
return ImmutablePair.of(table, partitions);
}
}