/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.conversion.hive.query;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import gobblin.configuration.State;
import gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity;
/***
* Generate Hive queries
*/
@Slf4j
public class HiveAvroORCQueryGenerator {
private static final String SERIALIZED_PUBLISH_TABLE_COMMANDS = "serialized.publish.table.commands";
private static final Gson GSON = new GsonBuilder().setPrettyPrinting().create();
// Table properties keys
public static final String ORC_COMPRESSION_KEY = "orc.compress";
public static final String ORC_ROW_INDEX_STRIDE_KEY = "orc.row.index.stride";
// Default values for Hive DDL / DML query generation
private static final String DEFAULT_DB_NAME = "default";
private static final String DEFAULT_ROW_FORMAT_SERDE = "org.apache.hadoop.hive.ql.io.orc.OrcSerde";
private static final String DEFAULT_ORC_INPUT_FORMAT = "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat";
private static final String DEFAULT_ORC_OUTPUT_FORMAT = "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat";
private static final String DEFAULT_ORC_COMPRESSION = "ZLIB";
private static final String DEFAULT_ORC_ROW_INDEX_STRIDE = "268435456";
private static final Properties DEFAULT_TBL_PROPERTIES = new Properties();
static {
DEFAULT_TBL_PROPERTIES.setProperty(ORC_COMPRESSION_KEY, DEFAULT_ORC_COMPRESSION);
DEFAULT_TBL_PROPERTIES.setProperty(ORC_ROW_INDEX_STRIDE_KEY, DEFAULT_ORC_ROW_INDEX_STRIDE);
}
// Avro to Hive schema mapping
private static final Map<Schema.Type, String> AVRO_TO_HIVE_COLUMN_MAPPING_V_12 = ImmutableMap
.<Schema.Type, String>builder()
.put(Schema.Type.NULL, "void")
.put(Schema.Type.BOOLEAN, "boolean")
.put(Schema.Type.INT, "int")
.put(Schema.Type.LONG, "bigint")
.put(Schema.Type.FLOAT, "float")
.put(Schema.Type.DOUBLE, "double")
.put(Schema.Type.BYTES, "binary")
.put(Schema.Type.STRING, "string")
.put(Schema.Type.RECORD, "struct")
.put(Schema.Type.MAP, "map")
.put(Schema.Type.ARRAY, "array")
.put(Schema.Type.UNION, "uniontype")
.put(Schema.Type.ENUM, "string")
.put(Schema.Type.FIXED, "binary")
.build();
// Hive evolution types supported
private static final Map<String, Set<String>> HIVE_COMPATIBLE_TYPES = ImmutableMap
.<String, Set<String>>builder()
.put("tinyint", ImmutableSet.<String>builder()
.add("smallint", "int", "bigint", "float", "double", "decimal", "string", "varchar").build())
.put("smallint", ImmutableSet.<String>builder().add("int", "bigint", "float", "double", "decimal", "string",
"varchar").build())
.put("int", ImmutableSet.<String>builder().add("bigint", "float", "double", "decimal", "string", "varchar")
.build())
.put("bigint", ImmutableSet.<String>builder().add("float", "double", "decimal", "string", "varchar").build())
.put("float", ImmutableSet.<String>builder().add("double", "decimal", "string", "varchar").build())
.put("double", ImmutableSet.<String>builder().add("decimal", "string", "varchar").build())
.put("decimal", ImmutableSet.<String>builder().add("string", "varchar").build())
.put("string", ImmutableSet.<String>builder().add("double", "decimal", "varchar").build())
.put("varchar", ImmutableSet.<String>builder().add("double", "string", "varchar").build())
.put("timestamp", ImmutableSet.<String>builder().add("string", "varchar").build())
.put("date", ImmutableSet.<String>builder().add("string", "varchar").build())
.put("binary", Sets.<String>newHashSet())
.put("boolean", Sets.<String>newHashSet()).build();
@ToString
public static enum COLUMN_SORT_ORDER {
ASC ("ASC"),
DESC ("DESC");
private final String order;
COLUMN_SORT_ORDER(String s) {
order = s;
}
}
/***
* Generate DDL query to create a different format (default: ORC) Hive table for a given Avro Schema
* @param schema Avro schema to use to generate the DDL for new Hive table
* @param tblName New Hive table name
* @param tblLocation New hive table location
* @param optionalDbName Optional DB name, if not specified it defaults to 'default'
* @param optionalPartitionDDLInfo Optional partition info in form of map of partition key, partition type pair
* If not specified, the table is assumed to be un-partitioned ie of type snapshot
* @param optionalClusterInfo Optional cluster info
* @param optionalSortOrderInfo Optional sort order
* @param optionalNumOfBuckets Optional number of buckets
* @param optionalRowFormatSerde Optional row format serde, default is ORC
* @param optionalInputFormat Optional input format serde, default is ORC
* @param optionalOutputFormat Optional output format serde, default is ORC
* @param tableProperties Optional table properties
* @param isEvolutionEnabled If schema evolution is turned on
* @param destinationTableMeta Optional destination table metadata @return Generated DDL query to create new Hive table
*/
public static String generateCreateTableDDL(Schema schema,
String tblName,
String tblLocation,
Optional<String> optionalDbName,
Optional<Map<String, String>> optionalPartitionDDLInfo,
Optional<List<String>> optionalClusterInfo,
Optional<Map<String, COLUMN_SORT_ORDER>> optionalSortOrderInfo,
Optional<Integer> optionalNumOfBuckets,
Optional<String> optionalRowFormatSerde,
Optional<String> optionalInputFormat,
Optional<String> optionalOutputFormat,
Properties tableProperties,
boolean isEvolutionEnabled,
Optional<Table> destinationTableMeta,
Map<String, String> hiveColumns) {
Preconditions.checkNotNull(schema);
Preconditions.checkArgument(StringUtils.isNotBlank(tblName));
Preconditions.checkArgument(StringUtils.isNotBlank(tblLocation));
String dbName = optionalDbName.isPresent() ? optionalDbName.get() : DEFAULT_DB_NAME;
String rowFormatSerde = optionalRowFormatSerde.isPresent() ? optionalRowFormatSerde.get() : DEFAULT_ROW_FORMAT_SERDE;
String inputFormat = optionalInputFormat.isPresent() ? optionalInputFormat.get() : DEFAULT_ORC_INPUT_FORMAT;
String outputFormat = optionalOutputFormat.isPresent() ? optionalOutputFormat.get() : DEFAULT_ORC_OUTPUT_FORMAT;
tableProperties = getTableProperties(tableProperties);
// Start building Hive DDL
// Refer to Hive DDL manual for explanation of clauses:
// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-Create/Drop/TruncateTable
StringBuilder ddl = new StringBuilder();
// Create statement
ddl.append(String.format("CREATE EXTERNAL TABLE IF NOT EXISTS `%s`.`%s` ", dbName, tblName));
// .. open bracket for CREATE
ddl.append("( \n");
// 1. If evolution is enabled, and destination table does not exists
// .. use columns from new schema
// (evolution does not matter if its new destination table)
// 2. If evolution is enabled, and destination table does exists
// .. use columns from new schema
// (alter table will be used before moving data from staging to final table)
// 3. If evolution is disabled, and destination table does not exists
// .. use columns from new schema
// (evolution does not matter if its new destination table)
// 4. If evolution is disabled, and destination table does exists
// .. use columns from destination schema
if (isEvolutionEnabled || !destinationTableMeta.isPresent()) {
log.info("Generating DDL using source schema");
ddl.append(generateAvroToHiveColumnMapping(schema, Optional.of(hiveColumns), true));
} else {
log.info("Generating DDL using destination schema");
ddl.append(generateDestinationToHiveColumnMapping(Optional.of(hiveColumns), destinationTableMeta.get()));
}
// .. close bracket for CREATE
ddl.append(") \n");
// Partition info
if (optionalPartitionDDLInfo.isPresent() && optionalPartitionDDLInfo.get().size() > 0) {
ddl.append("PARTITIONED BY ( ");
boolean isFirst = true;
Map<String, String> partitionInfoMap = optionalPartitionDDLInfo.get();
for (Map.Entry<String, String> partitionInfo : partitionInfoMap.entrySet()) {
if (isFirst) {
isFirst = false;
} else {
ddl.append(", ");
}
ddl.append(String.format("`%s` %s", partitionInfo.getKey(), partitionInfo.getValue()));
}
ddl.append(" ) \n");
}
if (optionalClusterInfo.isPresent()) {
if (!optionalNumOfBuckets.isPresent()) {
throw new IllegalArgumentException(("CLUSTERED BY requested, but no NUM_BUCKETS specified"));
}
ddl.append("CLUSTERED BY ( ");
boolean isFirst = true;
for (String clusterByCol : optionalClusterInfo.get()) {
if (!hiveColumns.containsKey(clusterByCol)) {
throw new IllegalArgumentException(String.format("Requested CLUSTERED BY column: %s "
+ "is not present in schema", clusterByCol));
}
if (isFirst) {
isFirst = false;
} else {
ddl.append(", ");
}
ddl.append(String.format("`%s`", clusterByCol));
}
ddl.append(" ) ");
if (optionalSortOrderInfo.isPresent() && optionalSortOrderInfo.get().size() > 0) {
Map<String, COLUMN_SORT_ORDER> sortOrderInfoMap = optionalSortOrderInfo.get();
ddl.append("SORTED BY ( ");
isFirst = true;
for (Map.Entry<String, COLUMN_SORT_ORDER> sortOrderInfo : sortOrderInfoMap.entrySet()){
if (!hiveColumns.containsKey(sortOrderInfo.getKey())) {
throw new IllegalArgumentException(String.format(
"Requested SORTED BY column: %s " + "is not present in schema", sortOrderInfo.getKey()));
}
if (isFirst) {
isFirst = false;
} else {
ddl.append(", ");
}
ddl.append(String.format("`%s` %s", sortOrderInfo.getKey(), sortOrderInfo.getValue()));
}
ddl.append(" ) ");
}
ddl.append(String.format(" INTO %s BUCKETS %n", optionalNumOfBuckets.get()));
} else {
if (optionalSortOrderInfo.isPresent()) {
throw new IllegalArgumentException("SORTED BY requested, but no CLUSTERED BY specified");
}
}
// Field Terminal
ddl.append("ROW FORMAT SERDE \n");
ddl.append(String.format(" '%s' %n", rowFormatSerde));
// Stored as ORC
ddl.append("STORED AS INPUTFORMAT \n");
ddl.append(String.format(" '%s' %n", inputFormat));
ddl.append("OUTPUTFORMAT \n");
ddl.append(String.format(" '%s' %n", outputFormat));
// Location
ddl.append("LOCATION \n");
ddl.append(String.format(" '%s' %n", tblLocation));
// Table properties
if (null != tableProperties && tableProperties.size() > 0) {
ddl.append("TBLPROPERTIES ( \n");
boolean isFirst = true;
for (String property : tableProperties.stringPropertyNames()) {
if (isFirst) {
isFirst = false;
} else {
ddl.append(", \n");
}
ddl.append(String.format(" '%s'='%s'", property, tableProperties.getProperty(property)));
}
ddl.append(") \n");
}
return ddl.toString();
}
private static Properties getTableProperties(Properties tableProperties) {
if (null == tableProperties || tableProperties.size() == 0) {
return DEFAULT_TBL_PROPERTIES;
}
for (String property : DEFAULT_TBL_PROPERTIES.stringPropertyNames()) {
if (!tableProperties.containsKey(property)) {
tableProperties.put(property, DEFAULT_TBL_PROPERTIES.get(property));
}
}
return tableProperties;
}
/***
* Generate DDL query to create a Hive partition pointing at specific location.
* @param dbName Hive database name.
* @param tableName Hive table name.
* @param partitionLocation Physical location of partition.
* @param partitionsDMLInfo Partitions DML info - a map of partition name and partition value.
* @param format Hive partition file format
* @return Commands to create a partition.
*/
public static List<String> generateCreatePartitionDDL(String dbName, String tableName, String partitionLocation,
Map<String, String> partitionsDMLInfo, Optional<String> format) {
if (null == partitionsDMLInfo || partitionsDMLInfo.size() == 0) {
return Collections.emptyList();
}
// Partition details
StringBuilder partitionSpecs = new StringBuilder();
partitionSpecs.append("PARTITION (");
boolean isFirstPartitionSpec = true;
for (Map.Entry<String, String> partition : partitionsDMLInfo.entrySet()) {
if (isFirstPartitionSpec) {
isFirstPartitionSpec = false;
} else {
partitionSpecs.append(", ");
}
partitionSpecs.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
}
partitionSpecs.append(") \n");
// Create statement
List<String> ddls = Lists.newArrayList();
// Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
// .. hence specifying 'use dbName' as a precursor to rename
// Refer: HIVE-2496
ddls.add(String.format("USE %s%n", dbName));
if (format.isPresent()) {
ddls.add(String
.format("ALTER TABLE `%s` ADD IF NOT EXISTS %s FILEFORMAT %s LOCATION '%s' ", tableName, partitionSpecs,
format.get(), partitionLocation));
} else {
ddls.add(String.format("ALTER TABLE `%s` ADD IF NOT EXISTS %s LOCATION '%s' ", tableName, partitionSpecs,
partitionLocation));
}
return ddls;
}
public static List<String> generateCreatePartitionDDL(String dbName, String tableName, String partitionLocation,
Map<String, String> partitionsDMLInfo) {
return generateCreatePartitionDDL(dbName, tableName, partitionLocation, partitionsDMLInfo,
Optional.<String>absent());
}
/***
* Generate DDL query to drop a Hive table.
* @param dbName Hive database name.
* @param tableName Hive table name.
* @return Command to drop the table.
*/
public static String generateDropTableDDL(String dbName, String tableName) {
return String.format("DROP TABLE IF EXISTS `%s`.`%s`", dbName, tableName);
}
/***
* Adapt Avro schema / types to Hive column types
* @param schema Schema to adapt and generate Hive columns with corresponding types
* @param hiveColumns Optional Map to populate with the generated hive columns for reference of caller
* @param topLevel If this is first level
* @return Generate Hive columns with types for given Avro schema
*/
private static String generateAvroToHiveColumnMapping(Schema schema,
Optional<Map<String, String>> hiveColumns,
boolean topLevel) {
if (topLevel && !schema.getType().equals(Schema.Type.RECORD)) {
throw new IllegalArgumentException(String.format("Schema for table must be of type RECORD. Received type: %s",
schema.getType()));
}
StringBuilder columns = new StringBuilder();
boolean isFirst;
switch (schema.getType()) {
case RECORD:
isFirst = true;
if (topLevel) {
for (Schema.Field field : schema.getFields()) {
if (isFirst) {
isFirst = false;
} else {
columns.append(", \n");
}
String type = generateAvroToHiveColumnMapping(field.schema(), hiveColumns, false);
if (hiveColumns.isPresent()) {
hiveColumns.get().put(field.name(), type);
}
String flattenSource = field.getProp("flatten_source");
if (StringUtils.isBlank(flattenSource)) {
flattenSource = field.name();
}
columns.append(String.format(" `%s` %s COMMENT 'from flatten_source %s'", field.name(), type,flattenSource));
}
} else {
columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
for (Schema.Field field : schema.getFields()) {
if (isFirst) {
isFirst = false;
} else {
columns.append(",");
}
String type = generateAvroToHiveColumnMapping(field.schema(), hiveColumns, false);
columns.append("`").append(field.name()).append("`").append(":").append(type);
}
columns.append(">");
}
break;
case UNION:
Optional<Schema> optionalType = isOfOptionType(schema);
if (optionalType.isPresent()) {
Schema optionalTypeSchema = optionalType.get();
columns.append(generateAvroToHiveColumnMapping(optionalTypeSchema, hiveColumns, false));
} else {
columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
isFirst = true;
for (Schema unionMember : schema.getTypes()) {
if (Schema.Type.NULL.equals(unionMember.getType())) {
continue;
}
if (isFirst) {
isFirst = false;
} else {
columns.append(",");
}
columns.append(generateAvroToHiveColumnMapping(unionMember, hiveColumns, false));
}
columns.append(">");
}
break;
case MAP:
columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
columns.append("string,").append(generateAvroToHiveColumnMapping(schema.getValueType(), hiveColumns, false));
columns.append(">");
break;
case ARRAY:
columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType())).append("<");
columns.append(generateAvroToHiveColumnMapping(schema.getElementType(), hiveColumns, false));
columns.append(">");
break;
case NULL:
break;
case BYTES:
case DOUBLE:
case ENUM:
case FIXED:
case FLOAT:
case INT:
case LONG:
case STRING:
case BOOLEAN:
columns.append(AVRO_TO_HIVE_COLUMN_MAPPING_V_12.get(schema.getType()));
break;
default:
String exceptionMessage = String.format("DDL query generation failed for \"%s\" ", schema);
log.error(exceptionMessage);
throw new AvroRuntimeException(exceptionMessage);
}
return columns.toString();
}
/***
* Use destination table schema to generate column mapping
* @param hiveColumns Optional Map to populate with the generated hive columns for reference of caller
* @param destinationTableMeta destination table metadata
* @return Generate Hive columns with types for given Avro schema
*/
private static String generateDestinationToHiveColumnMapping(
Optional<Map<String, String>> hiveColumns,
Table destinationTableMeta) {
StringBuilder columns = new StringBuilder();
boolean isFirst = true;
List<FieldSchema> fieldList = destinationTableMeta.getSd().getCols();
for (FieldSchema field : fieldList) {
if (isFirst) {
isFirst = false;
} else {
columns.append(", \n");
}
String name = field.getName();
String type = escapeHiveType(field.getType());
String comment = field.getComment();
if (hiveColumns.isPresent()) {
hiveColumns.get().put(name, type);
}
columns.append(String.format(" `%s` %s COMMENT '%s'", name, type, comment));
}
return columns.toString();
}
/***
* Escape the Hive nested field names.
* @param type Primitive or nested Hive type.
* @return Escaped Hive nested field.
*/
public static String escapeHiveType(String type) {
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
// Primitve
if (ObjectInspector.Category.PRIMITIVE.equals(typeInfo.getCategory())) {
return type;
}
// List
else if (ObjectInspector.Category.LIST.equals(typeInfo.getCategory())) {
ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo;
return org.apache.hadoop.hive.serde.serdeConstants.LIST_TYPE_NAME + "<"
+ escapeHiveType(listTypeInfo.getListElementTypeInfo().getTypeName()) + ">";
}
// Map
else if (ObjectInspector.Category.MAP.equals(typeInfo.getCategory())) {
MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;
return org.apache.hadoop.hive.serde.serdeConstants.MAP_TYPE_NAME + "<"
+ escapeHiveType(mapTypeInfo.getMapKeyTypeInfo().getTypeName()) + ","
+ escapeHiveType(mapTypeInfo.getMapValueTypeInfo().getTypeName()) + ">";
}
// Struct
else if (ObjectInspector.Category.STRUCT.equals(typeInfo.getCategory())) {
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<String> allStructFieldNames = structTypeInfo.getAllStructFieldNames();
List<TypeInfo> allStructFieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
StringBuilder sb = new StringBuilder();
sb.append(serdeConstants.STRUCT_TYPE_NAME + "<");
for (int i = 0; i < allStructFieldNames.size(); i++) {
if (i > 0) {
sb.append(",");
}
sb.append("`");
sb.append(allStructFieldNames.get(i));
sb.append("`");
sb.append(":");
sb.append(escapeHiveType(allStructFieldTypeInfos.get(i).getTypeName()));
}
sb.append(">");
return sb.toString();
}
// Union
else if (ObjectInspector.Category.UNION.equals(typeInfo.getCategory())) {
UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo;
List<TypeInfo> allUnionObjectTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos();
StringBuilder sb = new StringBuilder();
sb.append(serdeConstants.UNION_TYPE_NAME + "<");
for (int i = 0; i < allUnionObjectTypeInfos.size(); i++) {
if (i > 0) {
sb.append(",");
}
sb.append(escapeHiveType(allUnionObjectTypeInfos.get(i).getTypeName()));
}
sb.append(">");
return sb.toString();
} else {
throw new RuntimeException("Unknown type encountered: " + type);
}
}
/***
* Check if the Avro Schema is of type OPTION
* ie. [null, TYPE] or [TYPE, null]
* @param schema Avro Schema to check
* @return Optional Avro Typed data if schema is of type OPTION
*/
private static Optional<Schema> isOfOptionType(Schema schema) {
Preconditions.checkNotNull(schema);
// If not of type UNION, cant be an OPTION
if (!Schema.Type.UNION.equals(schema.getType())) {
return Optional.<Schema>absent();
}
// If has more than two members, can't be an OPTION
List<Schema> types = schema.getTypes();
if (null != types && types.size() == 2) {
Schema first = types.get(0);
Schema second = types.get(1);
// One member should be of type NULL and other of non NULL type
if (Schema.Type.NULL.equals(first.getType()) && !Schema.Type.NULL.equals(second.getType())) {
return Optional.of(second);
} else if (!Schema.Type.NULL.equals(first.getType()) && Schema.Type.NULL.equals(second.getType())) {
return Optional.of(first);
}
}
return Optional.<Schema>absent();
}
/***
* Generate DML mapping query to populate output schema table by selecting from input schema table
* This method assumes that each output schema field has a corresponding source input table's field reference
* .. in form of 'flatten_source' property
* @param inputAvroSchema Input schema that was used to obtain output schema (next argument)
* @param outputOrcSchema Output schema (flattened or nested) that was generated using input schema
* .. and has lineage information compatible with input schema
* @param inputTblName Input table name
* @param outputTblName Output table name
* @param optionalInputDbName Optional input DB name, if not specified it will default to 'default'
* @param optionalOutputDbName Optional output DB name, if not specified it will default to 'default'
* @param optionalPartitionDMLInfo Optional partition info in form of map of partition key, partition value pairs
* @param optionalOverwriteTable Optional overwrite table, if not specified it is set to true
* @param optionalCreateIfNotExists Optional create if not exists, if not specified it is set to false
* @param isEvolutionEnabled If schema evolution is turned on
* @param destinationTableMeta Optional destination table metadata
* @param rowLimit Optional row limit
* @return DML query
*/
public static String generateTableMappingDML(Schema inputAvroSchema,
Schema outputOrcSchema,
String inputTblName,
String outputTblName,
Optional<String> optionalInputDbName,
Optional<String> optionalOutputDbName,
Optional<Map<String, String>> optionalPartitionDMLInfo,
Optional<Boolean> optionalOverwriteTable,
Optional<Boolean> optionalCreateIfNotExists,
boolean isEvolutionEnabled,
Optional<Table> destinationTableMeta,
Optional<Integer> rowLimit) {
Preconditions.checkNotNull(inputAvroSchema);
Preconditions.checkNotNull(outputOrcSchema);
Preconditions.checkArgument(StringUtils.isNotBlank(inputTblName));
Preconditions.checkArgument(StringUtils.isNotBlank(outputTblName));
String inputDbName = optionalInputDbName.isPresent() ? optionalInputDbName.get() : DEFAULT_DB_NAME;
String outputDbName = optionalOutputDbName.isPresent() ? optionalOutputDbName.get() : DEFAULT_DB_NAME;
boolean shouldOverwriteTable = optionalOverwriteTable.isPresent() ? optionalOverwriteTable.get() : true;
boolean shouldCreateIfNotExists = optionalCreateIfNotExists.isPresent() ? optionalCreateIfNotExists.get() : false;
log.debug("Input Schema: " + inputAvroSchema.toString());
log.debug("Output Schema: " + outputOrcSchema.toString());
// Start building Hive DML
// Refer to Hive DDL manual for explanation of clauses:
// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-InsertingdataintoHiveTablesfromqueries
StringBuilder dmlQuery = new StringBuilder();
// Insert query
if (shouldOverwriteTable) {
dmlQuery.append(String.format("INSERT OVERWRITE TABLE `%s`.`%s` %n", outputDbName, outputTblName));
} else {
dmlQuery.append(String.format("INSERT INTO TABLE `%s`.`%s` %n", outputDbName, outputTblName));
}
// Partition details
if (optionalPartitionDMLInfo.isPresent()) {
if (optionalPartitionDMLInfo.get().size() > 0) {
dmlQuery.append("PARTITION (");
boolean isFirstPartitionSpec = true;
for (Map.Entry<String, String> partition : optionalPartitionDMLInfo.get().entrySet()) {
if (isFirstPartitionSpec) {
isFirstPartitionSpec = false;
} else {
dmlQuery.append(", ");
}
dmlQuery.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
}
dmlQuery.append(") \n");
}
}
// If not exists
if (shouldCreateIfNotExists) {
dmlQuery.append(" IF NOT EXISTS \n");
}
// Select query
dmlQuery.append("SELECT \n");
// 1. If evolution is enabled, and destination table does not exists
// .. use columns from new schema
// (evolution does not matter if its new destination table)
// 2. If evolution is enabled, and destination table does exists
// .. use columns from new schema
// (alter table will be used before moving data from staging to final table)
// 3. If evolution is disabled, and destination table does not exists
// .. use columns from new schema
// (evolution does not matter if its new destination table)
// 4. If evolution is disabled, and destination table does exists
// .. use columns from destination schema
if (isEvolutionEnabled || !destinationTableMeta.isPresent()) {
log.info("Generating DML using source schema");
boolean isFirst = true;
List<Schema.Field> fieldList = outputOrcSchema.getFields();
for (Schema.Field field : fieldList) {
String flattenSource = field.getProp("flatten_source");
String colName;
if (StringUtils.isNotBlank(flattenSource)) {
colName = flattenSource;
} else {
colName = field.name();
}
// Escape the column name
colName = colName.replaceAll("\\.", "`.`");
if (isFirst) {
isFirst = false;
} else {
dmlQuery.append(", \n");
}
dmlQuery.append(String.format(" `%s`", colName));
}
} else {
log.info("Generating DML using destination schema");
boolean isFirst = true;
List<FieldSchema> fieldList = destinationTableMeta.get().getSd().getCols();
for (FieldSchema field : fieldList) {
String colName = StringUtils.EMPTY;
if (field.isSetComment() && field.getComment().startsWith("from flatten_source ")) {
// Retrieve from column (flatten_source) from comment
colName = field.getComment().replaceAll("from flatten_source ", "").trim();
} else {
// Or else find field in flattened schema
List<Schema.Field> evolvedFieldList = outputOrcSchema.getFields();
for (Schema.Field evolvedField : evolvedFieldList) {
if (evolvedField.name().equalsIgnoreCase(field.getName())) {
String flattenSource = evolvedField.getProp("flatten_source");
if (StringUtils.isNotBlank(flattenSource)) {
colName = flattenSource;
} else {
colName = evolvedField.name();
}
break;
}
}
}
// Escape the column name
colName = colName.replaceAll("\\.", "`.`");
// colName can be blank if it is deleted in new evolved schema, so we shouldn't try to fetch it
if (StringUtils.isNotBlank(colName)) {
if (isFirst) {
isFirst = false;
} else {
dmlQuery.append(", \n");
}
dmlQuery.append(String.format(" `%s`", colName));
}
}
}
dmlQuery.append(String.format(" %n FROM `%s`.`%s` ", inputDbName, inputTblName));
// Partition details
if (optionalPartitionDMLInfo.isPresent()) {
if (optionalPartitionDMLInfo.get().size() > 0) {
dmlQuery.append("WHERE ");
boolean isFirstPartitionSpec = true;
for (Map.Entry<String, String> partition : optionalPartitionDMLInfo.get().entrySet()) {
if (isFirstPartitionSpec) {
isFirstPartitionSpec = false;
} else {
dmlQuery.append(" AND ");
}
dmlQuery.append(String.format("`%s`='%s'",
partition.getKey(), partition.getValue()));
}
dmlQuery.append(" \n");
}
}
// Limit clause
if (rowLimit.isPresent()) {
dmlQuery.append(String.format("LIMIT %s", rowLimit.get()));
}
return dmlQuery.toString();
}
public static Schema readSchemaFromString(String schemaStr)
throws IOException {
return new Schema.Parser().parse(schemaStr);
}
/***
* Generate DDLs to evolve final destination table.
* @param stagingTableName Staging table.
* @param finalTableName Un-evolved final destination table.
* @param optionalStagingDbName Optional staging database name, defaults to default.
* @param optionalFinalDbName Optional final database name, defaults to default.
* @param evolvedSchema Evolved Avro Schema.
* @param isEvolutionEnabled Is schema evolution enabled.
* @param evolvedColumns Evolved columns in Hive format.
* @param destinationTableMeta Destination table metadata.
* @return DDLs to evolve final destination table.
*/
public static List<String> generateEvolutionDDL(String stagingTableName,
String finalTableName,
Optional<String> optionalStagingDbName,
Optional<String> optionalFinalDbName,
Schema evolvedSchema,
boolean isEvolutionEnabled,
Map<String, String> evolvedColumns,
Optional<Table> destinationTableMeta) {
// If schema evolution is disabled, then do nothing OR
// If destination table does not exists, then do nothing
if (!isEvolutionEnabled || !destinationTableMeta.isPresent()) {
return Collections.emptyList();
}
String stagingDbName = optionalStagingDbName.isPresent() ? optionalStagingDbName.get() : DEFAULT_DB_NAME;
String finalDbName = optionalFinalDbName.isPresent() ? optionalFinalDbName.get() : DEFAULT_DB_NAME;
List<String> ddl = Lists.newArrayList();
// Evolve schema
Table destinationTable = destinationTableMeta.get();
if (destinationTable.getSd().getCols().size() == 0) {
log.warn("Desination Table: " + destinationTable + " does not has column details in StorageDescriptor. "
+ "It is probably of Avro type. Cannot evolve via traditional HQL, so skipping evolution checks.");
return ddl;
}
for (Map.Entry<String, String> evolvedColumn : evolvedColumns.entrySet()) {
// Find evolved column in destination table
boolean found = false;
for (FieldSchema destinationField : destinationTable.getSd().getCols()) {
if (destinationField.getName().equalsIgnoreCase(evolvedColumn.getKey())) {
// If evolved column is found, but type is evolved - evolve it
// .. if incompatible, isTypeEvolved will throw an exception
if (isTypeEvolved(evolvedColumn.getValue(), destinationField.getType())) {
ddl.add(String.format("USE %s%n", finalDbName));
ddl.add(String.format("ALTER TABLE `%s` CHANGE COLUMN %s %s %s COMMENT '%s'",
finalTableName, evolvedColumn.getKey(), evolvedColumn.getKey(), evolvedColumn.getValue(),
destinationField.getComment()));
}
found = true;
break;
}
}
if (!found) {
// If evolved column is not found ie. its new, add this column
String flattenSource = evolvedSchema.getField(evolvedColumn.getKey()).getProp("flatten_source");
if (StringUtils.isBlank(flattenSource)) {
flattenSource = evolvedSchema.getField(evolvedColumn.getKey()).name();
}
// Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
// .. hence specifying 'use dbName' as a precursor to rename
// Refer: HIVE-2496
ddl.add(String.format("USE %s%n", finalDbName));
ddl.add(String.format("ALTER TABLE `%s` ADD COLUMNS (%s %s COMMENT 'from flatten_source %s')",
finalTableName, evolvedColumn.getKey(), evolvedColumn.getValue(), flattenSource));
}
}
return ddl;
}
/**
* Generate DDL for dropping partitions of a table.
* <p>
* ALTER TABLE finalTableName DROP IF EXISTS PARTITION partition_spec, PARTITION partition_spec, ...;
* </p>
* @param finalTableName Table name where partitions are dropped
* @param partitionsDMLInfo Partitions to be dropped
* @return DDL to drop partitions in <code>finalTableName</code>
*/
public static List<String> generateDropPartitionsDDL(final String dbName, final String finalTableName,
final Map<String, String> partitionsDMLInfo) {
if (null == partitionsDMLInfo || partitionsDMLInfo.isEmpty()) {
return Collections.emptyList();
}
// Partition details
StringBuilder partitionSpecs = new StringBuilder();
partitionSpecs.append("PARTITION (");
boolean isFirstPartitionSpec = true;
for (Map.Entry<String, String> partition : partitionsDMLInfo.entrySet()) {
if (isFirstPartitionSpec) {
isFirstPartitionSpec = false;
} else {
partitionSpecs.append(", ");
}
partitionSpecs.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
}
partitionSpecs.append(") ");
List<String> ddls = Lists.newArrayList();
// Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
// .. hence specifying 'use dbName' as a precursor to rename
// Refer: HIVE-2496
ddls.add(String.format("USE %s%n", dbName));
ddls.add(String.format("ALTER TABLE %s DROP IF EXISTS %s", finalTableName, partitionSpecs));
return ddls;
}
/**
* Generate DDL for dropping partitions of a table.
* <p>
* ALTER TABLE finalTableName DROP IF EXISTS PARTITION partition_spec, PARTITION partition_spec, ...;
* </p>
* @param finalTableName Table name where partitions are dropped
* @param partitionDMLInfos list of Partition to be dropped
* @return DDL to drop partitions in <code>finalTableName</code>
*/
public static List<String> generateDropPartitionsDDL(final String dbName, final String finalTableName,
final List<Map<String, String>> partitionDMLInfos) {
if (partitionDMLInfos.isEmpty()) {
return Collections.emptyList();
}
List<String> ddls = Lists.newArrayList();
ddls.add(String.format("USE %s %n", dbName));
// Join the partition specs
ddls.add(String.format("ALTER TABLE %s DROP IF EXISTS %s", finalTableName,
Joiner.on(",").join(Iterables.transform(partitionDMLInfos, PARTITION_SPEC_GENERATOR))));
return ddls;
}
/***
* Generate DDL for creating and updating view over a table.
*
* Create view:
* <p>
* CREATE VIEW IF NOT EXISTS db.viewName AS SELECT * FROM db.tableName
* </p>
*
* Update view:
* <p>
* ALTER VIEW db.viewName AS SELECT * FROM db.tableName
* </p>
*
* @param tableDbName Database for the table over which view has to be created.
* @param tableName Table over which view has to be created.
* @param viewDbName Database for the view to be created.
* @param viewName View to be created.
* @param shouldUpdateView If view should be forced re-built.
* @return DDLs to create and / or update view over a table
*/
public static List<String> generateCreateOrUpdateViewDDL(final String tableDbName, final String tableName,
final String viewDbName, final String viewName, final boolean shouldUpdateView) {
Preconditions.checkArgument(StringUtils.isNotBlank(tableName), "Table name should not be empty");
Preconditions.checkArgument(StringUtils.isNotBlank(viewName), "View name should not be empty");
// Resolve defaults
String resolvedTableDbName = (StringUtils.isBlank(tableDbName)) ? DEFAULT_DB_NAME : tableDbName;
String resolvedViewDbName = (StringUtils.isBlank(viewDbName)) ? DEFAULT_DB_NAME : viewDbName;
List<String> ddls = Lists.newArrayList();
// No-op if view already exists
ddls.add(String.format("CREATE VIEW IF NOT EXISTS `%s`.`%s` AS SELECT * FROM `%s`.`%s`",
resolvedViewDbName, viewName,
resolvedTableDbName, tableName));
// This will force re-build the view
if (shouldUpdateView) {
ddls.add(String.format("ALTER VIEW `%s`.`%s` AS SELECT * FROM `%s`.`%s`",
resolvedViewDbName, viewName,
resolvedTableDbName, tableName));
}
return ddls;
}
/***
* Generate DDL for updating file format of table or partition.
* If partition spec is absent, DDL query to change storage format of Table is generated.
*
* Query syntax:
* <p>
* ALTER TABLE tableName [PARTITION partition_spec] SET FILEFORMAT fileFormat
* </p>
*
* @param dbName Database for the table for which storage format needs to be changed.
* @param tableName Table for which storage format needs to be changed.
* @param partitionsDMLInfo Optional partition spec for which storage format needs to be changed.
* @param format Storage format.
* @return DDL to change storage format for Table or Partition.
*/
public static List<String> generateAlterTableOrPartitionStorageFormatDDL(final String dbName,
final String tableName,
final Optional<Map<String, String>> partitionsDMLInfo,
String format) {
Preconditions.checkArgument(StringUtils.isNotBlank(tableName), "Table name should not be empty");
Preconditions.checkArgument(StringUtils.isNotBlank(format), "Format should not be empty");
// Resolve defaults
String resolvedDbName = (StringUtils.isBlank(dbName)) ? DEFAULT_DB_NAME : dbName;
// Partition details
StringBuilder partitionSpecs = new StringBuilder();
if (partitionsDMLInfo.isPresent()) {
partitionSpecs.append("PARTITION (");
boolean isFirstPartitionSpec = true;
for (Map.Entry<String, String> partition : partitionsDMLInfo.get().entrySet()) {
if (isFirstPartitionSpec) {
isFirstPartitionSpec = false;
} else {
partitionSpecs.append(", ");
}
partitionSpecs.append(String.format("`%s`='%s'", partition.getKey(), partition.getValue()));
}
partitionSpecs.append(") ");
}
List<String> ddls = Lists.newArrayList();
// Note: Hive does not support fully qualified Hive table names such as db.table for ALTER TABLE in v0.13
// .. hence specifying 'use dbName' as a precursor to rename
// Refer: HIVE-2496
ddls.add(String.format("USE %s%n", resolvedDbName));
ddls.add(String.format("ALTER TABLE %s %s SET FILEFORMAT %s", tableName, partitionSpecs, format));
return ddls;
}
/***
* Serialize a {@link QueryBasedHivePublishEntity} into a {@link State} at {@link #SERIALIZED_PUBLISH_TABLE_COMMANDS}.
* @param state {@link State} to serialize entity into.
* @param queryBasedHivePublishEntity to carry to publisher.
*/
public static void serializePublishCommands(State state, QueryBasedHivePublishEntity queryBasedHivePublishEntity) {
state.setProp(HiveAvroORCQueryGenerator.SERIALIZED_PUBLISH_TABLE_COMMANDS,
GSON.toJson(queryBasedHivePublishEntity));
}
/***
* Deserialize the publish entity from a {@link State} at {@link #SERIALIZED_PUBLISH_TABLE_COMMANDS}.
* @param state {@link State} to look into for serialized entity.
* @return Publish table entity.
*/
public static QueryBasedHivePublishEntity deserializePublishCommands(State state) {
QueryBasedHivePublishEntity queryBasedHivePublishEntity =
GSON.fromJson(state.getProp(HiveAvroORCQueryGenerator.SERIALIZED_PUBLISH_TABLE_COMMANDS),
QueryBasedHivePublishEntity.class);
return queryBasedHivePublishEntity == null ? new QueryBasedHivePublishEntity() : queryBasedHivePublishEntity;
}
public static boolean isTypeEvolved(String evolvedType, String destinationType) {
if (evolvedType.equalsIgnoreCase(destinationType)) {
// Same type, not evolved
return false;
}
// Look for compatibility in evolved type
if (HIVE_COMPATIBLE_TYPES.containsKey(destinationType)) {
if (HIVE_COMPATIBLE_TYPES.get(destinationType).contains(evolvedType)) {
return true;
} else {
throw new RuntimeException(String.format("Incompatible type evolution from: %s to: %s",
destinationType, evolvedType));
}
} else {
// We assume all complex types are compatible
// TODO: Add compatibility check when ORC evolution supports complex types
return true;
}
}
/**
* Generate partition spec in Hive standard syntax. (partition_column=partition_col_value, partition_column=partition_col_value, ...)
*/
private static final Function<Map<String, String>, String> PARTITION_SPEC_GENERATOR = new Function<Map<String, String>, String>() {
@Override
public String apply(Map<String, String> partitionDMLInfo) {
if (partitionDMLInfo == null) {
return StringUtils.EMPTY;
}
return String.format(" PARTITION (%s)", Joiner.on(",").withKeyValueSeparator("=").join(Maps.transformValues(partitionDMLInfo, QUOTE_PARTITION_VALUES)));
}
};
private static final Function<String, String> QUOTE_PARTITION_VALUES = new Function<String, String>() {
@Override
public String apply(String value) {
return String.format("'%s'", value);
}
};
}