/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.conversion.hive.converter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
import org.apache.avro.Schema;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.testng.Assert;
import org.testng.annotations.Test;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import gobblin.data.management.ConversionHiveTestUtils;
import gobblin.data.management.conversion.hive.query.HiveAvroORCQueryGenerator;
import gobblin.util.AvroFlattener;
/**
* Test for schema evolution enabled or disabled
*/
@Slf4j
@Test(groups = { "gobblin.data.management.conversion" })
public class HiveSchemaEvolutionTest {
private static String resourceDir = "avroToOrcSchemaEvolutionTest";
private static String schemaName = "sourceSchema";
private static String hiveDbName = "hiveDb";
private static AvroFlattener avroFlattener = new AvroFlattener();
private static Schema inputSchema;
private static Schema outputSchema;
private static Optional<Integer> rowLimit = Optional.absent();
static {
try {
inputSchema = ConversionHiveTestUtils.readSchemaFromJsonFile(resourceDir, "source_schema.json");
outputSchema = avroFlattener.flatten(inputSchema, true);
} catch (IOException e) {
throw new RuntimeException("Could not initialized tests", e);
}
}
@Test
public void testEvolutionEnabledForExistingTable() throws IOException {
boolean isEvolutionEnabled = true;
Optional<Table> destinationTableMeta = createEvolvedDestinationTable(schemaName, "default", "", true);
String ddl = HiveAvroORCQueryGenerator
.generateCreateTableDDL(outputSchema,
schemaName, "file:/user/hive/warehouse/" + schemaName,
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<List<String>>absent(),
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), Optional.<Integer>absent(),
Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(),
null, isEvolutionEnabled, destinationTableMeta,
new HashMap<String, String>());
Assert.assertEquals(ddl, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_enabled.ddl"), "Generated DDL did not match expected for evolution enabled");
String dml = HiveAvroORCQueryGenerator
.generateTableMappingDML(inputSchema, outputSchema, schemaName, schemaName + "_orc", Optional.<String>absent(),
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<Boolean>absent(),
Optional.<Boolean>absent(), isEvolutionEnabled, destinationTableMeta, rowLimit);
Assert.assertEquals(dml, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_enabled.dml"), "Generated DML did not match expected for evolution enabled");
}
@Test
public void testEvolutionEnabledForNewTable() throws IOException {
boolean isEvolutionEnabled = true;
Optional<Table> destinationTableMeta = Optional.absent();
String ddl = HiveAvroORCQueryGenerator
.generateCreateTableDDL(outputSchema, schemaName, "file:/user/hive/warehouse/" + schemaName,
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<List<String>>absent(),
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), Optional.<Integer>absent(),
Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(),
null, isEvolutionEnabled, destinationTableMeta,
new HashMap<String, String>());
Assert.assertEquals(ddl, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_enabled.ddl"),
"Generated DDL did not match expected for evolution enabled");
String dml = HiveAvroORCQueryGenerator
.generateTableMappingDML(inputSchema, outputSchema, schemaName, schemaName + "_orc", Optional.<String>absent(),
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<Boolean>absent(),
Optional.<Boolean>absent(), isEvolutionEnabled, destinationTableMeta, rowLimit);
Assert.assertEquals(dml, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_enabled.dml"),
"Generated DML did not match expected for evolution enabled");
}
@Test
public void testEvolutionDisabledForExistingTable() throws IOException {
boolean isEvolutionEnabled = false;
Optional<Table> destinationTableMeta = createEvolvedDestinationTable(schemaName, "default", "", true);
String ddl = HiveAvroORCQueryGenerator
.generateCreateTableDDL(outputSchema, schemaName, "file:/user/hive/warehouse/" + schemaName,
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<List<String>>absent(),
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), Optional.<Integer>absent(),
Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(),
null, isEvolutionEnabled, destinationTableMeta,
new HashMap<String, String>());
Assert.assertEquals(ddl, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_disabled.ddl"),
"Generated DDL did not match expected for evolution disabled");
String dml = HiveAvroORCQueryGenerator
.generateTableMappingDML(inputSchema, outputSchema, schemaName, schemaName + "_orc", Optional.<String>absent(),
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<Boolean>absent(),
Optional.<Boolean>absent(), isEvolutionEnabled, destinationTableMeta, rowLimit);
Assert.assertEquals(dml, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_disabled.dml"),
"Generated DML did not match expected for evolution disabled");
}
@Test
public void testEvolutionDisabledForNewTable() throws IOException {
boolean isEvolutionEnabled = false;
Optional<Table> destinationTableMeta = Optional.absent();
String ddl = HiveAvroORCQueryGenerator
.generateCreateTableDDL(outputSchema, schemaName, "file:/user/hive/warehouse/" + schemaName,
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<List<String>>absent(),
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), Optional.<Integer>absent(),
Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(),
null, isEvolutionEnabled, destinationTableMeta,
new HashMap<String, String>());
Assert.assertEquals(ddl, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_enabled.ddl"),
"Generated DDL did not match expected for evolution disabled");
String dml = HiveAvroORCQueryGenerator
.generateTableMappingDML(inputSchema, outputSchema, schemaName, schemaName + "_orc", Optional.<String>absent(),
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<Boolean>absent(),
Optional.<Boolean>absent(), isEvolutionEnabled, destinationTableMeta, rowLimit);
Assert.assertEquals(dml, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_evolution_enabled.dml"),
"Generated DML did not match expected for evolution disabled");
}
@Test
public void testLineageMissing() throws IOException {
boolean isEvolutionEnabled = false;
Optional<Table> destinationTableMeta = createEvolvedDestinationTable(schemaName, "default", "", false);
String ddl = HiveAvroORCQueryGenerator
.generateCreateTableDDL(outputSchema, schemaName, "file:/user/hive/warehouse/" + schemaName,
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<List<String>>absent(),
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), Optional.<Integer>absent(),
Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(),
null, isEvolutionEnabled, destinationTableMeta,
new HashMap<String, String>());
Assert.assertEquals(ddl, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_lineage_missing.ddl"),
"Generated DDL did not match expected for evolution disabled");
String dml = HiveAvroORCQueryGenerator
.generateTableMappingDML(inputSchema, outputSchema, schemaName, schemaName + "_orc", Optional.<String>absent(),
Optional.<String>absent(), Optional.<Map<String, String>>absent(), Optional.<Boolean>absent(),
Optional.<Boolean>absent(), isEvolutionEnabled, destinationTableMeta, rowLimit);
Assert.assertEquals(dml, ConversionHiveTestUtils.readQueryFromFile(resourceDir,
"source_schema_lineage_missing.dml"),
"Generated DML did not match expected for evolution disabled");
}
@Test
public void testEvolutionEnabledGenerateEvolutionDDL() {
String orcStagingTableName = schemaName + "_staging";
String orcTableName = schemaName;
boolean isEvolutionEnabled = true;
Optional<Table> destinationTableMeta = createEvolvedDestinationTable(schemaName, "default", "", true);
Map<String, String> hiveColumns = new HashMap<>();
// Call to help populate hiveColumns via real code path
HiveAvroORCQueryGenerator.generateCreateTableDDL(outputSchema, schemaName, "/tmp/dummy", Optional.<String>absent(),
Optional.<Map<String, String>>absent(), Optional.<List<String>>absent(),
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), Optional.<Integer>absent(),
Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(),
null, isEvolutionEnabled, destinationTableMeta, hiveColumns);
// Destination table exists
List<String> generateEvolutionDDL = HiveAvroORCQueryGenerator
.generateEvolutionDDL(orcStagingTableName, orcTableName, Optional.of(hiveDbName), Optional.of(hiveDbName),
outputSchema, isEvolutionEnabled, hiveColumns, destinationTableMeta);
Assert.assertEquals(generateEvolutionDDL.size(), 2);
Assert.assertEquals(generateEvolutionDDL.get(1),
"ALTER TABLE `sourceSchema` ADD COLUMNS (parentFieldRecord__nestedFieldInt int "
+ "COMMENT 'from flatten_source parentFieldRecord.nestedFieldInt')",
"Generated evolution DDL did not match for evolution enabled");
// Destination table does not exists
destinationTableMeta = Optional.absent();
generateEvolutionDDL = HiveAvroORCQueryGenerator
.generateEvolutionDDL(orcStagingTableName, orcTableName, Optional.of(hiveDbName), Optional.of(hiveDbName),
outputSchema, isEvolutionEnabled, hiveColumns, destinationTableMeta);
// No DDL should be generated, because create table will take care of destination table
Assert.assertEquals(generateEvolutionDDL.size(), 0,
"Generated evolution DDL did not match for evolution enabled");
}
@Test
public void testEvolutionDisabledGenerateEvolutionDDL() {
String orcStagingTableName = schemaName + "_staging";
String orcTableName = schemaName;
boolean isEvolutionEnabled = false;
Optional<Table> destinationTableMeta = createEvolvedDestinationTable(schemaName, "default", "", true);
Map<String, String> hiveColumns = new HashMap<>();
// Call to help populate hiveColumns via real code path
HiveAvroORCQueryGenerator.generateCreateTableDDL(outputSchema, schemaName, "/tmp/dummy", Optional.<String>absent(),
Optional.<Map<String, String>>absent(), Optional.<List<String>>absent(),
Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), Optional.<Integer>absent(),
Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(),
null, isEvolutionEnabled, destinationTableMeta, hiveColumns);
// Destination table exists
List<String> generateEvolutionDDL = HiveAvroORCQueryGenerator
.generateEvolutionDDL(orcStagingTableName, orcTableName, Optional.of(hiveDbName), Optional.of(hiveDbName),
outputSchema, isEvolutionEnabled, hiveColumns, destinationTableMeta);
// No DDL should be generated, because select based on destination table will selectively project columns
Assert.assertEquals(generateEvolutionDDL.size(), 0,
"Generated evolution DDL did not match for evolution disabled");
// Destination table does not exists
destinationTableMeta = Optional.absent();
generateEvolutionDDL = HiveAvroORCQueryGenerator
.generateEvolutionDDL(orcStagingTableName, orcTableName, Optional.of(hiveDbName), Optional.of(hiveDbName),
outputSchema, isEvolutionEnabled, hiveColumns, destinationTableMeta);
// No DDL should be generated, because create table will take care of destination table
Assert.assertEquals(generateEvolutionDDL.size(), 0,
"Generated evolution DDL did not match for evolution disabled");
}
private Optional<Table> createEvolvedDestinationTable(String tableName, String dbName, String location,
boolean withComment) {
List<FieldSchema> cols = new ArrayList<>();
// Existing columns that match avroToOrcSchemaEvolutionTest/source_schema_evolution_enabled.ddl
cols.add(new FieldSchema("parentFieldRecord__nestedFieldRecord__superNestedFieldString", "string",
withComment ? "from flatten_source parentFieldRecord.nestedFieldRecord.superNestedFieldString" : ""));
cols.add(new FieldSchema("parentFieldRecord__nestedFieldRecord__superNestedFieldInt", "int",
withComment ? "from flatten_source parentFieldRecord.nestedFieldRecord.superNestedFieldInt" : ""));
cols.add(new FieldSchema("parentFieldRecord__nestedFieldString", "string",
withComment ? "from flatten_source parentFieldRecord.nestedFieldString" : ""));
// The following column is skipped (simulating un-evolved schema):
// Column name : parentFieldRecord__nestedFieldInt
// Column type : int
// Column comment: from flatten_source parentFieldRecord.nestedFieldInt
cols.add(new FieldSchema("parentFieldInt", "int",
withComment ? "from flatten_source parentFieldInt" : ""));
// Extra schema
cols.add(new FieldSchema("parentFieldRecord__nestedFieldString2", "string",
withComment ? "from flatten_source parentFieldRecord.nestedFieldString2" : ""));
String inputFormat = "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat";
String outputFormat = "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat";
StorageDescriptor storageDescriptor = new StorageDescriptor(cols, location, inputFormat, outputFormat, false, 0,
new SerDeInfo(), null, Lists.<Order>newArrayList(), null);
Table table = new Table(tableName, dbName, "ketl_dev", 0, 0, 0, storageDescriptor,
Lists.<FieldSchema>newArrayList(), Maps.<String,String>newHashMap(), "", "", "");
return Optional.of(table);
}
}