/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.conversion.hive.query;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.ql.metadata.Partition;
import com.google.common.base.Optional;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset;
import gobblin.data.management.copy.hive.HiveDataset;
/***
* Generate Hive queries for validation
*
* @author Abhishek Tiwari
*/
public class HiveValidationQueryGenerator {
/***
* Generate Hive queries for validating converted Hive table.
* @param hiveDataset Source {@link HiveDataset}.
* @param sourcePartition Source {@link Partition} if any.
* @param conversionConfig {@link ConvertibleHiveDataset.ConversionConfig} for conversion.
* @return Validation Hive queries.
*/
public static List<String> generateCountValidationQueries(HiveDataset hiveDataset,
Optional<Partition> sourcePartition, ConvertibleHiveDataset.ConversionConfig conversionConfig) {
// Source and converted destination details
String sourceDatabase = hiveDataset.getDbAndTable().getDb();
String sourceTable = hiveDataset.getDbAndTable().getTable();
String destinationDatabase = conversionConfig.getDestinationDbName();
String destinationTable = conversionConfig.getDestinationTableName();
// Build query.
List<String> queries = Lists.newArrayList();
if (sourcePartition.isPresent()) {
StringBuilder partitionClause = new StringBuilder();
boolean isFirst = true;
String partitionInfo = sourcePartition.get().getName();
List<String> pInfo = Splitter.on(",").omitEmptyStrings().trimResults().splitToList(partitionInfo);
for (String aPInfo : pInfo) {
List<String> pInfoParts = Splitter.on("=").omitEmptyStrings().trimResults().splitToList(aPInfo);
if (pInfoParts.size() != 2) {
throw new IllegalArgumentException(String
.format("Partition details should be of the format " + "partitionName=partitionValue. Recieved: %s",
aPInfo));
}
if (isFirst) {
isFirst = false;
} else {
partitionClause.append(" and ");
}
partitionClause.append("`").append(pInfoParts.get(0)).append("`='").append(pInfoParts.get(1)).append("'");
}
queries.add(String
.format("SELECT count(*) FROM `%s`.`%s` WHERE %s ", sourceDatabase, sourceTable, partitionClause));
queries.add(String.format("SELECT count(*) FROM `%s`.`%s` WHERE %s ", destinationDatabase, destinationTable,
partitionClause));
} else {
queries.add(String.format("SELECT count(*) FROM `%s`.`%s` ", sourceDatabase, sourceTable));
queries.add(String.format("SELECT count(*) FROM `%s`.`%s` ", destinationDatabase, destinationTable));
}
return queries;
}
/***
* Generates Hive SQL that can be used to validate the quality between two {@link Table}s or optionally
* {@link Partition}. The query returned is a basic join query that returns the number of records matched
* between the two {@link Table}s.
* The responsibility of actually comparing this value with the expected module should be implemented by
* the user.
*
* @param sourceTable Source Hive {@link Table} name.
* @param sourceDb Source Hive database name.
* @param targetTable Target Hive {@link Table} name.
* @param optionalPartition Optional {@link Partition} to limit the comparison.
* @return Query to find number of rows common between two tables.
*/
public static String generateDataValidationQuery(String sourceTable, String sourceDb, Table targetTable,
Optional<Partition> optionalPartition, boolean isNestedORC) {
StringBuilder sb = new StringBuilder();
// Query head
sb.append("SELECT count(*) FROM `")
.append(sourceDb).append("`.`").append(sourceTable).append("` s JOIN `")
.append(targetTable.getDbName()).append("`.`").append(targetTable.getTableName()).append("` t ON \n");
// Columns equality
boolean isFirst = true;
List<FieldSchema> fieldList = targetTable.getSd().getCols();
for (FieldSchema field : fieldList) {
// Do not add maps in the join clause. Hive does not support map joins LIHADOOP-21956
if (StringUtils.startsWithIgnoreCase(field.getType(), "map")) {
continue;
}
if (StringUtils.containsIgnoreCase(field.getType(), ":map")) {
continue;
}
if (isFirst) {
isFirst = false;
} else {
sb.append(" AND \n");
}
if (isNestedORC) {
sb.append("\ts.`").append(field.getName()).append("`<=>");
} else {
// The source column lineage information is available in field's comment. Remove the description prefix "from flatten_source"
String colName = field.getComment().replaceAll("from flatten_source ", "").trim();
sb.append("\ts.`").append(colName.replaceAll("\\.", "`.`")).append("`<=>");
}
sb.append("t.`").append(field.getName()).append("` ");
}
sb.append("\n");
// Partition projection
if (optionalPartition.isPresent()) {
Partition partition = optionalPartition.get();
String partitionsInfoString = partition.getName();
List<String> pInfo = Splitter.on(",").omitEmptyStrings().trimResults().splitToList(partitionsInfoString);
for (int i = 0; i < pInfo.size(); i++) {
List<String> partitionInfoParts = Splitter.on("=").omitEmptyStrings().trimResults().splitToList(pInfo.get(i));
if (partitionInfoParts.size() != 2) {
throw new IllegalArgumentException(
String.format("Partition details should be of the format partitionName=partitionValue. Recieved: %s", pInfo.get(i)));
}
if (i==0) {
// add where clause
sb.append(" WHERE \n");
} else {
sb.append(" AND ");
}
// add project for source and destination partition
sb.append(String.format("s.`%s`='%s' ", partitionInfoParts.get(0), partitionInfoParts.get(1)));
sb.append(" AND ");
sb.append(String.format("t.`%s`='%s' ", partitionInfoParts.get(0), partitionInfoParts.get(1)));
}
}
return sb.toString();
}
}