/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.hive.policy;
import gobblin.hive.metastore.HiveMetaStoreUtils;
import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.reflect.ConstructorUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.TableType;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import gobblin.annotation.Alpha;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.hive.HivePartition;
import gobblin.hive.HiveRegProps;
import gobblin.hive.HiveSerDeManager;
import gobblin.hive.HiveTable;
import gobblin.hive.spec.HiveSpec;
import gobblin.hive.spec.SimpleHiveSpec;
/**
* A base implementation of {@link HiveRegistrationPolicy}. It obtains database name from
* property {@link #HIVE_DATABASE_NAME} or {@link #HIVE_DATABASE_REGEX} (group 1), obtains
* table name from property {@link #HIVE_TABLE_NAME} and {@link #HIVE_TABLE_REGEX} (group 1),
* and builds a {@link SimpleHiveSpec}.
*
* @author Ziyang Liu
*/
@Alpha
public class HiveRegistrationPolicyBase implements HiveRegistrationPolicy {
public static final String HIVE_DATABASE_NAME = "hive.database.name";
public static final String ADDITIONAL_HIVE_DATABASE_NAMES = "additional.hive.database.names";
public static final String HIVE_DATABASE_REGEX = "hive.database.regex";
public static final String HIVE_DATABASE_NAME_PREFIX = "hive.database.name.prefix";
public static final String HIVE_DATABASE_NAME_SUFFIX = "hive.database.name.suffix";
public static final String HIVE_TABLE_NAME = "hive.table.name";
public static final String ADDITIONAL_HIVE_TABLE_NAMES = "additional.hive.table.names";
public static final String HIVE_TABLE_REGEX = "hive.table.regex";
public static final String HIVE_TABLE_NAME_PREFIX = "hive.table.name.prefix";
public static final String HIVE_TABLE_NAME_SUFFIX = "hive.table.name.suffix";
public static final String HIVE_SANITIZE_INVALID_NAMES = "hive.sanitize.invalid.names";
public static final String HIVE_FS_URI = "hive.registration.fs.uri";
// {@value PRIMARY_TABLE_TOKEN} if present in {@value ADDITIONAL_HIVE_TABLE_NAMES} or dbPrefix.{@value HIVE_TABLE_NAME}
// .. will be replaced by the table name determined via {@link #getTableName(Path)}
public static final String PRIMARY_TABLE_TOKEN = "$PRIMARY_TABLE";
/**
* A valid db or table name should start with an alphanumeric character, and contains only
* alphanumeric characters and '_'.
*/
private static final Pattern VALID_DB_TABLE_NAME_PATTERN_1 = Pattern.compile("[a-z0-9][a-z0-9_]*");
/**
* A valid db or table name should contain at least one letter or '_' (i.e., should not be numbers only).
*/
private static final Pattern VALID_DB_TABLE_NAME_PATTERN_2 = Pattern.compile(".*[a-z_].*");
protected final HiveRegProps props;
protected final FileSystem fs;
protected final boolean sanitizeNameAllowed;
protected final Optional<Pattern> dbNamePattern;
protected final Optional<Pattern> tableNamePattern;
protected final String dbNamePrefix;
protected final String dbNameSuffix;
protected final String tableNamePrefix;
protected final String tableNameSuffix;
public HiveRegistrationPolicyBase(State props) throws IOException {
Preconditions.checkNotNull(props);
this.props = new HiveRegProps(props);
if (props.contains(HiveRegistrationPolicyBase.HIVE_FS_URI)) {
this.fs = FileSystem.get(URI.create(props.getProp(HiveRegistrationPolicyBase.HIVE_FS_URI)), new Configuration());
} else {
this.fs = FileSystem.get(new Configuration());
}
this.sanitizeNameAllowed = props.getPropAsBoolean(HIVE_SANITIZE_INVALID_NAMES, true);
this.dbNamePattern = props.contains(HIVE_DATABASE_REGEX)
? Optional.of(Pattern.compile(props.getProp(HIVE_DATABASE_REGEX))) : Optional.<Pattern> absent();
this.tableNamePattern = props.contains(HIVE_TABLE_REGEX)
? Optional.of(Pattern.compile(props.getProp(HIVE_TABLE_REGEX))) : Optional.<Pattern> absent();
this.dbNamePrefix = props.getProp(HIVE_DATABASE_NAME_PREFIX, StringUtils.EMPTY);
this.dbNameSuffix = props.getProp(HIVE_DATABASE_NAME_SUFFIX, StringUtils.EMPTY);
this.tableNamePrefix = props.getProp(HIVE_TABLE_NAME_PREFIX, StringUtils.EMPTY);
this.tableNameSuffix = props.getProp(HIVE_TABLE_NAME_SUFFIX, StringUtils.EMPTY);
}
/**
* This method first tries to obtain the database name from {@link #HIVE_DATABASE_NAME}.
* If this property is not specified, it then tries to obtain the database name using
* the first group of {@link #HIVE_DATABASE_REGEX}.
*
*/
protected Optional<String> getDatabaseName(Path path) {
if (!this.props.contains(HIVE_DATABASE_NAME) && !this.props.contains(HIVE_DATABASE_REGEX)) {
return Optional.<String> absent();
}
return Optional.<String> of(
this.dbNamePrefix + getDatabaseOrTableName(path, HIVE_DATABASE_NAME, HIVE_DATABASE_REGEX, this.dbNamePattern)
+ this.dbNameSuffix);
}
/**
* Obtain Hive database names. The returned {@link Iterable} contains the database name returned by
* {@link #getDatabaseName(Path)} (if present) plus additional database names specified in
* {@link #ADDITIONAL_HIVE_DATABASE_NAMES}.
*
*/
protected Iterable<String> getDatabaseNames(Path path) {
List<String> databaseNames = Lists.newArrayList();
Optional<String> databaseName;
if ((databaseName = getDatabaseName(path)).isPresent()) {
databaseNames.add(databaseName.get());
}
if (!Strings.isNullOrEmpty(this.props.getProp(ADDITIONAL_HIVE_DATABASE_NAMES))) {
for (String additionalDbName : this.props.getPropAsList(ADDITIONAL_HIVE_DATABASE_NAMES)) {
databaseNames.add(this.dbNamePrefix + additionalDbName + this.dbNameSuffix);
}
}
Preconditions.checkState(!databaseNames.isEmpty(), "Hive database name not specified");
return databaseNames;
}
/**
* This method first tries to obtain the database name from {@link #HIVE_TABLE_NAME}.
* If this property is not specified, it then tries to obtain the database name using
* the first group of {@link #HIVE_TABLE_REGEX}.
*/
protected Optional<String> getTableName(Path path) {
if (!this.props.contains(HIVE_TABLE_NAME) && !this.props.contains(HIVE_TABLE_REGEX)) {
return Optional.<String> absent();
}
return Optional.<String> of(
this.tableNamePrefix + getDatabaseOrTableName(path, HIVE_TABLE_NAME, HIVE_TABLE_REGEX, this.tableNamePattern)
+ this.tableNameSuffix);
}
/***
* Obtain Hive table names.
*
* The returned {@link Iterable} contains:
* 1. Table name returned by {@link #getTableName(Path)}
* 2. Table names specified by <code>additional.hive.table.names</code>
*
* In table names above, the {@value PRIMARY_TABLE_TOKEN} if present is also replaced by the
* table name obtained via {@link #getTableName(Path)}.
*
* @param path Path for the table on filesystem.
* @return Table names to register.
*/
protected Iterable<String> getTableNames(Path path) {
List<String> tableNames = getTableNames(Optional.<String>absent(), path);
Preconditions.checkState(!tableNames.isEmpty(), "Hive table name not specified");
return tableNames;
}
/***
* Obtain Hive table names filtered by <code>dbPrefix</code> (if present).
*
* The returned {@link List} contains:
* A. If <code>dbPrefix</code> is absent:
* 1. Table name returned by {@link #getTableName(Path)}
* 2. Table names specified by <code>additional.hive.table.names</code>
* B. If dbPrefix is present:
* 1. Table names specified by <code>dbPrefix.hive.table.names</code>
*
* In table names above, the {@value PRIMARY_TABLE_TOKEN} if present is also replaced by the
* table name obtained via {@link #getTableName(Path)}.
*
* @param dbPrefix Prefix to the property <code>additional.table.names</code>, to obtain table names only
* for the specified db. Eg. If <code>dbPrefix</code> is db, then
* <code>db.hive.table.names</code> is the resolved property name.
* @param path Path for the table on filesystem.
* @return Table names to register.
*/
protected List<String> getTableNames(Optional<String> dbPrefix, Path path) {
List<String> tableNames = Lists.newArrayList();
Optional<String> primaryTableName;
if ((primaryTableName = getTableName(path)).isPresent() && !dbPrefix.isPresent()) {
tableNames.add(primaryTableName.get());
}
String additionalNamesProp;
if (dbPrefix.isPresent()) {
additionalNamesProp = String.format("%s.%s", dbPrefix.get(), HIVE_TABLE_NAME);
} else {
additionalNamesProp = ADDITIONAL_HIVE_TABLE_NAMES;
}
if (!Strings.isNullOrEmpty(this.props.getProp(additionalNamesProp))) {
for (String additionalTableName : this.props.getPropAsList(additionalNamesProp)) {
String resolvedTableName = primaryTableName.isPresent() ?
StringUtils.replace(additionalTableName, PRIMARY_TABLE_TOKEN, primaryTableName.get()) :
additionalTableName;
tableNames.add(this.tableNamePrefix + resolvedTableName + this.tableNameSuffix);
}
}
return tableNames;
}
protected String getDatabaseOrTableName(Path path, String nameKey, String regexKey, Optional<Pattern> pattern) {
String name;
if (this.props.contains(nameKey)) {
name = this.props.getProp(nameKey);
} else if (pattern.isPresent()) {
name = pattern.get().matcher(path.toString()).group();
} else {
throw new IllegalStateException("Missing required property " + nameKey + " or " + regexKey);
}
return sanitizeAndValidateName(name);
}
protected String sanitizeAndValidateName(String name) {
name = name.toLowerCase();
if (this.sanitizeNameAllowed && !isNameValid(name)) {
name = sanitizeName(name);
}
if (isNameValid(name)) {
return name;
}
throw new IllegalStateException(name + " is not a valid Hive database or table name");
}
/**
* A base implementation for creating {@link HiveTable}s given a {@link Path}.
*
* <p>
* This method returns a list of {@link Hivetable}s that contains one table per db name
* (returned by {@link #getDatabaseNames(Path)}) and table name (returned by {@link #getTableNames(Path)}.
* </p>
*
* @param path a {@link Path} used to create the {@link HiveTable}.
* @return a list of {@link HiveTable}s for the given {@link Path}.
* @throws IOException
*/
protected List<HiveTable> getTables(Path path) throws IOException {
List<HiveTable> tables = Lists.newArrayList();
for (String databaseName : getDatabaseNames(path)) {
// Get tables to register ONLY for this Hive database (specified via prefix filter in properties)
boolean foundTablesViaDbFilter = false;
for (String tableName : getTableNames(Optional.of(databaseName), path)) {
tables.add(getTable(path, databaseName, tableName));
foundTablesViaDbFilter = true;
}
// If no tables found via db filter, get tables to register in all Hive databases and add them for this database
if (!foundTablesViaDbFilter) {
for (String tableName : getTableNames(path)) {
tables.add(getTable(path, databaseName, tableName));
}
}
}
return tables;
}
/**
* A base implementation for creating a non bucketed, external {@link HiveTable} for a {@link Path}.
*
* @param path a {@link Path} used to create the {@link HiveTable}.
* @param dbName the database name for the created {@link HiveTable}.
* @param tableName the table name for the created {@link HiveTable}.
* @return a {@link HiveTable}s for the given {@link Path}.
* @throws IOException
*/
protected HiveTable getTable(Path path, String dbName, String tableName) throws IOException {
HiveTable table = new HiveTable.Builder().withDbName(dbName).withTableName(tableName)
.withSerdeManaager(HiveSerDeManager.get(this.props)).build();
table.setLocation(this.fs.makeQualified(getTableLocation(path)).toString());
table.setSerDeProps(path);
// Setting table-level props.
State tableProps = new State(this.props.getTablePartitionProps());
if (this.props.getRuntimeTableProps().isPresent()){
tableProps.setProp(HiveMetaStoreUtils.RUNTIME_PROPS, this.props.getRuntimeTableProps().get());
}
table.setProps(tableProps);
table.setStorageProps(this.props.getStorageProps());
table.setSerDeProps(this.props.getSerdeProps());
table.setNumBuckets(-1);
table.setBucketColumns(Lists.<String> newArrayList());
table.setTableType(TableType.EXTERNAL_TABLE.toString());
return table;
}
protected Optional<HivePartition> getPartition(Path path, HiveTable table) throws IOException {
return Optional.<HivePartition> absent();
}
protected Path getTableLocation(Path path) {
return path;
}
/**
* Determine whether a database or table name is valid.
*
* A name is valid if and only if: it starts with an alphanumeric character, contains only alphanumeric characters
* and '_', and is NOT composed of numbers only.
*/
protected static boolean isNameValid(String name) {
Preconditions.checkNotNull(name);
name = name.toLowerCase();
return VALID_DB_TABLE_NAME_PATTERN_1.matcher(name).matches()
&& VALID_DB_TABLE_NAME_PATTERN_2.matcher(name).matches();
}
/**
* Attempt to sanitize an invalid database or table name by replacing characters that are not alphanumeric
* or '_' with '_'.
*/
protected static String sanitizeName(String name) {
return name.replaceAll("[^a-zA-Z0-9_]", "_");
}
@Override
public Collection<HiveSpec> getHiveSpecs(Path path) throws IOException {
List<HiveSpec> specs = Lists.newArrayList();
for (HiveTable table : getTables(path)) {
specs.add(new SimpleHiveSpec.Builder<>(path).withTable(table).withPartition(getPartition(path, table)).build());
}
return specs;
}
/**
* Get a {@link HiveRegistrationPolicy} from a {@link State} object.
*
* @param props A {@link State} object that contains property, {@link #HIVE_REGISTRATION_POLICY},
* which is the class name of the desired policy. This policy class must have a constructor that
* takes a {@link State} object.
*/
public static HiveRegistrationPolicy getPolicy(State props) {
Preconditions.checkArgument(props.contains(ConfigurationKeys.HIVE_REGISTRATION_POLICY));
String policyType = props.getProp(ConfigurationKeys.HIVE_REGISTRATION_POLICY);
try {
return (HiveRegistrationPolicy) ConstructorUtils.invokeConstructor(Class.forName(policyType), props);
} catch (ReflectiveOperationException e) {
throw new RuntimeException(
"Unable to instantiate " + HiveRegistrationPolicy.class.getSimpleName() + " with type " + policyType, e);
}
}
}