/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.hive;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.reflect.TypeToken;
import gobblin.annotation.Alpha;
import gobblin.configuration.State;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
/**
* A class that represents a Hive table or partition.
*
* @author Ziyang Liu
*/
@Getter
@Alpha
@ToString
public class HiveRegistrationUnit {
protected final String dbName;
protected final String tableName;
protected final List<Column> columns = Lists.newArrayList();
protected final State props = new State();
protected final State storageProps = new State();
protected final State serDeProps = new State();
protected final Optional<HiveSerDeManager> serDeManager;
/**
* Table or Partition properties
*/
protected Optional<Long> createTime;
protected Optional<Long> lastAccessTime;
/**
* Storage properties
*/
protected Optional<String> location;
protected Optional<String> inputFormat;
protected Optional<String> outputFormat;
protected Optional<Boolean> isCompressed;
protected Optional<Integer> numBuckets;
protected Optional<List<String>> bucketColumns;
protected Optional<Boolean> isStoredAsSubDirs;
/**
* SerDe properties
*/
protected Optional<String> serDeType;
HiveRegistrationUnit(Builder<?> builder) {
Preconditions.checkArgument(!Strings.isNullOrEmpty(builder.dbName));
Preconditions.checkArgument(!Strings.isNullOrEmpty(builder.tableName));
this.dbName = builder.dbName;
this.tableName = builder.tableName;
this.columns.addAll(builder.columns);
this.props.addAll(builder.props);
this.storageProps.addAll(builder.storageProps);
this.serDeProps.addAll(builder.serDeProps);
this.serDeManager = builder.serDeManager;
populateTablePartitionFields(this.props);
populateStorageFields(this.storageProps);
populateSerDeFields(this.serDeProps);
}
@SuppressWarnings("serial")
protected void populateTablePartitionFields(State state) {
this.createTime = populateField(state, HiveConstants.CREATE_TIME, new TypeToken<Long>() {});
this.lastAccessTime = populateField(state, HiveConstants.LAST_ACCESS_TIME, new TypeToken<Long>() {});
}
@SuppressWarnings({ "serial" })
protected void populateStorageFields(State state) {
this.location = populateField(state, HiveConstants.LOCATION, new TypeToken<String>() {});
this.inputFormat = populateField(state, HiveConstants.INPUT_FORMAT, new TypeToken<String>() {});
this.outputFormat = populateField(state, HiveConstants.OUTPUT_FORMAT, new TypeToken<String>() {});
this.isCompressed = populateField(state, HiveConstants.COMPRESSED, new TypeToken<Boolean>() {});
this.numBuckets = populateField(state, HiveConstants.NUM_BUCKETS, new TypeToken<Integer>() {});
this.bucketColumns = populateField(state, HiveConstants.BUCKET_COLUMNS, new TypeToken<List<String>>() {});
this.isStoredAsSubDirs = populateField(state, HiveConstants.STORED_AS_SUB_DIRS, new TypeToken<Boolean>() {});
}
@SuppressWarnings("serial")
protected void populateSerDeFields(State state) {
this.serDeType = populateField(state, HiveConstants.SERDE_TYPE, new TypeToken<String>() {});
}
@SuppressWarnings({ "serial", "unchecked" })
protected static <T> Optional<T> populateField(State state, String key, TypeToken<T> token) {
if (state.contains(key)) {
Optional<T> fieldValue;
if (new TypeToken<Boolean>() {}.isAssignableFrom(token)) {
fieldValue = (Optional<T>) Optional.of(state.getPropAsBoolean(key));
} else if (new TypeToken<Integer>() {}.isAssignableFrom(token)) {
fieldValue = (Optional<T>) Optional.of(state.getPropAsInt(key));
} else if (new TypeToken<Long>() {}.isAssignableFrom(token)) {
fieldValue = (Optional<T>) Optional.of(state.getPropAsLong(key));
} else if (new TypeToken<List<String>>() {}.isAssignableFrom(token)) {
fieldValue = (Optional<T>) Optional.of(state.getPropAsList(key));
} else {
fieldValue = (Optional<T>) Optional.of(state.getProp(key));
}
state.removeProp(key);
return fieldValue;
}
return Optional.<T> absent();
}
/**
* Set the columns for a table or partition.
*
* <p>
* Columns does not need to be set for a table if the table's serde already provides the schema,
* such as Avro tables. Columns does not need to be set for a partition if they are the same as
* the table's columns.
* </p>
* @param columns
*/
public void setColumns(List<Column> columns) {
this.columns.clear();
this.columns.addAll(columns);
}
/**
* Set a table/partition parameter.
*
* <p>
* When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
* {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
* which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
* one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
* {@link #setProp(String, Object)}, {@link #setStorageProp(String, Object)} and
* {@link #setSerDeProp(String, Object)}. When using query-based Hive registration, they do not need to be
* distinguished since all parameters will be passed via TBLPROPERTIES.
* </p>
*/
public void setProp(String key, Object value) {
this.props.setProp(key, value);
updateTablePartitionFields(this.props, key, value);
}
/**
* Set a storage parameter for a table/partition.
*
* <p>
* When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
* {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
* which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
* one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
* {@link #setProp(String, Object)}, {@link #setStorageProp(String, Object)} and
* {@link #setSerDeProp(String, Object)}. When using query-based Hive registration, they do not need to be
* distinguished since all parameters will be passed via TBLPROPERTIES.
* </p>
*/
public void setStorageProp(String key, Object value) {
this.storageProps.setProp(key, value);
updateStorageFields(this.storageProps, key, value);
}
/**
* Set a serde parameter for a table/partition.
*
* <p>
* When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
* {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
* which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
* one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
* {@link #setProp(String, Object)}, {@link #setStorageProp(String, Object)} and
* {@link #setSerDeProp(String, Object)}. When using query-based Hive registration, they do not need to be
* distinguished since all parameters will be passed via TBLPROPERTIES.
* </p>
*/
public void setSerDeProp(String key, Object value) {
this.serDeProps.setProp(key, value);
updateSerDeFields(this.serDeProps, key, value);
}
/**
* Set table/partition parameters.
*
* <p>
* When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
* {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
* which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
* one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
* {@link #setProps(State)}, {@link #setStorageProps(State)} and
* {@link #setSerDeProps(State)}. When using query-based Hive registration, they do not need to be
* distinguished since all parameters will be passed via TBLPROPERTIES.
* </p>
*/
public void setProps(State props) {
for (String propKey : props.getPropertyNames()) {
setProp(propKey, props.getProp(propKey));
}
}
/**
* Set storage parameters for a table/partition.
*
* <p>
* When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
* {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
* which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
* one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
* {@link #setProps(State)}, {@link #setStorageProps(State)} and
* {@link #setSerDeProps(State)}. When using query-based Hive registration, they do not need to be
* distinguished since all parameters will be passed via TBLPROPERTIES.
* </p>
*/
public void setStorageProps(State storageProps) {
for (String propKey : storageProps.getPropertyNames()) {
setStorageProp(propKey, storageProps.getProp(propKey));
}
}
/**
* Set serde parameters for a table/partition.
*
* <p>
* When using {@link gobblin.hive.metastore.HiveMetaStoreBasedRegister}, since it internally use
* {@link org.apache.hadoop.hive.metastore.api.Table} and {@link org.apache.hadoop.hive.metastore.api.Partition}
* which distinguishes between table/partition parameters, storage descriptor parameters, and serde parameters,
* one may need to distinguish them when constructing a {@link HiveRegistrationUnit} by using
* {@link #setProps(State)}, {@link #setStorageProps(State)} and
* {@link #setSerDeProps(State)}. When using query-based Hive registration, they do not need to be
* distinguished since all parameters will be passed via TBLPROPERTIES.
* </p>
*/
public void setSerDeProps(State serdeProps) {
for (String propKey : serdeProps.getPropertyNames()) {
setSerDeProp(propKey, serdeProps.getProp(propKey));
}
}
protected void updateTablePartitionFields(State state, String key, Object value) {
boolean isExistingField = true;
switch (key) {
case HiveConstants.CREATE_TIME:
this.createTime = Optional.of((Long) value);
break;
case HiveConstants.LAST_ACCESS_TIME:
this.createTime = Optional.of((Long) value);
break;
default:
isExistingField = false;
}
if (isExistingField) {
state.removeProp(key);
}
}
protected void updateStorageFields(State state, String key, Object value) {
boolean isExistingField = true;
switch (key) {
case HiveConstants.LOCATION:
this.location = Optional.of((String) value);
break;
case HiveConstants.INPUT_FORMAT:
this.inputFormat = Optional.of((String) value);
break;
case HiveConstants.OUTPUT_FORMAT:
this.outputFormat = Optional.of((String) value);
break;
case HiveConstants.COMPRESSED:
this.isCompressed = Optional.of((Boolean) value);
break;
case HiveConstants.NUM_BUCKETS:
this.numBuckets = Optional.of((Integer) value);
break;
case HiveConstants.BUCKET_COLUMNS:
this.bucketColumns = Optional.of(Splitter.on(',').omitEmptyStrings().trimResults().splitToList((String) value));
break;
case HiveConstants.STORED_AS_SUB_DIRS:
this.isStoredAsSubDirs = Optional.of((Boolean) value);
break;
default:
isExistingField = false;
}
if (isExistingField) {
state.removeProp(key);
}
}
protected void updateSerDeFields(State state, String key, Object value) {
boolean isExistingField = true;
switch (key) {
case HiveConstants.SERDE_TYPE:
this.serDeType = Optional.of((String) value);
break;
default:
isExistingField = false;
}
if (isExistingField) {
state.removeProp(key);
}
}
/**
* Set serde properties for a table/partition using the table/partition's {@link HiveSerDeManager}.
*
* <p>
* Requires that the {@link HiveSerDeManager} of the table/partition must be specified in
* {@link Builder#withSerdeManaager(HiveSerDeManager)}, and the table/partition's location must be specified
* either in {@link #setLocation(String)} or via {@link HiveConstants#LOCATION}.
* </p>
*/
public void setSerDeProps(Path path) throws IOException {
this.serDeManager.get().addSerDeProperties(path, this);
}
/**
* Set serde properties for a table/partition using another table/partition's serde properties.
*
* <p>
* A benefit of doing this is to avoid obtaining the schema multiple times when creating a table and a partition
* with the same schema, or creating several tables and partitions with the same schema. After the first
* table/partition is created, one can use the same SerDe properties to create the other tables/partitions.
* </p>
*/
public void setSerDeProps(HiveRegistrationUnit other) throws IOException {
this.serDeManager.get().addSerDeProperties(other, this);
}
public void setCreateTime(long createTime) {
this.createTime = Optional.of(createTime);
}
public void setLastAccessTime(long lastAccessTime) {
this.lastAccessTime = Optional.of(lastAccessTime);
}
public void setLocation(String location) {
this.location = Optional.of(location);
}
public void setInputFormat(String inputFormat) {
this.inputFormat = Optional.of(inputFormat);
}
public void setOutputFormat(String outputFormat) {
this.outputFormat = Optional.of(outputFormat);
}
public void setCompressed(boolean isCompressed) {
this.isCompressed = Optional.of(isCompressed);
}
public void setNumBuckets(int numBuckets) {
this.numBuckets = Optional.of(numBuckets);
}
public void setBucketColumns(List<String> bucketColumns) {
this.bucketColumns = Optional.<List<String>> of(ImmutableList.<String> copyOf(bucketColumns));
}
public void setStoredAsSubDirs(boolean isStoredAsSubDirs) {
this.isStoredAsSubDirs = Optional.of(isStoredAsSubDirs);
}
public void setSerDeType(String serDeType) {
this.serDeType = Optional.of(serDeType);
}
static abstract class Builder<T extends Builder<?>> {
private String dbName;
private String tableName;
private List<Column> columns = Lists.newArrayList();
private State props = new State();
private State storageProps = new State();
private State serDeProps = new State();
private Optional<HiveSerDeManager> serDeManager = Optional.absent();
@SuppressWarnings("unchecked")
public T withDbName(String dbName) {
this.dbName = dbName;
return (T) this;
}
@SuppressWarnings("unchecked")
public T withTableName(String tableName) {
this.tableName = tableName;
return (T) this;
}
@SuppressWarnings("unchecked")
public T withColumns(List<Column> columns) {
this.columns = columns;
return (T) this;
}
@SuppressWarnings("unchecked")
public T withProps(State props) {
this.props = props;
return (T) this;
}
@SuppressWarnings("unchecked")
public T withStorageProps(State storageProps) {
this.storageProps = storageProps;
return (T) this;
}
@SuppressWarnings("unchecked")
public T withSerdeProps(State serDeProps) {
this.serDeProps = serDeProps;
return (T) this;
}
@SuppressWarnings("unchecked")
public T withSerdeManaager(HiveSerDeManager serDeManager) {
this.serDeManager = Optional.of(serDeManager);
return (T) this;
}
public abstract HiveRegistrationUnit build();
}
@AllArgsConstructor
@Getter
public static class Column {
private final String name;
private final String type;
private final String comment;
}
}