/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.hive.avro;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Preconditions;
import gobblin.annotation.Alpha;
import gobblin.configuration.State;
import gobblin.hive.HiveRegistrationUnit;
import gobblin.hive.HiveSerDeManager;
import gobblin.hive.HiveSerDeWrapper;
import gobblin.util.AvroUtils;
import gobblin.util.HadoopUtils;
import lombok.extern.slf4j.Slf4j;
/**
* A {@link HiveSerDeManager} for registering Avro tables and partitions.
*
* @author Ziyang Liu
*/
@Slf4j
@Alpha
public class HiveAvroSerDeManager extends HiveSerDeManager {
public static final String SCHEMA_LITERAL = "avro.schema.literal";
public static final String SCHEMA_URL = "avro.schema.url";
public static final String USE_SCHEMA_FILE = "use.schema.file";
public static final boolean DEFAULT_USE_SCHEMA_FILE = false;
public static final String SCHEMA_FILE_NAME = "schema.file.name";
public static final String DEFAULT_SCHEMA_FILE_NAME = "_schema.avsc";
public static final String SCHEMA_LITERAL_LENGTH_LIMIT = "schema.literal.length.limit";
public static final int DEFAULT_SCHEMA_LITERAL_LENGTH_LIMIT = 4000;
protected final FileSystem fs;
protected final boolean useSchemaFile;
protected final String schemaFileName;
protected final int schemaLiteralLengthLimit;
protected final HiveSerDeWrapper serDeWrapper = HiveSerDeWrapper.get("AVRO");
public HiveAvroSerDeManager(State props) throws IOException {
super(props);
this.fs = FileSystem.get(HadoopUtils.getConfFromState(props));
this.useSchemaFile = props.getPropAsBoolean(USE_SCHEMA_FILE, DEFAULT_USE_SCHEMA_FILE);
this.schemaFileName = props.getProp(SCHEMA_FILE_NAME, DEFAULT_SCHEMA_FILE_NAME);
this.schemaLiteralLengthLimit =
props.getPropAsInt(SCHEMA_LITERAL_LENGTH_LIMIT, DEFAULT_SCHEMA_LITERAL_LENGTH_LIMIT);
}
/**
* Add an Avro {@link Schema} to the given {@link HiveRegistrationUnit}.
*
* <p>
* If {@link #USE_SCHEMA_FILE} is true, the schema will be added via {@link #SCHEMA_URL} pointing to
* the schema file named {@link #SCHEMA_FILE_NAME}.
* </p>
*
* <p>
* If {@link #USE_SCHEMA_FILE} is false, the schema will be obtained by {@link #getDirectorySchema(Path)}.
* If the length of the schema is less than {@link #SCHEMA_LITERAL_LENGTH_LIMIT}, it will be added via
* {@link #SCHEMA_LITERAL}. Otherwise, the schema will be written to {@link #SCHEMA_FILE_NAME} and added
* via {@link #SCHEMA_URL}.
* </p>
*/
@Override
public void addSerDeProperties(Path path, HiveRegistrationUnit hiveUnit) throws IOException {
hiveUnit.setSerDeType(this.serDeWrapper.getSerDe().getClass().getName());
hiveUnit.setInputFormat(this.serDeWrapper.getInputFormatClassName());
hiveUnit.setOutputFormat(this.serDeWrapper.getOutputFormatClassName());
addSchemaProperties(path, hiveUnit);
}
@Override
public void addSerDeProperties(HiveRegistrationUnit source, HiveRegistrationUnit target) throws IOException {
if (source.getSerDeType().isPresent()) {
target.setSerDeType(source.getSerDeType().get());
}
if (source.getInputFormat().isPresent()) {
target.setInputFormat(source.getInputFormat().get());
}
if (source.getOutputFormat().isPresent()) {
target.setOutputFormat(source.getOutputFormat().get());
}
if (source.getSerDeProps().contains(SCHEMA_LITERAL)) {
target.setSerDeProp(SCHEMA_LITERAL, source.getSerDeProps().getProp(SCHEMA_LITERAL));
}
if (source.getSerDeProps().contains(SCHEMA_URL)) {
target.setSerDeProp(SCHEMA_URL, source.getSerDeProps().getProp(SCHEMA_URL));
}
}
private void addSchemaProperties(Path path, HiveRegistrationUnit hiveUnit) throws IOException {
Preconditions.checkArgument(this.fs.getFileStatus(path).isDirectory(), path + " is not a directory.");
Path schemaFile = new Path(path, this.schemaFileName);
if (this.useSchemaFile) {
hiveUnit.setSerDeProp(SCHEMA_URL, schemaFile.toString());
} else {
Schema schema = getDirectorySchema(path);
addSchemaFromAvroFile(schema, schemaFile, hiveUnit);
}
}
/**
* Get schema for a directory using {@link AvroUtils#getDirectorySchema(Path, FileSystem, boolean)}.
*/
protected Schema getDirectorySchema(Path directory) throws IOException {
return AvroUtils.getDirectorySchema(directory, this.fs, true);
}
/**
* Add a {@link Schema} obtained from an Avro data file to the given {@link HiveRegistrationUnit}.
*
* <p>
* If the length of the schema is less than {@link #SCHEMA_LITERAL_LENGTH_LIMIT}, it will be added via
* {@link #SCHEMA_LITERAL}. Otherwise, the schema will be written to {@link #SCHEMA_FILE_NAME} and added
* via {@link #SCHEMA_URL}.
* </p>
*/
protected void addSchemaFromAvroFile(Schema schema, Path schemaFile, HiveRegistrationUnit hiveUnit)
throws IOException {
Preconditions.checkNotNull(schema);
String schemaStr = schema.toString();
if (schemaStr.length() <= this.schemaLiteralLengthLimit) {
hiveUnit.setSerDeProp(SCHEMA_LITERAL, schema.toString());
} else {
AvroUtils.writeSchemaToFile(schema, schemaFile, this.fs, true);
log.info("Using schema file " + schemaFile.toString());
hiveUnit.setSerDeProp(SCHEMA_URL, schemaFile.toString());
}
}
@Override
public void updateSchema(HiveRegistrationUnit existingUnit, HiveRegistrationUnit newUnit) throws IOException {
Preconditions.checkArgument(
newUnit.getSerDeProps().contains(SCHEMA_LITERAL) || newUnit.getSerDeProps().contains(SCHEMA_URL));
if (newUnit.getSerDeProps().contains(SCHEMA_LITERAL)) {
existingUnit.setSerDeProp(SCHEMA_LITERAL, newUnit.getSerDeProps().getProp(SCHEMA_LITERAL));
} else {
existingUnit.setSerDeProp(SCHEMA_URL, newUnit.getSerDeProps().getProp(SCHEMA_URL));
}
}
@Override
public boolean haveSameSchema(HiveRegistrationUnit unit1, HiveRegistrationUnit unit2) {
if (unit1.getSerDeProps().contains(HiveAvroSerDeManager.SCHEMA_LITERAL)
&& unit2.getSerDeProps().contains(HiveAvroSerDeManager.SCHEMA_LITERAL)) {
return unit1.getSerDeProps().getProp(HiveAvroSerDeManager.SCHEMA_LITERAL)
.equals(unit2.getSerDeProps().getProp(HiveAvroSerDeManager.SCHEMA_LITERAL));
} else if (unit1.getSerDeProps().contains(HiveAvroSerDeManager.SCHEMA_URL)
&& unit2.getSerDeProps().contains(HiveAvroSerDeManager.SCHEMA_URL)) {
return unit1.getSerDeProps().getProp(HiveAvroSerDeManager.SCHEMA_URL)
.equals(unit2.getSerDeProps().getProp(HiveAvroSerDeManager.SCHEMA_URL));
}
return false;
}
}