/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.conversion.hive.avro;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
import org.apache.avro.Schema;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.hash.Hashing;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.data.management.conversion.hive.query.HiveAvroORCQueryGenerator;
import gobblin.hive.avro.HiveAvroSerDeManager;
import gobblin.util.AvroUtils;
import gobblin.util.HadoopUtils;
/**
* Avro schema for a {@link Partition} or {@link Table} is available at multiple locations. This class is used to decide
* the schema to use. It also creates a temporary schema file on the {@link FileSystem}.
*
* <ul>
* 1. The {@link Schema} can be set as a literal in the serde info<br>
* 2. The {@link Schema} url can set as a property in the serde info<br>
* 3. The {@link Schema} can be inferred using the physical data location of the {@link Table} or {@link Partition}<br>
* </ul>
*
* Callers request for the schema url using {@link #getSchemaUrl(Partition)} or {@link #getSchemaUrl(Table)}.
*<ul>
* In case (1.), the literal is written as a {@link Schema} file under {@link #schemaDir}. The {@link Path} to this file
* is uses as the {@link Schema} url<br>
* In case (2.), the url itself is used as the {@link Schema} url<br>
* In case (3.), a {@link Schema} file is created under {@link #schemaDir} for {@link Schema} of latest data file.<br>
*</ul>
*
* In all three cases the mapping of {@link Schema} to temporary Schema file path is cached.
* If multiple {@link Partition}s have the same {@link Schema} a duplicate schema file in not created. Already existing
* {@link Schema} url for this {@link Schema} is used.
*/
@Slf4j
public class AvroSchemaManager {
private static final String HIVE_SCHEMA_TEMP_DIR_PATH_KEY = "hive.schema.dir";
private static final String DEFAULT_HIVE_SCHEMA_TEMP_DIR_PATH_KEY = "/tmp/gobblin_schemas";
private final FileSystem fs;
/**
* A mapping of {@link Schema} hash to its {@link Path} on {@link FileSystem}
*/
private final Map<String, Path> schemaPaths;
/**
* A temporary directory to hold all Schema files. The path is job id specific.
* Deleting it will not affect other job executions
*/
private final Path schemaDir;
public AvroSchemaManager(FileSystem fs, State state) {
this.fs = fs;
this.schemaPaths = Maps.newHashMap();
this.schemaDir = new Path(state.getProp(HIVE_SCHEMA_TEMP_DIR_PATH_KEY, DEFAULT_HIVE_SCHEMA_TEMP_DIR_PATH_KEY),
state.getProp(ConfigurationKeys.JOB_ID_KEY));
}
/**
* Get the url to <code>table</code>'s avro {@link Schema} file.
*
* @param table whose avro schema is to be returned
* @return a {@link Path} to table's avro {@link Schema} file.
*/
public Path getSchemaUrl(Table table) throws IOException {
return getSchemaUrl(table.getTTable().getSd());
}
/**
* Get the url to <code>partition</code>'s avro {@link Schema} file.
*
* @param partition whose avro schema is to be returned
* @return a {@link Path} to table's avro {@link Schema} file.
*/
public Path getSchemaUrl(Partition partition) throws IOException {
return getSchemaUrl(partition.getTPartition().getSd());
}
/**
* Delete the temporary {@link #schemaDir}
*/
public void cleanupTempSchemas() throws IOException {
HadoopUtils.deleteIfExists(this.fs, this.schemaDir, true);
}
public static Schema getSchemaFromUrl(Path schemaUrl, FileSystem fs) throws IOException {
return AvroUtils.parseSchemaFromFile(schemaUrl, fs);
}
private Path getSchemaUrl(StorageDescriptor sd) throws IOException {
String schemaString = StringUtils.EMPTY;
try {
// Try to fetch from SCHEMA URL
if (sd.getSerdeInfo().getParameters().containsKey(HiveAvroSerDeManager.SCHEMA_URL)) {
String schemaUrl = sd.getSerdeInfo().getParameters().get(HiveAvroSerDeManager.SCHEMA_URL);
if (schemaUrl.startsWith("http")) {
// Fetch schema literal via HTTP GET if scheme is http(s)
schemaString = IOUtils.toString(new URI(schemaUrl), StandardCharsets.UTF_8);
log.debug("Schema string is: " + schemaString);
Schema schema = HiveAvroORCQueryGenerator.readSchemaFromString(schemaString);
return getOrGenerateSchemaFile(schema);
} else {
// .. else fetch from HDFS or local filesystem
return new Path(sd.getSerdeInfo().getParameters().get(HiveAvroSerDeManager.SCHEMA_URL));
}
}
// Try to fetch from SCHEMA LITERAL
else if (sd.getSerdeInfo().getParameters().containsKey(HiveAvroSerDeManager.SCHEMA_LITERAL)) {
schemaString = sd.getSerdeInfo().getParameters().get(HiveAvroSerDeManager.SCHEMA_LITERAL);
log.debug("Schema string is: " + schemaString);
Schema schema = HiveAvroORCQueryGenerator.readSchemaFromString(schemaString);
return getOrGenerateSchemaFile(schema);
}
} catch (URISyntaxException e) {
log.error(String.format("Failed to parse schema from schema string. Falling back to HDFS schema: %s",
schemaString), e);
}
// Try to fetch from HDFS
Schema schema = AvroUtils.getDirectorySchema(new Path(sd.getLocation()), this.fs, true);
if (schema == null) {
throw new SchemaNotFoundException("Failed to get avro schema");
}
return getOrGenerateSchemaFile(schema);
}
/**
* If url for schema already exists, return the url. If not create a new temporary schema file and return a the url.
*/
private Path getOrGenerateSchemaFile(Schema schema) throws IOException {
Preconditions.checkNotNull(schema, "Avro Schema should not be null");
String hashedSchema = Hashing.sha256().hashString(schema.toString(), StandardCharsets.UTF_8).toString();
if (!this.schemaPaths.containsKey(hashedSchema)) {
Path schemaFilePath = new Path(this.schemaDir, String.valueOf(System.currentTimeMillis() + ".avsc"));
AvroUtils.writeSchemaToFile(schema, schemaFilePath, fs, true);
this.schemaPaths.put(hashedSchema, schemaFilePath);
}
return this.schemaPaths.get(hashedSchema);
}
}