/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.converter;
import com.google.common.base.Optional;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.converter.filter.AvroProjectionConverter;
import gobblin.converter.filter.AvroSchemaFieldRemover;
import gobblin.metrics.kafka.KafkaSchemaRegistry;
import gobblin.metrics.kafka.KafkaSchemaRegistryFactory;
import gobblin.metrics.kafka.SchemaRegistryException;
import gobblin.util.AvroUtils;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.ExecutionException;
import javax.xml.bind.DatatypeConverter;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
/**
* A converter for extracting schema/records from an envelope schema.
* Input schema: envelope schema - must have fields payloadSchemaId (the schema registry key of the output
* schema) and payload (byte data for output record)
* Input record: record corresponding to input schema
* Output schema: schema obtained from schema registry using key provided in input record's {@link #PAYLOAD_SCHEMA_ID_FIELD}
* Output record: record corresponding to output schema obtained from input record's {@link #PAYLOAD_FIELD} as bytes
*/
public class EnvelopeSchemaConverter extends Converter<Schema, String, GenericRecord, GenericRecord> {
public static final String PAYLOAD_SCHEMA_ID_FIELD = "EnvelopeSchemaConverter.schemaIdField";
public static final String PAYLOAD_FIELD = "EnvelopeSchemaConverter.payloadField";
public static final String DEFAULT_PAYLOAD_SCHEMA_ID_FIELD ="payloadSchemaId";
public static final String DEFAULT_PAYLOAD_FIELD = "payload";
public static final String DEFAULT_KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS = "gobblin.metrics.kafka.KafkaAvroSchemaRegistryFactory";
private Optional<AvroSchemaFieldRemover> fieldRemover;
private KafkaSchemaRegistry registry;
private DecoderFactory decoderFactory;
private LoadingCache<Schema, GenericDatumReader<GenericRecord>> readers;
/**
* To remove certain fields from the Avro schema or records of a topic/table, set property
* {topic/table name}.remove.fields={comma-separated, fully qualified field names} in workUnit.
*/
@Override
public EnvelopeSchemaConverter init(WorkUnitState workUnit) {
if (workUnit.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)) {
String removeFieldsPropName = workUnit.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY) + AvroProjectionConverter.REMOVE_FIELDS;
if (workUnit.contains(removeFieldsPropName)) {
this.fieldRemover = Optional.of(new AvroSchemaFieldRemover(workUnit.getProp(removeFieldsPropName)));
} else {
this.fieldRemover = Optional.absent();
}
}
String registryFactoryField = workUnit.contains(KafkaSchemaRegistryFactory.KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS) ?
workUnit.getProp(KafkaSchemaRegistryFactory.KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS) : DEFAULT_KAFKA_SCHEMA_REGISTRY_FACTORY_CLASS;
try {
KafkaSchemaRegistryFactory registryFactory = ((Class<? extends KafkaSchemaRegistryFactory>) Class.forName(registryFactoryField)).newInstance();
this.registry = registryFactory.create(workUnit.getProperties());
} catch (ClassNotFoundException | IllegalAccessException | InstantiationException e) {
return null;
}
this.decoderFactory = DecoderFactory.get();
this.readers = CacheBuilder.newBuilder().build(new CacheLoader<Schema, GenericDatumReader<GenericRecord>>() {
@Override
public GenericDatumReader<GenericRecord> load(final Schema key) throws Exception {
return new GenericDatumReader<>(key);
}
});
return this;
}
/**
* Do nothing, actual schema must be obtained from records.
*/
@Override
public String convertSchema(Schema inputSchema, WorkUnitState workUnit) throws SchemaConversionException {
return EnvelopeSchemaConverter.class.getName();
}
/**
* Get actual schema from registry and deserialize payload using it.
*/
@Override
public Iterable<GenericRecord> convertRecord(String outputSchema, GenericRecord inputRecord, WorkUnitState workUnit)
throws DataConversionException {
try {
String schemaIdField = workUnit.contains(PAYLOAD_SCHEMA_ID_FIELD) ?
workUnit.getProp(PAYLOAD_SCHEMA_ID_FIELD) : DEFAULT_PAYLOAD_SCHEMA_ID_FIELD;
String payloadField = workUnit.contains(PAYLOAD_FIELD) ?
workUnit.getProp(PAYLOAD_FIELD) : DEFAULT_PAYLOAD_FIELD;
String schemaKey = String.valueOf(inputRecord.get(schemaIdField));
Schema payloadSchema = (Schema) this.registry.getSchemaByKey(schemaKey);
byte[] payload = getPayload(inputRecord, payloadField);
GenericRecord outputRecord = deserializePayload(payload, payloadSchema);
if (this.fieldRemover.isPresent()) {
payloadSchema = this.fieldRemover.get().removeFields(payloadSchema);
}
return new SingleRecordIterable<>(AvroUtils.convertRecordSchema(outputRecord, payloadSchema));
} catch (IOException | SchemaRegistryException | ExecutionException e) {
throw new DataConversionException(e);
}
}
/**
* Get payload field from GenericRecord and convert to byte array
*/
public byte[] getPayload(GenericRecord inputRecord, String payloadFieldName) {
ByteBuffer bb = (ByteBuffer) inputRecord.get(payloadFieldName);
byte[] payloadBytes;
if (bb.hasArray()) {
payloadBytes = bb.array();
} else {
payloadBytes = new byte[bb.remaining()];
bb.get(payloadBytes);
}
String hexString = new String(payloadBytes, StandardCharsets.UTF_8);
return DatatypeConverter.parseHexBinary(hexString);
}
/**
* Deserialize payload using payload schema
*/
public GenericRecord deserializePayload(byte[] payload, Schema payloadSchema) throws IOException, ExecutionException {
Decoder decoder = this.decoderFactory.binaryDecoder(payload, null);
GenericDatumReader<GenericRecord> reader = this.readers.get(payloadSchema);
return reader.read(null, decoder);
}
}