/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.writer;
import java.util.Collections;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.Path;
import gobblin.codec.StreamCodec;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.util.AvroUtils;
import gobblin.util.ForkOperatorUtils;
import gobblin.util.WriterUtils;
/**
* A abstract {@link DataWriterBuilder} for building {@link DataWriter}s that write to
* {@link org.apache.hadoop.fs.FileSystem}s.
*
* @param <S> schema type
* @param <S> data record type
*
* @author Ziyang Liu
*/
public abstract class FsDataWriterBuilder<S, D> extends PartitionAwareDataWriterBuilder<S, D> {
public static final String WRITER_INCLUDE_PARTITION_IN_FILE_NAMES =
ConfigurationKeys.WRITER_PREFIX + ".include.partition.in.file.names";
public static final String WRITER_REPLACE_PATH_SEPARATORS_IN_PARTITIONS =
ConfigurationKeys.WRITER_PREFIX + ".replace.path.separators.in.partitions";
private List<StreamCodec> encoders;
/**
* Get the file name to be used by the writer. If a {@link gobblin.writer.partitioner.WriterPartioner} is used,
* the partition will be added as part of the file name.
*/
public String getFileName(State properties) {
String extension =
this.format.equals(WriterOutputFormat.OTHER) ? getExtension(properties) : this.format.getExtension();
String fileName = WriterUtils.getWriterFileName(properties, this.branches, this.branch, this.writerId, extension);
if (this.partition.isPresent()) {
fileName = getPartitionedFileName(properties, fileName);
}
List<StreamCodec> encoders = getEncoders();
if (!encoders.isEmpty()) {
StringBuilder filenameBuilder = new StringBuilder(fileName);
for (StreamCodec codec : encoders) {
filenameBuilder.append('.');
filenameBuilder.append(codec.getTag());
}
fileName = filenameBuilder.toString();
}
return fileName;
}
private static String getExtension(State properties) {
return properties.getProp(ConfigurationKeys.WRITER_OUTPUT_FORMAT_KEY, StringUtils.EMPTY);
}
protected String getPartitionPath(State properties) {
if (this.partition.isPresent()) {
boolean includePartitionerFieldNames = properties.getPropAsBoolean(ForkOperatorUtils
.getPropertyNameForBranch(WRITER_INCLUDE_PARTITION_IN_FILE_NAMES, this.branches, this.branch), false);
boolean removePathSeparators = properties.getPropAsBoolean(ForkOperatorUtils
.getPropertyNameForBranch(WRITER_REPLACE_PATH_SEPARATORS_IN_PARTITIONS, this.branches, this.branch), false);
return AvroUtils.serializeAsPath(this.partition.get(), includePartitionerFieldNames, removePathSeparators).toString();
} else {
return null;
}
}
protected String getPartitionedFileName(State properties, String originalFileName) {
return new Path(
getPartitionPath(properties),
originalFileName).toString();
}
@Override
public boolean validatePartitionSchema(Schema partitionSchema) {
return true;
}
/**
* Get list of encoders configured for the writer.
*/
public synchronized List<StreamCodec> getEncoders() {
if (encoders == null) {
encoders = buildEncoders();
}
return encoders;
}
/**
* Build and cache encoders for the writer based on configured options as encoder
* construction can potentially be expensive.
*/
protected List<StreamCodec> buildEncoders() {
// Should be overridden by subclasses if their associated writers are
// encoder aware
return Collections.emptyList();
}
}