/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.writer;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closer;
import gobblin.codec.StreamCodec;
import gobblin.commit.SpeculativeAttemptAwareConstruct;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.metadata.types.GlobalMetadata;
import gobblin.util.FinalState;
import gobblin.util.ForkOperatorUtils;
import gobblin.util.HadoopUtils;
import gobblin.util.JobConfigurationUtils;
import gobblin.util.WriterUtils;
import gobblin.util.recordcount.IngestionRecordCountProvider;
/**
* An implementation of {@link DataWriter} does the work of setting the output/staging dir
* and creating the FileSystem instance.
*
* @author akshay@nerdwallet.com
*/
public abstract class FsDataWriter<D> implements DataWriter<D>, FinalState, MetadataAwareWriter, SpeculativeAttemptAwareConstruct {
private static final Logger LOG = LoggerFactory.getLogger(FsDataWriter.class);
public static final String WRITER_INCLUDE_RECORD_COUNT_IN_FILE_NAMES =
ConfigurationKeys.WRITER_PREFIX + ".include.record.count.in.file.names";
protected final State properties;
protected final String id;
protected final int numBranches;
protected final int branchId;
protected final String fileName;
protected final FileSystem fs;
protected final Path stagingFile;
private final GlobalMetadata defaultMetadata;
protected Path outputFile;
protected final String allOutputFilesPropName;
protected final boolean shouldIncludeRecordCountInFileName;
protected final int bufferSize;
protected final short replicationFactor;
protected final long blockSize;
protected final FsPermission filePermission;
protected final FsPermission dirPermission;
protected final Optional<String> group;
protected final Closer closer = Closer.create();
protected final Optional<String> writerAttemptIdOptional;
protected Optional<Long> bytesWritten;
private final List<StreamCodec> encoders;
public FsDataWriter(FsDataWriterBuilder<?, ?> builder, State properties) throws IOException {
this.properties = properties;
this.id = builder.getWriterId();
this.numBranches = builder.getBranches();
this.branchId = builder.getBranch();
this.fileName = builder.getFileName(properties);
this.writerAttemptIdOptional = Optional.fromNullable(builder.getWriterAttemptId());
this.encoders = builder.getEncoders();
Configuration conf = new Configuration();
// Add all job configuration properties so they are picked up by Hadoop
JobConfigurationUtils.putStateIntoConfiguration(properties, conf);
this.fs = WriterUtils.getWriterFS(properties, this.numBranches, this.branchId);
// Initialize staging/output directory
Path writerStagingDir = this.writerAttemptIdOptional.isPresent() ? WriterUtils
.getWriterStagingDir(properties, this.numBranches, this.branchId, this.writerAttemptIdOptional.get())
: WriterUtils.getWriterStagingDir(properties, this.numBranches, this.branchId);
this.stagingFile = new Path(writerStagingDir, this.fileName);
this.outputFile =
new Path(WriterUtils.getWriterOutputDir(properties, this.numBranches, this.branchId), this.fileName);
this.allOutputFilesPropName = ForkOperatorUtils
.getPropertyNameForBranch(ConfigurationKeys.WRITER_FINAL_OUTPUT_FILE_PATHS, this.numBranches, this.branchId);
// Deleting the staging file if it already exists, which can happen if the
// task failed and the staging file didn't get cleaned up for some reason.
// Deleting the staging file prevents the task retry from being blocked.
if (this.fs.exists(this.stagingFile)) {
LOG.warn(String.format("Task staging file %s already exists, deleting it", this.stagingFile));
HadoopUtils.deletePath(this.fs, this.stagingFile, false);
}
this.shouldIncludeRecordCountInFileName = properties.getPropAsBoolean(ForkOperatorUtils
.getPropertyNameForBranch(WRITER_INCLUDE_RECORD_COUNT_IN_FILE_NAMES, this.numBranches, this.branchId), false);
this.bufferSize = properties.getPropAsInt(ForkOperatorUtils
.getPropertyNameForBranch(ConfigurationKeys.WRITER_BUFFER_SIZE, this.numBranches, this.branchId),
ConfigurationKeys.DEFAULT_BUFFER_SIZE);
this.replicationFactor = properties.getPropAsShort(ForkOperatorUtils
.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_REPLICATION_FACTOR, this.numBranches, this.branchId),
this.fs.getDefaultReplication(this.outputFile));
this.blockSize = properties.getPropAsLong(ForkOperatorUtils
.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_BLOCK_SIZE, this.numBranches, this.branchId),
this.fs.getDefaultBlockSize(this.outputFile));
this.filePermission = HadoopUtils.deserializeWriterFilePermissions(properties, this.numBranches, this.branchId);
this.dirPermission = HadoopUtils.deserializeWriterDirPermissions(properties, this.numBranches, this.branchId);
this.group = Optional.fromNullable(properties.getProp(ForkOperatorUtils
.getPropertyNameForBranch(ConfigurationKeys.WRITER_GROUP_NAME, this.numBranches, this.branchId)));
// Create the parent directory of the output file if it does not exist
WriterUtils.mkdirsWithRecursivePermission(this.fs, this.outputFile.getParent(), this.dirPermission);
this.bytesWritten = Optional.absent();
this.defaultMetadata = new GlobalMetadata();
for (StreamCodec c : getEncoders()) {
this.defaultMetadata.addTransferEncoding(c.getTag());
}
String partitionPath = builder.getPartitionPath(properties);
if (builder.getPartitionPath(properties) != null) {
properties.setProp(ConfigurationKeys.WRITER_PARTITION_PATH_KEY + builder.getWriterId(), partitionPath);
}
}
/**
* Create the staging output file and an {@link OutputStream} to write to the file.
*
* @return an {@link OutputStream} to write to the staging file
* @throws IOException if it fails to create the file and the {@link OutputStream}
*/
protected OutputStream createStagingFileOutputStream()
throws IOException {
OutputStream out = this.fs
.create(this.stagingFile, this.filePermission, true, this.bufferSize, this.replicationFactor, this.blockSize,
null);
// encoders need to be attached to the stream in reverse order since we should write to the
// innermost encoder first
for (StreamCodec encoder : Lists.reverse(getEncoders())) {
out = encoder.encodeOutputStream(out);
}
return this.closer.register(out);
}
/**
* Set the group name of the staging output file.
*
* @throws IOException if it fails to set the group name
*/
protected void setStagingFileGroup()
throws IOException {
Preconditions.checkArgument(this.fs.exists(this.stagingFile),
String.format("Staging output file %s does not exist", this.stagingFile));
if (this.group.isPresent()) {
HadoopUtils.setGroup(this.fs, this.stagingFile, this.group.get());
}
}
protected List<StreamCodec> getEncoders() {
return encoders;
}
public GlobalMetadata getDefaultMetadata() {
return defaultMetadata;
}
@Override
public long bytesWritten()
throws IOException {
if (this.bytesWritten.isPresent()) {
return this.bytesWritten.get().longValue();
}
return 0l;
}
/**
* {@inheritDoc}.
*
* <p>
* This default implementation simply renames the staging file to the output file. If the output file
* already exists, it will delete it first before doing the renaming.
* </p>
*
* @throws IOException if any file operation fails
*/
@Override
public void commit()
throws IOException {
this.closer.close();
setStagingFileGroup();
if (!this.fs.exists(this.stagingFile)) {
throw new IOException(String.format("File %s does not exist", this.stagingFile));
}
FileStatus stagingFileStatus = this.fs.getFileStatus(this.stagingFile);
// Double check permission of staging file
if (!stagingFileStatus.getPermission().equals(this.filePermission)) {
this.fs.setPermission(this.stagingFile, this.filePermission);
}
this.bytesWritten = Optional.of(Long.valueOf(stagingFileStatus.getLen()));
LOG.info(String.format("Moving data from %s to %s", this.stagingFile, this.outputFile));
// For the same reason as deleting the staging file if it already exists, deleting
// the output file if it already exists prevents task retry from being blocked.
if (this.fs.exists(this.outputFile)) {
LOG.warn(String.format("Task output file %s already exists", this.outputFile));
HadoopUtils.deletePath(this.fs, this.outputFile, false);
}
HadoopUtils.renamePath(this.fs, this.stagingFile, this.outputFile);
}
/**
* {@inheritDoc}.
*
* <p>
* This default implementation simply deletes the staging file if it exists.
* </p>
*
* @throws IOException if deletion of the staging file fails
*/
@Override
public void cleanup()
throws IOException {
// Delete the staging file
if (this.fs.exists(this.stagingFile)) {
HadoopUtils.deletePath(this.fs, this.stagingFile, false);
}
}
@Override
public void close()
throws IOException {
this.closer.close();
if (this.shouldIncludeRecordCountInFileName) {
String filePathWithRecordCount = addRecordCountToFileName();
this.properties.appendToSetProp(this.allOutputFilesPropName, filePathWithRecordCount);
} else {
this.properties.appendToSetProp(this.allOutputFilesPropName, getOutputFilePath());
}
}
private synchronized String addRecordCountToFileName()
throws IOException {
String filePath = getOutputFilePath();
String filePathWithRecordCount = IngestionRecordCountProvider.constructFilePath(filePath, recordsWritten());
LOG.info("Renaming " + filePath + " to " + filePathWithRecordCount);
HadoopUtils.renamePath(this.fs, new Path(filePath), new Path(filePathWithRecordCount));
this.outputFile = new Path(filePathWithRecordCount);
return filePathWithRecordCount;
}
@Override
public State getFinalState() {
State state = new State();
state.setProp("RecordsWritten", recordsWritten());
try {
state.setProp("BytesWritten", bytesWritten());
} catch (Exception exception) {
// If Writer fails to return bytesWritten, it might not be implemented, or implemented incorrectly.
// Omit property instead of failing.
}
return state;
}
/**
* Get the output file path.
*
* @return the output file path
*/
public String getOutputFilePath() {
return this.outputFile.toString();
}
/**
* Get the fully-qualified output file path.
*
* @return the fully-qualified output file path
*/
public String getFullyQualifiedOutputFilePath() {
return this.fs.makeQualified(this.outputFile).toString();
}
@Override
public boolean isSpeculativeAttemptSafe() {
return this.writerAttemptIdOptional.isPresent() && this.getClass() == FsDataWriter.class;
}
}