/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.util.io;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import gobblin.configuration.ConfigurationKeys;
/**
* Utility class of input/output stream helpers.
*/
public class StreamUtils {
/**
* Convert an instance of {@link InputStream} to a {@link FSDataInputStream} that is {@link Seekable} and
* {@link PositionedReadable}.
*
* @see SeekableFSInputStream
*
*/
public static FSDataInputStream convertStream(InputStream in) throws IOException {
return new FSDataInputStream(new SeekableFSInputStream(in));
}
/**
* Copies an {@link InputStream} to and {@link OutputStream} using {@link Channels}.
*
* <p>
* <b>Note:</b> The method does not close the {@link InputStream} and {@link OutputStream}. However, the
* {@link ReadableByteChannel} and {@link WritableByteChannel}s are closed
* </p>
*
* @return Total bytes copied
*/
public static long copy(InputStream is, OutputStream os) throws IOException {
return new StreamCopier(is, os).copy();
}
/**
* Copies a {@link ReadableByteChannel} to a {@link WritableByteChannel}.
* <p>
* <b>Note:</b> The {@link ReadableByteChannel} and {@link WritableByteChannel}s are NOT closed by the method
* </p>
*
* @return Total bytes copied
*/
public static long copy(ReadableByteChannel inputChannel, WritableByteChannel outputChannel) throws IOException {
return new StreamCopier(inputChannel, outputChannel).copy();
}
/**
* Creates a tar gzip file using a given {@link Path} as input and a given {@link Path} as a destination. If the given
* input is a file then only that file will be added to tarball. If it is a directory then the entire directory will
* be recursively put into the tarball.
*
* @param fs the {@link FileSystem} the input exists, and the the output should be written to.
* @param sourcePath the {@link Path} of the input files, this can either be a file or a directory.
* @param destPath the {@link Path} that tarball should be written to.
*/
public static void tar(FileSystem fs, Path sourcePath, Path destPath) throws IOException {
tar(fs, fs, sourcePath, destPath);
}
/**
* Similiar to {@link #tar(FileSystem, Path, Path)} except the source and destination {@link FileSystem} can be different.
*
* @see #tar(FileSystem, Path, Path)
*/
public static void tar(FileSystem sourceFs, FileSystem destFs, Path sourcePath, Path destPath) throws IOException {
try (FSDataOutputStream fsDataOutputStream = destFs.create(destPath);
TarArchiveOutputStream tarArchiveOutputStream = new TarArchiveOutputStream(
new GzipCompressorOutputStream(fsDataOutputStream), ConfigurationKeys.DEFAULT_CHARSET_ENCODING.name())) {
FileStatus fileStatus = sourceFs.getFileStatus(sourcePath);
if (sourceFs.isDirectory(sourcePath)) {
dirToTarArchiveOutputStreamRecursive(fileStatus, sourceFs, Optional.<Path> absent(), tarArchiveOutputStream);
} else {
try (FSDataInputStream fsDataInputStream = sourceFs.open(sourcePath)) {
fileToTarArchiveOutputStream(fileStatus, fsDataInputStream, new Path(sourcePath.getName()),
tarArchiveOutputStream);
}
}
}
}
/**
* Helper method for {@link #tar(FileSystem, FileSystem, Path, Path)} that recursively adds a directory to a given
* {@link TarArchiveOutputStream}.
*/
private static void dirToTarArchiveOutputStreamRecursive(FileStatus dirFileStatus, FileSystem fs,
Optional<Path> destDir, TarArchiveOutputStream tarArchiveOutputStream) throws IOException {
Preconditions.checkState(fs.isDirectory(dirFileStatus.getPath()));
Path dir = destDir.isPresent() ? new Path(destDir.get(), dirFileStatus.getPath().getName())
: new Path(dirFileStatus.getPath().getName());
dirToTarArchiveOutputStream(dir, tarArchiveOutputStream);
for (FileStatus childFileStatus : fs.listStatus(dirFileStatus.getPath())) {
Path childFile = new Path(dir, childFileStatus.getPath().getName());
if (fs.isDirectory(childFileStatus.getPath())) {
dirToTarArchiveOutputStreamRecursive(childFileStatus, fs, Optional.of(childFile), tarArchiveOutputStream);
} else {
try (FSDataInputStream fsDataInputStream = fs.open(childFileStatus.getPath())) {
fileToTarArchiveOutputStream(childFileStatus, fsDataInputStream, childFile, tarArchiveOutputStream);
}
}
}
}
/**
* Helper method for {@link #tar(FileSystem, FileSystem, Path, Path)} that adds a directory entry to a given
* {@link TarArchiveOutputStream}.
*/
private static void dirToTarArchiveOutputStream(Path destDir, TarArchiveOutputStream tarArchiveOutputStream)
throws IOException {
TarArchiveEntry tarArchiveEntry = new TarArchiveEntry(formatPathToDir(destDir));
tarArchiveEntry.setModTime(System.currentTimeMillis());
tarArchiveOutputStream.putArchiveEntry(tarArchiveEntry);
tarArchiveOutputStream.closeArchiveEntry();
}
/**
* Helper method for {@link #tar(FileSystem, FileSystem, Path, Path)} that adds a file entry to a given
* {@link TarArchiveOutputStream} and copies the contents of the file to the new entry.
*/
private static void fileToTarArchiveOutputStream(FileStatus fileStatus, FSDataInputStream fsDataInputStream,
Path destFile, TarArchiveOutputStream tarArchiveOutputStream) throws IOException {
TarArchiveEntry tarArchiveEntry = new TarArchiveEntry(formatPathToFile(destFile));
tarArchiveEntry.setSize(fileStatus.getLen());
tarArchiveEntry.setModTime(System.currentTimeMillis());
tarArchiveOutputStream.putArchiveEntry(tarArchiveEntry);
try {
IOUtils.copy(fsDataInputStream, tarArchiveOutputStream);
} finally {
tarArchiveOutputStream.closeArchiveEntry();
}
}
/**
* Convert a {@link Path} to a {@link String} and make sure it is properly formatted to be recognized as a directory
* by {@link TarArchiveEntry}.
*/
private static String formatPathToDir(Path path) {
return path.toString().endsWith(Path.SEPARATOR) ? path.toString() : path.toString() + Path.SEPARATOR;
}
/**
* Convert a {@link Path} to a {@link String} and make sure it is properly formatted to be recognized as a file
* by {@link TarArchiveEntry}.
*/
private static String formatPathToFile(Path path) {
return StringUtils.removeEnd(path.toString(), Path.SEPARATOR);
}
/*
* Determines if a byte array is compressed. The java.util.zip GZip
* implementation does not expose the GZip header so it is difficult to determine
* if a string is compressed.
* Copied from Helix GZipCompressionUtil
* @param bytes an array of bytes
* @return true if the array is compressed or false otherwise
*/
public static boolean isCompressed(byte[] bytes) {
if ((bytes == null) || (bytes.length < 2)) {
return false;
} else {
return ((bytes[0] == (byte) (GZIPInputStream.GZIP_MAGIC)) &&
(bytes[1] == (byte) (GZIPInputStream.GZIP_MAGIC >> 8)));
}
}
/**
* Reads the full contents of a ByteBuffer and writes them to an OutputStream. The ByteBuffer is
* consumed by this operation; eg in.remaining() will be 0 after it completes successfully.
* @param in ByteBuffer to write into the OutputStream
* @param out Destination stream
* @throws IOException If there is an error writing into the OutputStream
*/
public static void byteBufferToOutputStream(ByteBuffer in, OutputStream out)
throws IOException {
final int BUF_SIZE = 8192;
if (in.hasArray()) {
out.write(in.array(), in.arrayOffset() + in.position(), in.remaining());
} else {
final byte[] b = new byte[Math.min(in.remaining(), BUF_SIZE)];
while (in.remaining() > 0) {
int bytesToRead = Math.min(in.remaining(), BUF_SIZE);
in.get(b, 0, bytesToRead);
out.write(b, 0, bytesToRead);
}
}
}
}