package org.gbif.occurrence.download.file.simplecsv; import org.gbif.dwc.terms.Term; import org.gbif.hadoop.compress.d2.D2CombineInputStream; import org.gbif.hadoop.compress.d2.D2Utils; import org.gbif.hadoop.compress.d2.zip.ModalZipOutputStream; import org.gbif.hadoop.compress.d2.zip.ZipEntry; import org.gbif.occurrence.download.file.common.DownloadFileUtils; import org.gbif.occurrence.download.hive.DownloadTerms; import org.gbif.occurrence.download.hive.HiveColumns; import org.gbif.occurrence.download.inject.DownloadWorkflowModule; import org.gbif.utils.file.properties.PropertiesUtil; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Paths; import java.util.Arrays; import java.util.Properties; import java.util.zip.ZipOutputStream; import javax.annotation.Nullable; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Throwables; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.io.ByteStreams; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Utility class that creates zip file from a directory that stores the data of a Hive table. */ public class SimpleCsvArchiveBuilder { private static final Logger LOG = LoggerFactory.getLogger(SimpleCsvArchiveBuilder.class); //Occurrences file name private static final String CSV_EXTENSION = ".csv"; private static final String ZIP_EXTENSION = ".zip"; private static final String ERROR_ZIP_MSG = "Error creating zip file"; //Header file is named '0' to appear first when listing the content of the directory. private static final String HEADER_FILE_NAME = "0"; //String that contains the file HEADER for the simple table format. private static final String HEADER = Joiner.on('\t').join(Iterables.transform(DownloadTerms.SIMPLE_DOWNLOAD_TERMS, new Function<Term, String>() { @Nullable @Override public String apply(@Nullable Term input) { return HiveColumns.columnFor(input).replaceAll("_", ""); } })) + '\n'; /** * Merges the content of sourceFS:sourcePath into targetFS:outputPath in a file called downloadKey.zip. * The HEADER file is added to the directory hiveTableInputPath so it appears in the resulting zip file. */ public static void mergeToZip( final FileSystem sourceFS, FileSystem targetFS, String sourcePath, String targetPath, String downloadKey, ModalZipOutputStream.MODE mode ) throws IOException { Path outputPath = new Path(targetPath, downloadKey + ZIP_EXTENSION); if (ModalZipOutputStream.MODE.PRE_DEFLATED == mode) { //Use hadoop-compress for pre_deflated files zipPreDeflated(sourceFS, targetFS, sourcePath, outputPath, downloadKey); } else { //Use standard Java libraries for uncompressed input zipDefault(sourceFS, targetFS, sourcePath, outputPath, downloadKey); } } /** * Merges the file using the standard java libraries java.util.zip. */ private static void zipDefault( final FileSystem sourceFS, final FileSystem targetFS, String sourcePath, Path outputPath, String downloadKey ) { try ( FSDataOutputStream zipped = targetFS.create(outputPath, true); ZipOutputStream zos = new ZipOutputStream(zipped); ) { //appends the header file appendHeaderFile(sourceFS, new Path(sourcePath), ModalZipOutputStream.MODE.DEFAULT); java.util.zip.ZipEntry ze = new java.util.zip.ZipEntry(Paths.get(downloadKey + CSV_EXTENSION).toString()); zos.putNextEntry(ze); //files are sorted by name File[] files = new File(sourcePath).listFiles(); Arrays.sort(files); for (File fileInZip : files) { FileInputStream fileInZipInputStream = new FileInputStream(fileInZip); ByteStreams.copy(fileInZipInputStream, zos); zos.flush(); fileInZipInputStream.close(); } zos.closeEntry(); } catch (Exception ex) { LOG.error(ERROR_ZIP_MSG, ex); throw Throwables.propagate(ex); } } /** * Merges the pre-deflated content using the hadoop-compress library. */ private static void zipPreDeflated( final FileSystem sourceFS, FileSystem targetFS, String sourcePath, Path outputPath, String downloadKey ) throws IOException { try ( FSDataOutputStream zipped = targetFS.create(outputPath, true); ModalZipOutputStream zos = new ModalZipOutputStream(new BufferedOutputStream(zipped)); ) { final Path inputPath = new Path(sourcePath); //appends the header file appendHeaderFile(sourceFS, inputPath, ModalZipOutputStream.MODE.PRE_DEFLATED); //Get all the files inside the directory and creates a list of InputStreams. try { D2CombineInputStream in = new D2CombineInputStream(Lists.transform(Lists.newArrayList(sourceFS.listStatus(inputPath)), new Function<FileStatus, InputStream>() { @Nullable @Override public InputStream apply(@Nullable FileStatus input) { try { return sourceFS.open(input.getPath()); } catch (IOException ex) { throw Throwables.propagate(ex); } } })); ZipEntry ze = new ZipEntry(Paths.get(downloadKey + CSV_EXTENSION).toString()); zos.putNextEntry(ze, ModalZipOutputStream.MODE.PRE_DEFLATED); ByteStreams.copy(in, zos); in.close(); // required to get the sizes ze.setSize(in.getUncompressedLength()); // important to set the sizes and CRC ze.setCompressedSize(in.getCompressedLength()); ze.setCrc(in.getCrc32()); zos.closeEntry(); } catch (Exception ex) { LOG.error(ERROR_ZIP_MSG, ex); throw Throwables.propagate(ex); } } } /** * Creates a compressed file named '0' that contains the content of the file HEADER. */ private static void appendHeaderFile(FileSystem fileSystem, Path dir, ModalZipOutputStream.MODE mode) throws IOException { try (FSDataOutputStream fsDataOutputStream = fileSystem.create(new Path(dir, HEADER_FILE_NAME))) { if (ModalZipOutputStream.MODE.PRE_DEFLATED == mode) { D2Utils.compress(new ByteArrayInputStream(HEADER.getBytes()), fsDataOutputStream); } else { fsDataOutputStream.write(HEADER.getBytes()); } } } /** * Executes the archive/zip creation process. * The expected parameters are: * 0. sourcePath: HDFS path to the directory that contains the data files. * 1. targetPath: HDFS path where the resulting file will be copied. * 2. downloadKey: occurrence download key. * 3. MODE: ModalZipOutputStream.MODE of input files. */ public static void main(String[] args) throws IOException { Properties properties = PropertiesUtil.loadProperties(DownloadWorkflowModule.CONF_FILE); FileSystem sourceFileSystem = DownloadFileUtils.getHdfs(properties.getProperty(DownloadWorkflowModule.DefaultSettings.NAME_NODE_KEY)); mergeToZip(sourceFileSystem, sourceFileSystem, args[0], args[1], args[2], ModalZipOutputStream.MODE.valueOf(args[3])); } /** * Private constructor. */ private SimpleCsvArchiveBuilder() { //do nothing } }