/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.parser; import com.google.common.annotations.VisibleForTesting; import com.streamsets.pipeline.api.Record; import com.streamsets.pipeline.api.ext.DataCollectorServices; import com.streamsets.pipeline.api.ext.json.JsonMapper; import com.streamsets.pipeline.api.impl.Utils; import com.streamsets.pipeline.config.Compression; import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.io.input.ProxyInputStream; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.FileSystems; import java.nio.file.PathMatcher; import java.nio.file.Paths; import java.util.HashMap; import java.util.Map; public class CompressionDataParser extends AbstractDataParser { public static final String ZERO = "0"; public static final String MINUS_ONE = "-1"; public static final String PATH_SEPARATOR = "/"; private static final boolean DECOMPRESS_UNTIL_EOF = true; private final InputStream is; private final String id; private final Compression compression; private final String compressionFilePattern; private final DataParserFactory dataParserFactory; private String offset; private DataParser parser; private boolean eof = false; private CompressionInput compressionInput; public CompressionDataParser( String id, InputStream is, String offset, Compression compression, String compressionFilePattern, DataParserFactory dataParserFactory ) { this.id = id; this.is = is; this.offset = offset; this.compression = compression; this.compressionFilePattern = compressionFilePattern; this.dataParserFactory = dataParserFactory; } @Override public Record parse() throws IOException, DataParserException { if (compressionInput == null) { if(offset == null || offset.isEmpty()) { offset = ZERO; } // first invocation of this method on the parser. Initialize compressionInput = new CompressionInputBuilder(compression, compressionFilePattern, is, offset).build(); offset = compressionInput.getStreamPosition(offset); } Record record = null; while (!eof && record == null) { if (parser == null) { InputStream nextInputStream = compressionInput.getNextInputStream(); if (nextInputStream != null) { parser = dataParserFactory.getParser(id, new NonClosingProxyInputStream(nextInputStream), offset); } else { //reached end of compression/archive stream eof = true; } } if (!eof) { String offsetBeforeRead = compressionInput.getStreamPosition(getOffset()); record = parser.parse(); if (record == null) { parser.close(); parser = null; // for subsequent entries offset always starts at ZERO offset = ZERO; } else { compressionInput.wrapRecordHeaders(record.getHeader(), offsetBeforeRead); } } } return record; } @Override public String getOffset() throws IOException, DataParserException { if(eof) { return MINUS_ONE; } if(parser != null) { return compressionInput.wrapOffset(parser.getOffset()); } return ZERO; } @Override public void close() throws IOException { if(parser != null) { parser.close(); parser = null; } if(compressionInput != null) { compressionInput.close(); compressionInput = null; } is.close(); } /** * Wrapper Input Stream that does nothing on calling close(). */ static class NonClosingProxyInputStream extends ProxyInputStream { public NonClosingProxyInputStream(InputStream proxy) { super(proxy); } @Override public void close() throws IOException { // NO-OP } } interface CompressionInput { public InputStream getNextInputStream() throws IOException; public String wrapOffset(String offset) throws IOException; public String getStreamPosition(String offset) throws IOException; public String wrapRecordId(String recordId); public void wrapRecordHeaders(Record.Header header, String offset); public void close() throws IOException; } @VisibleForTesting static class CompressionInputBuilder { private final Compression compressionInputFormat; private final String compressedFilePattern; private final InputStream inputStream; private final String offset; public CompressionInputBuilder( Compression compressionInputFormat, String compressedFilePattern, InputStream inputStream, String offset ) { this.compressionInputFormat = compressionInputFormat; this.compressedFilePattern = compressedFilePattern; this.inputStream = inputStream; this.offset = offset; } public CompressionDataParser.CompressionInput build() throws IOException { if (compressionInputFormat != null) { switch (compressionInputFormat) { case NONE: return new None(inputStream); case COMPRESSED_FILE: return new CompressorInput(inputStream); case ARCHIVE: return new ArchiveInput(compressedFilePattern, new None(inputStream), offset); case COMPRESSED_ARCHIVE: return new ArchiveInput(compressedFilePattern, new CompressorInput(inputStream), offset); default: throw new IllegalArgumentException(); } } return new None(inputStream); } @VisibleForTesting static class None implements CompressionDataParser.CompressionInput { private InputStream inputStream; public None(InputStream inputStream) { this.inputStream = inputStream; } @Override public String wrapOffset(String offset) { return offset; } @Override public String getStreamPosition(String offset) { return offset; } @Override public String wrapRecordId(String recordId) { return recordId; } @Override public void wrapRecordHeaders(Record.Header header, String offset) { //NO OP } @Override public void close() { // NO-OP } @Override public InputStream getNextInputStream() { InputStream temp = inputStream; inputStream = null; return temp; } } @VisibleForTesting static class CompressorInput implements CompressionDataParser.CompressionInput { private InputStream inputStream; public CompressorInput(InputStream inputStream) throws IOException { try { this.inputStream = new CompressorStreamFactory(DECOMPRESS_UNTIL_EOF).createCompressorInputStream( new BufferedInputStream(inputStream)); } catch (CompressorException e) { throw new IOException(e); } } @Override public String wrapOffset(String offset) { return offset; } @Override public InputStream getNextInputStream() { InputStream temp = inputStream; inputStream = null; return temp; } @Override public String getStreamPosition(String offset) { return offset; } @Override public String wrapRecordId(String recordId) { return recordId; } @Override public void wrapRecordHeaders(Record.Header header, String offset) { //NO OP } @Override public void close() throws IOException { if(inputStream != null) { inputStream.close(); } } } @VisibleForTesting static class ArchiveInput implements CompressionDataParser.CompressionInput { public static final String FILE_NAME = "fileName"; public static final String FILE_OFFSET = "fileOffset"; public static final String FILE_PATH_INSIDE_ARCHIVE = "filePathInsideArchive"; public static final String FILE_NAME_INSIDE_ARCHIVE = "fileNameInsideArchive"; public static final String FILE_OFFSET_INSIDER_ARCHIVE = "fileOffsetInsideArchive"; private static final JsonMapper objectMapper = DataCollectorServices.instance().get(JsonMapper.SERVICE_KEY); private final PathMatcher pathMatcher; private ArchiveEntry currentEntry; private ArchiveInputStream archiveInputStream; private String wrappedOffset; private InputStream nextInputStream; private CompressionDataParser.CompressionInput compressionInput; public ArchiveInput( String compressedFilePattern, CompressionDataParser.CompressionInput compressionInput, String wrappedOffset ) { Utils.checkNotNull(compressedFilePattern, "Compressed File Pattern cannot be null"); Utils.checkNotNull(wrappedOffset, "Offset cannot be null"); pathMatcher = FileSystems.getDefault().getPathMatcher("glob:" + compressedFilePattern); this.wrappedOffset = wrappedOffset; this.compressionInput = compressionInput; } @Override public String wrapOffset(String offset) throws IOException { String fileName = null; if (currentEntry != null) { fileName = currentEntry.getName(); } Map<String, Object> archiveOffset = new HashMap<>(); archiveOffset.put(FILE_NAME, fileName); archiveOffset.put(FILE_OFFSET, offset); return objectMapper.writeValueAsString(archiveOffset); } @SuppressWarnings("unchecked") @Override public InputStream getNextInputStream() throws IOException { if(archiveInputStream == null) { // Very first call to getNextInputStream, initialize archiveInputStream using the wrappedOffset wrappedOffset = wrappedOffset.equals(ZERO) ? wrapOffset(wrappedOffset) : wrappedOffset; Map<String, Object> archiveInputOffset = objectMapper.readValue(wrappedOffset, Map.class); try { archiveInputStream = new ArchiveStreamFactory().createArchiveInputStream( new BufferedInputStream(compressionInput.getNextInputStream())); } catch (ArchiveException e) { throw new IOException(e); } seekToOffset(archiveInputOffset); nextInputStream = archiveInputStream; } if (nextInputStream == null) { // this means reached end of a compressed file within the archive. seek to the next eligible entry seekToNextEligibleEntry(); if (currentEntry != null) { // Not end of archive nextInputStream = archiveInputStream; } } InputStream temp = nextInputStream; nextInputStream = null; return temp; } @SuppressWarnings("unchecked") @Override public String getStreamPosition(String offset) throws IOException { if(ZERO.equals(offset)) { return ZERO; } Map<String, Object> map = objectMapper.readValue(offset, Map.class); return (String)map.get(FILE_OFFSET); } @Override public String wrapRecordId(String recordId) { if(currentEntry != null) { return recordId + PATH_SEPARATOR + currentEntry.getName(); } return recordId; } @Override public void wrapRecordHeaders(Record.Header header, String offset) { if (currentEntry != null) { String fullPathToFile = currentEntry.getName(); int lastPathIndex = fullPathToFile.lastIndexOf('/'); String fileName = fullPathToFile.substring(lastPathIndex + 1); String filePath = (lastPathIndex != -1)? fullPathToFile.substring(0, lastPathIndex) : ""; header.setAttribute(FILE_PATH_INSIDE_ARCHIVE, filePath); header.setAttribute(FILE_NAME_INSIDE_ARCHIVE, fileName); header.setAttribute(FILE_OFFSET_INSIDER_ARCHIVE, offset); } } @Override public void close() throws IOException { if(archiveInputStream != null) { archiveInputStream.close(); } } private void seekToOffset(Map<String, Object> archiveInputOffset) throws IOException { String fileName = (String) archiveInputOffset.get(FILE_NAME); long longOffset = Long.parseLong((String) archiveInputOffset.get(FILE_OFFSET)); currentEntry = archiveInputStream.getNextEntry(); while (currentEntry != null) { if (isEligibleEntry(currentEntry)) { // A match is when // - it is the first file read within the zip // - file that was last processed // - if offset is -1 then return next eligible entry in the archive // - if offset is not -1 then return current entry [the file that was last processed] as it is not // completely read if (fileName == null) { //match - first file to match pattern within the zip break; } else if (currentEntry.getName().equals(fileName)) { // reached the last processed file if (longOffset != -1) { // the last processed file is not completely read, return same break; } else { // return next eligible entry in the archive seekToNextEligibleEntry(); break; } } } currentEntry = archiveInputStream.getNextEntry(); } } private void seekToNextEligibleEntry() throws IOException { currentEntry = archiveInputStream.getNextEntry(); while (currentEntry != null && !isEligibleEntry(currentEntry)) { currentEntry = archiveInputStream.getNextEntry(); } } private boolean isEligibleEntry(ArchiveEntry currentEntry) { return !currentEntry.isDirectory() && pathMatcher.matches(Paths.get(currentEntry.getName())); } } } }