/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.input.expand;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.InputStreamInputSource;
import org.voyanttools.trombone.input.source.Source;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
/**
* An expander for compressed files. This is supported through the Apache
* Commons Compress library and supports common cases for a range of formats
* like "bzip2", "bz2", "gzip", "gz", "pack200", "xz" (though not
* all of these are tested). Note that for multi-file archives (like "tar"
* and "zip") the {@link ArchiveExpander} should be used.
*
* @author "Stéfan Sinclair"
*/
class CompressedExpander implements Expander {
/**
* the primary expander (child documents are expanded with this)
*/
private Expander expander;
/**
* the stored document storage strategy
*/
private StoredDocumentSourceStorage storedDocumentSourceStorage;
/**
* Create a new instance of this expander (this should only be done by
* {@link StoredDocumentSourceExpander}.
*
* @param storedDocumentSourceStorage a stored storage strategy
* @param storedDocumentSoruceExpander a reference to the primary expander
*/
CompressedExpander(StoredDocumentSourceStorage storedDocumentSourceStorage, StoredDocumentSourceExpander storedDocumentSoruceExpander) {
this.storedDocumentSourceStorage = storedDocumentSourceStorage;
this.expander = storedDocumentSoruceExpander;
}
/* (non-Javadoc)
* @see org.voyanttools.trombone.input.expand.Expander#getExpandedStoredDocumentSources(org.voyanttools.trombone.document.StoredDocumentSource)
*/
public List<StoredDocumentSource> getExpandedStoredDocumentSources(
StoredDocumentSource storedDocumentSource) throws IOException {
List<StoredDocumentSource> expandedDocumentSources = new ArrayList<StoredDocumentSource>();
// first try to see if we've been here already
String parentId = storedDocumentSource.getId();
expandedDocumentSources = storedDocumentSourceStorage.getMultipleExpandedStoredDocumentSources(parentId);
if (expandedDocumentSources!=null && expandedDocumentSources.isEmpty()==false) {
return expandedDocumentSources;
}
DocumentMetadata metadata = storedDocumentSource.getMetadata();
String filename = metadata.getLocation();
if (filename==null || filename.isEmpty()==true) {filename="uncompressed";}
InputStream inputStream = null;
try {
inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(parentId);
BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
InputStream newInputStream = new CompressorStreamFactory().createCompressorInputStream(bufferedInputStream);
DocumentMetadata childMetadata = metadata.asParent(parentId, DocumentMetadata.ParentType.EXPANSION);
String modifiedFilename = Pattern.compile("\\.(bzip2|bz2|gz|gzip|xz)$", Pattern.CASE_INSENSITIVE).matcher(filename).replaceFirst("");
childMetadata.setLocation(modifiedFilename);
childMetadata.setModified(metadata.getModified());
childMetadata.setSource(Source.STREAM);
childMetadata.setTitle(modifiedFilename);
String id = DigestUtils.md5Hex(parentId+"uncompressed");
InputSource decompressedInputSource = new InputStreamInputSource(id, childMetadata, newInputStream);
StoredDocumentSource decompressedStoredDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(decompressedInputSource);
expandedDocumentSources.addAll(this.expander.getExpandedStoredDocumentSources(decompressedStoredDocumentSource)); // expand this recursively
storedDocumentSourceStorage.setMultipleExpandedStoredDocumentSources(storedDocumentSource.getId(), expandedDocumentSources);
return expandedDocumentSources;
} catch (CompressorException e) {
throw new IOException("A problem was encountered reading this compressed file: "+storedDocumentSource.getMetadata().getLocation(), e);
}
finally {
if (inputStream != null) {
inputStream.close();
}
}
}
}