ArchiveExpander.java example

Explorer

trombone-master
- src
  - main
    - java
  - test
    - java
      - org
        voyanttools
        trombone
        document
        MetadataTest.java
        input
        expand
        ArchiveExpanderTest.java
        CompressedExpanderTest.java
        XmlExpanderTest.java
        XslExpanderTest.java
        extract
        BagItExtractorTest.java
        TikaExtractorTest.java
        XmlExtractorTest.java
        index
        LuceneIndexerTest.java
        lucene
        StoredToLuceneDocumentMapperTest.java
        analysis
        OpenNlpLemmaTokenizerTest.java
        StanfordNlpLemmaTokenizerTest.java
        search
        FieldPrefixAwareSimpleQueryParserTest.java
        model
        CorpusCollocateTest.java
        CorpusTermMinimalsDBTest.java
        CorpusTermsQueueTest.java
        DocumentTermsTest.java
        KeywordsTest.java
        TableTest.java
        storage
        file
        FileStoredDocumentSourceStorageTest.java
        TromboneMigration.java
        tool
        DocumentCollocatesTest.java
        DocumentTermsTest.java
        KwicsTest.java
        StoredResourceTest.java
        TableCorrelationsTest.java
        TableManagerTest.java
        build
        CorpusBuilderTest.java
        CorpusCreatorTest.java
        DocumentExpanderTest.java
        DocumentExtractorTest.java
        DocumentStorerTest.java
        corpus
        CorpusCollocatesTest.java
        CorpusFacetsTest.java
        CorpusManagerTest.java
        CorpusTermsTest.java
        DocumentContextsTest.java
        DocumentNgramsTest.java
        DocumentTermsTest.java
        DocumentTokensTest.java
        DocumentsMetadataTest.java
        SimpleSortedSetFacetsExample.java
        VelizaTest.java
        util
        EmbeddedWebServer.java
        TestHelper.java

/*******************************************************************************
 * Trombone is a flexible text processing and analysis library used
 * primarily by Voyant Tools (voyant-tools.org).
 * 
 * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
 * 
 * This file is part of Trombone.
 * 
 * Trombone is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Trombone is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Trombone.  If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package org.voyanttools.trombone.input.expand;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.InputStreamInputSource;
import org.voyanttools.trombone.input.source.Source;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;

/**
 * An expander for compressed archives. This is supported through the Apache 
 * Commons Compress library and supports common cases for a range of formats
 * like "ar", "cpio", "dump", "jar", "tar", "tgz", "tbz2", "zip" (though not
 * all of these are tested). Note that for single compressed files (like "gz"
 * and "bzip2") the {@link CompressedExpander} should be used.
 * 
 * @author Stéfan Sinclair
 */
class ArchiveExpander implements Expander {

	/**
	 * the primary expander (child documents are expanded with this)
	 */
	private Expander expander;
	
	/**
	 * the stored document storage strategy
	 */
	private StoredDocumentSourceStorage storedDocumentSourceStorage;
	
	private FlexibleParameters parameters;
	
	/**
	 * Create a new instance of this expander (this should only be done by
	 * {@link StoredDocumentSourceExpander}.
	 * 
	 * @param storedDocumentSourceStorage a stored storage strategy
	 * @param storedDocumentSoruceExpander a reference to the primary expander
	 */
	ArchiveExpander(StoredDocumentSourceStorage storedDocumentSourceStorage, StoredDocumentSourceExpander storedDocumentSoruceExpander, FlexibleParameters parameters) {
		this.storedDocumentSourceStorage = storedDocumentSourceStorage;
		this.expander = storedDocumentSoruceExpander;
		this.parameters = parameters;
	}
	
	public List<StoredDocumentSource> getExpandedStoredDocumentSources(StoredDocumentSource storedDocumentSource)
			throws IOException {
		
		// first try to see if we've been here already
		String id = storedDocumentSource.getId();
		List<StoredDocumentSource> archivedStoredDocumentSources = storedDocumentSourceStorage.getMultipleExpandedStoredDocumentSources(id);
		if (archivedStoredDocumentSources!=null && archivedStoredDocumentSources.isEmpty()==false) {
			return archivedStoredDocumentSources;
		}
		
		InputStream inputStream = null;
		try {
			ArchiveStreamFactory archiveStreamFactory = new ArchiveStreamFactory();
			inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId());
			BufferedInputStream bis = new BufferedInputStream(inputStream);
			
			String filename = storedDocumentSource.getMetadata().getLocation();
			ArchiveInputStream archiveInputStream;
			
			if (filename.toLowerCase().endsWith("tgz") || filename.toLowerCase().endsWith("tar.gz")) { // decompress and then untar
				archiveInputStream = archiveStreamFactory.createArchiveInputStream(ArchiveStreamFactory.TAR, new GZIPInputStream(bis));
			}
			else if (filename.toLowerCase().endsWith("tbz2") || filename.toLowerCase().endsWith("tar.bz2")) { // decompress and then untar
				archiveInputStream = archiveStreamFactory.createArchiveInputStream(ArchiveStreamFactory.TAR, new BZip2CompressorInputStream(bis));
			}
			else {
				archiveInputStream = archiveStreamFactory.createArchiveInputStream(bis);
			}
			archivedStoredDocumentSources = getExpandedDocumentSources(archiveInputStream, storedDocumentSource);
			storedDocumentSourceStorage.setMultipleExpandedStoredDocumentSources(storedDocumentSource.getId(), archivedStoredDocumentSources);
			return archivedStoredDocumentSources;
		} catch (ArchiveException e) {
			throw new IOException("A problem was encountered reading this archive: "+storedDocumentSource.getMetadata().getLocation(), e);
		}
		finally {
			if (inputStream != null) {
				inputStream.close();
			}
		}
	}

	/**
	 * Get a list of stored document sources from the specified archive stream
	 * (that corresponds to the specfied parent stored document source).
	 * 
	 * @param archiveInputStream the full archive input stream
	 * @param parentStoredDocumentSource the parent stored document source
	 * @return a list of stored document sources in this archive
	 * @throws IOException thrown when an IO exception occurs during unarchiving
	 */
	private List<StoredDocumentSource> getExpandedDocumentSources(
			ArchiveInputStream archiveInputStream, StoredDocumentSource parentStoredDocumentSource) throws IOException {
		
		List<StoredDocumentSource> expandedDocumentSources = new ArrayList<StoredDocumentSource>();
		
		ArchiveEntry archiveEntry = archiveInputStream.getNextEntry();
		String parentId = parentStoredDocumentSource.getId();
		DocumentMetadata parentMetadata = parentStoredDocumentSource.getMetadata();
		DocumentFormat parentDocumentFormat = parentMetadata.getDocumentFormat();
				Map<String, Expander> clonedExpanders = new HashMap<String, Expander>();
		while (archiveEntry != null) {
			
			if (archiveEntry.isDirectory()==false) {
				final String filename = archiveEntry.getName();
				final File file = new File(filename);
				
				// auto-detect a BagIt (this may result in expanding other files unnecessarily, but that's ok). 
				if (file.getName().equals("bagit.txt") || file.getName().equals("bag-info.txt") || file.getName().equals("CWRC.bin")) {
					BagItExpander bagitExpander = new BagItExpander(storedDocumentSourceStorage, parameters);
					return bagitExpander.getExpandedStoredDocumentSources(parentStoredDocumentSource);
//					expandedDocumentSources.clear();
//					expandedDocumentSources.add(parentStoredDocumentSource);
//					parentMetadata.setDocumentFormat(DocumentFormat.BAGIT);
//					// ensure the parent metadata is right
//					storedDocumentSourceStorage.updateStoredDocumentSourceMetadata(parentStoredDocumentSource.getId(), parentMetadata);
//					return expandedDocumentSources;
				}

				// skip directories and skippable files
				if (DocumentFormat.isSkippable(file)==false) {
					DocumentMetadata childMetadata = parentMetadata.asParent(parentStoredDocumentSource.getId(), DocumentMetadata.ParentType.EXPANSION);
					childMetadata.setLocation(file.toString());
					childMetadata.setModified(archiveEntry.getLastModifiedDate().getTime());
					childMetadata.setSource(Source.STREAM);
					childMetadata.setTitle(file.getName().replaceFirst("\\.\\w+$", ""));
					DocumentFormat childDocumentFormat = null;
					if (parentDocumentFormat==DocumentFormat.PBLIT) { // use cloned expander without parameters
						if (file.getName().equals("satorbase.xml")) {
							childDocumentFormat=DocumentFormat.SATORBASE;
						} else if (file.getParent().equals("hyperlistes")) {
							childDocumentFormat=DocumentFormat.HYPERLISTES;
						} else if (file.getParent().equals("toucher")) {
							childDocumentFormat=DocumentFormat.TOUCHER;
						} else {
							throw new IllegalStateException("Unrecognized file for PBLIT bundle");
						}
						childMetadata.setDocumentFormat(childDocumentFormat);
					}
					String id = DigestUtils.md5Hex(parentId+filename);
					InputSource inputSource = new InputStreamInputSource(id, childMetadata, new CloseShieldInputStream(archiveInputStream));
					StoredDocumentSource storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource);
					if (parentDocumentFormat==DocumentFormat.PBLIT) { // use cloned expander without parameters
						if (!clonedExpanders.containsKey(childDocumentFormat.name())) {
							FlexibleParameters clonedParams = new FlexibleParameters();
							clonedParams.setParameter("inputFormat", childDocumentFormat.name());
							clonedExpanders.put(childDocumentFormat.name(), new StoredDocumentSourceExpander(storedDocumentSourceStorage, clonedParams));
						}
//						if (childDocumentFormat==DocumentFormat.HYPERLISTES) {
							expandedDocumentSources.addAll(clonedExpanders.get(childDocumentFormat.name()).getExpandedStoredDocumentSources(storedDocumentSource)); // expand this recursively						
//						}
					} else {
						expandedDocumentSources.addAll(this.expander.getExpandedStoredDocumentSources(storedDocumentSource)); // expand this recursively
					}
				}
			}
			archiveEntry = archiveInputStream.getNextEntry();
		}
		
		return expandedDocumentSources;
	}

}