/******************************************************************************* * Trombone is a flexible text processing and analysis library used * primarily by Voyant Tools (voyant-tools.org). * * Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell * * This file is part of Trombone. * * Trombone is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Trombone is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Trombone. If not, see <http://www.gnu.org/licenses/>. ******************************************************************************/ package org.voyanttools.trombone.input.expand; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.zip.GZIPInputStream; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.io.input.CloseShieldInputStream; import org.voyanttools.trombone.input.source.InputSource; import org.voyanttools.trombone.input.source.InputStreamInputSource; import org.voyanttools.trombone.input.source.Source; import org.voyanttools.trombone.model.DocumentFormat; import org.voyanttools.trombone.model.DocumentMetadata; import org.voyanttools.trombone.model.StoredDocumentSource; import org.voyanttools.trombone.storage.StoredDocumentSourceStorage; import org.voyanttools.trombone.util.FlexibleParameters; /** * An expander for compressed archives. This is supported through the Apache * Commons Compress library and supports common cases for a range of formats * like "ar", "cpio", "dump", "jar", "tar", "tgz", "tbz2", "zip" (though not * all of these are tested). Note that for single compressed files (like "gz" * and "bzip2") the {@link CompressedExpander} should be used. * * @author Stéfan Sinclair */ class ArchiveExpander implements Expander { /** * the primary expander (child documents are expanded with this) */ private Expander expander; /** * the stored document storage strategy */ private StoredDocumentSourceStorage storedDocumentSourceStorage; private FlexibleParameters parameters; /** * Create a new instance of this expander (this should only be done by * {@link StoredDocumentSourceExpander}. * * @param storedDocumentSourceStorage a stored storage strategy * @param storedDocumentSoruceExpander a reference to the primary expander */ ArchiveExpander(StoredDocumentSourceStorage storedDocumentSourceStorage, StoredDocumentSourceExpander storedDocumentSoruceExpander, FlexibleParameters parameters) { this.storedDocumentSourceStorage = storedDocumentSourceStorage; this.expander = storedDocumentSoruceExpander; this.parameters = parameters; } public List<StoredDocumentSource> getExpandedStoredDocumentSources(StoredDocumentSource storedDocumentSource) throws IOException { // first try to see if we've been here already String id = storedDocumentSource.getId(); List<StoredDocumentSource> archivedStoredDocumentSources = storedDocumentSourceStorage.getMultipleExpandedStoredDocumentSources(id); if (archivedStoredDocumentSources!=null && archivedStoredDocumentSources.isEmpty()==false) { return archivedStoredDocumentSources; } InputStream inputStream = null; try { ArchiveStreamFactory archiveStreamFactory = new ArchiveStreamFactory(); inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId()); BufferedInputStream bis = new BufferedInputStream(inputStream); String filename = storedDocumentSource.getMetadata().getLocation(); ArchiveInputStream archiveInputStream; if (filename.toLowerCase().endsWith("tgz") || filename.toLowerCase().endsWith("tar.gz")) { // decompress and then untar archiveInputStream = archiveStreamFactory.createArchiveInputStream(ArchiveStreamFactory.TAR, new GZIPInputStream(bis)); } else if (filename.toLowerCase().endsWith("tbz2") || filename.toLowerCase().endsWith("tar.bz2")) { // decompress and then untar archiveInputStream = archiveStreamFactory.createArchiveInputStream(ArchiveStreamFactory.TAR, new BZip2CompressorInputStream(bis)); } else { archiveInputStream = archiveStreamFactory.createArchiveInputStream(bis); } archivedStoredDocumentSources = getExpandedDocumentSources(archiveInputStream, storedDocumentSource); storedDocumentSourceStorage.setMultipleExpandedStoredDocumentSources(storedDocumentSource.getId(), archivedStoredDocumentSources); return archivedStoredDocumentSources; } catch (ArchiveException e) { throw new IOException("A problem was encountered reading this archive: "+storedDocumentSource.getMetadata().getLocation(), e); } finally { if (inputStream != null) { inputStream.close(); } } } /** * Get a list of stored document sources from the specified archive stream * (that corresponds to the specfied parent stored document source). * * @param archiveInputStream the full archive input stream * @param parentStoredDocumentSource the parent stored document source * @return a list of stored document sources in this archive * @throws IOException thrown when an IO exception occurs during unarchiving */ private List<StoredDocumentSource> getExpandedDocumentSources( ArchiveInputStream archiveInputStream, StoredDocumentSource parentStoredDocumentSource) throws IOException { List<StoredDocumentSource> expandedDocumentSources = new ArrayList<StoredDocumentSource>(); ArchiveEntry archiveEntry = archiveInputStream.getNextEntry(); String parentId = parentStoredDocumentSource.getId(); DocumentMetadata parentMetadata = parentStoredDocumentSource.getMetadata(); DocumentFormat parentDocumentFormat = parentMetadata.getDocumentFormat(); Map<String, Expander> clonedExpanders = new HashMap<String, Expander>(); while (archiveEntry != null) { if (archiveEntry.isDirectory()==false) { final String filename = archiveEntry.getName(); final File file = new File(filename); // auto-detect a BagIt (this may result in expanding other files unnecessarily, but that's ok). if (file.getName().equals("bagit.txt") || file.getName().equals("bag-info.txt") || file.getName().equals("CWRC.bin")) { BagItExpander bagitExpander = new BagItExpander(storedDocumentSourceStorage, parameters); return bagitExpander.getExpandedStoredDocumentSources(parentStoredDocumentSource); // expandedDocumentSources.clear(); // expandedDocumentSources.add(parentStoredDocumentSource); // parentMetadata.setDocumentFormat(DocumentFormat.BAGIT); // // ensure the parent metadata is right // storedDocumentSourceStorage.updateStoredDocumentSourceMetadata(parentStoredDocumentSource.getId(), parentMetadata); // return expandedDocumentSources; } // skip directories and skippable files if (DocumentFormat.isSkippable(file)==false) { DocumentMetadata childMetadata = parentMetadata.asParent(parentStoredDocumentSource.getId(), DocumentMetadata.ParentType.EXPANSION); childMetadata.setLocation(file.toString()); childMetadata.setModified(archiveEntry.getLastModifiedDate().getTime()); childMetadata.setSource(Source.STREAM); childMetadata.setTitle(file.getName().replaceFirst("\\.\\w+$", "")); DocumentFormat childDocumentFormat = null; if (parentDocumentFormat==DocumentFormat.PBLIT) { // use cloned expander without parameters if (file.getName().equals("satorbase.xml")) { childDocumentFormat=DocumentFormat.SATORBASE; } else if (file.getParent().equals("hyperlistes")) { childDocumentFormat=DocumentFormat.HYPERLISTES; } else if (file.getParent().equals("toucher")) { childDocumentFormat=DocumentFormat.TOUCHER; } else { throw new IllegalStateException("Unrecognized file for PBLIT bundle"); } childMetadata.setDocumentFormat(childDocumentFormat); } String id = DigestUtils.md5Hex(parentId+filename); InputSource inputSource = new InputStreamInputSource(id, childMetadata, new CloseShieldInputStream(archiveInputStream)); StoredDocumentSource storedDocumentSource = storedDocumentSourceStorage.getStoredDocumentSource(inputSource); if (parentDocumentFormat==DocumentFormat.PBLIT) { // use cloned expander without parameters if (!clonedExpanders.containsKey(childDocumentFormat.name())) { FlexibleParameters clonedParams = new FlexibleParameters(); clonedParams.setParameter("inputFormat", childDocumentFormat.name()); clonedExpanders.put(childDocumentFormat.name(), new StoredDocumentSourceExpander(storedDocumentSourceStorage, clonedParams)); } // if (childDocumentFormat==DocumentFormat.HYPERLISTES) { expandedDocumentSources.addAll(clonedExpanders.get(childDocumentFormat.name()).getExpandedStoredDocumentSources(storedDocumentSource)); // expand this recursively // } } else { expandedDocumentSources.addAll(this.expander.getExpandedStoredDocumentSources(storedDocumentSource)); // expand this recursively } } } archiveEntry = archiveInputStream.getNextEntry(); } return expandedDocumentSources; } }