/**
*
*/
package org.voyanttools.trombone.input.extract;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.UUID;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.cxf.helpers.IOUtils;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.input.source.InputStreamInputSource;
import org.voyanttools.trombone.input.source.StoredDocumentSourceInputSource;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.DocumentMetadata.ParentType;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
/**
* @author sgsin
*
*/
public class BagItExtractor implements Extractor {
private StoredDocumentSourceStorage storedDocumentSourceStorage;
FlexibleParameters parameters;
public BagItExtractor(StoredDocumentSourceStorage storedDocumentSourceStorage, FlexibleParameters parameters) {
this.storedDocumentSourceStorage = storedDocumentSourceStorage;
this.parameters = parameters;
}
/* (non-Javadoc)
* @see org.voyanttools.trombone.input.extract.Extractor#getExtractableInputSource(org.voyanttools.trombone.model.StoredDocumentSource)
*/
@Override
public InputSource getExtractableInputSource(StoredDocumentSource storedDocumentSource) throws IOException {
ArchiveStreamFactory archiveStreamFactory = new ArchiveStreamFactory();
InputStream inputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId());
BufferedInputStream bis = new BufferedInputStream(inputStream);
ArchiveInputStream archiveInputStream = null;
String id = DigestUtils.md5Hex(storedDocumentSource.getId()+"-bagit");
DocumentMetadata metadata = storedDocumentSource.getMetadata().asParent(id, ParentType.EXTRACTION);
StoredDocumentSource extractedDocumentSource = null;
try {
archiveInputStream = archiveStreamFactory.createArchiveInputStream(bis);
ArchiveEntry archiveEntry = archiveInputStream.getNextEntry();
while (archiveEntry != null) {
if (archiveEntry.isDirectory()==false) {
final String filename = archiveEntry.getName();
final File file = new File(filename);
// these filenames are all hard-coded for CWRC for now, not sure how to generalize this
if (file.getName().equals("MODS.bin")) { // get metadata if CWRC.bin doesn't have header
DocumentMetadata docMetadata = getMetadata(archiveInputStream, "MODS");
metadata.setTitle(docMetadata.getTitle());
metadata.setAuthor(docMetadata.getAuthor());
} else if (file.getName().equals("DC.xml")) { // get CWRC ID
DocumentMetadata docMetadata = getMetadata(archiveInputStream, "DC");
metadata.setExtra("cwrcIdentifier", docMetadata.getExtra("cwrcIdentifier"));
} else if (file.getName().equals("CWRC.bin")) {
InputSource is = new InputStreamInputSource(DigestUtils.md5Hex(UUID.randomUUID().toString()), metadata, new CloseShieldInputStream(archiveInputStream));
StoredDocumentSource storedDocSource = storedDocumentSourceStorage.getStoredDocumentSource(is);
FlexibleParameters params = new FlexibleParameters();
// we'll have a peak at the file to see if we can determine its format, we do this here because otherwise it's treated much more generically
InputStream storedInputStream = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocSource.getId());
String contents = IOUtils.readStringFromStream(storedInputStream);
storedInputStream.close();
if (contents.contains("<?xml") && contents.contains("<TEI") && contents.contains("GutenTag")) {
params.setParameter("inputFormat", "GUTENTAG");
metadata.setDefaultFormat(DocumentFormat.XML);
}
StoredDocumentSourceExtractor extractor = new StoredDocumentSourceExtractor(storedDocumentSourceStorage, params);
extractedDocumentSource = extractor.getExtractedStoredDocumentSource(storedDocSource);
}
}
archiveEntry = archiveInputStream.getNextEntry();
}
} catch (ArchiveException e) {
throw new IOException(e);
} finally {
if (archiveInputStream!=null) {archiveInputStream.close();}
}
if (extractedDocumentSource==null) {
throw new IOException("Unable to find BagIt contents.");
}
return new StoredDocumentSourceInputSource(storedDocumentSourceStorage, extractedDocumentSource);
}
private InputSource getExtractableInputSource(ArchiveInputStream archiveInputStream, String inputFormat) throws IOException {
FlexibleParameters params = new FlexibleParameters(new String[]{"inputFormat="+inputFormat});
InputSource is = new InputStreamInputSource(DigestUtils.md5Hex(UUID.randomUUID().toString()), new DocumentMetadata(), new CloseShieldInputStream(archiveInputStream));
StoredDocumentSource storedDocSource = storedDocumentSourceStorage.getStoredDocumentSource(is);
XmlExtractor extractor = new XmlExtractor(storedDocumentSourceStorage, params);
return extractor.getExtractableInputSource(storedDocSource);
}
private DocumentMetadata getMetadata(ArchiveInputStream archiveInputStream, String inputFormat) throws IOException {
InputSource inputSource = getExtractableInputSource(archiveInputStream, inputFormat);
inputSource.getInputStream().close(); // make sure it's read
return inputSource.getMetadata();
}
}