/*******************************************************************************
* Trombone is a flexible text processing and analysis library used
* primarily by Voyant Tools (voyant-tools.org).
*
* Copyright (©) 2007-2012 Stéfan Sinclair & Geoffrey Rockwell
*
* This file is part of Trombone.
*
* Trombone is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Trombone is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Trombone. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package org.voyanttools.trombone.input.extract;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.html.HtmlMapper;
import org.voyanttools.trombone.input.source.InputSource;
import org.voyanttools.trombone.model.DocumentFormat;
import org.voyanttools.trombone.model.DocumentMetadata;
import org.voyanttools.trombone.model.DocumentMetadata.ParentType;
import org.voyanttools.trombone.model.StoredDocumentSource;
import org.voyanttools.trombone.storage.StoredDocumentSourceStorage;
import org.voyanttools.trombone.util.FlexibleParameters;
import org.voyanttools.trombone.util.LangDetector;
import org.xml.sax.SAXException;
/**
* @author sgs
*
*/
public class TikaExtractor implements Extractor {
private ParseContext context;
private Parser parser;
private Detector detector;
private StoredDocumentSourceStorage storedDocumentSourceStorage;
private FlexibleParameters parameters;
TikaExtractor(StoredDocumentSourceStorage storedDocumentSourceStorage, FlexibleParameters parameters) {
this.storedDocumentSourceStorage = storedDocumentSourceStorage;
this.parameters = parameters;
context = new ParseContext();
detector = new DefaultDetector();
parser = new AutoDetectParser(detector);
context.set(Parser.class, parser);
context.set(HtmlMapper.class, new CustomHtmlMapper());
}
public InputSource getExtractableInputSource(StoredDocumentSource storedDocumentSource) throws IOException {
StringBuilder id = new StringBuilder(storedDocumentSource.getId()).append("tika-extracted");
// not sure why we can't use all params, but just in case
for (String param : new String[]{"language","inputRemoveFrom","inputRemoveFromAfter","inputRemoveUntil","inputRemoveUntilAfter"}) {
if (parameters.containsKey(param)) {
id.append(param).append(parameters.getParameterValue(param));
}
}
return new ExtractableTikaInputSource(DigestUtils.md5Hex(id.toString()), storedDocumentSource);
}
private class CustomHtmlMapper extends DefaultHtmlMapper {
@Override
public String mapSafeElement(String name) {
return name.toLowerCase();
}
@Override
public String mapSafeAttribute(String elementName, String attributeName) {
return attributeName.toLowerCase();
}
public boolean isDiscardElement(String name) {
return super.isDiscardElement(name) || name.equalsIgnoreCase("iframe");
}
}
private class ExtractableTikaInputSource implements InputSource {
private String id;
private StoredDocumentSource storedDocumentSource;
private DocumentMetadata metadata;
private boolean isProcessed = false;
private ExtractableTikaInputSource(String id, StoredDocumentSource storedDocumentSource) {
this.id = id;
this.storedDocumentSource = storedDocumentSource;
this.metadata = storedDocumentSource.getMetadata();
}
@Override
public InputStream getInputStream() throws IOException {
org.apache.tika.metadata.Metadata extractedMetadata = new org.apache.tika.metadata.Metadata();
// added this to override poor unicode detection for non-ascii characters, but upgrade of tika have resolved the issue
// if (metadata.getDocumentFormat()==DocumentFormat.TEXT) {
// extractedMetadata.set(Metadata.CONTENT_TYPE, MediaType.TEXT_PLAIN.toString());
// }
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
// Try with a document containing various tables and formattings
InputStream input = storedDocumentSourceStorage.getStoredDocumentSourceInputStream(storedDocumentSource.getId());
// do a first pass to convert various formats to simple HTML
try {
TransformerHandler handler = factory.newTransformerHandler();
// set the output to xhtml instead of html to avoid "Illegal HTML character" exceptions form the transformer
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xhtml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
parser.parse(input, handler, extractedMetadata, context);
} catch (Exception e) {
throw new IOException("Unable to parse document: "+storedDocumentSource.getMetadata(), e);
} finally {
input.close();
}
String extractedContent = sw.toString();
// special handling for PDFs from the xhtml output
if (metadata.getDocumentFormat()==DocumentFormat.PDF) {
// we get empty paragraphs for some reason
extractedContent = extractedContent.replaceAll("<p></p>", "")
// newlines seem to be added superfluously, especially since paragraphs are formed properly
.replaceAll("(
|
|
)+", "")
// hardspaces seem to be added superflously as well
.replaceAll("\\t[\\s \u00A0]+", " ");
}
if (metadata.getDocumentFormat()==DocumentFormat.TOUCHER) {
metadata.setExtra("collection", "Toucher");
}
for (String name : extractedMetadata.names()) {
String value = extractedMetadata.get(name);
if (value.trim().isEmpty()) {continue;}
if (name.equals("title") || name.equals("dc:title")) {
DocumentFormat f = metadata.getDocumentFormat();
// don't set title if it's already there for text or unknown format
if (!metadata.getTitle().isEmpty() && f!=DocumentFormat.UNKNOWN && f!=DocumentFormat.TEXT) {
metadata.setTitle(value);
}
}
else if (name.toLowerCase().equals("meta:author") || name.toLowerCase().equals("author")) {
metadata.setAuthor(value);
}
else if (name.toLowerCase().equals("keywords")) {
metadata.setKeywords(value);
}
else {
metadata.setExtra(name, value);
}
}
// now extract the body from the simple HTML – we should be able to cheat since we already have processed content
int start = extractedContent.indexOf("<body");
int end = extractedContent.indexOf("</body");
if (start > -1 && end > start) {
int startend = extractedContent.indexOf('>', start)+1;
if (startend>start && startend<end) {
extractedContent = extractedContent.substring(startend+1, end);
}
}
DocumentFormat format = storedDocumentSource.getMetadata().getDocumentFormat();
if (format==DocumentFormat.PDF) {
extractedContent = extractedContent.replaceAll("\\s+\\&\\#xD;\\s+", " ");
extractedContent = extractedContent.replaceAll("\\s+ ", " ");
extractedContent = extractedContent.replaceAll("<p/>", "");
}
else if (format==DocumentFormat.TEXT || format==DocumentFormat.UNKNOWN) {
extractedContent = extractedContent.replaceAll("
</p>", "</p>");
extractedContent = extractedContent.replaceAll("
+", "</p>\n <p>");
extractedContent = extractedContent.replaceAll("
", "<br />\n ");
}
// try to determine language
metadata.setLanguageCode(LangDetector.langDetector.detect(extractedContent, parameters));
if (parameters.containsKey("inputRemoveUntil")) {
Pattern pattern = Pattern.compile(parameters.getParameterValue("inputRemoveUntil"));
Matcher matcher = pattern.matcher(extractedContent);
if (matcher.find()) {
extractedContent = extractedContent.substring(matcher.start());
}
}
if (parameters.containsKey("inputRemoveUntilAfter")) {
Pattern pattern = Pattern.compile(parameters.getParameterValue("inputRemoveUntilAfter"));
Matcher matcher = pattern.matcher(extractedContent);
if (matcher.find()) {
extractedContent = extractedContent.substring(matcher.end());
}
}
if (parameters.containsKey("inputRemoveFrom")) {
Pattern pattern = Pattern.compile(parameters.getParameterValue("inputRemoveFrom"));
Matcher matcher = pattern.matcher(extractedContent);
if (matcher.find()) {
extractedContent = extractedContent.substring(0, matcher.start()-1);
}
}
if (parameters.containsKey("inputRemoveFromAfter")) {
Pattern pattern = Pattern.compile(parameters.getParameterValue("inputRemoveFromAfter"));
Matcher matcher = pattern.matcher(extractedContent);
if (matcher.find()) {
extractedContent = extractedContent.substring(0, matcher.end());
}
}
isProcessed = true;
return new ByteArrayInputStream(extractedContent.getBytes("UTF-8"));
}
@Override
public DocumentMetadata getMetadata() throws IOException {
return isProcessed ? this.metadata : storedDocumentSourceStorage.getStoredDocumentSourceMetadata(id);
}
@Override
public String getUniqueId() {
return this.id;
}
}
}