/*
* <p><b>License and Copyright: </b>The contents of this file is subject to the
* same open source license as the Fedora Repository System at www.fedora-commons.org
* Copyright © 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 by The Technical University of Denmark.
* All rights reserved.</p>
*/
package dk.defxws.fedoragsearch.server;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Arrays;
import javax.xml.transform.stream.StreamSource;
import org.apache.log4j.Logger;
import org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource;
//import org.apache.lucene.demo.html.HTMLParser;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import dk.defxws.fedoragsearch.server.errors.GenericSearchException;
/**
* performs transformations from formatted documents to text
*
* @author gsp@dtv.dk
* @version
*/
public class TransformerToText {
private static final Logger logger =
Logger.getLogger(TransformerToText.class);
public static final String[] handledMimeTypes = {"text/plain", "text/xml", "application/xml", "text/html", "application/pdf"};
public TransformerToText() {
}
public StringBuffer getFromTika(String fullDsId, byte[] doc, String indexFieldTagName, String textIndexField, String indexFieldNamePrefix, String selectedFields, int writeLimit)
throws GenericSearchException {
StringBuffer indexFields = new StringBuffer();
InputStream isr = new ByteArrayInputStream(doc);
WriteOutContentHandler textHandler = new WriteOutContentHandler(writeLimit);
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser();
try {
parser.parse(isr, textHandler, metadata);
} catch (Exception e) {
if (!textHandler.isWriteLimitReached(e)) {
throw new GenericSearchException(e.toString());
} else {
logger.warn("getFromTika"
+" fullDsId="+fullDsId
+" writeLimit reached="+e);
}
} finally {
try {
isr.close();
} catch (Exception e) {
throw new GenericSearchException(e.toString());
}
}
if (logger.isDebugEnabled()) {
logger.debug("getFromTika"
+" fullDsId="+fullDsId
+" indexFieldTagName="+indexFieldTagName
+" textIndexField="+textIndexField
+" indexFieldNamePrefix="+indexFieldNamePrefix
+" selectedFields="+selectedFields);
for (int i=0; i<metadata.names().length; i++) {
String name = metadata.names()[i];
StringBuffer metadataValue = new StringBuffer(metadata.get(name));
String[] metadataValues = metadata.getValues(name);
if (metadataValues.length>1) {
for (int j=1; j<metadataValues.length; j++) {
metadataValue.append(" "+metadataValues[j]);
}
}
logger.debug(" METADATA name="+name+" value="+metadataValue);
}
}
if ("IndexField".equals(indexFieldTagName) || "field".equals(indexFieldTagName)) {
String[] names = new String[0];
if (selectedFields != null && selectedFields.length()>0){
names = selectedFields.split(",");
} else {
if (selectedFields != null){
names = metadata.names();
for (int i=0; i<names.length; i++) {
names[i] = names[i]+"="
+names[i].replace(' ', '_')
.replace(':', '_')
.replace('/', '_')
.replace('=', '_')
.replace('(', '_')
.replace(')', '_')
.replace('&', '_');
}
}
}
if (textIndexField != null && textIndexField.length() > 0) {
names = Arrays.copyOf(names, names.length+1);
names[names.length-1] = textIndexField;
}
for (int i=0; i<names.length; i++) {
if (logger.isDebugEnabled())
logger.debug("getFromTika"
+" metadata names["+i+"]="+names[i]);
String fieldSpec = names[i].trim();
if (fieldSpec.length() > 0) {
String[] fieldNameWithParams = fieldSpec.split("/");
String fieldName = fieldNameWithParams[0].trim();
String fieldNameOrg = fieldName;
if (fieldSpec.indexOf("=") > 0) {
fieldNameOrg = fieldSpec.substring(0, fieldSpec.indexOf("="));
fieldSpec = fieldSpec.substring(fieldSpec.indexOf("=")+1);
fieldNameWithParams = fieldSpec.split("/");
fieldName = fieldNameWithParams[0].trim();
}
String index = "TOKENIZED";
String store = "YES";
String termVector = "YES";
String boost = "1.0";
if (fieldNameWithParams.length > 1) {
if (fieldNameWithParams[1].length() > 0)
index = fieldNameWithParams[1];
if (fieldNameWithParams.length > 2) {
if (fieldNameWithParams[2].length() > 0)
store = fieldNameWithParams[2];
if (fieldNameWithParams.length > 3) {
if (fieldNameWithParams[3].length() > 0)
termVector = fieldNameWithParams[3];
if (fieldNameWithParams.length > 4) {
if (fieldNameWithParams[4].length() > 0)
boost = fieldNameWithParams[4];
}
}
}
}
StringBuffer indexFieldValue = new StringBuffer();
String indexFieldName = fieldName;
if (textIndexField != null && textIndexField.length() > 0 && i == names.length-1) {
indexFieldValue.append(textHandler.toString());
} else {
indexFieldName = indexFieldNamePrefix+fieldName;
indexFieldValue.append(metadata.get(fieldNameOrg));
String[] indexFieldValues = metadata.getValues(fieldNameOrg);
if (indexFieldValues.length>1) {
for (int j=1; j<indexFieldValues.length; j++) {
indexFieldValue.append(" "+indexFieldValues[j]);
}
}
}
StringBuffer indexField = new StringBuffer();
if (indexFieldValue.length()>0) {
if ("IndexField".equals(indexFieldTagName)) {
indexField.append("\n<IndexField IFname=\""+indexFieldName+"\" index=\""+index+"\" store=\""+store+"\" termVector=\""+termVector+"\" boost=\""+boost+"\">");
} else if ("field".equals(indexFieldTagName)) {
indexField.append("\n<field name=\""+indexFieldName+"\">");
}
indexField.append(indexFieldValue.toString().replaceAll("<", "<").replaceAll(">", ">").replaceAll("&", "&").replaceAll("\"", """));
indexField.append("</"+indexFieldTagName+">");
indexField.append("<!--"+names[i]+"-->");
indexFields.append(indexField);
}
if (logger.isDebugEnabled())
logger.debug("getFromTika"
+" metadataFieldName="+fieldNameOrg
+" indexField="+indexField);
}
}
}
StringBuffer docText = new StringBuffer(indexFields);
// put space instead of characters not allowed in the indexing stylesheet
char c;
for (int i=0; i<docText.length(); i++) {
c = docText.charAt(i);
if (c < 32 && c != 9 && c != 10 && c != 13) {
if (logger.isDebugEnabled())
logger.debug("getTextFromTika index="+i+" char="+c+" set to 32");
docText.replace(i, i+1, " ");
}
}
return docText;
}
public StringBuffer getText(byte[] doc, String mimetype)
throws GenericSearchException {
if (mimetype.equals("text/plain")) {
return getTextFromText(doc);
} else if(mimetype.equals("text/xml") || mimetype.equals("application/xml")) {
return getTextFromXML(doc);
} else if(mimetype.equals("text/html")) {
return getTextFromHTML(doc);
} else if(mimetype.equals("application/pdf")) {
return getTextFromPDF(doc);
} else if(mimetype.equals("application/ps")) {
return new StringBuffer();
} else if(mimetype.equals("application/msword")) {
return new StringBuffer();
} else return new StringBuffer();
}
private StringBuffer getTextFromText(byte[] doc)
throws GenericSearchException {
StringBuffer docText = new StringBuffer();
InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(doc));
try {
int c = isr.read();
while (c>-1) {
docText.append((char)c);
c=isr.read();
}
} catch (IOException e) {
throw new GenericSearchException(e.toString());
}
return docText;
}
private StringBuffer getTextFromXML(byte[] doc)
throws GenericSearchException {
InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(doc));
StringBuffer docText = (new GTransformer()).transform(
"fgsconfigFinal/textFromXml",
new StreamSource(isr));
docText.delete(0, docText.indexOf(">")+1);
return docText;
}
private StringBuffer getTextFromHTML(byte[] doc)
throws GenericSearchException {
StringBuffer docText = new StringBuffer();
DemoHTMLParser htmlParser = new DemoHTMLParser();
try {
DocData docData = htmlParser.parse(new DocData(), null, null, new InputSource(new ByteArrayInputStream(doc)), new TrecContentSource());
docText = new StringBuffer(docData.getBody());
} catch (IOException e) {
throw new GenericSearchException(e.toString());
} catch (SAXException e) {
throw new GenericSearchException(e.toString());
}
// HTMLParser htmlParser = new HTMLParser(new ByteArrayInputStream(doc));
// try {
// InputStreamReader isr = (InputStreamReader) htmlParser.getReader();
// int c = isr.read();
// while (c>-1) {
// docText.append((char)c);
// c=isr.read();
// }
// } catch (IOException e) {
// throw new GenericSearchException(e.toString());
// }
return docText;
}
private StringBuffer getTextFromPDF(byte[] doc)
throws GenericSearchException {
// extract PDF document's textual content
// encrypted and/or password protected PDF documents cannot be indexed by GSearch,
// an exception will be thrown here, which will be caught by the calling
// GenericOperationsImpl method, which will return an empty index field
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF");
StringBuffer docText = new StringBuffer();
ByteArrayInputStream bais = null;
try {
bais = new ByteArrayInputStream(doc);
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new ByteArrayInputStream: ", e);
throw new GenericSearchException(
"getTextFromPDF new ByteArrayInputStream: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new ByteArrayInputStream");
PDFParser parser;
try {
parser = new PDFParser(bais);
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFParser: ", e);
throw new GenericSearchException(
"getTextFromPDF new PDFParser: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFParser");
try {
parser.parse();
} catch (Exception e) {
closeBAIS(bais);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.parse: ", e);
throw new GenericSearchException(
"getTextFromPDF parser.parse: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.parse");
COSDocument cosDoc = null;
try {
cosDoc = parser.getDocument();
}
catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.getDocument: ", e);
throw new GenericSearchException(
"getTextFromPDF parser.getDocument: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF parser.getDocument");
PDDocument pdDoc = null;
try {
pdDoc = new PDDocument(cosDoc);
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDDocument: ", e);
throw new GenericSearchException(
"getTextFromPDF new PDDocument: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDDocument isEncrypted="+pdDoc.isEncrypted()+" getNumberOfPages="+pdDoc.getNumberOfPages());
PDFTextStripper stripper;
try {
stripper = new PDFTextStripper();
} catch (Exception e) {
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFTextStripper: ", e);
throw new GenericSearchException(
"getTextFromPDF new PDFTextStripper: ", e);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF new PDFTextStripper getStartPage="+stripper. getStartPage()+" getEndPage="+stripper.getEndPage());
String docString = "";
try {
docString = stripper.getText(pdDoc);
}
catch (Exception e) {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText: ", e);
throw new GenericSearchException(
"getTextFromPDF stripper.getText: ", e);
}
finally {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText finally");
closeBAIS(bais);
closeCOSDocument(cosDoc);
closePDDocument(pdDoc);
}
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF stripper.getText");
docText = new StringBuffer(docString);
// put space instead of characters not allowed in the indexing stylesheet
char c;
for (int i=0; i<docText.length(); i++) {
c = docText.charAt(i);
if (c < 32 && c != 9 && c != 10 && c != 13) {
if (logger.isDebugEnabled())
logger.debug("getTextFromPDF index="+i+" char="+c+" set to 32");
docText.replace(i, i+1, " ");
}
}
return docText;
}
private void closeCOSDocument(COSDocument cosDoc)
throws GenericSearchException {
if (cosDoc != null) {
try {
cosDoc.close();
}
catch (Exception e) {
throw new GenericSearchException(
"Cannot close COSDocument: ", e);
}
}
}
private void closePDDocument(PDDocument pdDoc)
throws GenericSearchException {
if (pdDoc != null) {
try {
pdDoc.close();
}
catch (Exception e) {
throw new GenericSearchException(
"Cannot close PDDocument: ", e);
}
}
}
private void closeBAIS(ByteArrayInputStream bais)
throws GenericSearchException {
if (bais != null) {
try {
bais.close();
}
catch (Exception e) {
throw new GenericSearchException(
"Cannot close ByteArrayInputStream: ", e);
}
}
}
}