/**
* pdfXtk - PDF Extraction Toolkit
* Copyright (c) by the authors/contributors. All rights reserved.
* This project includes code from PDFBox and TouchGraph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://pdfxtk.sourceforge.net
*
*/
package at.ac.tuwien.dbai.pdfwrap;
import at.ac.tuwien.dbai.pdfwrap.analysis.PageProcessor;
import at.ac.tuwien.dbai.pdfwrap.exceptions.DocumentProcessingException;
import at.ac.tuwien.dbai.pdfwrap.model.document.GenericSegment;
import at.ac.tuwien.dbai.pdfwrap.model.document.IXHTMLSegment;
import at.ac.tuwien.dbai.pdfwrap.model.document.Page;
import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyGraph;
import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFObjectExtractor;
import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFPage;
import at.ac.tuwien.dbai.pdfwrap.utils.Utils;
import org.apache.log4j.Logger;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
/**
* This is the main program that parses the pdf document and transforms it.
* Based upon PDFBox code example from Ben Litchfield
*
* @author Tamir Hassan, pdfanalyser@tamirhassan.com
* @version PDF Analyser 0.9
* @author Ben Litchfield (ben@csh.rit.edu)
*/
public class ProcessFile
{
// TODO: move somewhere sensible! this is a global var, at least for GUI
// moved to GUI 30.11.06
//public static float XML_RESOLUTION = 150;
private static final Logger LOG = Logger.getLogger( ProcessFile.class );
/**
* This is the default encoding of the text to be output.
*/
public static final String DEFAULT_ENCODING =
//null;
//"ISO-8859-1";
//"ISO-8859-6"; //arabic
//"US-ASCII";
"UTF-8";
//"UTF-16";
//"UTF-16BE";
//"UTF-16LE";
//private static Document resultDocument;
/**
* The stream to write the output to.
*/
//protected static Writer output;
// 27.12.08 changed to public due to GraphMatcher.java
public static final String PASSWORD = "-password";
public static final String ENCODING = "-encoding";
public static final String CONSOLE = "-console";
public static final String START_PAGE = "-startPage";
public static final String END_PAGE = "-endPage";
// public static final String TABLE = "-table";
// public static final String AUTOTABLE = "-autotable";
public static final String XMILLUM = "-xmillum";
public static final String NOBORDERS = "-noborders";
public static final String PROCESS_SPACES = "-spaces";
public static final String NORULINGLINES = "-norulinglines";
/*
public static String STR_INFILE = "";
public static String STR_OUTPUT_PATH = ".";
public static int STR_CURR_PAGE_NO = -1;
public static final String STR_IMAGE_PREFIX = "-imgPrefix";
*/
/*
* possible conversions:
* pdf -> xml, pdf -> xhtml,
* gecko -> xml, gecko -> xhtml
*/
public static List<Page> processPDF(byte[] theFile, PageProcessor pp,
int startPage, int endPage, String encoding, String password,
List<AdjacencyGraph<GenericSegment>> adjGraphList, boolean GUI)
throws DocumentProcessingException
{
boolean toConsole = false;
if (password == null)
password = "";
if (encoding == null || encoding == "")
encoding = DEFAULT_ENCODING;
if (startPage == 0)
startPage = 1;
if (endPage == 0)
endPage = Integer.MAX_VALUE;
ByteArrayInputStream inStream = new ByteArrayInputStream(theFile);
PDDocument document = null;
try {
PDFObjectExtractor extractor = new PDFObjectExtractor();
// PDDocument document = null;
document = PDDocument.load( inStream );
// document.print();
if( document.isEncrypted() )
{
try
{
document.decrypt( password );
}
catch( InvalidPasswordException e )
{
if(!(password == null || password == ""))//they supplied the wrong password
{
throw new DocumentProcessingException
("Error: The supplied password is incorrect.");
}
else
{
//they didn't suppply a password and the default of "" was wrong.
throw new DocumentProcessingException
( "Error: The document is encrypted." );
}
} catch (CryptographyException e) {
throw new DocumentProcessingException(e);
}
}
extractor.setStartPage( startPage );
extractor.setEndPage( endPage );
// stripper.writeText( document, output );
List<PDFPage> thePages = extractor.findObjects(document);
List<Page> theResult = new ArrayList<Page>();
startPage = extractor.getStartPage();
endPage = extractor.getEndPage();
// now the DU part
Iterator<PDFPage> pageIter = thePages.iterator();
int currentPage = -1;
while(pageIter.hasNext())
{
currentPage ++;
PDFPage thePage = pageIter.next();
Page resultPage = pp.processPage(thePage);
theResult.add(resultPage);
if (adjGraphList != null)
adjGraphList.add(pp.getAdjGraph());
}
// 17.11.10 document-wide processing for headers, footers, etc.
if (!GUI)
theResult = pp.processDocPages(theResult, null);
// move to finally block somewhere?
if( document != null )
{
document.close();
}
return theResult;
}
catch (IOException e)
{
e.printStackTrace();
throw new DocumentProcessingException(e);
}
}
public static org.w3c.dom.Document processResultPageToXMLDocument
(Page resultPage, boolean toXHTML, boolean borders)
throws DocumentProcessingException
{
List<Page> theResult = new ArrayList<Page>();
theResult.add(resultPage);
return processResultToXMLDocument(theResult, toXHTML, borders);
}
public static org.w3c.dom.Document processResultToXMLDocument
(List<Page> theResult, boolean toXHTML, boolean borders)
throws DocumentProcessingException
{
org.w3c.dom.Document resultDocument;
// only used in the case of XHTML
Element newBodyElement = null;
Element docElement = null;
// set up the XML file
try
{
if (toXHTML)
{
resultDocument = setUpXML("html");
docElement = resultDocument.getDocumentElement();
if (borders)
{
// add borders stuff here
Element newHeadElement = resultDocument.createElement("head");
Element newStyleElement = resultDocument.createElement("style");
newStyleElement.setAttribute("type", "text/css");
Text newTextElement = resultDocument.createTextNode
("table {border-collapse: collapse;}");
Text newTextElement2 = resultDocument.createTextNode
("td, th {border: 1px solid grey; padding: 2px 4px;}");
newStyleElement.appendChild(newTextElement);
newStyleElement.appendChild(newTextElement2);
newHeadElement.appendChild(newStyleElement);
docElement.appendChild(newHeadElement);
}
newBodyElement = resultDocument.createElement("body");
}
else
{
resultDocument = setUpXML("PDFResult");
docElement = resultDocument.getDocumentElement();
}
}
catch (ParserConfigurationException e)
{
throw new DocumentProcessingException(e);
}
// add the new page element
//docElement = resultDocument.getDocumentElement();
int pageNo = 0;
Iterator resultIter = theResult.iterator();
while(resultIter.hasNext())
{
GenericSegment gs = (GenericSegment)resultIter.next();
if (gs instanceof Page)
{
Page resultPage = (Page)gs;
pageNo ++;
if (toXHTML)
{
resultPage.setPageNo(pageNo);
resultPage.addAsXHTML(resultDocument, newBodyElement);
}
else
{
Element newPageElement = resultDocument.createElement("page");
newPageElement.setAttribute("page_number", Integer.toString(pageNo));
//we want to use the MediaBox!
//resultPage.findBoundingBox();
resultPage.addAsXmillum(resultDocument, newPageElement,
resultPage, Utils.XML_RESOLUTION);
docElement.appendChild(newPageElement);
}
}
else if (gs instanceof IXHTMLSegment)//(gs.getClass() == Cluster.class || gs.getClass() == strRasterSegment.class)
{
IXHTMLSegment c = (IXHTMLSegment)gs;
if (toXHTML)
{
c.addAsXHTML(resultDocument, newBodyElement);
}
// for XMIllum output, the top-level segment is always a Page
}
// run NG on page
// output page (cluster-wise) to ontology
}
if (toXHTML)
docElement.appendChild(newBodyElement);
return resultDocument;
}
/**
*
* @param theFile as byte array
* @param pp bring in the pageProcessor implementation
* @param toXHTML whether to return xhtml document or XMIllum visualization format
* @param borders adds border to table cell in output format - works only when toXHTML true
* @param startPage The first page to start extraction(1 based)
* @param endPage The last page to extract(inclusive)
* @param encoding (ISO-8859-1,UTF-16BE,UTF-16LE,...)
* @param password Password to decrypt document
*
* @return new instance of dom document representing the processing results
* @throws DocumentProcessingException
*/
public static org.w3c.dom.Document processPDFToXMLDocument(byte[] theFile,
PageProcessor pp, boolean toXHTML, boolean borders,
int startPage, int endPage, String encoding, String password)
throws DocumentProcessingException
{
List<Page> theResult = processPDF(theFile, pp, startPage, endPage,
encoding, password, null, false);
return processResultToXMLDocument(theResult, toXHTML, borders);
}
/*
public static byte[] processPDFToByteArray(byte[] theFile,
PageProcessor pp, int toXHTML,
int startPage, int endPage, String encoding, String password)
throws DocumentProcessingException
{
org.w3c.dom.Document resultDocument;
// calls the above and returns a byte[] from the XML Document.
List<Page> theResult = processPDF(theFile, pp, startPage, endPage,
encoding, password, null, false);
resultDocument = processResultToXMLDocument(theResult, toXHTML, borders);
return serializeXML(resultDocument);
}
*/
public static byte[] processPDFToByteArray(byte[] theFile,
PageProcessor pp, boolean toXHTML, boolean borders,
int startPage, int endPage, String encoding, String password)
throws DocumentProcessingException
{
// calls the above and returns a byte[] from the XML Document.
org.w3c.dom.Document resultDocument =
processPDFToXMLDocument(theFile, pp, toXHTML, borders,
startPage, endPage, encoding, password);
return serializeXML(resultDocument);
}
/**
* Infamous main method.
*
* @param args Command line arguments, should be one and a reference to a file.
*
* @throws Exception If there is an error parsing the document.
*/
public static void main(String[] args) throws Exception
{
boolean toConsole = false;
// boolean table = false;
// boolean autotable = false;
boolean toXHTML = true;
boolean borders = true;
boolean rulingLines = true;
boolean processSpaces = false;
int currentArgumentIndex = 0;
String password = "";
String encoding = DEFAULT_ENCODING;
PDFObjectExtractor extractor = new PDFObjectExtractor();
String inFile = null;
String outFile = null;
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( ENCODING ) )
{
i++;
if( i >= args.length )
{
usage();
}
encoding = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( CONSOLE ) )
{
toConsole = true;
}
/*
else if( args[i].equals( AUTOTABLE ))
{
autotable = true;
}
else if( args[i].equals( TABLE ))
{
table = true;
}
*/
else if( args[i].equals( NOBORDERS ))
{
borders = false;
}
else if( args[i].equals( XMILLUM ) )
{
toXHTML = false;
}
else if( args[i].equals( NORULINGLINES ))
{
rulingLines = false;
}
else if( args[i].equals( PROCESS_SPACES ))
{
processSpaces = false;
}
else
{
if( inFile == null )
{
inFile = args[i];
}
else
{
outFile = args[i];
}
}
}
if( inFile == null )
{
usage();
}
if( outFile == null && inFile.length() >4 )
{
outFile = inFile.substring( 0, inFile.length() -4 ) + ".html";
}
// decide whether we have a pdf or image (TODO: command-line override)
/*
boolean pdf = true;
if (inFile.endsWith("png") ||
inFile.endsWith("tif") ||
inFile.endsWith("tiff")||
inFile.endsWith("jpg") ||
inFile.endsWith("jpeg")||
inFile.endsWith("PNG") ||
inFile.endsWith("TIF") ||
inFile.endsWith("TIFF") ||
inFile.endsWith("JPG") ||
inFile.endsWith("JPEG")) pdf = false;
*/
// System.err.println("Processing: " + inFile);
// load the input file
File inputFile = new File(inFile);
/*
STR_INFILE = inputFile.getCanonicalPath();
File tempOutFile = new File(outFile); // tmp for str only
if (tempOutFile.getParent() != null)
STR_OUTPUT_PATH = tempOutFile.getParent();
*/
byte[] inputDoc = getBytesFromFile(inputFile);
org.w3c.dom.Document resultDocument = null;
// set up page processor object
PageProcessor pp = new PageProcessor();
pp.setProcessType(PageProcessor.PP_BLOCK);
pp.setRulingLines(rulingLines);
pp.setProcessSpaces(processSpaces);
// no iterations should be automatically set to -1
// do the processing
resultDocument =
processPDFToXMLDocument(inputDoc, pp, toXHTML, borders,
startPage, endPage, encoding, password);
System.out.println("Using input file: " + inFile);
System.out.println("Using output file: " + outFile);
// now output the XML Document by serializing it to output
Writer output = null;
if( toConsole )
{
output = new OutputStreamWriter( System.out );
}
else
{
if( encoding != null )
{
output = new OutputStreamWriter(
new FileOutputStream( outFile ), encoding );
}
else
{
//use default encoding
output = new OutputStreamWriter(
new FileOutputStream( outFile ) );
}
}
serializeXML(resultDocument, output);
if( output != null )
{
output.close();
}
}
public static byte[] PDFToXHTML(byte[] theFile,
int startPage, int endPage, String encoding, String password)
throws DocumentProcessingException
{
PageProcessor pp = new PageProcessor(PageProcessor.PP_BLOCK);
return processPDFToByteArray(theFile, pp, true, true,
startPage, endPage, encoding, password);
}
// try/catch moved to calling method 9.04.06
protected static org.w3c.dom.Document setUpXML(String nodeName)
throws ParserConfigurationException
{
//try
//{
DocumentBuilderFactory myFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder myDocBuilder = myFactory.newDocumentBuilder();
DOMImplementation myDOMImpl = myDocBuilder.getDOMImplementation();
// resultDocument = myDOMImpl.createDocument("at.ac.tuwien.dbai.pdfwrap", "PDFResult", null);
org.w3c.dom.Document resultDocument =
myDOMImpl.createDocument("at.ac.tuwien.dbai.pdfwrap", nodeName, null);
return resultDocument;
//}
//catch (ParserConfigurationException e)
//{
// e.printStackTrace();
// return null;
//}
}
// Returns the contents of the file in a byte array.
public static byte[] getBytesFromFile(File file) throws IOException {
InputStream is = new FileInputStream(file);
// Get the size of the file
long length = file.length();
// You cannot create an array using a long type.
// It needs to be an int type.
// Before converting to an int type, check
// to ensure that file is not larger than Integer.MAX_VALUE.
if (length > Integer.MAX_VALUE) {
// File is too large
}
// Create the byte array to hold the data
byte[] bytes = new byte[(int)length];
// Read in the bytes
int offset = 0;
int numRead = 0;
while (offset < bytes.length
&& (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
offset += numRead;
}
// Ensure all the bytes have been read in
if (offset < bytes.length) {
throw new IOException("Could not completely read file "+file.getName());
}
// Close the input stream and return bytes
is.close();
return bytes;
}
public static byte[] serializeXML(org.w3c.dom.Document resultDocument)
throws DocumentProcessingException
{
// calls the above and returns a byte[] from the XML Document.
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
try
{
Writer output = new OutputStreamWriter(outStream, DEFAULT_ENCODING);
serializeXML(resultDocument, output);
}
catch (IOException e)
{
throw new DocumentProcessingException(e);
}
return outStream.toByteArray();
}
public static void serializeXML(org.w3c.dom.Document resultDocument, OutputStream outStream)
throws DocumentProcessingException
{
try
{
Writer output = new OutputStreamWriter(outStream, DEFAULT_ENCODING);
serializeXML(resultDocument, output);
}
catch (IOException e)
{
throw new DocumentProcessingException(e);
}
}
public static void serializeXML
(org.w3c.dom.Document resultDocument, Writer output)
throws IOException
{
// The third parameter in the constructor method for
// _OutputFormat_ controls whether indenting should be
// used. Unfortunately, I have found some bugs in the
// indenting implementation that have corrupted the text
// so I have switched it off.
OutputFormat myOutputFormat =
new OutputFormat(resultDocument,
"UTF-8",
true);
// output used to be replaced with System.out
XMLSerializer s =
new XMLSerializer(output,
myOutputFormat);
try {
s.serialize(resultDocument);
// next line added by THA 21.03.05
output.flush();
}
catch (IOException e) {
System.err.println("Couldn't serialize document: "+
e.getMessage());
throw e;
}
// end of addition
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java at.ac.tuwien.dbai.pdfwrap.ProcessFile [OPTIONS] <PDF file> [Text File]\n" +
" -password <password> Password to decrypt document\n" +
" -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
" -xmillum output XMIllum XML (instead of XHTML)\n" +
" -norulinglines do not process ruling lines\n" +
" -spaces split low-level segments according to spaces\n" +
" -console Send text to console instead of file\n" +
" -startPage <number> The first page to start extraction(1 based)\n" +
" -endPage <number> The last page to extract(inclusive)\n" +
" <PDF file> The PDF document to use\n" +
" [Text File] The file to write the text to\n"
);
System.exit( 1 );
}
}
// the above taken from:
// http://userpage.fu-berlin.de/~ram/pub/pub_jf47htqHHt/java_sax_parser_en
/** utility class */
final class XML
{ /** create a new XML reader */
final public static org.xml.sax.XMLReader makeXMLReader()
throws Exception
{ final javax.xml.parsers.SAXParserFactory saxParserFactory =
javax.xml.parsers.SAXParserFactory.newInstance();
final javax.xml.parsers.SAXParser saxParser = saxParserFactory.newSAXParser();
final org.xml.sax.XMLReader parser = saxParser.getXMLReader();
return parser; }}