/** * pdfXtk - PDF Extraction Toolkit * Copyright (c) by the authors/contributors. All rights reserved. * This project includes code from PDFBox and TouchGraph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://pdfxtk.sourceforge.net * */ package at.ac.tuwien.dbai.pdfwrap.analysis; import at.ac.tuwien.dbai.pdfwrap.comparators.YComparator; import at.ac.tuwien.dbai.pdfwrap.gui.EdgeSegment; import at.ac.tuwien.dbai.pdfwrap.model.document.*; import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyEdge; import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyGraph; import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFObjectExtractor; import at.ac.tuwien.dbai.pdfwrap.pdfread.PDFPage; import at.ac.tuwien.dbai.pdfwrap.utils.ListUtils; import at.ac.tuwien.dbai.pdfwrap.utils.Utils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.awt.image.BufferedImage; import java.io.IOException; import java.util.*; /** * General class to take a PDFPage and return a processed Page object, * according to the given processType * * @author Tamir Hassan, pdfanalyser@tamirhassan.com * @author @author Ben Litchfield (ben@csh.rit.edu) * @version PDF Analyser 0.9 */ public class PageProcessor { private static final Log log = LogFactory.getLog( PageProcessor.class ); // types of clustering -- while clustering is still being perfected... // public final static int PP_MONOSPACE = 0; //seems to be catered for by PP_BMW public final static int PP_INSTRUCTION = 1; public final static int PP_FRAGMENT = 2; public final static int PP_CHAR = 3; public final static int PP_LINE = 4; public final static int PP_BLOCK = 5; public final static int PP_MERGED_LINES = 16; // public final static int PP_DEFAULT = 100; // public static Page STR_CURR_PAGE = new Page(); // added so that the table understander could be called via a separate method protected Page retVal; protected List<CharSegment> charList; // was: charList protected List<TextFragment> fragList; protected List<ImageSegment> imageList; protected List<LineSegment> lineList; protected List<RectSegment> rectList; // the chosen granularity for graph matching protected List<GenericSegment> processingResult; protected List<EdgeSegment> edgeSegmentList; // the edges that will be finally displayed protected List<TextLine> textLines; protected List<TextBlock> mergedLines; protected List<TextBlock> textBlocks; protected RulingObjectProcessor rop; protected AdjacencyGraph<GenericSegment> adjGraph; float currentX = 0.0f; float currentY = 0.0f; // end of addition protected int processType = PP_BLOCK; protected boolean rulingLines = true; protected boolean processSpaces = false; protected int noIterations = -1; // added by TH // private Document resultDocument; // end of addition // private static Logger log = Logger.getLogger(PDFTextStripper.class); /* private int currentPageNo = 0; private int startPage = 1; private int endPage = Integer.MAX_VALUE; private PDOutlineItem startBookmark = null; private int startBookmarkPageNumber = -1; private PDOutlineItem endBookmark = null; private int endBookmarkPageNumber = -1; private boolean suppressDuplicateOverlappingText = true; private PDDocument document; private boolean shouldSeparateByBeads = true; private List pageArticles = null; */ /** * The charactersByArticle is used to extract text by article divisions. For example * a PDF that has two columns like a newspaper, we want to extract the first column and * then the second column. In this example the PDF would have 2 beads(or articles), one for * each column. The size of the charactersByArticle would be 5, because not all text on the * screen will fall into one of the articles. The five divisions are shown below * * Text before first article * first article text * text between first article and second article * second article text * text after second article * * Most PDFs won't have any beads, so charactersByArticle will contain a single entry. */ // currently not used! private Vector charactersByArticle = new Vector(); private Map characterListMapping = new HashMap(); private String lineSeparator = System.getProperty("line.separator"); private String pageSeparator = System.getProperty("line.separator"); private String wordSeparator = " "; // private DocumentGraph documentGraph; /** * Instantiate a new PageProcessor object. */ public PageProcessor() // throws IOException -- I don't think there's any need for it now { // super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) ); } public PageProcessor(int processType) // throws IOException -- I don't think there's any need for it now { // super( ResourceLoader.loadProperties( "Resources/PDFTextStripper.properties" ) ); this.processType = processType; } public static List<Page> processDocPages(List<Page> thePages, BufferedImage pageImage) { // methods to run AFTER all pages have been understood return thePages; } /** * This will process the contents of a page. * modified by TH * * @param page The page to process. * @param content The contents of the page. * * @throws IOException If there is an error processing the page. */ public Page processPage(PDFPage thisPage) // throws IOException { Page retVal = doProcessPage(thisPage); // custom processing goes here postProcessing(processType, retVal); retVal.setLastOpIndex(thisPage.getLastOpIndex()); return retVal; } protected Page doProcessPage(PDFPage thisPage) { long startProcess = System.currentTimeMillis(); retVal = new Page(); retVal.setBoundingBox(thisPage.getBoundingBox()); retVal.setRotation(thisPage.getRotation()); charList = ListUtils.selectCharacters(thisPage.getItems()); // was: charList fragList = ListUtils.selectTextFragments(thisPage.getItems()); imageList = ListUtils.selectImageSegments(thisPage.getItems()); lineList = ListUtils.selectLineSegments(thisPage.getItems()); rectList = ListUtils.selectRectSegments(thisPage.getItems()); // the chosen granularity for graph matching processingResult = new ArrayList<GenericSegment>(); edgeSegmentList = new ArrayList<EdgeSegment>(); // the edges that will be finally displayed textLines = new ArrayList<TextLine>(); mergedLines = new ArrayList<TextBlock>(); textBlocks = new ArrayList<TextBlock>(); rop = new RulingObjectProcessor(); if (Utils.DISPLAY_TIMINGS) { if( log.isDebugEnabled() ) { log.debug( ("time A: " + ( System.currentTimeMillis() - startProcess ) )); } } PDFObjectExtractor.removeLeadingTrailingSpaces(fragList); // if (processType == PP_STRUCT) // PageSegmenter.LINE_SPACING_TOLERANCE = 0.20f; // else // lines or coarser granular levels if ( processType != PP_CHAR && processType != PP_FRAGMENT ) { if (Utils.DISPLAY_TIMINGS ) { if( log.isDebugEnabled() ) { log.debug("time E: " + (System.currentTimeMillis() - startProcess)); } } // added 2011-11-04 to REMOVE space characters (test) if (processSpaces) { List<CharSegment> charsToRemove = new ArrayList<CharSegment>(); for (CharSegment cs : charList) if (cs.getText().equals(" ")) charsToRemove.add(cs); charList.removeAll(charsToRemove); } if (processSpaces) textLines = LineProcessor.findLinesFromCharacters( charList, 0.3f, false, false); //pageImage != null); // that is charList in the calling method // good - 0.3 or 0.4 else // textLines = lxLineFinder.findLines(fragmentList, 0.20f, false); // 19.10.10 // changed after PDF-TREX comparison textLines = LineProcessor.findLinesFromTextFragments( fragList, 0.80f, false, false); //pageImage != null); AdjacencyGraph<TextLine> lineAG = new AdjacencyGraph<TextLine>(); lineAG.addList(textLines); if( log.isDebugEnabled() ) { if( log.isDebugEnabled() ) { log.debug( "number of items pageFromLines: " + textLines.size() ); log.debug("Time for preprocessing: " + (System.currentTimeMillis() - startProcess)); } } // Generate NG long before = System.currentTimeMillis(); lineAG.generateEdgesSingle(); if( Utils.DISPLAY_TIMINGS ) { if( log.isDebugEnabled() ) { log.debug( "Time for AG generation: " + (System.currentTimeMillis() - before)); } } before = System.currentTimeMillis(); // RULING OBJECT PROCESSING if(rulingLines) { rop.addRulingObjects(lineList); rop.addRulingObjects(rectList); // here they will be automatically processed // into their constituent lines rop.removeDuplicateLines(); // this removes duplicate lines and 'joins' // touching lines lineList = rop.getRulingLines(); rectList = new ArrayList<RectSegment>(); // empty rectList // added 6.01.11 for str conversions // commented out 2011-01-26 for execution // RulingObjectProcessor.strDetectUnderlinedText(textLines, lineList); } // BEST FIRST CLUSTERING (BLOCK FINDING) // return only the blocks -- second level clustering takes place separately // noIterations affects block-finding unless processType == PP_COLUMN int blockIterations = noIterations; if (processType != PP_BLOCK) blockIterations = 0; // textBlocks = PageSegmenter.orderedEdgeCluster TextBlockPageSegmenter tbps = new TextBlockPageSegmenter(); tbps.setMaxIterations(blockIterations); textBlocks = tbps.clusterLinesIntoTextBlocks(lineAG); // not used in this version of pdfxtk // HashMap<GenericSegment, CandidateCluster> clustHash = // tbps.getClustHash(); if (Utils.DISPLAY_TIMINGS){ if( log.isDebugEnabled() ) { log.debug("Time for ordered edge cluster: " + (System.currentTimeMillis() - before)); } } // ListUtils.printListWithSubItems(textBlocks); before = System.currentTimeMillis(); // FIND ATOMIC LINES for (TextBlock c : textBlocks) { { CandidateCluster cc = new CandidateCluster(); // generic problems -- cannot simply add list for (TextSegment t : c.getItems()) cc.getItems().add(t); cc.findLinesWidth(); for (CompositeSegment<? extends TextSegment> l : cc.getFoundLines()) { TextBlock lineBlock = new TextBlock(); lineBlock.setCalculatedFields(l); for (TextSegment i : l.getItems()) { if (i.getClass() == TextLine.class) { lineBlock.getItems().add((TextLine)i); } if (i.getClass() == LineFragment.class) { TextLine tl = new TextLine(); tl.getItems().add((LineFragment)i); tl.setCalculatedFields(i); lineBlock.getItems().add(tl); } else if (i.getClass() == TextFragment.class) { LineFragment lf = new LineFragment(); lf.getItems().add((TextFragment)i); lf.setCalculatedFields(i); TextLine tl = new TextLine(); tl.getItems().add(lf); tl.setCalculatedFields(lf); lineBlock.getItems().add(tl); } else if (i.getClass() == CharSegment.class) { TextFragment tf = new TextFragment(); tf.getItems().add((CharSegment)i); tf.setCalculatedFields(i); LineFragment lf = new LineFragment(); lf.getItems().add((TextFragment)tf); lf.setCalculatedFields(tf); TextLine tl = new TextLine(); tl.getItems().add(lf); tl.setCalculatedFields(lf); lineBlock.getItems().add(tl); } else { //TODO: // TextBlock: add its items, not the block itself? //??? // throw new DocumentProcessingException // ("Invalid objects found in line"); } } mergedLines.add(lineBlock); } } } if (Utils.DISPLAY_TIMINGS) { if( log.isDebugEnabled() ) { log.debug( "total pp time: " + (System.currentTimeMillis() - startProcess)); } } // custom processing goes here // postProcessing(processType, retVal); } return retVal; } public void postProcessing(int processType, Page retVal) { if (processType == PP_CHAR) { // This processing mode just for debugging view ... for (CharSegment c : charList) { TextFragment tf = new TextFragment(); tf.getItems().add(c); tf.setCalculatedFields(c); LineFragment lf = new LineFragment(); lf.getItems().add(tf); lf.setCalculatedFields(tf); TextLine tl = new TextLine(); tl.getItems().add(lf); tl.setCalculatedFields(lf); TextBlock tb = new TextBlock(); tb.getItems().add(tl); tb.setCalculatedFields(tl); processingResult.add(tb); } } else if (processType == PP_FRAGMENT) // characters { // This processing mode just for debugging view ... for (TextFragment tf : fragList) { LineFragment lf = new LineFragment(); lf.getItems().add(tf); lf.setCalculatedFields(tf); TextLine tl = new TextLine(); tl.getItems().add(lf); tl.setCalculatedFields(lf); TextBlock tb = new TextBlock(); tb.getItems().add(tl); tb.setCalculatedFields(tl); processingResult.add(tb); } } else if (processType == PP_LINE) { // This processing mode also for debugging view; // for wrapping generally PP_MERGED_LINES should be used ... for (TextLine tl : textLines) { TextBlock tb = new TextBlock(); tb.getItems().add(tl); tb.setCalculatedFields(tl); processingResult.add(tb); } } else if (processType == PP_MERGED_LINES) processingResult.addAll(mergedLines); else if (processType == PP_BLOCK) processingResult.addAll(textBlocks); // if (processType == PP_LINE || processType == PP_MERGED_LINES || // processType == PP_BLOCK) if (processType != PP_INSTRUCTION && processType != PP_FRAGMENT && processType != PP_CHAR) { adjGraph = new AdjacencyGraph<GenericSegment>(); // processingResult.setFontNames(); adjGraph.addList(processingResult); // ListFactory.create imageList?!? // clusterNG.addList(imageList); adjGraph.generateEdgesSingle(); //GraphMatcher.removeLongEdges(clusterNG, 25.0f); if( log.isDebugEnabled() ) { log.debug("PP.edges: " + adjGraph.getEdges().size()); } List<EdgeSegment> edgeList = new ArrayList<EdgeSegment>(); for (AdjacencyEdge<GenericSegment> ae : adjGraph.getEdges()) // edgeList.add(new EdgeSegment(ae)); edgeList.add(ae.toDisplayableSegment()); // edgeList = clusterNG.getEdges().toSegmentList(); // 2011-01-27 TEMPORARILY COMMENTED OUT // rop.labelEdges(clusterNG.getEdges(), rop.getRulingLines()); } /* System.out.println("processingResult:"); ListUtils.printList(processingResult); */ // add the text clusters (segments) to the page object retVal.getItems().addAll(processingResult); retVal.getItems().addAll(textLines); retVal.getItems().addAll(fragList); retVal.getItems().addAll(charList); retVal.getItems().addAll(imageList); retVal.getItems().addAll(lineList); retVal.getItems().addAll(rectList); retVal.getItems().addAll(edgeSegmentList); Collections.sort(retVal.getItems(), new YComparator());//.reverseOrder(new YComparator())); } public void customProcessing(int processType) { // for custom processing, override this method! } public AdjacencyGraph<GenericSegment> getAdjGraph() { return adjGraph; } public void setAdjGraph(AdjacencyGraph<GenericSegment> adjGraph) { this.adjGraph = adjGraph; } public int getProcessType() { return processType; } public void setProcessType(int processType) { this.processType = processType; } public boolean isRulingLines() { return rulingLines; } public void setRulingLines(boolean rulingLines) { this.rulingLines = rulingLines; } public boolean isProcessSpaces() { return processSpaces; } public void setProcessSpaces(boolean processSpaces) { this.processSpaces = processSpaces; } public int getNoIterations() { return noIterations; } public void setNoIterations(int noIterations) { this.noIterations = noIterations; } }