/** * pdfXtk - PDF Extraction Toolkit * Copyright (c) by the authors/contributors. All rights reserved. * This project includes code from PDFBox and TouchGraph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://pdfxtk.sourceforge.net * */ package at.ac.tuwien.dbai.pdfwrap.analysis; import at.ac.tuwien.dbai.pdfwrap.comparators.XYTextComparator; import at.ac.tuwien.dbai.pdfwrap.model.document.*; import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils; import at.ac.tuwien.dbai.pdfwrap.utils.Utils; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; /** * Methods to find lines of text from fragments on a page * and within text blocks/candidate clusters * * @author Tamir Hassan, pdfanalyser@tamirhassan.com * @version PDF Analyser 0.9 */ public class LineProcessor { // (take LineFragment -- find line) // take TextFragment OR any other TextSegment(e.g. Char) -- create LineFragments and Lines // public static ... public static List<TextLine> findLinesFromLineFragments(List<LineFragment> textBlocks, float maxX, boolean postNG, boolean ignoreFontsize) { List<TextLine> retVal = new ArrayList<TextLine>(); List<CompositeSegment<? extends TextSegment>> foundLines = findLines(textBlocks, maxX, postNG, ignoreFontsize); for (CompositeSegment<? extends TextSegment> cs : foundLines) { TextLine tl = new TextLine(); tl.getItems().addAll((List<? extends LineFragment>) cs.getItems()); tl.setCalculatedFields(cs); retVal.add(tl); } return retVal; } public static List<TextLine> findLinesFromTextFragments(List<TextFragment> textBlocks, float maxX, boolean postNG, boolean ignoreFontsize) { List<TextLine> retVal = new ArrayList<TextLine>(); List<CompositeSegment<? extends TextSegment>> foundLines = findLines(textBlocks, maxX, postNG, ignoreFontsize); for (CompositeSegment<? extends TextSegment> cs : foundLines) { LineFragment lf = new LineFragment(); lf.getItems().addAll((List<? extends TextFragment>) cs.getItems()); lf.setCalculatedFields(cs); TextLine tl = new TextLine(); tl.getItems().add(lf); tl.setCalculatedFields(lf); retVal.add(tl); } return retVal; } public static List<TextLine> findLinesFromCharacters(List<CharSegment> textBlocks, float maxX, boolean postNG, boolean ignoreFontsize) { List<TextLine> retVal = new ArrayList<TextLine>(); List<CompositeSegment<? extends TextSegment>> foundLines = findLines(textBlocks, maxX, postNG, ignoreFontsize); for (CompositeSegment<? extends TextSegment> cs : foundLines) { TextFragment tf = new TextFragment(); tf.getItems().addAll((List<? extends CharSegment>) cs.getItems()); tf.setCalculatedFields(cs); LineFragment lf = new LineFragment(); lf.getItems().add(tf); lf.setCalculatedFields(tf); TextLine tl = new TextLine(); tl.getItems().add(lf); tl.setCalculatedFields(lf); retVal.add(tl); } return retVal; } public static List<TextLine> findLinesFromTextLines(List<TextLine> textBlocks, float maxX, boolean postNG, boolean ignoreFontsize) { List<TextLine> retVal = new ArrayList<TextLine>(); List<CompositeSegment<? extends TextSegment>> foundLines = findLines(textBlocks, maxX, postNG, ignoreFontsize); for (CompositeSegment<? extends TextSegment> cs : foundLines) { TextLine tl = new TextLine(); for (TextSegment ts : cs.getItems()) { TextLine tl2 = (TextLine)ts; tl.getItems().addAll(tl2.getItems()); } tl.setCalculatedFields(cs); retVal.add(tl); } return retVal; } // 2011-01-26: changed to public -- called directly by CandidateCluster.findLines() public static List<CompositeSegment<? extends TextSegment>> findLines( List<? extends TextSegment> textBlocks, float maxX, boolean postNG, boolean ignoreFontsize) //throws Exception { // TODO: support super/subscript natively -- or allow misc segments ... // pre: textBlocks in collection must be sorted in y-then-x order Collections.sort(textBlocks, new XYTextComparator()); // pre: all items in textBlocks must be TextPosition objects // TODO: create a specific exception here List<CompositeSegment<? extends TextSegment>> retVal = new ArrayList<CompositeSegment<? extends TextSegment>>(); TextSegment lastBlock = null; List<TextSegment> newItems = new ArrayList<TextSegment>(); // variables for controlling new line objects to be added // these can be generated later -- not for the preNG cluster //String newString = ""; boolean merge = false; Iterator iter = textBlocks.iterator(); while (iter.hasNext()) { TextSegment thisBlock = null; // if empty text block, try again :) // (required so that empty text blocks do not interfere with processing) while (iter.hasNext() && (thisBlock == null || thisBlock.isEmpty())) { thisBlock = (TextSegment)iter.next(); } if (lastBlock != null) { // should return null if no lastBlock...? if (sameLine(lastBlock, thisBlock, maxX, postNG, ignoreFontsize)) // we "merge" { // TODO: delete! // System.out.println("merging " + newString + " withspace " + thisBlock.getCharacter()); if (merge) { newItems.add(thisBlock); } else { newItems = new ArrayList<TextSegment>(); newItems.add(thisBlock); merge = true; } } else // we don't merge { // TODO: add all sub-objects, and fix font! (not null) CompositeSegment<TextSegment> newLine = new CompositeSegment<TextSegment>(); newLine.setItems(newItems); newLine.setCalculatedFields(); retVal.add(newLine); // nothing to merge with => // simply assign all new variables newItems = new ArrayList<TextSegment>(); newItems.add(thisBlock); // TODO: replace with a proper average (mode?) // newFontSize = fontSize; //first = false; merge = true; } } else { // nothing to merge with => // simply assign all new variables newItems = new ArrayList<TextSegment>(); newItems.add(thisBlock); // TODO: replace with a proper average (mode?) // newFontSize = fontSize; //first = false; merge = true; } lastBlock = thisBlock; //first = false; } // add last block if appropriate if (newItems.size() > 0) { CompositeSegment<TextSegment> newLine = new CompositeSegment<TextSegment>(); newLine.setItems(newItems); newLine.setCalculatedFields(); retVal.add(newLine); } // 2011-05-28: lines; fontsize should be the maximum of all blocks! for (CompositeSegment<? extends TextSegment> l : retVal) { float largestFontSize = 0.0f; boolean changeMade = false; for (GenericSegment gs : l.getItems()) { if (gs instanceof TextSegment) { float thisFontSize = ((TextSegment)gs).getFontSize(); if (thisFontSize > largestFontSize) { largestFontSize = thisFontSize; changeMade = true; } } } if (changeMade) l.setFontSize(largestFontSize); } return retVal; } // TODO: rewrite to make clearer -- it all works with TextSegments now! private static boolean sameLine(TextSegment lastBlock, TextSegment thisBlock, float maxX, boolean postNG, boolean ignoreFontsize) { // added 12.06.07 if(thisBlock.getX1() < lastBlock.getXmid()) return false; if (postNG) return (SegmentUtils.vertIntersect(lastBlock, thisBlock.getYmid()) || SegmentUtils.vertIntersect(thisBlock, lastBlock.getYmid())); // problem with atomic line finding on tm_03dec08_p04z.pdf // changed 4.05.09 //GenericSegment.vertIntersect(lastBlock, thisBlock); float fontSize; boolean sameFontSize; boolean xGuard; if (lastBlock instanceof TextSegment && thisBlock instanceof TextSegment) { fontSize = (lastBlock.getFontSize() + thisBlock.getFontSize()) / 2.0f; sameFontSize = Utils.within(lastBlock.getFontSize(), thisBlock.getFontSize(), fontSize * 0.15f); //System.out.println("fontSize: " + fontSize + " maxX: " + maxX + " product: " + fontSize * maxX); xGuard = Utils.within(lastBlock.getX2(), thisBlock.getX1(), fontSize * maxX); } else { // completely nonsensical to line-find on GenericSegments(!) fontSize = -1.0f; sameFontSize = false; xGuard = false; } if (ignoreFontsize) sameFontSize = true; // for PDF-TREX comparison was 0.1f; later changed to 0.25f return (Utils.within(lastBlock.getY1(), thisBlock.getY1(), fontSize * Utils.sameLineTolerance) //&& !crosses(lastBlock, thisBlock, pageDivs) && sameFontSize && xGuard); } }