/** * pdfXtk - PDF Extraction Toolkit * Copyright (c) by the authors/contributors. All rights reserved. * This project includes code from PDFBox and TouchGraph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://pdfxtk.sourceforge.net * */ package at.ac.tuwien.dbai.pdfwrap.analysis; import at.ac.tuwien.dbai.pdfwrap.comparators.EdgeAttributeComparator; import at.ac.tuwien.dbai.pdfwrap.model.document.CompositeSegment; import at.ac.tuwien.dbai.pdfwrap.model.document.GenericSegment; import at.ac.tuwien.dbai.pdfwrap.model.document.TextSegment; import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyEdge; import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils; import at.ac.tuwien.dbai.pdfwrap.utils.Utils; import java.util.Comparator; import java.util.List; /** * Text block segmentation rules * * @author Tamir Hassan, pdfanalyser@tamirhassan.com * @version PDF Analyser 0.9 */ public class TextBlockPageSegmenter extends AbstractPageSegmenter { public static float MAX_CLUST_LINE_SPACING = 1.75f; // 5524.pdf i-cite public static float MIN_CLUST_LINE_SPACING = 0.25f; // Baghdad problem! 30.07.08 public static float MAX_COL_LINE_THRESHOLD = 3.5f; // final static float LINE_SPACING_TOLERANCE = 0.25f; public static float LINE_SPACING_TOLERANCE = 0.05f; // changed 30.10.10 // NOTE! This linespacing tolerance does not apply to OCR; // 9.01.11 also does not apply to str conversions; // PageProcessor changes this value if a page image is used /* protected static boolean clusterTogether(AdjacencyEdge<GenericSegment> ae, CandidateCluster clustFrom, CandidateCluster clustTo, List<AdjacencyEdge<GenericSegment>> allEdges, HashMap vertNeighbourMap, List<? extends GenericSegment> items, int processPhase) { if (processPhase == 2) return clusterTogether2(ae, clustFrom, clustTo, allEdges, items); else return clusterTogether1(ae, clustFrom, clustTo, allEdges, vertNeighbourMap); } */ // TODO: only vertical edges actually need to be passed here...? public int clusterTogether(AdjacencyEdge<GenericSegment> ae, CandidateCluster clustFrom, CandidateCluster clustTo) { TextSegment segFrom = (TextSegment)ae.getNodeFrom(); TextSegment segTo = (TextSegment)ae.getNodeTo(); // 30.10.10 -- Cluster.lineSpacing is not a multiple, but rather the absolute linespacing // float lineSpacing = ae.getEdgeLength(); boolean boolRetVal; // caution: do not confuse segFrom with clustFrom :) // don't cluster the same segment together(!) // (should not happen anyway...) if (segFrom == segTo) boolRetVal = false; else { if (ae.isHorizontal()) boolRetVal = clusterTogetherHoriz(ae, clustFrom, clustTo); else boolRetVal = clusterTogetherVert(ae, clustFrom, clustTo); } if (boolRetVal == true) return 1; //true else return -1; //false // 0 -- third state } protected boolean clusterTogetherHoriz(AdjacencyEdge<GenericSegment> ae, CandidateCluster clustFrom, CandidateCluster clustTo) { TextSegment segFrom = (TextSegment)ae.getNodeFrom(); TextSegment segTo = (TextSegment)ae.getNodeTo(); if (clustFrom == null) { clustFrom = new CandidateCluster(); clustFrom.getItems().add(segFrom); clustFrom.findLinesWidth(); clustFrom.findBoundingBox(); // precondition for findNVN } if (clustTo == null) { clustTo = new CandidateCluster(); clustTo.getItems().add(segFrom); clustTo.findLinesWidth(); clustFrom.findBoundingBox(); // precondition for findNVN } // don't cluster the same cluster together(!) if (clustFrom == clustTo) return false; long t = System.currentTimeMillis(); // changed on 30.04.09 to use segments rather than clusters List<GenericSegment> neighboursFrom = AbstractPageSegmenter.findNearestVerticalNeighbours(segFrom, allEdges, vertNeighbourMap); List<GenericSegment> neighboursTo = AbstractPageSegmenter.findNearestVerticalNeighbours(segTo, allEdges, vertNeighbourMap); TextSegment closestNeighbourFrom = null; if (neighboursFrom.get(0) != null && neighboursFrom.get(1) != null) { float distanceAbove = ((TextSegment)neighboursFrom.get(0)).getY1() - segFrom.getY2(); float distanceBelow = segFrom.getY1() - ((TextSegment)neighboursFrom.get(1)).getY2(); if (distanceAbove < distanceBelow) closestNeighbourFrom = (TextSegment)neighboursFrom.get(0); else closestNeighbourFrom = (TextSegment)neighboursFrom.get(1); } else if (neighboursFrom.get(0) != null) { closestNeighbourFrom = (TextSegment)neighboursFrom.get(0); } else if (neighboursFrom.get(1) != null) { closestNeighbourFrom = (TextSegment)neighboursFrom.get(1); } TextSegment closestNeighbourTo = null; if (neighboursTo.get(0) != null && neighboursTo.get(1) != null) { float distanceAbove = ((TextSegment)neighboursTo.get(0)).getY1() - segTo.getY2(); float distanceBelow = segTo.getY1() - ((TextSegment)neighboursTo.get(1)).getY2(); if (distanceAbove < distanceBelow) closestNeighbourTo = (TextSegment)neighboursTo.get(0); else closestNeighbourTo = (TextSegment)neighboursTo.get(1); } else if (neighboursTo.get(0) != null) { closestNeighbourTo = (TextSegment)neighboursTo.get(0); } else if (neighboursTo.get(1) != null) { closestNeighbourTo = (TextSegment)neighboursTo.get(1); } TextSegment closestNeighbour = null; float neighbourDistance = -1; if (closestNeighbourFrom != null && closestNeighbourTo != null) { float distanceFrom; if (closestNeighbourFrom.getYmid() < segFrom.getYmid()) distanceFrom = segFrom.getY1() - closestNeighbourFrom.getY2(); else distanceFrom = closestNeighbourFrom.getY1() - segFrom.getY2(); float distanceTo; if (closestNeighbourTo.getYmid() < segTo.getYmid()) distanceTo = segTo.getY1() - closestNeighbourTo.getY2(); else distanceTo = closestNeighbourTo.getY1() - segTo.getY2(); if (distanceFrom < distanceTo) { closestNeighbour = closestNeighbourFrom; neighbourDistance = distanceFrom; } else { closestNeighbour = closestNeighbourTo; neighbourDistance = distanceTo; } } else if (closestNeighbourFrom != null) { closestNeighbour = closestNeighbourFrom; float distanceFrom; if (closestNeighbourFrom.getYmid() < segFrom.getYmid()) distanceFrom = segFrom.getY1() - closestNeighbourFrom.getY2(); else distanceFrom = closestNeighbourFrom.getY1() - segFrom.getY2(); neighbourDistance = distanceFrom; } else if (closestNeighbourTo != null) { closestNeighbour = closestNeighbourTo; float distanceTo; if (closestNeighbourTo.getYmid() < segTo.getYmid()) distanceTo = segTo.getY1() - closestNeighbourTo.getY2(); else distanceTo = closestNeighbourTo.getY1() - segTo.getY2(); neighbourDistance = distanceTo; } // TODO: neighbourDistance is not used at all! float max_horiz_edge_width = 0.75f; if (!(clustFrom.getFoundLines().size() <= 2 || clustTo.getFoundLines().size() <= 2)) max_horiz_edge_width = 0.85f; if (!(clustFrom.getFoundLines().size() <= 1 || clustTo.getFoundLines().size() <= 1)) max_horiz_edge_width = 1.0f; // if baseline of both segs doesn't match, reduce to 0.3 // addition of 30.04.09 boolean sameBaseline = Utils.within(segFrom.getY1(), segTo.getY1(), Utils.calculateThreshold(segFrom, segTo, 0.20f)); if (!sameBaseline) max_horiz_edge_width = 0.3f; //float d = neighbourDistance / ae.getFontSize(); // 29.04.09: we recalculate (at least for horiz. edges) // the lineSpacing (i.e. relative edge length) // using the smallest of both fontsize values... float smallestFontSize = ((TextSegment)ae.getNodeFrom()).getFontSize(); if (((TextSegment)ae.getNodeFrom()).getFontSize() > ((TextSegment)ae.getNodeTo()).getFontSize()) smallestFontSize = ((TextSegment)ae.getNodeTo()).getFontSize(); float horizGap = ae.physicalLength() / smallestFontSize; if (horizGap > max_horiz_edge_width) return false; return true; } protected boolean clusterTogetherVert(AdjacencyEdge<GenericSegment> ae, CandidateCluster clustFrom, CandidateCluster clustTo) { TextSegment segFrom = (TextSegment)ae.getNodeFrom(); TextSegment segTo = (TextSegment)ae.getNodeTo(); float lineSpacing; if (ae.getDirection() == AdjacencyEdge.REL_ABOVE) lineSpacing = ae.getNodeTo().getY1() - ae.getNodeFrom().getY1(); else // REL_BELOW lineSpacing = ae.getNodeFrom().getY1() - ae.getNodeTo().getY1(); lineSpacing = lineSpacing/ae.avgFontSize(); // System.out.println("eins"); if (!(Utils.sameFontSize(segFrom, segTo))) return false; // System.out.println("lineSpacing: " + lineSpacing); // System.out.println("zwei"); if (!(lineSpacing <= MAX_CLUST_LINE_SPACING && lineSpacing >= MIN_CLUST_LINE_SPACING)) return false; // System.out.println("drei"); // 2011-10-28: the first three if clauses are not executed any more, // as the algorithm now takes singleton clusters as input if (clustFrom == null && clustTo == null) { // System.out.println("drei punkt eins"); return true; } else if (clustFrom == null) { // System.out.println("drei punkt zwei"); // check if line spacing matches that of cluster, or has not yet been // assigned if (clustTo.getRelLineSpacing() == 0.0f || Utils.within(lineSpacing, clustTo.getRelLineSpacing(), LINE_SPACING_TOLERANCE)) return true; } else if (clustTo == null) { // System.out.println("drei punkt drei"); // check if line spacing matches that of cluster, or has not yet been // assigned if (clustFrom.getRelLineSpacing() == 0.0f || Utils.within(lineSpacing, clustFrom.getRelLineSpacing(), LINE_SPACING_TOLERANCE)) return true; } else { // System.out.println("drei punkt vier"); // don't cluster the same segments together! if (clustFrom == clustTo) return false; // check that the line spacings are the same and ?within the threshold? boolean sameLineSpacing = (Utils.within(clustFrom.getRelLineSpacing(), clustTo.getRelLineSpacing(), LINE_SPACING_TOLERANCE)); if (clustFrom.getRelLineSpacing() == 0.0f || clustTo.getRelLineSpacing() == 0.0f) sameLineSpacing = true; // System.out.println("drei punkt fünf"); // System.out.println("ls: " + clustFrom.getLineSpacing() + " clustFrom: " + clustFrom); // System.out.println("ls: " + clustTo.getLineSpacing() + " clustTo: " + clustFrom); // System.out.println("LINE_SPACING_TOLERANCE = " + LINE_SPACING_TOLERANCE); // highly unlikely that it will succeed with sameLineSpacing but // fail here but just in case... // also return true if either linespacing unassigned boolean clustFromValidLineSpacing = clustFrom.getRelLineSpacing() == 0.0f || Utils.within(lineSpacing, clustFrom.getRelLineSpacing(), LINE_SPACING_TOLERANCE); boolean clustToValidLineSpacing = clustTo.getRelLineSpacing() == 0.0f || Utils.within(lineSpacing, clustTo.getRelLineSpacing(), LINE_SPACING_TOLERANCE); return (sameLineSpacing && clustFromValidLineSpacing && clustToValidLineSpacing); } // System.out.println("vier"); return false; } public boolean isValidCluster(CandidateCluster c) { // prerequisite for calling this method is that the lines have been found ... // and that the average linespacing has been found //c.findLinesWidth(); c.setCalculatedFields(); // now, we check that the linespacing is constant by comparing the // spacing of each consecutive line with the average linespacing boolean clashingLines = false; CompositeSegment<? extends GenericSegment> prevLine = null; for (CompositeSegment<? extends GenericSegment> l : c.getFoundLines()) { if (prevLine != null) { float lineSpacing = (prevLine.getY1() - l.getY1()) / c.getFontSize(); if (SegmentUtils.vertIntersect(prevLine, l.getYmid())) clashingLines = true; // System.out.println("lineSpacing: " + lineSpacing); if (!Utils.within(lineSpacing, c.getRelLineSpacing(), LINE_SPACING_TOLERANCE)) return false; // fontsize check too } prevLine = l; } // System.out.println("returning: " + !checkForChasms(c)); return !AbstractPageSegmenter.checkForChasms(c); } public Comparator<AdjacencyEdge<? extends GenericSegment>> edgeComparator() { return new EdgeAttributeComparator(); } public boolean horizSkip() { return true; } public boolean doSwallow() { return true; } // no effect if doSwallow is true public boolean doOverlap() { return true; } public boolean neighbourMap() { return true; } }