/**
* pdfXtk - PDF Extraction Toolkit
* Copyright (c) by the authors/contributors. All rights reserved.
* This project includes code from PDFBox and TouchGraph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://pdfxtk.sourceforge.net
*
*/
package at.ac.tuwien.dbai.pdfwrap.analysis;
import at.ac.tuwien.dbai.pdfwrap.comparators.YComparator;
import at.ac.tuwien.dbai.pdfwrap.comparators.EdgeAttributeComparator;
import at.ac.tuwien.dbai.pdfwrap.model.document.*;
import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyEdge;
import at.ac.tuwien.dbai.pdfwrap.model.graph.AdjacencyGraph;
import at.ac.tuwien.dbai.pdfwrap.utils.ListUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.Utils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.*;
/**
* Abstract page segmenter framework for implementation
* of page segmentation algorithms
*
* @author Tamir Hassan, pdfanalyser@tamirhassan.com
* @version PDF Analyser 0.9
*/
public abstract class AbstractPageSegmenter {
private static final Log log = LogFactory.getLog( AbstractPageSegmenter.class );
protected int maxIterations = Integer.MAX_VALUE;
protected List<GenericSegment> allSegments;
protected List<GenericSegment> unusedSegments;
protected List<AdjacencyEdge<GenericSegment>> allEdges;
protected List<AdjacencyEdge<GenericSegment>> priorityEdges; // sorted
// version
// of edges
protected AdjacencyGraph<? extends GenericSegment> ag;
protected HashMap<GenericSegment, CandidateCluster> clustHash;
protected HashMap<GenericSegment, List<GenericSegment>> vertNeighbourMap;
// List<GenericSegment> items;
public abstract Comparator<AdjacencyEdge<? extends GenericSegment>> edgeComparator();
public abstract int clusterTogether(AdjacencyEdge<GenericSegment> ae,
CandidateCluster clustFrom, CandidateCluster clustTo);
public abstract boolean isValidCluster(CandidateCluster c);
public abstract boolean horizSkip();
public abstract boolean doSwallow();
public abstract boolean doOverlap();
public abstract boolean neighbourMap();
protected boolean isDebugMode() {
return false;
}
// float maxH = 0.0f;
protected boolean checkHashes(Collection cols, Collection values) {
if (cols.size() == values.size()) {
Iterator itemIter = cols.iterator();
while (itemIter.hasNext()) {
GenericSegment item = (GenericSegment) itemIter.next();
if (!values.contains(item))
return true;
}
return false;
} else {
return true;
}
}
protected boolean inSwallowGroup(CandidateCluster c,
List<GenericSegment> swallowedItems) {
if (doSwallow())
return true;
List<GenericSegment> swallowedSegments = new ArrayList<GenericSegment>();
for (GenericSegment o : swallowedItems) {
if (!c.getItems().contains(o))
swallowedSegments.add((TextSegment) o);
}
// now we need to make sure that each new segment
// can (will) be added to c anyway
for (GenericSegment gs : swallowedItems) {
TextSegment s = (TextSegment) gs;
// we need to see whether unusedEdges contains
// an edge between any item in c and s
// EdgeList subList = (EdgeList)unusedEdges.getEdges(s);
// s should not be a member of c.getItems() (can't see how that
// could happen...)
boolean foundMemberOfCGetItems = false;
// Iterator j = subList.iterator();
// while(j.hasNext())
for (AdjacencyEdge<GenericSegment> e : priorityEdges) {
// AdjacencyEdge e = (AdjacencyEdge)j.next();
if (e.getNodeFrom() == s || e.getNodeTo() == s) {
if (c.getItems().contains(e.getNodeFrom()))
foundMemberOfCGetItems = true;
if (c.getItems().contains(e.getNodeTo()))
foundMemberOfCGetItems = true;
}
}
if (foundMemberOfCGetItems == false)
return false;
}
return true;
}
public List<TextBlock> clusterLinesIntoTextBlocks(
AdjacencyGraph<? extends GenericSegment> lineAG) {
List<TextBlock> retVal = new ArrayList<TextBlock>();
List<CandidateCluster> l = orderedEdgeCluster(lineAG);
for (CandidateCluster c : l) {
TextBlock tb = new TextBlock(c.getX1(), c.getX2(), c.getY1(),
c.getY2(), c.getText(), c.getFontName(), c.getFontSize());
tb.setLineSpacing(c.getRelLineSpacing());
for (TextSegment ts : c.getItems())
tb.getItems().add((TextLine) ts); // NOTE: not type-safe;
// crashes here if given
// e.g. TextFragments as
// input
retVal.add(tb);
}
return retVal;
}
public List<TextBlock> clusterFragsIntoTextBlocks(
AdjacencyGraph<? extends GenericSegment> lineAG) {
List<TextBlock> retVal = new ArrayList<TextBlock>();
List<CandidateCluster> l = orderedEdgeCluster(lineAG);
for (CandidateCluster c : l) {
TextBlock tb = new TextBlock(c.getX1(), c.getX2(), c.getY1(),
c.getY2(), c.getText(), c.getFontName(), c.getFontSize());
tb.setLineSpacing(c.getRelLineSpacing());
for (TextSegment ts : c.getItems()) {
TextFragment tf = (TextFragment) ts;
LineFragment lf = new LineFragment();
lf.getItems().add(tf);
lf.setCalculatedFields(tf);
TextLine tl = new TextLine();
tl.getItems().add(lf);
tl.setCalculatedFields(lf);
tb.getItems().add(tl); // NOTE: not type-safe; crashes here if
// given e.g. TextFragments as input
}
retVal.add(tb);
}
return retVal;
}
public List<TextBlock> clusterCharsIntoTextBlocks(
AdjacencyGraph<? extends GenericSegment> lineAG) {
List<TextBlock> retVal = new ArrayList<TextBlock>();
List<CandidateCluster> l = orderedEdgeCluster(lineAG);
for (CandidateCluster c : l) {
TextBlock tb = new TextBlock(c.getX1(), c.getX2(), c.getY1(),
c.getY2(), c.getText(), c.getFontName(), c.getFontSize());
tb.setLineSpacing(c.getRelLineSpacing());
for (TextSegment ts : c.getItems()) {
CharSegment cs = (CharSegment) ts;
TextFragment tf = new TextFragment();
tf.getItems().add(cs);
tf.setCalculatedFields(cs);
LineFragment lf = new LineFragment();
lf.getItems().add(tf);
lf.setCalculatedFields(tf);
TextLine tl = new TextLine();
tl.getItems().add(lf);
tl.setCalculatedFields(lf);
tb.getItems().add(tl); // NOTE: not type-safe; crashes here if
// given e.g. TextFragments as input
}
retVal.add(tb);
}
return retVal;
}
// NOTE: when starting with a partial segmentation, clustHash should be set
// otherwise pass a null value
public void initializeSegmenter(
AdjacencyGraph<? extends GenericSegment> ag,
HashMap<GenericSegment, CandidateCluster> clustHash) {
if (clustHash != null)
this.clustHash = clustHash;
this.ag = ag;
if (maxIterations <= 0)
maxIterations = Integer.MAX_VALUE;
else
System.out.println("running with " + maxIterations + " iterations");
long startProcess = System.currentTimeMillis();
long t = System.currentTimeMillis();
// SegmentList unusedSegments =
// (SegmentList)pageFromLines.getItems().clone();
List<CandidateCluster> retVal = new ArrayList<CandidateCluster>();
unusedSegments = new ArrayList<GenericSegment>();
allSegments = new ArrayList<GenericSegment>();
for (GenericSegment s : ag.getVertSegmentList()) {
allSegments.add(s);
unusedSegments.add(s);
}
priorityEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
allEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
// optimizationEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
for (AdjacencyEdge<?> e : ag.getEdges()) {
// if (e.isHorizontal())
if (e.isVertical()) {
AdjacencyEdge<GenericSegment> aegs = new AdjacencyEdge<GenericSegment>(
e.getNodeFrom(), e.getNodeTo(), e.getDirection(),
e.getWeight());
priorityEdges.add(aegs);
allEdges.add(aegs);
}
}
Collections.sort(priorityEdges, edgeComparator());
// priorityEdges.removeDuplicateEdges();
List<AdjacencyEdge<GenericSegment>> edgesToRemove = new ArrayList<AdjacencyEdge<GenericSegment>>();
int count = 0;
for (AdjacencyEdge<GenericSegment> ae : priorityEdges) {
count++;
if (count > maxIterations)
edgesToRemove.add(ae);
}
priorityEdges.removeAll(edgesToRemove);
// System.out.println("reduced size to: " + priorityEdges.size());
// added 2011-10-28
// initialize vertNeighbourMap -- used by some clusterTogether methods
// however, this map is only populated by demand
vertNeighbourMap = new HashMap<GenericSegment, List<GenericSegment>>();
}
/**
* greedy, best-first page segmentation algorithm
*/
public List<CandidateCluster> orderedEdgeCluster(
AdjacencyGraph<? extends GenericSegment> lineAG) {
initializeSegmenter(lineAG, null);
// ListUtils.printList(priorityEdges);
// System.out.println("allSegments: " + allSegments);
SegmentationResult retInt = processEdges(new SegmentationResult(
allSegments), priorityEdges);
// , new FirstPassSegmentationRules());
return retInt.getSegments();
}
/*
* replaced by the short form!
*
* public List<CandidateCluster> orderedEdgeCluster (AdjacencyGraph<?
* extends GenericSegment> lineAG)//, { if (maxIterations <= 0)
* maxIterations = Integer.MAX_VALUE; else
* System.out.println("running with " + maxIterations + " iterations");
*
* long startProcess = System.currentTimeMillis(); long t =
* System.currentTimeMillis();
*
* // SegmentList unusedSegments =
* (SegmentList)pageFromLines.getItems().clone(); List<CandidateCluster>
* retVal = new ArrayList<CandidateCluster>();
*
* unusedSegments = new ArrayList<GenericSegment>(); allSegments = new
* ArrayList<GenericSegment>();
*
* for (GenericSegment s : lineAG.getVertSegmentList()) {
* allSegments.add(s); unusedSegments.add(s); }
*
* // now, with vertical edges, seems to give different results without
* clone :) // EdgeList priorityEdges = (EdgeList)allEdges.clone(); // don't
* need to clone? maybe do? // 2011-01-26: why can't you do this? //
* List<AdjacencyEdge<? extends GenericSegment>> priorityEdges =
* lineAG.getEdges();//.clone();
*
* priorityEdges = new ArrayList<AdjacencyEdge<GenericSegment>>(); allEdges
* = new ArrayList<AdjacencyEdge<GenericSegment>>(); for (AdjacencyEdge<?> e
* : lineAG.getEdges()) { AdjacencyEdge<GenericSegment> aegs = new
* AdjacencyEdge<GenericSegment> (e.getNodeFrom(), e.getNodeTo(),
* e.getDirection(), e.getWeight());
*
* priorityEdges.add(aegs); allEdges.add(aegs); }
*
* Collections.sort(priorityEdges, edgeComparator());
* //priorityEdges.removeDuplicateEdges();
*
* clustHash = new HashMap<GenericSegment, CandidateCluster>();
* vertNeighbourMap = new HashMap<GenericSegment, List<GenericSegment>>();
*
* // System.out.println("priorityEdges.size: " + priorityEdges.size()); //
* System.out.println("lineAGEdges.size: " + lineAG.getEdges().size());
*
* int iteration = 0;
*
* t = System.currentTimeMillis();
*
* while(priorityEdges.size() > 0 && iteration < maxIterations) { long start
* = System.currentTimeMillis();
*
* AdjacencyEdge<GenericSegment> ae = priorityEdges.remove(0);
*
* if (ae.getNodeFrom() instanceof TextSegment && ae.getNodeTo() instanceof
* TextSegment) { TextSegment segFrom = (TextSegment)ae.getNodeFrom();
* TextSegment segTo = (TextSegment)ae.getNodeTo(); float lineSpacing =
* ae.physicalLength() / ae.getFontSize();
*
* iteration ++; if (iteration % 1000 == 0) System.out.println("Iteration: "
* + iteration + " of " + allEdges.size());
*
* if (isDebugMode()) System.out.println("Examining edge: " + ae +
* " with length: " + ae.physicalLength());
*
* if (clustHash.get(segFrom) == null && clustHash.get(segTo) == null) { if
* (isDebugMode()) System.out.println("one"); if (clusterTogether(ae, null,
* null)) { if (isDebugMode()) System.out.println("two");
* List<GenericSegment> swallowedSegments = swallow(createList(segFrom),
* createList(segTo));
*
* if (ae.isVertical() || ae.isHorizontal() && swallowedSegments.size() <=
* 2) { if (isDebugMode()) System.out.println("three"); CandidateCluster
* newc = makeCluster(swallowedSegments);
*
* if (isValidCluster(newc)) { if (isDebugMode())
* System.out.println("four"); t = System.currentTimeMillis();
* updateHashes(newc, retVal, neighbourMap() && ae.isVertical()); t =
* System.currentTimeMillis(); } } } // else do nothing } else if
* (clustHash.get(segFrom) == null) { if (isDebugMode())
* System.out.println("five"); CandidateCluster c = clustHash.get(segTo); if
* (clusterTogether(ae, null, c)) { if (isDebugMode())
* System.out.println("six"); List<GenericSegment> swallowedSegments =
* swallow(cloneList(c.getItems()), createList(segFrom));
*
* if (ae.isVertical() || ae.isHorizontal() && inSwallowGroup(c,
* swallowedSegments)) { if (isDebugMode()) System.out.println("seven");
* CandidateCluster newc = makeCluster(swallowedSegments); if
* (isValidCluster(newc)) { if (isDebugMode()) System.out.println("eight");
* updateHashes(newc, retVal, neighbourMap() && ae.isVertical()); t =
* System.currentTimeMillis(); retVal.remove(c); } } } } else if
* (clustHash.get(segTo) == null) { if (isDebugMode())
* System.out.println("nine"); CandidateCluster c = clustHash.get(segFrom);
* if (clusterTogether(ae, c, null)) { if (isDebugMode())
* System.out.println("ten"); List<GenericSegment> swallowedSegments =
* swallow(cloneList(c.getItems()), createList(segTo));
*
* // check if the addition doesn't swallow any additional elements if
* (ae.isVertical() || ae.isHorizontal() && inSwallowGroup(c,
* swallowedSegments)) { if (isDebugMode()) System.out.println("eleven");
* CandidateCluster newc = makeCluster(swallowedSegments); if
* (isValidCluster(newc)) { if (isDebugMode()) System.out.println("twelve");
* updateHashes(newc, retVal, neighbourMap() && ae.isVertical());
* retVal.remove(c); } } } } else // both segments already used, merge { if
* (isDebugMode()) System.out.println("thirteen"); // only possibility for
* horizontal edge, as all other segments added // as singletons by now
*
* t = System.currentTimeMillis(); // merge the two clusters if compatible
* CandidateCluster c1 = clustHash.get(segFrom); CandidateCluster c2 =
* clustHash.get(segTo);
*
* boolean skip = false;
*
* // commented out after PDF-TREX comparison 17.07.10
*
* // correction 21.07.10 // if (ae.isHorizontal()) skip = true; if
* (ae.isHorizontal() && horizSkip()) skip = true;
*
* if (c1 == c2) skip = true; // in clusterTogether -- redundant!
*
* if (!skip) { if (isDebugMode()) System.out.println("thirteenandahalf");
* if (isDebugMode()) System.out.println("c1: " + c1); if (isDebugMode())
* System.out.println("c2: " + c2); if (clusterTogether(ae, c1, c2)) { if
* (isDebugMode()) System.out.println("fourteen"); // check if the addition
* doesn't swallow any additional elements List<GenericSegment>
* swallowedSegments = swallow(cloneList(c1.getItems()),
* cloneList(c2.getItems())); if (ae.isVertical() || ae.isHorizontal() &&
* swallowedSegments.size() <= c1.getItems().size() + c2.getItems().size())
* { if (isDebugMode()) System.out.println("fifteen"); CandidateCluster newc
* = makeCluster(swallowedSegments); if (isValidCluster(newc)) { if
* (isDebugMode()) System.out.println("sixteen"); updateHashes(newc, retVal,
* neighbourMap() && ae.isVertical()); if (isDebugMode())
* System.out.println("16a"); newc.findBoundingBox();
* newc.setFontSize(ae.getFontSize()); if (isDebugMode())
* System.out.println("16b"); //newc.setLineSpacing(lineSpacing); // removed
* 29.10.10 newc.setCalculatedFields(); // 29.10.10 retVal.remove(c2);
* retVal.remove(c1); if (isDebugMode()) System.out.println("16c"); if
* (ae.isHorizontal()) { // 2011-01-26 TEMPORARILY COMMENTED OUT
*
* } } } } } else { // part of same cluster; not merging :) } } } }
*
* if (priorityEdges.size() == 0) // don't add singletons if in 'watch' mode
* :) { if (isDebugMode()) System.out.println("remaining singletons: " +
* priorityEdges.size()); // add remaining singletons for (GenericSegment s
* : allSegments) { if (clustHash.get(s) == null) { CandidateCluster c =
* makeCluster(createList(s)); //newc.setLineSpacing(lineSpacing); //
* removed 29.10.10 c.setCalculatedFields(); clustHash.put(s, c);
* retVal.add(c); //only for testing when bailing out of method here... } }
* }
*
* if (Utils.DISPLAY_TIMINGS)
* System.out.println("Total time for clustering: " +
* (System.currentTimeMillis() - startProcess));
*
*
* return retVal; }
*/
// clones too
public SegmentationResult processEdges(SegmentationResult sr,
List<AdjacencyEdge<GenericSegment>> selectedEdges)// ,
// ISegmentationRules
// rules)
{
SegmentationResult cloneSegResult = sr.clone();
int counter = 0;
for (AdjacencyEdge<GenericSegment> ae : selectedEdges) {
counter++;
if (counter <= maxIterations) {
if (isDebugMode())
System.out
.println("processing edge " + counter + ": " + ae);
// if (counter % 100 == 0)
// System.out.println("processing edge " + counter + ": " + ae);
processEdge(ae, cloneSegResult);// , rules);
// and find redundant edges...
// find which items were SWALLOWED -- these should take us
// directly to the edges
// ListUtils.removeDuplicates(sr.joinedEdges);
}
}
return cloneSegResult;
}
// clones too
public SegmentationResult processEdgesResort(SegmentationResult sr,
List<AdjacencyEdge<GenericSegment>> selectedEdges)// ,
// ISegmentationRules
// rules)
{
SegmentationResult cloneSegResult = sr.clone();
int counter = 0;
List<AdjacencyEdge<GenericSegment>> cloneEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
for (AdjacencyEdge<GenericSegment> ae : selectedEdges)
cloneEdges.add(ae);
while (cloneEdges.size() > 0) {
// Collections.sort(selectedEdges, rules.edgeComparator());
Collections.sort(selectedEdges, this.edgeComparator());
AdjacencyEdge<GenericSegment> ae = cloneEdges.remove(0);
counter++;
if (isDebugMode())
System.out.println("processing edge sort " + counter + ": "
+ ae);
processEdge(ae, cloneSegResult);// , rules);
}
return cloneSegResult;
}
// TODO
// (static)
public void processEdge(AdjacencyEdge<GenericSegment> ae,
SegmentationResult sr)// , ISegmentationRules rules)
{
TextSegment segFrom = (TextSegment) ae.getNodeFrom();
TextSegment segTo = (TextSegment) ae.getNodeTo();
if (sr.clustHash.get(segFrom) == null
&& sr.clustHash.get(segTo) == null) {
if (isDebugMode())
System.out.println("one");
int ctTest = // rules.clusterTogether(ae, null, null, sr);
clusterTogether(ae, null, null);// , sr);
if (ctTest == 1) {
if (isDebugMode())
System.out.println("two");
List<AdjacencyEdge<GenericSegment>> swallowedEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
if (!doSwallow())
swallowedEdges.add(ae);
List<GenericSegment> swallowedSegments = swallow(
createList(segFrom), createList(segTo), sr.clustHash,
swallowedEdges); // clustHash not modified; only lookup
if (isDebugMode())
System.out.println("three");
CandidateCluster newc = makeCluster(swallowedSegments);
if (isValidCluster(newc)) {
if (isDebugMode())
System.out.println("four");
sr.addSegmentUpdateHash(newc);
sr.addJoinedEdges(swallowedEdges);
}
} else if (ctTest == 0) {
sr.remainingEdges.add(ae);
}
// else do nothing
} else if (sr.clustHash.get(segFrom) == null) {
if (isDebugMode())
System.out.println("five");
CandidateCluster c = sr.clustHash.get(segTo);
int ctTest = // rules.clusterTogether(ae, null, c, sr);
clusterTogether(ae, null, c);// , sr);
if (ctTest == 1) {
if (isDebugMode())
System.out.println("six");
List<AdjacencyEdge<GenericSegment>> swallowedEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
if (!doSwallow())
swallowedEdges.add(ae);
List<GenericSegment> swallowedSegments = swallow(
cloneList(c.getItems()), createList(segFrom),
sr.clustHash, swallowedEdges); // clustHash not
// modified; only lookup
if (isDebugMode())
System.out.println("seven");
CandidateCluster newc = makeCluster(swallowedSegments);
if (isValidCluster(newc)) {
if (isDebugMode())
System.out.println("eight");
sr.addSegmentUpdateHash(newc);
sr.addJoinedEdges(swallowedEdges);
sr.segments.remove(c);
}
} else if (ctTest == 0) {
sr.remainingEdges.add(ae);
}
} else if (sr.clustHash.get(segTo) == null) {
System.out.println("segTo: " + segTo);
System.out.println("null reached, containskey: "
+ sr.clustHash.containsKey(segTo));
if (isDebugMode())
System.out.println("nine");
CandidateCluster c = sr.clustHash.get(segFrom);
int ctTest = // rules.clusterTogether(ae, c, null, sr);
clusterTogether(ae, c, null);// , sr);
if (ctTest == 1) {
if (isDebugMode())
System.out.println("ten");
List<AdjacencyEdge<GenericSegment>> swallowedEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
if (!doSwallow())
swallowedEdges.add(ae);
List<GenericSegment> swallowedSegments = swallow(
cloneList(c.getItems()), createList(segTo),
sr.clustHash, swallowedEdges);
// check if the addition doesn't swallow any additional elements
if (isDebugMode())
System.out.println("eleven");
CandidateCluster newc = makeCluster(swallowedSegments);
if (isValidCluster(newc)) {
if (isDebugMode())
System.out.println("twelve");
sr.addSegmentUpdateHash(newc);
sr.addJoinedEdges(swallowedEdges);
sr.segments.remove(c);
}
} else if (ctTest == 0) {
sr.remainingEdges.add(ae);
}
} else // both segments already used, merge
{
if (isDebugMode())
System.out.println("thirteen");
// only possibility for horizontal edge, as all other segments added
// as singletons by now
// merge the two clusters if compatible
CandidateCluster c1 = sr.clustHash.get(segFrom);
CandidateCluster c2 = sr.clustHash.get(segTo);
boolean skip = false;
// if (ae.isHorizontal() && horizSkip()) skip = true;
if (c1 == c2)
skip = true; // in clusterTogether -- redundant!
if (!skip) {
if (isDebugMode())
System.out.println("thirteenandahalf");
if (isDebugMode())
System.out.println("c1: " + c1);
if (isDebugMode())
System.out.println("c2: " + c2);
int ctTest = // rules.clusterTogether(ae, c1, c2, sr);
clusterTogether(ae, c1, c2);// , sr);
if (ctTest == 1) {
if (isDebugMode())
System.out.println("fourteen");
List<AdjacencyEdge<GenericSegment>> swallowedEdges = new ArrayList<AdjacencyEdge<GenericSegment>>();
if (!doSwallow())
swallowedEdges.add(ae);
List<GenericSegment> swallowedSegments = swallow(
cloneList(c1.getItems()), cloneList(c2.getItems()),
sr.clustHash, swallowedEdges);
// check if the addition doesn't swallow any additional
// elements
// if (ae.isVertical() || ae.isHorizontal() &&
// swallowedSegments.size() <= c1.getItems().size() +
// c2.getItems().size())
if (true) {
if (isDebugMode())
System.out.println("fifteen");
CandidateCluster newc = makeCluster(swallowedSegments);
if (isValidCluster(newc)) {
if (isDebugMode())
System.out.println("sixteen");
sr.addSegmentUpdateHash(newc);
sr.addJoinedEdges(swallowedEdges);
if (isDebugMode())
System.out.println("16a");
newc.findBoundingBox();
newc.setFontSize(ae.avgFontSize());
if (isDebugMode())
System.out.println("16b");
// newc.setLineSpacing(lineSpacing); // removed
// 29.10.10
newc.setCalculatedFields(); // 29.10.10
sr.segments.remove(c2);
sr.segments.remove(c1);
if (isDebugMode())
System.out.println("16c");
}
}
} else if (ctTest == 0) {
sr.remainingEdges.add(ae);
}
} else {
// part of same cluster; not merging :)
}
}
}
protected static List<GenericSegment> findNearestVerticalNeighbours(
GenericSegment c, List<AdjacencyEdge<GenericSegment>> allEdges,
HashMap<GenericSegment, List<GenericSegment>> vertNeighbourMap) {
if (vertNeighbourMap.containsKey(c)) {
return vertNeighbourMap.get(c);
} else {
// find lowest neighbourAbove and highest neighbourBelow
GenericSegment lowestNeighbourAbove = null;
GenericSegment highestNeighbourBelow = null;
// Iterator edgeIter = priorityEdges.iterator();
Iterator edgeIter = allEdges.iterator();
while (edgeIter.hasNext()) {
AdjacencyEdge ae = (AdjacencyEdge) edgeIter.next();
GenericSegment segFrom = (GenericSegment) ae.getNodeFrom();
GenericSegment segTo = (GenericSegment) ae.getNodeTo();
if (ae.isVertical()) {
// we can assume that the bounding box of the current
// cluster is correct
// edge points into or out of cluster
if (c == segFrom && c != segTo) {
// segTo is the outside element
if (segTo.getYmid() > c.getY2()) {
// segTo is above the cluster
if (lowestNeighbourAbove == null
|| segTo.getYmid() < lowestNeighbourAbove
.getYmid()) {
lowestNeighbourAbove = segTo;
}
} else if (segTo.getYmid() < c.getY1()) {
// segTo is below the cluster
if (highestNeighbourBelow == null
|| segTo.getYmid() > highestNeighbourBelow
.getYmid()) {
highestNeighbourBelow = segTo;
}
} else {
// do nothing if within boundary of cluster
// but not swallowed for some reason
}
} else if (c != segFrom && c == segTo) {
// segFrom is the outside element
if (segFrom.getYmid() > c.getY2()) {
// segTo is above the cluster
if (lowestNeighbourAbove == null
|| segFrom.getYmid() < lowestNeighbourAbove
.getYmid()) {
lowestNeighbourAbove = segFrom;
}
} else if (segFrom.getYmid() < c.getY1()) {
// segTo is below the cluster
if (highestNeighbourBelow == null
|| segFrom.getYmid() > highestNeighbourBelow
.getYmid()) {
highestNeighbourBelow = segFrom;
}
} else {
// do nothing if within boundary of cluster
// but not swallowed for some reason
}
}
}
}
// System.out.println("in foo section");
// System.out.println("lowestNeighbourAbove: " +
// lowestNeighbourAbove);
// System.out.println("highestNeighbourBelow: " +
// highestNeighbourBelow);
List<GenericSegment> retVal = new ArrayList<GenericSegment>();
retVal.add(lowestNeighbourAbove);
retVal.add(highestNeighbourBelow);
vertNeighbourMap.put(c, retVal);
return retVal;
}
}
protected List<GenericSegment> swallow(List<GenericSegment> l1,
List<GenericSegment> l2) {
return swallow(l1, l2, this.clustHash, null);
}
protected List<GenericSegment> swallow(List<GenericSegment> l1,
List<GenericSegment> l2,
HashMap<GenericSegment, CandidateCluster> clustHash,
List<AdjacencyEdge<GenericSegment>> redundantEdges) {
/*
* if (!rules.swallow()) { // override swallow method to return just the
* items, no swallowing
*
* List<GenericSegment> retVal = new ArrayList<GenericSegment>();
* retVal.addAll(l1); retVal.addAll(l2); return retVal; }
*/
// TODO: RETURN LIST OF SWALLOWED ITEMS
CompositeSegment<GenericSegment> temp = new CompositeSegment<GenericSegment>();
temp.getItems().addAll(l1);
temp.getItems().addAll(l2);
temp.findBoundingBox();
List<GenericSegment> swallowedItems = new ArrayList<GenericSegment>();
// long startTime = System.currentTimeMillis();
if (!doSwallow()) {
if (doOverlap()) {
boolean loop = true;
while (loop) {
boolean changeMade = false;
// swallowedItems =
// ListUtils.findElementsIntersectingBBox(items, temp);
List<GenericSegment> itemsToAdd = new ArrayList<GenericSegment>();
for (GenericSegment gs : temp.getItems()) // loop through
// items in seg
{
for (GenericSegment gs2 : allSegments) // loop through
// _other_ items
// on page
{
if (!temp.getItems().contains(gs2)) {
if (gs != gs2
&& SegmentUtils.intersects(gs2, gs)) {
itemsToAdd.add(gs2);
temp.growBoundingBox(gs2);
changeMade = true;
}
}
}
}
temp.getItems().addAll(itemsToAdd);
if (!changeMade)
loop = false;
}
}
// System.out.println("finished swallow in: " +
// (System.currentTimeMillis() - startTime));
return temp.getItems();
} else {
boolean loop = true;
while (loop) {
// System.out.println("allsegments.size: " +
// allSegments.size());
// System.out.println("temp: " + temp.toExtendedString());
swallowedItems =
// items.getElementsWithCentresWithinBBoxOrViceVersa(temp);
ListUtils.findElementsIntersectingBBox(allSegments, temp);
List<GenericSegment> newItems = new ArrayList<GenericSegment>();
for (GenericSegment gs : swallowedItems) {
if (clustHash.get(gs) != null) // if belongs to another
// cluster
{
CandidateCluster clust = clustHash.get(gs);
newItems.addAll(clust.getItems());
// System.out.println("adding cc items: " +
// clust.toExtendedString());
}
}
swallowedItems.addAll(newItems);
ListUtils.removeDuplicates(swallowedItems);
if (temp.getItems().size() == swallowedItems.size()) {
// no items swallowed
loop = false;
} else // elsepart added 18.05.07
{
// commented out 6.06.07
// if (newSegment instanceof Cluster) return true;
}
temp.setItems(swallowedItems);
temp.findBoundingBox();
}
if (redundantEdges != null) // if valid method passed
for (AdjacencyEdge<GenericSegment> ae : allEdges) {
if (swallowedItems.contains(ae.getNodeFrom())
&& swallowedItems.contains(ae.getNodeTo()))
redundantEdges.add(ae);
}
return swallowedItems;
}
}
// TODO: This method will crash if the input list contains non-TextSegments
// 2011-10-27 changed to static!
protected static CandidateCluster makeCluster(List<GenericSegment> items) {
CandidateCluster retVal = new CandidateCluster();
for (GenericSegment gs : items)
retVal.getItems().add((TextSegment) gs);
retVal.findFontSize(); // added 13.08.08
// retVal.findLinesWidth(); // this method uses pnglf and the entire
// width
// Collections.sort(retVal.getFoundLines(), new YComparator());
CandidateCluster tempClust = new CandidateCluster();
for (GenericSegment gs : items)
tempClust.getItems().add((TextSegment) gs);
// tempClust.flattenByOneLevel();
tempClust.findLines(Float.MAX_VALUE); // 14.08.08 this method ensures
// that the resulting lines are
// SORTED
// but IntegrateLines destroys
// this... :(
retVal.setFoundLines(tempClust.getFoundLines());
// above doesn't work due to NPE, but need to fix it soon TODO
// TODO: Integrate lines?
// retVal.integrateLines();
Collections.sort(retVal.getFoundLines(), new YComparator());
retVal.findBoundingBox();
return retVal;
}
// pre: foundLines and fontSize are set
// TODO: what when the line spacing is different? replace fontSize for
// vertical with lineSpacing?
// TODO: consider the shape of the gap and generate a score?
public static boolean checkForChasms(CandidateCluster cts) {
float minChasmHeight = 3.5f;
float minChasmWidth = 0.5f;
List<List<GenericSegment>> lineGaps = findLineGaps(cts, minChasmWidth
* cts.getFontSize());
// TODO: sort out this static shit!
List<GenericSegment> gaps = mergeLineGaps(lineGaps,
minChasmWidth * cts.getFontSize(),
minChasmHeight * cts.getFontSize());
for (GenericSegment gap : gaps) {
if ((gap.getWidth() > minChasmWidth * cts.getFontSize())
&& (gap.getHeight() > minChasmHeight * cts.getFontSize()))
return true;
}
return false;
}
// pre: foundLines must be set
// returns a SegmentList of SegmentLists of GenericSegments
public static List<List<GenericSegment>> findLineGaps(CandidateCluster cts,
float minWidth) {
List<List<GenericSegment>> retVal = new ArrayList<List<GenericSegment>>();
for (CompositeSegment<? extends GenericSegment> l : cts.getFoundLines()) {
List<GenericSegment> lineGaps = new ArrayList<GenericSegment>();
for (int n = 1; n < l.getItems().size(); n++) {
GenericSegment a = l.getItems().get(n - 1);
GenericSegment b = l.getItems().get(n);
// assume that a and b intersect; and that b is to the right of
// a
if ((b.getX1() - a.getX2()) > minWidth) {
float newY1 = Utils.maximum(a.getY1(), b.getY1());
float newY2 = Utils.minimum(a.getY2(), b.getY2());
GenericSegment gapSeg = new GenericSegment(a.getX2(),
b.getX1(), newY1, newY2);
lineGaps.add(gapSeg);
}
}
retVal.add(lineGaps);
}
return retVal;
}
// TODO: for speedup, find a way to make this work with iterators instead
// of for/next loops.
public static List<GenericSegment> mergeLineGaps(
List<List<GenericSegment>> lineGaps, float minWidth, float minHeight) {
List<GenericSegment> retVal = new ArrayList<GenericSegment>();
for (int n = 1; n <= lineGaps.size(); n++) {
List<GenericSegment> thisLineGaps = lineGaps.get(n - 1);
List<GenericSegment> nextLineGaps = new ArrayList<GenericSegment>();
if (n < lineGaps.size())
nextLineGaps = lineGaps.get(n);
// compare this with next
// Iterator i = thisLineGaps.iterator();
// while(i.hasNext())
boolean potentialNewGap = false;
float lastX2 = -1.0f;
float lastY1 = -1.0f;
float lastY2 = -1.0f;
int lastIndex = -1;
for (int i = 0; i < thisLineGaps.size(); i++) {
GenericSegment thisGap = (GenericSegment) thisLineGaps.get(i);// i.next();
boolean intersects = false;
boolean addedGap = false;
if (potentialNewGap) {
if (lastX2 >= thisGap.getX1()) {
float newX2 = lastX2;
if (lastX2 <= thisGap.getX2()) {
potentialNewGap = false;
newX2 = thisGap.getX2();
}
GenericSegment newGap = new GenericSegment(
thisGap.getX1(), newX2, lastY1, thisGap.getY2());
nextLineGaps.add(lastIndex + 1, newGap);
addedGap = true;
intersects = true;
}
}
if (!addedGap) {
for (int j = 0; j < nextLineGaps.size(); j++)
// Iterator j = nextLineGaps.iterator();
// while(j.hasNext())
{
GenericSegment nextGap = nextLineGaps.get(j);// j.next();
if (SegmentUtils.horizIntersect(thisGap, nextGap)) {
intersects = true;
// update next with vertical co-ordinates of this;
// shrink x if necc.
nextGap.setY2(thisGap.getY2());
if (thisGap.getX1() > nextGap.getX1())
nextGap.setX1(thisGap.getX1());
if (thisGap.getX2() < nextGap.getX2()) {
lastX2 = nextGap.getX2();
lastY1 = nextGap.getY1();
lastY2 = nextGap.getY2();
nextGap.setX2(thisGap.getX2());
potentialNewGap = true;
lastIndex = j;
} else {
potentialNewGap = false;
}
}
}
}
// the last row doesn't intersect with anything anyway :)
if (!intersects)// || n == (lineGaps.size()))
{
// add the last one to the result
// add to result
retVal.add(thisGap);
}
}
// here add remaining last row of gaps (nextLineGaps)
// if (n == lineGaps.size() - 1)
// retVal.addAll(nextLineGaps);
}
return retVal;
}
protected static List<GenericSegment> createList(GenericSegment gs) {
List<GenericSegment> retVal = new ArrayList<GenericSegment>();
retVal.add(gs);
return (retVal);
}
protected static List<GenericSegment> cloneList(
List<? extends GenericSegment> l) {
List<GenericSegment> retVal = new ArrayList<GenericSegment>();
for (GenericSegment gs : l)
retVal.add(gs);
return (retVal);
}
public AdjacencyGraph<? extends GenericSegment> getAG() {
return ag;
}
public void setAG(AdjacencyGraph<? extends GenericSegment> ag) {
this.ag = ag;
}
public int getMaxIterations() {
return maxIterations;
}
public void setMaxIterations(int maxIterations) {
this.maxIterations = maxIterations;
}
public HashMap<GenericSegment, CandidateCluster> getClustHash() {
return clustHash;
}
public void setClustHash(HashMap<GenericSegment, CandidateCluster> clustHash) {
this.clustHash = clustHash;
}
}