/**
* pdfXtk - PDF Extraction Toolkit
* Copyright (c) by the authors/contributors. All rights reserved.
* This project includes code from PDFBox and TouchGraph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://pdfxtk.sourceforge.net
*
*/
package at.ac.tuwien.dbai.pdfwrap.analysis;
import at.ac.tuwien.dbai.pdfwrap.model.document.CompositeSegment;
import at.ac.tuwien.dbai.pdfwrap.model.document.TextSegment;
import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.Utils;
import java.util.ArrayList;
import java.util.List;
/**
* Candidate cluster used in segmentation algorithm
*
* @author Tamir Hassan, pdfanalyser@tamirhassan.com
* @version PDF Analyser 0.9
*/
public class CandidateCluster extends CompositeSegment<TextSegment>
{
protected boolean constantFont = true;
protected boolean constantFontSize = true;
protected float absLineSpacing;
protected int textAlignment;
protected boolean constantLS;
protected boolean constantFS;
protected boolean constantGS;
protected boolean uniqueLines;
protected boolean strContainsSuperSubscript = false;
protected List<CompositeSegment<? extends TextSegment>> foundLines;
public final static int ALIGN_LCR = 31;
public final static int ALIGN_LC = 32;
public final static int ALIGN_CR = 33;
public final static int ALIGN_L = 34;
public final static int ALIGN_C = 35;
public final static int ALIGN_R = 36;
public final static int ALIGN_NONE = 37;
public final static int ALIGN_UNSET = 0;
/**
* Constructor.
*
* @param x1
* The x1 coordinate of the segment.
* @param x2
* The x2 coordinate of the segment.
* @param y1
* The y1 coordinate of the segment.
* @param y2
* The y2 coordinate of the segment.
* @param text
* The textual contents of the segment.
* @param font
* The (main) font of the segment.
* @param fontSize
* The (main) font size in the segment.
*/
/* 30.11.06: these constructors appear to be useless*/
// 1.12.06: but they are used by TextBlock...
public CandidateCluster(float x1, float x2, float y1, float y2,
String text, String fontName, float fontSize)
{
super(x1, x2, y1, y2, text, fontName, fontSize);
this.items = new ArrayList<TextSegment>();
}
public CandidateCluster(float x1, float x2, float y1, float y2)
{
super(x1, x2, y1, y2);
this.items = new ArrayList<TextSegment>();
}
public CandidateCluster(float x1, float x2, float y1, float y2,
String text, String fontName, float fontSize, List<TextSegment> items)
{
super(x1, x2, y1, y2, text, fontName, fontSize);
this.items = items;
}
public CandidateCluster(float x1, float x2, float y1, float y2,
List<TextSegment> items)
{
super(x1, x2, y1, y2);
this.items = items;
}
public CandidateCluster(List<TextSegment> items)
{
super();
this.items = items;
}
public CandidateCluster()
{
// most common method if initialization now
// the fields are filled once all the items have
// been added...
super();
this.items = new ArrayList<TextSegment>();
}
public boolean isConstantFont() {
return constantFont;
}
public void setConstantFont(boolean constantFont) {
this.constantFont = constantFont;
}
public boolean isConstantFontSize() {
return constantFontSize;
}
public void setConstantFontSize(boolean constantFontSize) {
this.constantFontSize = constantFontSize;
}
public float getRelLineSpacing() {
return (absLineSpacing / fontSize);
}
public float getAbsLineSpacing() {
return absLineSpacing;
}
public void setAbsLineSpacing(float absLineSpacing) {
this.absLineSpacing = absLineSpacing;
}
public int getTextAlignment() {
return textAlignment;
}
public void setTextAlignment(int textAlignment) {
this.textAlignment = textAlignment;
}
public boolean isLeftAligned()
{
return textAlignment == ALIGN_L ||
textAlignment == ALIGN_LC ||
textAlignment == ALIGN_LCR;
}
public boolean isCentreAligned()
{
return textAlignment == ALIGN_C ||
textAlignment == ALIGN_LC ||
textAlignment == ALIGN_CR ||
textAlignment == ALIGN_LCR;
}
public boolean isRightAligned()
{
return textAlignment == ALIGN_R ||
textAlignment == ALIGN_CR ||
textAlignment == ALIGN_LCR;
}
public boolean isConstantLS() {
return constantLS;
}
public void setConstantLS(boolean constantLS) {
this.constantLS = constantLS;
}
public boolean isConstantFS() {
return constantFS;
}
public void setConstantFS(boolean constantFS) {
this.constantFS = constantFS;
}
public boolean isConstantGS() {
return constantGS;
}
public void setConstantGS(boolean constantGS) {
this.constantGS = constantGS;
}
public boolean isUniqueLines() {
return uniqueLines;
}
public void setUniqueLines(boolean uniqueLines) {
this.uniqueLines = uniqueLines;
}
public boolean isStrContainsSuperSubscript() {
return strContainsSuperSubscript;
}
public void setStrContainsSuperSubscript(boolean strContainsSuperSubscript) {
this.strContainsSuperSubscript = strContainsSuperSubscript;
}
public List<CompositeSegment<? extends TextSegment>> getFoundLines() {
return foundLines;
}
public void setFoundLines
(List<CompositeSegment<? extends TextSegment>> foundLines) {
this.foundLines = foundLines;
}
public void findLines()
{
findLines(0.5f);
}
public void findLinesWidth()
{
findLines(Float.MAX_VALUE);
}
public void findLines(float horizThreshold)
{
foundLines =
LineProcessor.findLines(items, horizThreshold, true, false);
processLines();
}
public void setCalculatedFields()
{
//TODO: e.g. super.setCalculatedFields();
//findLines();
findLinesWidth(); // TODO: does this replacement cause a problem?
// System.out.println("foundLinesWidth: " + foundLines);
processLines();
findFontSize(); // TODO: with processLines now redundant :(
findBoundingBox(); // NOT WITH NEW METHOD HERE!
// 17.01.07 done automatically now during findLines
findText();
}
public void processLines()
{
if (foundLines.size() > 1)
{
// first find averages
float avgX1 = 0.0f;
float avgXcen = 0.0f;
float avgX2 = 0.0f;
float afs = 0.0f;
float als = 0.0f;
boolean clashingLines = false;
CompositeSegment<? extends TextSegment> prevLine = null;
for (CompositeSegment<? extends TextSegment> l : foundLines)
{
avgX1 += l.getX1();
avgXcen += l.getXmid();
avgX2 += l.getX2();
afs += l.getFontSize();
if (prevLine != null)
{
float lineSpacing = prevLine.getY1() - l.getY1();
als += lineSpacing;
if (SegmentUtils.vertIntersect(prevLine,l.getYmid())) clashingLines = true;
}
prevLine = l;
}
avgX1 /= foundLines.size();
avgXcen /= foundLines.size();
avgX2 /= foundLines.size();
afs /= foundLines.size();
fontSize = afs;
als /= (foundLines.size() - 1);
// System.out.println("setting als to: " + als);
// lineSpacing = als;
absLineSpacing = als;///afs; // changed 30.10.10
// now, see if they are within allowed error
boolean constantX1 = true,
constantXcen = true,
constantX2 = true,
constantfs = true,
constantls = true;
float tolerance = afs * 0.5f;
prevLine = null;
for (CompositeSegment<? extends TextSegment> l : foundLines)
{
if (!Utils.within(l.getX1(), avgX1, tolerance)) constantX1 = false;
if (!Utils.within(l.getXmid(), avgXcen, tolerance)) constantXcen = false;
if (!Utils.within(l.getX2(), avgX2, tolerance)) constantX2 = false;
if (!Utils.within(l.getFontSize(), afs, afs * 0.1f)) constantfs = false;
if (prevLine != null)
{
float lineSpacing = prevLine.getY1() - l.getY1();
if (!Utils.within(lineSpacing, als, afs * 0.2f)) constantls = false;
}
prevLine = l;
}
if (constantX1 && constantX2)
textAlignment = ALIGN_LCR;
else if (constantX1 && constantXcen)
textAlignment = ALIGN_LC;
else if (constantXcen && constantX2)
textAlignment = ALIGN_CR;
else if (constantX1)
textAlignment = ALIGN_L;
else if (constantXcen)
textAlignment = ALIGN_C;
else if (constantX2)
textAlignment = ALIGN_R;
else textAlignment = ALIGN_NONE;
if (constantls) constantLS = true;
else constantLS = false;
if (constantfs) constantFS = true;
else constantFS = false;
if (!clashingLines) uniqueLines = true;
else uniqueLines = false;
// this.fontSize = afs;
}
else
{
// if singleton, or if no sub-objects etc.
textAlignment = ALIGN_LCR;
}
}
// TODO: in font size comparison, allow the usual error (use Utils.within 10%)
// although whether that makes sense depends on which algorithm we use to work
// out the font size :)
// pre: findLinesWidth carried out and elements sorted in ascending order
public TextSegment getTopElementMatchingFontsizeAfterSorting()
{
TextSegment retVal = null;
for (TextSegment s : items)
{
for (CompositeSegment<? extends TextSegment> l : foundLines)
{
if (l.getItems().contains(s))
{
//if (s.getFontSize() == l.getFontSize())
if (Utils.within(s.getFontSize(), l.getFontSize(), s.getFontSize() * 0.1f))
return s;
}
}
}
return retVal;
}
// pre: findLinesWidth carried out and elements sorted in ascending order
public TextSegment getBottomElementMatchingFontsizeAfterSorting()
{
TextSegment retVal = null;
for (int n = items.size() - 1; n >= 0; n --)
{
TextSegment s = items.get(n);
for (CompositeSegment<? extends TextSegment> l : foundLines)
{
if (l.getItems().contains(s))
{
//if (s.getFontSize() == l.getFontSize())
if (Utils.within(s.getFontSize(), l.getFontSize(), s.getFontSize() * 0.1f))
return s;
}
}
}
return retVal;
}
}