/**
* pdfXtk - PDF Extraction Toolkit
* Copyright (c) by the authors/contributors. All rights reserved.
* This project includes code from PDFBox and TouchGraph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://pdfxtk.sourceforge.net
*
*/
package at.ac.tuwien.dbai.pdfwrap.model.document;
import at.ac.tuwien.dbai.pdfwrap.utils.ListUtils;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.TextPosition;
import java.util.ArrayList;
import java.util.List;
/**
* Text fragment element; represents an atomic fragment corresponding
* to one COS instruction
*
* @author Tamir Hassan, pdfanalyser@tamirhassan.com
* @version PDF Analyser 0.9
*/
public class TextFragment extends CompositeSegment<CharSegment>
{
// TODO: what about rotated text/text with negative width/height?
// should these variables be included even in the base class?
/*
float xScale;
float yScale;
float wordSpacing;
float widthOfSpace;
*/
boolean overprint = false;
// 2011-01-24 unnecessary
// protected String tagName = "text-fragment";
/**
* Constructor.
*
* @param x1 The x1 coordinate of the segment.
* @param x2 The x2 coordinate of the segment.
* @param y1 The y1 coordinate of the segment.
* @param y2 The y2 coordinate of the segment.
* @param text The textual contents of the segment.
* @param font The (main) font of the segment.
* @param fontSize The (main) font size in the segment.
*/
public TextFragment(
float x1,
float x2,
float y1,
float y2,
String text,
String fontName,
float fontSize
)
{
super(x1, x2, y1, y2, text, fontName, fontSize);
}
public TextFragment(
float x1,
float x2,
float y1,
float y2,
String text,
PDFont font,
float fontSize
)
{
super(x1, x2, y1, y2, text, findFontName(font), fontSize);
}
public TextFragment(
float x1,
float x2,
float y1,
float y2
)
{
super(x1, x2, y1, y2);
}
public TextFragment()
{
super();
}
public TextFragment(CharSegment c)
{
super(c.getX1(), c.getX2(), c.getY1(), c.getY2());
items.add(c);
text = c.getText();
fontName = c.getFontName();
fontSize = c.getFontSize();
}
// not in current use (I think) -- WRONG
// now by default sets level to zero (primitive)
public TextFragment(TextPosition tPos)
{
super(tPos.getX(),
tPos.getX() + (tPos.getWidth()),
tPos.getY(),
tPos.getY() + (tPos.getFontSize() * tPos.getYScale()),
tPos.getCharacter(),
findFontName(tPos.getFont()),
tPos.getFontSize() * tPos.getYScale());
// todo: trim the name of the font
String fontName = tPos.getFont().getBaseFont();
}
/**
* This will create a TextFragment object from a TextPosition object.
* As of PDFBox 0.7.2, this is the method currently in use, which
* converts co-ordinates back to the original system.
*
* @param tPos - the TextPosition object; pageDim - page dimensions in order to
* convert co-ordinates
* @return The new TextFragment object
*/
public TextFragment(TextPosition tPos, GenericSegment pageDim)
{
super(tPos.getX(),
tPos.getX() + tPos.getWidth(),
pageDim.getY2() - tPos.getY(),
pageDim.getY2() - tPos.getY() + (tPos.getFontSize() * tPos.getYScale()),
tPos.getCharacter(),
tPos.getFont().getBaseFont(),
tPos.getFontSize() * tPos.getYScale());
// uncomment to print the contents of all text fragments to the screen
// System.out.println("Created text fragment: x1: " + tPos.getX() + " x2: " + (tPos.getX() + tPos.getWidth()) + " y1: " + tPos.getY() + " y2: " + (tPos.getY() + (tPos.getFontSize() * tPos.getYScale())) + " Text: " + text + " Font size: " + tPos.getFontSize() + " X Scale: " + tPos.getYScale() + " Y Scale: " + tPos.getYScale());
// todo: trim the name of the font
String fontName = tPos.getFont().getBaseFont();
/*
this.xScale = tPos.getXScale();
this.yScale = tPos.getYScale();
this.widthOfSpace = tPos.getWidthOfSpace();
this.wordSpacing = tPos.getWordSpacing();
*/
}
protected static String findFontName(PDFont font)
{
if (font.getBaseFont().matches("^[A-Z]{6}\\+.+"))
return font.getBaseFont().substring(7);
else return font.getBaseFont();
}
public List<OpTuple> sourceOps()
{
List<OpTuple> retVal = new ArrayList<OpTuple>();
for (CharSegment cs : items)
retVal.add(cs.getSourceOp());
ListUtils.removeDuplicates(retVal);
return retVal;
}
public boolean isOverprint() {
return overprint;
}
public void setOverprint(boolean overprint) {
this.overprint = overprint;
}
/*
* with the tagName stuff this should not be necessary
*
public void addAsXML(Document resultDocument, Element parent, GenericSegment pageDim,
float resolution)
{
//TODO: find a better name for this element?
Element newSegmentElement = resultDocument.createElement("text-fragment");
super.setElementAttributes(resultDocument, newSegmentElement, pageDim, resolution);
parent.appendChild(newSegmentElement);
}
*/
// 2011-11-02 overrides method from CompositeSegment
// to ignore whitespace characters (with sometimes fancy coordinates)
public void findBoundingBox()
{
boolean first = true;
int noItems = 0;
double fontSizeTotal = 0.0;
for (CharSegment thisSegment : items)
{
if (!(thisSegment instanceof IBlankSegment) &&
!(thisSegment.getText().equals(" ")))
{
if (thisSegment instanceof TextSegment)
{
noItems++;
fontSizeTotal += ((TextSegment) thisSegment).getFontSize();
}
if (first)
{
x1 = thisSegment.getX1();
x2 = thisSegment.getX2();
y1 = thisSegment.getY1();
y2 = thisSegment.getY2();
first = false;
} else
{
growBoundingBox(thisSegment);
}
}
}
if (noItems >= 0)
{
fontSize = (float) (fontSizeTotal / noItems);
} else
{
fontSize = -1.0f;
}
}
}