/** * pdfXtk - PDF Extraction Toolkit * Copyright (c) by the authors/contributors. All rights reserved. * This project includes code from PDFBox and TouchGraph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://pdfxtk.sourceforge.net * */ package at.ac.tuwien.dbai.pdfwrap.model.document; import at.ac.tuwien.dbai.pdfwrap.utils.Utils; import org.w3c.dom.Document; import org.w3c.dom.Element; import java.util.List; /** * This represents a text segment (base class) * * @author Tamir Hassan, pdfanalyser@tamirhassan.com * @version PDF Analyser 0.9 */ public class TextSegment extends GenericSegment { // only for str conversions -- is not saved or represented in graph protected boolean isUnderlined; protected int superSubscript = 0; // -1 for subscript, 1 for superscript, 0 for normal protected String text; // protected PDFont segFont; protected float fontSize; protected String fontName; public String getFontName() { return fontName; } public void setFontName(String fontName) { this.fontName = fontName; } /** * Constructor. * * @param x1 The x1 coordinate of the segment. * @param x2 The x2 coordinate of the segment. * @param y1 The y1 coordinate of the segment. * @param y2 The y2 coordinate of the segment. * @param text The textual contents of the segment. * @param font The (main) font of the segment. * @param fontSize The (main) font size in the segment. */ public TextSegment( float x1, float x2, float y1, float y2, String text, String fontName, float fontSize ) { super(x1, x2, y1, y2); this.text = text; // this.segFont = font; // TODO: remove subsetting preceding chars this.fontName = fontName; this.setFontSize(fontSize); // also sets node text font } public TextSegment( float x1, float x2, float y1, float y2 ) { super(x1, x2, y1, y2); this.text = Utils.EMPTY_STRING; } public TextSegment() { super(); this.text = Utils.EMPTY_STRING; } /** * This will get the text of the segment. * * @return The text. */ public String getText() { return text; } /** * @param scale The text to set. */ public void setText(String text) { this.text = text; } /** * This will get the font size of the segment. * * @return The font size of the segment. */ public float getFontSize() { return fontSize; } /** * @param scale The font size to set. */ public void setFontSize(float fontSize) { this.fontSize = fontSize; } /** * This will return whether the text segment is 'empty', i.e. * contains just an empty string or a string with only spaces * * @return TRUE if empty */ public boolean isEmpty() { if (text != null) return (text.trim().length() == 0); else return false; } @Override public void mergeSegment(GenericSegment seg) { super.mergeSegment(seg); //Merge the text content of the two segments text = text.concat(" ").concat(((TextSegment)seg).getText()); } @Override public List<AttributeTuple> getAttributes() { List<AttributeTuple> attributeList = super.getAttributes(); attributeList.add(new AttributeTuple("text", text)); attributeList.add(new AttributeTuple("fontsize", fontSize)); attributeList.add(new AttributeTuple("font", fontName)); attributeList.add(new AttributeTuple("bold", isBold())); attributeList.add(new AttributeTuple("italic", isItalic())); return attributeList; } public boolean isBold() { if (fontName == null) return false; if (Utils.containsSubstring(fontName, "Bold") || Utils.containsSubstring(fontName, "bold") || Utils.containsSubstring(fontName, "Black") || Utils.containsSubstring(fontName, "black") || Utils.containsSubstring(fontName, "Heavy") || Utils.containsSubstring(fontName, "heavy")) return true; else return false; } public boolean isItalic() { if (fontName == null) return false; if (Utils.containsSubstring(fontName, "Italic") || Utils.containsSubstring(fontName, "italic") || Utils.containsSubstring(fontName, "Cursive") || Utils.containsSubstring(fontName, "cursive") || Utils.containsSubstring(fontName, "Kursiv") || Utils.containsSubstring(fontName, "kursiv")) return true; else return false; } public boolean isCapitals() { // TODO: 22.01.2011 include regexp or remove this method! used anywhere? // doesn't include lower-case A-Z return false; } public boolean isNumeric() { // TODO: 22.01.2011 include regexp or remove this method! used anywhere? // doesn't include A-Z/a-z return false; } /* public void addAsXML(Document resultDocument, Element parent, GenericSegment pageDim, float resolution) { // TODO: inelegant solution //if (potentialTable) tagName = "potential-table-cell"; //TODO: find a better name for this element? Element newSegmentElement = resultDocument.createElement(getTagName()); this.setElementAttributes(resultDocument, newSegmentElement, pageDim, resolution); parent.appendChild(newSegmentElement); } */ public void setElementAttributes (Document resultDocument, Element newSegmentElement, GenericSegment pageDim, float resolution) { // float fontSizeRatio = (resolution / SCREEN_RESOLUTION) * // FONT_SIZE_RATIO; // 22.01.2011 why font size ratio? newSegmentElement.setAttribute ("font-name", this.getFontName()); newSegmentElement.setAttribute ("font-size", Float.toString(this.getFontSize())); // * fontSizeRatio)); newSegmentElement.setAttribute ("bold", Boolean.toString(isBold())); newSegmentElement.setAttribute ("italic", Boolean.toString(isItalic())); //newSegmentElement.setAttribute // ("text-ratio", Float.toString(this.getTextRatio())); // XMIllum does not like 0-length text // this method will allow us to easily see when this might happen! //System.out.println("foobar"); if (this.isEmpty()) { if (this.getText().length() > 0) { newSegmentElement.appendChild (resultDocument.createTextNode("[empty:spaces]")); } else { newSegmentElement.appendChild (resultDocument.createTextNode("[empty:empty]")); } } else { //if (this.getClass() == TableCell.class) //System.out.println("class: " + this.getClass()); //System.out.println("appending text: " + Utils.removeInvalidXMLCharacters(this.getSegText())); //resultDocument.removeTextNode(); newSegmentElement.appendChild (resultDocument.createTextNode(Utils.removeInvalidXMLCharacters(this.getText()))); } super.setElementAttributes(resultDocument, newSegmentElement, pageDim, resolution); } public boolean isUnderlined() { return isUnderlined; } public void setUnderlined(boolean isUnderlined) { this.isUnderlined = isUnderlined; } public int getSuperSubscript() { return superSubscript; } public void setSuperSubscript(int superSubscript) { this.superSubscript = superSubscript; } }