/** * pdfXtk - PDF Extraction Toolkit * Copyright (c) by the authors/contributors. All rights reserved. * This project includes code from PDFBox and TouchGraph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://pdfxtk.sourceforge.net * */ package at.ac.tuwien.dbai.pdfwrap.model.document; import org.w3c.dom.Document; import org.w3c.dom.Element; import java.util.Iterator; import java.util.List; /** * Default granular object for segmentation output * * @author Tamir Hassan, pdfanalyser@tamirhassan.com * @version PDF Analyser 0.9 */ public class TextBlock extends CompositeSegment<TextLine> implements IXHTMLSegment { // Generic TextBlock stuff // protected List<TextLine> items; protected int textAlignment; protected float lineSpacing; public float strXPosNewline = -1.0f; public final static int NO_CLASSIFICATIONS = 3; public final static int ALIGN_LCR = 31; public final static int ALIGN_LC = 32; public final static int ALIGN_CR = 33; public final static int ALIGN_L = 34; public final static int ALIGN_C = 35; public final static int ALIGN_R = 36; public final static int ALIGN_NONE = 37; public final static int ALIGN_UNSET = 0; // Paragraph stuff protected int classification; public final static int PARAGRAPH = 0; // this is only for backwards compatibility // with older stuff -- HEADING should be // updated to be a more specific OTHER_TEXT... public final static int HEADING = 1; public final static int BODY = 0; // these classifications ONLY for str. mode // as line-finding is carried out again public final static int BODY_TEXT = 40; public final static int HEADING_1 = 41; public final static int HEADING_2 = 42; public final static int HEADING_3 = 43; public final static int ORDERED_LIST_ITEM = 51; public final static int UNORDERED_LIST_ITEM = 52; //public final static int MISC = 2; // changed 29.10.06, as differing // between heading and misc will require // generic knowledge (i.e. avg. font size of page) public final static int OTHER_TEXT = 9; // this is to include headings, captions and other // misc stuff // headings can later be detected (easy!) public final static int CELL = 2; public TextBlock() { super(); } public TextBlock(List<TextLine> items) { super(); // super(theItems); this.items = items; } public TextBlock( float x1, float x2, float y1, float y2, String text, String fontName, float fontSize ) { super(x1, x2, y1, y2, text, fontName, fontSize); } public TextBlock( float x1, float x2, float y1, float y2 ) { super(x1, x2, y1, y2); } public TextBlock( float x1, float x2, float y1, float y2, String text, String fontName, float fontSize, List<TextLine> items ) { // super(x1, x2, y1, y2, text, fontName, fontSize, items); super(x1, x2, y1, y2, text, fontName, fontSize); this.items = items; } public TextBlock( float x1, float x2, float y1, float y2, List<TextLine> items ) { // super(x1, x2, y1, y2, items); super(x1, x2, y1, y2); this.items = items; } // IXMillumSegment public void setElementAttributes(Document resultDocument, Element newSegmentElement, GenericSegment pageDim, float resolution) { super.setElementAttributes(resultDocument, newSegmentElement, pageDim, resolution); // TODO: HACK -- the below lines refer to the this.getText() method, as the // text currently is not stored. But this is due to change when the // line-finding is integrated. newSegmentElement.setAttribute ("font-size", Float.toString(this.getFontSize())); // newSegmentElement.setAttribute // ("text-ratio", Float.toString(this.getTextRatio())); // newSegmentElement.setAttribute // ("info", getInfoString()); String type = "unknown"; switch(classification) { case PARAGRAPH: type = "paragraph"; break; case HEADING: type = "heading"; break; case OTHER_TEXT: type = "other-text"; break; case CELL: type = "cell"; break; default: type = "error"; } newSegmentElement.setAttribute("type", type); // System.out.println("creating text node: " + this.getText()); // done in super! //newSegmentElement.appendChild //(resultDocument.createTextNode(this.getText())); } public void addAsXHTML(Document resultDocument, Element parent)//, GenericSegment pageDim) { Element newParagraphElement, newTextElement, tempElement = null; if (classification == HEADING || classification == HEADING_3) newParagraphElement = resultDocument.createElement("h3"); else if (classification == HEADING_2) newParagraphElement = resultDocument.createElement("h2"); else if (classification == HEADING_1) newParagraphElement = resultDocument.createElement("h1"); else if (classification == UNORDERED_LIST_ITEM) { // tempElement = resultDocument.createElement("ul"); // newParagraphElement = resultDocument.createElement("li"); // 22.01.2011 <ul>s are now separate objects newParagraphElement = resultDocument.createElement("li"); } else newParagraphElement = resultDocument.createElement("p"); // HEADING_1 to HEADING_3 in str mode if (classification >= 40 && classification < 60) { boolean bold = false; boolean italic = false; boolean underlined = false; int superSubscript = 0; String textToAdd = ""; float prevX2 = -1.0f; Iterator iter1 = items.iterator(); while(iter1.hasNext()) { TextLine tl1 = (TextLine)iter1.next(); // System.out.println("tl1: " + tl1); if (tl1.getX1() < prevX2) { // new line: do we insert a carriage return? if (prevX2 < strXPosNewline) textToAdd = textToAdd + ("\n"); } prevX2 = tl1.getX2(); Iterator iter2 = tl1.getItems().iterator(); while(iter2.hasNext()) { TextLine tl2 = (TextLine)iter2.next(); // System.out.println("tl2: " + tl2); TextFragment prevFrag = null; Iterator iter3 = tl2.getItems().iterator(); while(iter3.hasNext()) { TextFragment tf = (TextFragment)iter3.next(); // System.out.println("tf: " + tf); // System.out.println("tf is superSubscript: " + tf.isStrIsUnderlined()); // if neither matches the whitespace character // and horiz gap > 0.25(afs) if (prevFrag != null) { float horizGap = tf.getX1() - prevFrag.getX2(); float afs = (tf.getFontSize() + prevFrag.getFontSize()) / 2.0f; if (!(tf.getText().trim().matches("[\\s]") || prevFrag.getText().trim().matches("[\\s]")) && horizGap > afs * 0.15f) { textToAdd = textToAdd + " "; } } if (tf.isBold() == bold && tf.isItalic() == italic && tf.isUnderlined() == underlined && tf.getSuperSubscript() == superSubscript) { // same style as previous character // textToAdd.concat(tf.getText()); // doesn't work?!? textToAdd = textToAdd + (tf.getText()); } else { // add text // if (textToAdd.length() > 0) if (textToAdd.trim().length() > 0) { if (superSubscript == 1) { if (underlined) { // System.out.println("underlined with textToAdd: " + textToAdd); // System.out.println("textToAdd.length: " + textToAdd.trim().length()); if (bold && italic) { Element newTextElement4 = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement4); Element newTextElement3 = resultDocument.createElement("u"); newTextElement4.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("b"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (bold) { Element newTextElement3 = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("u"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("b"); newTextElement2.appendChild(newTextElement); } else if (italic) { Element newTextElement3 = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("u"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else { Element newTextElement2 = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("u"); newTextElement2.appendChild(newTextElement); } } else { if (bold && italic) { Element newTextElement3 = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("b"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (bold) { Element newTextElement2 = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("b"); newTextElement2.appendChild(newTextElement); } else if (italic) { Element newTextElement2 = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else { newTextElement = resultDocument.createElement("sup"); newParagraphElement.appendChild(newTextElement); } } } else if (superSubscript == -1) { // System.out.println("outputting subscript"); if (underlined) { // System.out.println("underlined with textToAdd: " + textToAdd); // System.out.println("textToAdd.length: " + textToAdd.trim().length()); if (bold && italic) { Element newTextElement4 = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement4); Element newTextElement3 = resultDocument.createElement("u"); newTextElement4.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("b"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (bold) { Element newTextElement3 = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("u"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("b"); newTextElement2.appendChild(newTextElement); } else if (italic) { Element newTextElement3 = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("u"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else { Element newTextElement2 = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("u"); newTextElement2.appendChild(newTextElement); } } else { if (bold && italic) { Element newTextElement3 = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("b"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (bold) { Element newTextElement2 = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("b"); newTextElement2.appendChild(newTextElement); } else if (italic) { Element newTextElement2 = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else { newTextElement = resultDocument.createElement("sub"); newParagraphElement.appendChild(newTextElement); } } } else // normal text (required in order to initialize newTextElement) { if (underlined) { // System.out.println("underlined with textToAdd: " + textToAdd); // System.out.println("textToAdd.length: " + textToAdd.trim().length()); if (bold && italic) { Element newTextElement3 = resultDocument.createElement("u"); newParagraphElement.appendChild(newTextElement3); Element newTextElement2 = resultDocument.createElement("b"); newTextElement3.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (bold) { Element newTextElement2 = resultDocument.createElement("u"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("b"); newTextElement2.appendChild(newTextElement); } else if (italic) { Element newTextElement2 = resultDocument.createElement("u"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else { newTextElement = resultDocument.createElement("u"); newParagraphElement.appendChild(newTextElement); } } else { if (bold && italic) { Element newTextElement2 = resultDocument.createElement("b"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (bold) { // System.out.println("bold with textToAdd: " + textToAdd); // System.out.println("textToAdd.length: " + textToAdd.trim().length()); newTextElement = resultDocument.createElement("b"); newParagraphElement.appendChild(newTextElement); } else if (italic) { newTextElement = resultDocument.createElement("i"); newParagraphElement.appendChild(newTextElement); } else { newTextElement = newParagraphElement; } } } // the following lines would just add the string // without <br/>s //newColumnElement.appendChild //(resultDocument.createTextNode(theText)); String textSection = new String(); for (int n = 0; n < textToAdd.length(); n ++) { String thisChar = textToAdd.substring(n, n + 1); if (thisChar.equals("\n")) { newTextElement.appendChild (resultDocument.createTextNode(textSection)); newTextElement.appendChild (resultDocument.createElement("br")); textSection = ""; } else { textSection = textSection.concat(thisChar); } } if (textSection.length() > 0) newTextElement.appendChild (resultDocument.createTextNode(textSection)); } // update bold and italic bold = tf.isBold(); italic = tf.isItalic(); underlined = tf.isUnderlined(); textToAdd = ""; textToAdd = textToAdd + (tf.getText()); superSubscript = tf.getSuperSubscript(); } prevFrag = tf; } } } // if remaining text if(textToAdd.trim().length() > 0) { if (bold && italic) { Element newTextElement2 = resultDocument.createElement("b"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (bold) { newTextElement = resultDocument.createElement("b"); newParagraphElement.appendChild(newTextElement); } else if (italic) { newTextElement = resultDocument.createElement("i"); newParagraphElement.appendChild(newTextElement); } else { newTextElement = newParagraphElement; } // the following lines would just add the string // without <br/>s //newColumnElement.appendChild //(resultDocument.createTextNode(theText)); String textSection = new String(); for (int n = 0; n < textToAdd.length(); n ++) { String thisChar = textToAdd.substring(n, n + 1); if (thisChar.equals("\n")) { newTextElement.appendChild (resultDocument.createTextNode(textSection)); newTextElement.appendChild (resultDocument.createElement("br")); textSection = ""; } else { textSection = textSection.concat(thisChar); } } if (textSection.length() > 0) newTextElement.appendChild (resultDocument.createTextNode(textSection)); } } else // normal mode { if (isBold() && isItalic()) { Element newTextElement2 = resultDocument.createElement("b"); newParagraphElement.appendChild(newTextElement2); newTextElement = resultDocument.createElement("i"); newTextElement2.appendChild(newTextElement); } else if (isBold()) { newTextElement = resultDocument.createElement("b"); newParagraphElement.appendChild(newTextElement); } else if (isItalic()) { newTextElement = resultDocument.createElement("i"); newParagraphElement.appendChild(newTextElement); } else { newTextElement = newParagraphElement; } String theText = this.getText(); // the following lines would just add the string // without <br/>s //newColumnElement.appendChild //(resultDocument.createTextNode(theText)); String textSection = new String(); for (int n = 0; n < theText.length(); n ++) { String thisChar = theText.substring(n, n + 1); if (thisChar.equals("\n")) { newTextElement.appendChild (resultDocument.createTextNode(textSection)); newTextElement.appendChild (resultDocument.createElement("br")); textSection = ""; } else { textSection = textSection.concat(thisChar); } } if (textSection.length() > 0) newTextElement.appendChild (resultDocument.createTextNode(textSection)); } /* newParagraphElement.appendChild (resultDocument.createTextNode(this.getText())); */ // 22.01.2011 <ul>s are now separate objects /* if (classification == UNORDERED_LIST_ITEM) { tempElement.appendChild(newParagraphElement); parent.appendChild(tempElement); } else */ parent.appendChild(newParagraphElement); } public int getTextAlignment() { return textAlignment; } public void setTextAlignment(int textAlignment) { this.textAlignment = textAlignment; } public float getLineSpacing() { return lineSpacing; } public void setLineSpacing(float lineSpacing) { this.lineSpacing = lineSpacing; } public float getStrXPosNewline() { return strXPosNewline; } public void setStrXPosNewline(float strXPosNewline) { this.strXPosNewline = strXPosNewline; } }