/** * pdfXtk - PDF Extraction Toolkit * Copyright (c) by the authors/contributors. All rights reserved. * This project includes code from PDFBox and TouchGraph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * http://pdfxtk.sourceforge.net * */ package at.ac.tuwien.dbai.pdfwrap.model.document; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.util.TextPosition; import java.util.List; /** * Text fragment element; represents a single character * This class is identical in functionality to TextFragment! * ... however, does not extend it in order to allow typing * of lists ... * * @author Tamir Hassan, pdfanalyser@tamirhassan.com * @version PDF Analyser 0.9 */ public class CharSegment extends TextSegment { boolean overprint = false; protected OpTuple sourceOp; /** * Constructor. * * @param x1 The x1 coordinate of the segment. * @param x2 The x2 coordinate of the segment. * @param y1 The y1 coordinate of the segment. * @param y2 The y2 coordinate of the segment. * @param text The textual contents of the segment. * @param font The (main) font of the segment. * @param fontSize The (main) font size in the segment. */ public CharSegment( float x1, float x2, float y1, float y2, String text, String fontName, float fontSize ) { super(x1, x2, y1, y2, text, fontName, fontSize); } public CharSegment( float x1, float x2, float y1, float y2, String text, PDFont font, float fontSize, OpTuple sourceOp ) { super(x1, x2, y1, y2, text, findFontName(font), fontSize); this.sourceOp = sourceOp; } public CharSegment( float x1, float x2, float y1, float y2 ) { super(x1, x2, y1, y2); } // not in current use (I think) -- WRONG // now by default sets level to zero (primitive) public CharSegment(TextPosition tPos) { super(tPos.getX(), tPos.getX() + (tPos.getWidth()), tPos.getY(), tPos.getY() + (tPos.getFontSize() * tPos.getYScale()), tPos.getCharacter(), findFontName(tPos.getFont()), tPos.getFontSize() * tPos.getYScale()); // todo: trim the name of the font String fontName = tPos.getFont().getBaseFont(); } /** * This will create a TextFragment object from a TextPosition object. * As of PDFBox 0.7.2, this is the method currently in use, which * converts co-ordinates back to the original system. * * @param tPos - the TextPosition object; pageDim - page dimensions in order to * convert co-ordinates * @return The new TextFragment object */ public CharSegment(TextPosition tPos, GenericSegment pageDim) { super(tPos.getX(), tPos.getX() + tPos.getWidth(), pageDim.getY2() - tPos.getY(), pageDim.getY2() - tPos.getY() + (tPos.getFontSize() * tPos.getYScale()), tPos.getCharacter(), tPos.getFont().getBaseFont(), tPos.getFontSize() * tPos.getYScale()); // uncomment to print the contents of all text fragments to the screen // System.out.println("Created text fragment: x1: " + tPos.getX() + " x2: " + (tPos.getX() + tPos.getWidth()) + " y1: " + tPos.getY() + " y2: " + (tPos.getY() + (tPos.getFontSize() * tPos.getYScale())) + " Text: " + text + " Font size: " + tPos.getFontSize() + " X Scale: " + tPos.getYScale() + " Y Scale: " + tPos.getYScale()); // todo: trim the name of the font String fontName = tPos.getFont().getBaseFont(); /* this.xScale = tPos.getXScale(); this.yScale = tPos.getYScale(); this.widthOfSpace = tPos.getWidthOfSpace(); this.wordSpacing = tPos.getWordSpacing(); */ } protected static String findFontName(PDFont font) { if (font.getBaseFont().matches("^[A-Z]{6}\\+.+")) return font.getBaseFont().substring(7); else return font.getBaseFont(); } @Override public List<AttributeTuple> getAttributes() { List<AttributeTuple> attributeList = super.getAttributes(); attributeList.add(new AttributeTuple("opindex", sourceOp.getOpIndex())); attributeList.add(new AttributeTuple("argindex", sourceOp.getArgIndex())); return attributeList; } public boolean isOverprint() { return overprint; } public void setOverprint(boolean overprint) { this.overprint = overprint; } public OpTuple getSourceOp() { return sourceOp; } public void setSourceOp(OpTuple sourceOp) { this.sourceOp = sourceOp; } }