/**
* pdfXtk - PDF Extraction Toolkit
* Copyright (c) by the authors/contributors. All rights reserved.
* This project includes code from PDFBox and TouchGraph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* 3. Neither the names pdfXtk or PDF Extraction Toolkit; nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* http://pdfxtk.sourceforge.net
*
*/
package at.ac.tuwien.dbai.pdfwrap.pdfread;
import at.ac.tuwien.dbai.pdfwrap.model.document.*;
import at.ac.tuwien.dbai.pdfwrap.utils.SegmentUtils;
import at.ac.tuwien.dbai.pdfwrap.utils.Utils;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.apache.pdfbox.pdmodel.encryption.PDStandardEncryption;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAppearanceDictionary;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAppearanceStream;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.util.*;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.*;
import java.util.List;
/**
* This extracts the low-level information in the PDF
*
* Based on PDF code
* @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
* @author Tamir Hassan, pdfanalyser@tamirhassan.com
* @version PDF Analyser 0.9
*/
public class PDFObjectExtractor extends PDFStreamEngine
{
// 13.04.09 2332
// changes: copied showString, addCharacter from PDFObjectExtractorOld.java
// these methods did not exist before
// disabled adding of text block items within showCharacter
// refactoring would be a good idea...
// copied from PDFStreamEngine as not visible to this method
public static final byte[] SPACE_BYTES = { (byte)32 };
protected Graphics2D graphics;
protected Dimension pageSize;
protected PDPage page;
protected List lineSubPaths = new ArrayList();
protected GeneralPath linePath = new GeneralPath();
// 100409: necessary?
protected Color strokingColor = Color.BLACK;
protected Color nonStrokingColor = Color.BLACK;
protected Color currentColor = Color.black;
protected List<ImageSegment> imageList = new ArrayList<ImageSegment>();
protected List<TextFragment> fragmentList = new ArrayList<TextFragment>();
protected List<LineSegment> lineList = new ArrayList<LineSegment>();
protected List<RectSegment> rectList = new ArrayList<RectSegment>();
// 22.01.07
protected List<CharSegment> charList = new ArrayList<CharSegment>();
// current subpath
protected List<LineSegment> currentLines = new ArrayList<LineSegment>();
protected List<RectSegment> currentRects = new ArrayList<RectSegment>();
// current path
protected List<LineSegment> linesToAdd = new ArrayList<LineSegment>();
protected List<RectSegment> rectsToAdd = new ArrayList<RectSegment>();
// following relate to subpaths!
// subpaths containing curves are thrown away
protected boolean pathContainsCurve = false;
protected float pathBeginX, pathBeginY;
protected boolean pathBeginSet = false, pathClosed = false;
// new from 23 Apr. 09
protected GenericSegment clipBounds = null;
protected Stack clipBoundsStack = new Stack();
protected float currentX = -1;
protected float currentY = -1;
protected boolean newPath = false;
protected CompositeSegment lastStringFragment = null;
// 24.04.09 Tj operator
protected boolean newTextFragment = false;
protected boolean mergeAcrossTextInstructions = false;
protected int opIndex = -1;
protected int currentPageNo = 0;
protected int startPage = 1;
protected int endPage = Integer.MAX_VALUE;
protected PDOutlineItem startBookmark = null;
protected int startBookmarkPageNumber = -1;
protected PDOutlineItem endBookmark = null;
protected int endBookmarkPageNumber = -1;
protected PDDocument document;
protected boolean suppressDuplicateOverlappingText = true;
protected boolean shouldSeparateByBeads = true;
protected boolean sortByPosition = false;
protected List pageArticles = null;
/**
* The charactersByArticle is used to extract text by article divisions. For example
* a PDF that has two columns like a newspaper, we want to extract the first column and
* then the second column. In this example the PDF would have 2 beads(or articles), one for
* each column. The size of the charactersByArticle would be 5, because not all text on the
* screen will fall into one of the articles. The five divisions are shown below
*
* Text before first article
* first article text
* text between first article and second article
* second article text
* text after second article
*
* Most PDFs won't have any beads, so charactersByArticle will contain a single entry.
*/
protected Vector charactersByArticle = new Vector();
protected Map characterListMapping = new HashMap();
protected String lineSeparator = System.getProperty("line.separator");
protected String pageSeparator = System.getProperty("line.separator");
protected String wordSeparator = " ";
/**
* Default constructor, loads properties from file.
*
* @throws IOException If there is an error loading properties from the file.
*/
public PDFObjectExtractor() throws IOException
{
// super( ResourceLoader.loadProperties( "Resources/PDFObjectExtractor.properties", true ) );
// 100904 don't know what this parameter means, but was set to false before...
// super( ResourceLoader.loadProperties( Utils.getRootDir() + "/Resources/PDFObjectExtractor.properties", false ) );
super( ResourceLoader.loadProperties( "PDFObjectExtractor.properties", false ) );
}
/**
* This will draw the page to the requested context.
*
* @param g The graphics context to draw onto.
* @param p The page to draw.
* @param pageDimension The size of the page to draw.
*
* @throws IOException If there is an IO error while drawing the page.
*/
public void drawPage( Graphics g, PDPage p, Dimension pageDimension ) throws IOException
{
graphics = (Graphics2D)g;
page = p;
pageSize = pageDimension;
graphics.setRenderingHint( RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON );
PDResources resources = page.findResources();
processStream( page, resources, page.getContents().getStream() );
List annotations = page.getAnnotations();
for( int i=0; i<annotations.size(); i++ )
{
PDAnnotation annot = (PDAnnotation)annotations.get( i );
PDRectangle rect = annot.getRectangle();
String appearanceName = annot.getAppearanceStream();
PDAppearanceDictionary appearDictionary = annot.getAppearance();
if( appearDictionary != null )
{
if( appearanceName == null )
{
appearanceName = "default";
}
Map appearanceMap = appearDictionary.getNormalAppearance();
PDAppearanceStream appearance =
(PDAppearanceStream)appearanceMap.get( appearanceName );
if( appearance != null )
{
g.translate( (int)rect.getLowerLeftX(), (int)-rect.getLowerLeftY() );
//g.translate( 20, -20 );
processSubStream( page, appearance.getResources(), appearance.getStream() );
g.translate( (int)-rect.getLowerLeftX(), (int)+rect.getLowerLeftY() );
}
}
}
// Transformations should be done in order
// 1 - Translate
// 2 - Rotate
// 3 - Scale
// Refer to PDFReference p176 (or 188 in xpdf)
/*AffineTransform transform = graphics.getTransform();
transform.setToTranslation( 0, page.findMediaBox().getHeight()/2 );
transform.setToRotation((double)p.getRotation());
transform.setTransform( 1, 0, 0, 1, 0, 0 );
transform.setToScale( 1, 1 );
AffineTransform rotation = graphics.getTransform();
rotation.rotate( (page.findRotation() * Math.PI) / 180d );
graphics.setTransform( rotation );*/
}
@Override
protected void processOperator(PDFOperator arg0, List arg1)
throws IOException {
opIndex ++;
/*
System.out.print("arg0: " + arg0);
for (Object o : arg1)
{
if (o instanceof COSString)
{
System.out.println(" " + ((COSString)o).getString());
}
else
{
System.out.print(" " + o);
}
}
System.out.println();
*/
// TODO Auto-generated method stub
super.processOperator(arg0, arg1);
}
/**
* You should override this method if you want to perform an action when a
* string is being shown.
*
* @param string The string to display.
*
* @throws IOException If there is an error showing the string
*/
public void showString( byte[] string , int argIndex) throws IOException
{
OpTuple sourceOp = new OpTuple(opIndex, argIndex);
//super.showString(string);
ArrayList charactersToAdd = new ArrayList();
float[] individualWidths = new float[2048];
float spaceWidth = 0;
float spacing = 0;
StringBuffer stringResult = new StringBuffer(string.length);
float characterHorizontalDisplacement = 0;
float characterVerticalDisplacement = 0;
float spaceDisplacement = 0;
float fontSize = getGraphicsState().getTextState().getFontSize();
float horizontalScaling = getGraphicsState().getTextState().getHorizontalScalingPercent()/100f;
float verticalScaling = horizontalScaling;//not sure if this is right but what else to do???
float rise = getGraphicsState().getTextState().getRise();
final float wordSpacing = getGraphicsState().getTextState().getWordSpacing();
final float characterSpacing = getGraphicsState().getTextState().getCharacterSpacing();
float wordSpacingDisplacement = 0;
//We won't know the actual number of characters until
//we process the byte data(could be two bytes each) but
//it won't ever be more than string.length*2(there are some cases
//were a single byte will result in two output characters "fi"
PDFont font = getGraphicsState().getTextState().getFont();
//This will typically be 1000 but in the case of a type3 font
//this might be a different number
float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 );
float averageWidth = font.getAverageFontWidth();
Matrix initialMatrix = new Matrix();
initialMatrix.setValue(0,0,1);
initialMatrix.setValue(0,1,0);
initialMatrix.setValue(0,2,0);
initialMatrix.setValue(1,0,0);
initialMatrix.setValue(1,1,1);
initialMatrix.setValue(1,2,0);
initialMatrix.setValue(2,0,0);
initialMatrix.setValue(2,1,rise);
initialMatrix.setValue(2,2,1);
//this
int codeLength = 1;
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
//lets see what the space displacement should be
spaceDisplacement = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor);
if( spaceDisplacement == 0 )
{
spaceDisplacement = (averageWidth/glyphSpaceToTextSpaceFactor);
//The average space width appears to be higher than necessary
//so lets make it a little bit smaller.
spaceDisplacement *= .80f;
}
int pageRotation = page.findRotation();
Matrix trm = initialMatrix.multiply( getTextMatrix() ).multiply( ctm );
float x = trm.getValue(2,0);
float y = trm.getValue(2,1);
if( pageRotation == 0 )
{
trm.setValue( 2,1, -y + page.findMediaBox().getHeight() );
}
else if( pageRotation == 90 || pageRotation == -270 )
{
trm.setValue( 2,0, y );
trm.setValue( 2,1, x ); //2013-03-12 fixed shifted coordinates in e.g. eu-014.pdf (table datset)
// TODO: update GUI code to correct positioning of raster graphic
// trm.setValue( 2,1, x + page.findMediaBox().getHeight() - page.findMediaBox().getWidth() );
}
else if( pageRotation == 270 || pageRotation == -90 )
{
trm.setValue( 2,0, -y + page.findMediaBox().getHeight() );
trm.setValue( 2,1, x );
}
float xScale = trm.getXScale();
float yScale = trm.getYScale();
float xPos = trm.getXPosition();
float yPos = trm.getYPosition();
spaceWidth = spaceDisplacement * xScale * fontSize;
wordSpacingDisplacement = wordSpacing*xScale * fontSize;
float totalStringWidth = 0;
// addition 15.04.09
// to go back to older version, simply make x and yTrans 0
// this handles (more or less) edocsacs.pdf
float xPosBefore2 = getTextMatrix().getXPosition();
float yPosBefore2 = getTextMatrix().getYPosition();
float xTrans = 0, yTrans = 0;
if (pageRotation == 0)
{
xTrans = xPos - xPosBefore2;
//System.out.println("xPos: " + xPos + " tmxp: " + getTextMatrix().getXPosition() + " xTrans: " + xTrans);
yTrans = yPos - yPosBefore2;
//System.out.println("yPos: " + yPos + " tmyp: " + getTextMatrix().getYPosition() + " yTrans: " + yTrans);
}
else if (pageRotation == 90 || pageRotation == -270)
{
xTrans = xPos - yPosBefore2;
///// System.out.println("xPos: " + xPos + " tmyp: " + getTextMatrix().getYPosition() + " xTrans: " + xTrans);
yTrans = yPos - xPosBefore2;
///// System.out.println("yPos: " + yPos + " tmxp: " + getTextMatrix().getXPosition() + " yTrans: " + yTrans);
}
else if (pageRotation == 270 || pageRotation == -90)
{
yTrans = xPos - yPosBefore2;
///// System.out.println("xPos: " + xPos + " tmyp: " + getTextMatrix().getYPosition() + " xTrans: " + xTrans);
xTrans = yPos - xPosBefore2;
///// System.out.println("yPos: " + yPos + " tmxp: " + getTextMatrix().getXPosition() + " yTrans: " + yTrans);
}
//xTrans = 0;
//yTrans = 0;
// end addition
for( int i=0; i<string.length; i+=codeLength )
{
codeLength = 1;
String c = font.encode( string, i, codeLength );
if( c == null && i+1<string.length)
{
//maybe a multibyte encoding
codeLength++;
c = font.encode( string, i, codeLength );
}
//todo, handle horizontal displacement
// System.out.println("font: " + font + " font width: " + font.getFontWidth( string, i, codeLength ));
characterHorizontalDisplacement = (font.getFontWidth( string, i, codeLength )/glyphSpaceToTextSpaceFactor);
characterVerticalDisplacement =
Math.max(
characterVerticalDisplacement,
font.getFontHeight( string, i, codeLength)/glyphSpaceToTextSpaceFactor);
// PDF Spec - 5.5.2 Word Spacing
//
// Word spacing works the same was as character spacing, but applies
// only to the space character, code 32.
//
// Note: Word spacing is applied to every occurrence of the single-byte
// character code 32 in a string. This can occur when using a simple
// font or a composite font that defines code 32 as a single-byte code.
// It does not apply to occurrences of the byte value 32 in multiple-byte
// codes.
//
// RDD - My interpretation of this is that only character code 32's that
// encode to spaces should have word spacing applied. Cases have been
// observed where a font has a space character with a character code
// other than 32, and where word spacing (Tw) was used. In these cases,
// applying word spacing to either the non-32 space or to the character
// code 32 non-space resulted in errors consistent with this interpretation.
//
if( (string[i] == 0x20) && c != null && c.equals( " " ) )
{
spacing = wordSpacing + characterSpacing;
}
else
{
spacing = characterSpacing;
}
// We want to update the textMatrix using the width, in text space units.
//
//The adjustment will always be zero. The adjustment as shown in the
//TJ operator will be handled separately.
float adjustment=0;
//todo, need to compute the vertical displacement
float ty = 0;
float tx = ((characterHorizontalDisplacement-adjustment/glyphSpaceToTextSpaceFactor)*fontSize + spacing)
*horizontalScaling;
// tx2, td2 added by TH sometime
float tx2 = (characterHorizontalDisplacement-adjustment/glyphSpaceToTextSpaceFactor)*fontSize * horizontalScaling;
// System.out.println("String: " + string + " tx: " + tx + " tx2: " + tx2);
// System.out.println("characterHorizontalDisplacement: " + characterHorizontalDisplacement);
// System.out.println("horizontalScaling: " + horizontalScaling);
// System.out.println("adjustment: " + adjustment);
tx2 = (characterHorizontalDisplacement)*fontSize * horizontalScaling;
Matrix td = new Matrix();
td.setValue( 2, 0, tx );
td.setValue( 2, 1, ty );
Matrix td2 = new Matrix();
td2.setValue( 2, 0, tx2 );
td2.setValue( 2, 1, ty );
float xPosBefore = getTextMatrix().getXPosition();
float yPosBefore = getTextMatrix().getYPosition();
Matrix textMatrix2 = (Matrix)getTextMatrix().clone();
setTextMatrix(td.multiply( getTextMatrix() ));
textMatrix2 = td2.multiply( textMatrix2 );
/*
// addition 15.04.09
* moved above as it needs to be the same value for the string...
float xTrans = xPos - xPosBefore;
System.out.println("xTrans: " + xTrans);
float yTrans = yPos - yPosBefore;
System.out.println("yTrans: " + yTrans);
// end addition
*/
float width = 0, width2 = 0;
if( pageRotation == 0 )
{
width = (getTextMatrix().getXPosition() - xPosBefore);
width2 = (textMatrix2.getXPosition() - xPosBefore);
}
else if( pageRotation == 90 || pageRotation == -270)
{
width = (getTextMatrix().getYPosition() - yPosBefore);
width2 = (textMatrix2.getYPosition() - yPosBefore);
}
else if( pageRotation == 270 || pageRotation == -90 )
{
width = (yPosBefore - getTextMatrix().getYPosition());
width2 = (yPosBefore - textMatrix2.getYPosition());
//width = (textMatrix.getYPosition() - yPosBefore);
//width2 = (textMatrix2.getYPosition() - yPosBefore);
}
//there are several cases where one character code will
//output multiple characters. For example "fi" or a
//glyphname that has no mapping like "visiblespace"
if( c != null )
{
float widthOfEachCharacterForCode = width/c.length();
for( int j=0; j<c.length(); j++)
{
if( stringResult.length()+j <individualWidths.length )
{
if( c.equals("-"))
{
//System.out.println( "stringResult.length()+j=" + (widthOfEachCharacterForCode));
}
individualWidths[stringResult.length()+j] = widthOfEachCharacterForCode;
}
}
}
totalStringWidth += width;
//System.out.println("new instruction: " + c +
// " width: " + width + " xPosBefore: " + xPosBefore + " yPosBefore: " + yPosBefore);
// TODO:
// with this code (incorrect results), try on matrox
// with indiv. characters as clusters (NG generation)
// to see whether the graph generation does actually terminate :)
/*
TextFragment thisChar = new TextFragment
(xPosBefore, xPosBefore + width, yPosBefore,
yPosBefore + + (fontSize * yScale), c, font, fontSize);
addCharacter(thisChar);
*/
CharSegment thisChar;
if (c == null) c = "";
if (pageRotation == 0)
{
//TextFragment
thisChar = new CharSegment
(xPosBefore + xTrans, xPosBefore + width2 + xTrans, yPos,
yPos + (fontSize * yScale), c, font, fontSize * yScale, sourceOp);
// addCharacter(thisChar, false);
//System.out.println("thisChar: " + thisChar);
charactersToAdd.add(thisChar);
}
else // pageRotation == 90
{
/*
System.out.println("c: " + c + " yPosBefore: " + yPosBefore +
" (width:) " + width + " width2: " + width2 + " xPos: " + xPos +
" xPosBefore: " + xPosBefore + " yPos: " + yPos + " yPosBefore: "
+ yPosBefore +
" fontSize: " + fontSize + " xScale: " + xScale + " yScale: " +
yScale);
*/
// xScale? need to find a doc where they are different ;)
thisChar = new CharSegment
(yPosBefore + xTrans, yPosBefore + width2 + xTrans, yPos,
yPos + (fontSize * xScale), c, font, fontSize * xScale, sourceOp);
//System.out.println("thisChar: " + thisChar);
if (pageRotation == 270 || pageRotation == 90)
//addCharacter(thisChar, true);
charactersToAdd.add(thisChar);
else
//addCharacter(thisChar, false);
charactersToAdd.add(thisChar);
}
// System.out.println("yPos1.5: " + yPos);
stringResult.append( c );
}
// System.out.println("yPos2: " + yPos);
float totalStringHeight = characterVerticalDisplacement * fontSize * yScale;
String resultingString = stringResult.toString();
// System.out.println("yPos2.5: " + yPos);
if( individualWidths.length != resultingString.length() )
{
float[] tmp = new float[resultingString.length()];
System.arraycopy( individualWidths, 0, tmp, 0, Math.min( individualWidths.length, resultingString.length() ));
individualWidths = tmp;
if( resultingString.equals( "- " ))
{
//System.out.println( "EQUALS " + individualWidths[0] );
}
}
// System.out.println("yPos3: " + yPos);
float charX1 = xPos;
float charX2 = xPos + totalStringWidth;
float charY1 = yPos;
float charY2 = yPos + (fontSize * yScale);//totalStringHeight;
String c = stringResult.toString();
//TextFragment thisChar = new TextFragment(charX1, charX2, charY1, charY2,
// c, font, fontSize * yScale);
//addCharacter(thisChar);
// System.out.println("showCharacter with xPos: " + xPos + " and yPos " + yPos + " and string " + stringResult.toString());
// addCharacter already called above
// showCharacter does nothing; neither in super nor in this class
/* start commented out 1.1
showCharacter(
new TextPosition(
xPos,
yPos,
xScale,
yScale,
totalStringWidth,
individualWidths,
totalStringHeight,
spaceWidth,
stringResult.toString(),
font,
fontSize,
wordSpacingDisplacement ));
//System.out.println("relief");
end commented out 1.1 */
//no call addCharacter for the list of characters to be processed...
CompositeSegment thisStringFragment = new CompositeSegment();
thisStringFragment.setText(stringResult.toString());
//System.out.println("stringResult: " + stringResult);
thisStringFragment.getItems().addAll(charactersToAdd);
thisStringFragment.findBoundingBox();
//System.out.println("thisStringFragment: " + thisStringFragment);
//System.out.println("lastStringFragment: " + lastStringFragment);
//float tolerance = fontSize * 0.05f;
// changed on 7.04.09 to allow for google19.pdf and other pdfs where yscale is used
// to determine fontsize...
float tolerance = fontSize * yScale * 0.05f;
//System.out.println("lastStringFragment: " + lastStringFragment);
//System.out.println("thisStringFragment: " + thisStringFragment);
//System.out.println("tolerance: " + tolerance);
if (lastStringFragment != null &&
thisStringFragment.getText().equals
(lastStringFragment.getText()) &&
Utils.within(thisStringFragment.getX1(),
lastStringFragment.getX1(), tolerance) &&
Utils.within(thisStringFragment.getX2(),
lastStringFragment.getX2(), tolerance) &&
Utils.within(thisStringFragment.getY1(),
lastStringFragment.getY1(), tolerance) &&
Utils.within(thisStringFragment.getY2(),
lastStringFragment.getY2(), tolerance))
//if (false)
{
//System.out.println("stringResult overprint...");
// it's an overprint; code to come here
}
else
{
// add the text fragment!
Iterator<CharSegment> ctaIter = charactersToAdd.iterator();
while(ctaIter.hasNext())
{
CharSegment thisChar = ctaIter.next();
if (pageRotation == 270 || pageRotation == -90)
//if (pageRotation == 270 || pageRotation == 90)
addCharacter(thisChar, true);
else
addCharacter(thisChar, false);
}
}
lastStringFragment = thisStringFragment;
}
public void addCharacter(CharSegment thisChar, boolean reverseCoords)
{
boolean concatenate = false;
if (fragmentList.size() > 0)
{
TextFragment lastFragment = fragmentList.get(fragmentList.size() - 1);
// System.out.println("**thisChar: " + thisChar);
//System.out.println("lastFragment: " + lastFragment);
// TODO: this lovely overprint code doesn't work on
// google19.pdf, where complete sentences are
// rendered at once... :(
CharSegment lastChar = (CharSegment)lastFragment.getItems().
get(lastFragment.getItems().size() - 1);
boolean overprint = false;
float tolerance = thisChar.getFontSize() * 0.05f;
if ((lastChar.getText().equals(thisChar.getText()) &&
lastChar.getFontSize() == thisChar.getFontSize() &&
Utils.within(lastChar.getX1(), thisChar.getX1(), tolerance) &&
Utils.within(lastChar.getX2(), thisChar.getX2(), tolerance) &&
Utils.within(lastChar.getY1(), thisChar.getY1(), tolerance) &&
Utils.within(lastChar.getY2(), thisChar.getY2(), tolerance) &&
lastChar.getFontName() == thisChar.getFontName())
) overprint = true;
if (overprint)
{
/*
System.out.println("Overprint with: ");
System.out.println("lastChar: " + lastChar);
System.out.println("lastFramgent: " + lastFragment);
System.out.println("thisChar: " + thisChar);
System.out.println("Setting lastChar to overprint");
System.out.println();
*/
lastChar.setOverprint(true);
}
else
{
boolean sameLine;
/*
if (lastFragment.getY1() == thisChar.getY1())
sameLine = true; else sameLine = false;
*/
// 23.06.08 added error margin
if (Utils.within(lastFragment.getY1(), thisChar.getY1(), tolerance))
sameLine = true; else sameLine = false;
boolean sameWord;
float spacing;
if (reverseCoords)
{
//System.out.println("reverseCoords");
float lfWidth = lastFragment.getWidth();
float tcWidth = thisChar.getWidth();
lastFragment.setX1(0 - lastFragment.getX1());
lastFragment.setX2(lastFragment.getX1() + lfWidth);
thisChar.setX1(0 - thisChar.getX1());
thisChar.setX2(thisChar.getX1() + tcWidth);
}
spacing = thisChar.getX1() - lastFragment.getX2();
//System.out.println("spacing: " + spacing);
if (spacing < thisChar.getWidth() * 0.25 &&
spacing > (0 - thisChar.getWidth() * 1.0))
sameWord = true; else sameWord = false;
if (sameLine && sameWord)
concatenate = true; else concatenate = false;
if (mergeAcrossTextInstructions)
concatenate = !newTextFragment;
// TEST 27.08.09
// concatenate = !newTextFragment;
if (concatenate)
{
// concatenate last fragment of fragmentList
// logically, the size of fragmentList must be at least one
lastFragment.setX2(thisChar.getX2());
lastFragment.setText(lastFragment.getText().concat(thisChar.getText()));
lastFragment.getItems().add(thisChar);
}
else
{
// !concatenate stuff was here but we
// need to revert the co-ordinates back...
}
if (reverseCoords)
{
float lfWidth = lastFragment.getWidth();
float tcWidth = thisChar.getWidth();
lastFragment.setX1(0 - lastFragment.getX1());
lastFragment.setX2(lastFragment.getX1() + lfWidth);
thisChar.setX1(0 - thisChar.getX1());
thisChar.setX2(thisChar.getX1() + tcWidth);
}
if (!concatenate)
{
TextFragment newFrag = new TextFragment(thisChar);
// 2011-01-25 constructor does all this automatically
// newLine.getItems().add(thisChar);
// newLine.setBoundingBox(thisChar.getBoundingBox());
// newLine.setText(thisChar.getText());
// newLine.setFontName(thisChar.getFontName());
// newLine.setFontSize(thisChar.getFontSize());
fragmentList.add(newFrag);
}
}
}
else
{
TextFragment newFrag = new TextFragment(thisChar);
// 2011-01-25 constructor does all this automatically
// newFrag.getItems().add(thisChar);
// newFrag.setBoundingBox(thisChar.getBoundingBox());
// newFrag.setText(thisChar.getText());
// newFrag.setFontName(thisChar.getFontName());
// newFrag.setFontSize(thisChar.getFontSize());
fragmentList.add(newFrag);
}
// do not add empty space characters (bmw example)
if (!thisChar.getText().equals(" "))
charList.add(thisChar);
setNewTextFragment(false); // only needed when mergeAcrossTextInstructions == true
}
// pre: characters are in the correct order
public static void removeLeadingTrailingSpaces
(List<TextFragment> fragmentList)
{
List<TextFragment> fragsToRemove =
new ArrayList<TextFragment>();
for (TextFragment tf : fragmentList)
{
// remove from beginning
while(tf.getItems().size() > 0 &&
tf.getItems().get(0).getText().equals(" "))
{
tf.getItems().remove(0);
tf.findBoundingBox();
tf.findText();
}
// remove at end
int lastIndex = tf.getItems().size() - 1;
while(tf.getItems().size() > 0 &&
tf.getItems().get(lastIndex).getText().equals(" "))
{
tf.getItems().remove(lastIndex);
tf.findBoundingBox();
tf.findText();
// recalculate lastIndex -- important!
lastIndex = tf.getItems().size() - 1;
}
if (tf.getItems().size() == 0)
fragsToRemove.add(tf);
}
fragmentList.removeAll(fragsToRemove);
}
/**
* You should override this method if you want to perform an action when a
* string is being shown.
*
* @param text The string to display.
*/
/* start commented out 1.1
protected void showCharacter( TextPosition text )
{
//should use colorspaces for the font color but for now assume that
//the font color is black
try
{
if( this.getGraphicsState().getTextState().getRenderingMode() == PDTextState.RENDERING_MODE_FILL_TEXT )
{
graphics.setColor( this.getGraphicsState().getNonStrokingColorSpace().createColor() );
}
else if( this.getGraphicsState().getTextState().getRenderingMode() == PDTextState.RENDERING_MODE_STROKE_TEXT )
{
graphics.setColor( this.getGraphicsState().getStrokingColorSpace().createColor() );
}
else
{
//need to implement....
}
PDFont font = text.getFont();
// System.out.println("gotten font: " + font.getBaseFont() + " subtype: " + font.getSubType());
try
{
font.drawString( text.getCharacter(), graphics, text.getFontSize(), text.getXScale(), text.getYScale(),
text.getX(), text.getY() );
}
catch (Exception e)
{
// e.g. font name not found
e.printStackTrace();
}
// works for the top part of Daimler!! edocsacs.pdf
//TextFragment newFragment = new TextFragment(text);
//TextLine newLine = new TextLine(newFragment);
//newLine.findBoundingBox();
//newLine.findText(false);
//fragmentList.add(newLine);
//System.out.println("added fragment: " + newLine);
}
catch( IOException io )
{
io.printStackTrace();
}
}
end commented out 1.1 */
/**
* Get the graphics that we are currently drawing on.
*
* @return The graphics we are drawing on.
*/
public Graphics2D getGraphics()
{
return graphics;
}
/**
* Get the page that is currently being drawn.
*
* @return The page that is being drawn.
*/
public PDPage getPage()
{
return page;
}
/**
* Get the size of the page that is currently being drawn.
*
* @return The size of the page that is being drawn.
*/
public Dimension getPageSize()
{
return pageSize;
}
/**
* Fix the y coordinate based on page rotation.
*
* @param x The x coordinate.
* @param y The y coordinate.
* @return The updated y coordinate.
*/
public double fixY( double x, double y )
{
return pageSize.getHeight() - y;
}
/**
* Get the current line path to be drawn.
*
* @return The current line path to be drawn.
*/
public GeneralPath getLinePath()
{
return linePath;
}
/**
* Set the line path to draw.
*
* @param newLinePath Set the line path to draw.
*/
public void setLinePath(GeneralPath newLinePath)
{
if (linePath == null || linePath.getCurrentPoint() == null){
linePath = newLinePath;
}else{
linePath.append (newLinePath, false);
}
}
/**
* Get the current list of line paths to be drawn.
*
* @return The current list of line paths to be drawn.
*/
public List getLineSubPaths()
{
return lineSubPaths;
}
/**
* Set the list of line paths to draw.
*
* @param newLineSubPaths Set the list of line paths to draw.
*/
public void setLineSubPaths(List newLineSubPaths)
{
lineSubPaths = newLineSubPaths;
}
/**
*
* Fill the path
*
* @param windingRule The winding rule this path will use.
*/
public void fillPath(int windingRule) throws IOException{
graphics.setColor( getGraphicsState().getNonStrokingColor().getJavaColor() );
//logger().info("Filling the path with rule: " + windingRule);
getLinePath().setWindingRule(windingRule);
graphics.setRenderingHint( RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_OFF );
List subPaths = getLineSubPaths();
for( int i=0; i<subPaths.size(); i++ )
{
GeneralPath subPath = (GeneralPath)subPaths.get( i );
if (subPath.getCurrentPoint() != null){ //Sector9's suggestion in bug 1672556
subPath.closePath();
}
/*Rectangle bBox = subPath.getBounds();
Point2D point1 = TransformedPoint(bBox.x, bBox.y);
Point2D point2 = TransformedPoint(bBox.x + bBox.width, bBox.y + bBox.height);
//RectSegment ls = new RectSegment(bBox.x, bBox.x*bBox.width, bBox.y, bBox.y+bBox.height);
RectSegment ls = new RectSegment((float)point1.getX(), (float)point2.getX(),
(float)point1.getY(), (float)point2.getY());
System.out.println("fillPath adding line segment: " + ls);
rectList.add(ls);*/
graphics.fill( subPath );
}
graphics.fill( getLinePath() );
getLinePath().reset();;
}
public void setStroke(BasicStroke newStroke){
getGraphics().setStroke( newStroke );
}
public void StrokePath() throws IOException{
graphics.setColor( getGraphicsState().getStrokingColor().getJavaColor() ); //per Ben's 11/15 change in StrokePath.java
List subPaths = getLineSubPaths();
for( int i=0; i<subPaths.size(); i++ )
{
GeneralPath subPath = (GeneralPath)subPaths.get( i );
graphics.draw( subPath );
/*
Rectangle bBox = subPath.getBounds();
Point2D point1 = TransformedPoint(bBox.x, bBox.y);
Point2D point2 = TransformedPoint(bBox.x + bBox.width, bBox.y + bBox.height);
//RectSegment ls = new RectSegment(bBox.x, bBox.x*bBox.width, bBox.y, bBox.y+bBox.height);
RectSegment ls = new RectSegment((float)point1.getX(), (float)point2.getX(),
(float)point1.getY(), (float)point2.getY());
System.out.println("strokePath adding line segment: " + ls);
rectList.add(ls);*/
}
subPaths.clear();
GeneralPath path = getLinePath();
graphics.draw( path );
path.reset();
}
// these colour methods added from old method 100409
/**
* Get the non stroking color.
*
* @return The non stroking color.
*/
public Color getNonStrokingColor()
{
return nonStrokingColor;
}
/**
* Set the non stroking color.
*
* @param newNonStrokingColor The non stroking color.
*/
public void setNonStrokingColor(Color newNonStrokingColor)
{
nonStrokingColor = newNonStrokingColor;
currentColor = nonStrokingColor;
}
/**
* Get the stroking color.
*
* @return The stroking color.
*/
public Color getStrokingColor()
{
return strokingColor;
}
/**
* Set the stroking color.
*
* @param newStrokingColor The stroking color.
*/
public void setStrokingColor(Color newStrokingColor)
{
strokingColor = newStrokingColor;
currentColor = strokingColor;
}
//If you need to do anything when a color changes, do it here ... or in an override of this function
public void ColorChanged(Boolean bStroking) throws IOException{
//logger().info("changing " + (bStroking ? "" : "non") + "stroking color");
///// System.out.println("changing " + (bStroking ? "" : "non") + "stroking color");
}
//This code generalizes the code Jim Lynch wrote for AppendRectangleToPath
public java.awt.geom.Point2D.Double TransformedPoint (double x, double y){
double scaleX = 0.0;
double scaleY = 0.0;
double transX = 0.0;
double transY = 0.0;
double finalX = x;
double finalY = y;
//Get the transformation matrix
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
AffineTransform at = ctm.createAffineTransform();
scaleX = at.getScaleX();
scaleY = at.getScaleY();
transX = at.getTranslateX();
transY = at.getTranslateY();
Point2D Pscale = ScaledPoint (finalX, finalY, scaleX, scaleY);
finalX = Pscale.getX();
finalY = Pscale.getY();
finalX += transX;
finalY += transY;
finalY = fixY( finalX, finalY );
finalY -= .6;
return new java.awt.geom.Point2D.Double(finalX, finalY);
}
//Use ScaledPoint rather than TransformedPoint in situations where most of the translation
//need not be repeated.
//Consider, for example, the second coordinate of a rectangle.
public java.awt.geom.Point2D.Double ScaledPoint (double x, double y, double scaleX, double scaleY){
double finalX = 0.0;
double finalY = 0.0;
if(scaleX > 0)
{
finalX = x * scaleX;
}
if(scaleY > 0)
{
finalY = y * scaleY;
}
return new java.awt.geom.Point2D.Double(finalX, finalY);
}
public java.awt.geom.Point2D.Double ScaledPoint (double x, double y){
double scaleX = 0.0;
double scaleY = 0.0;
//Get the transformation matrix
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
AffineTransform at = ctm.createAffineTransform();
scaleX = at.getScaleX();
scaleY = at.getScaleY();
return ScaledPoint(x, y, scaleX, scaleY);
}
// 100409 methods from old class below...
/**
* Start a new page. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void startPage( PDPage page ) throws IOException
{
//default is to do nothing.
}
/**
* End a page. Default implementation is to do nothing. Subclasses
* may provide additional information.
*
* @param page The page we are about to process.
*
* @throws IOException If there is any error writing to the stream.
*/
protected void endPage( PDPage page ) throws IOException
{
//default is to do nothing
}
/**
* This will set the first page to be extracted by this class.
*
* @param startPageValue New value of property startPage.
*/
public void setStartPage(int startPageValue)
{
startPage = startPageValue;
}
/**
* This will set the last page to be extracted by this class.
*
* @param endPageValue New value of property endPage.
*/
public void setEndPage(int endPageValue)
{
endPage = endPageValue;
}
/**
* This is the page that the text extraction will start on. The pages start
* at page 1. For example in a 5 page PDF document, if the start page is 1
* then all pages will be extracted. If the start page is 4 then pages 4 and 5
* will be extracted. The default value is 1.
*
* @return Value of property startPage.
*/
public int getStartPage()
{
return startPage;
}
/**
* This will get the last page that will be extracted. This is inclusive,
* for example if a 5 page PDF an endPage value of 5 would extract the
* entire document, an end page of 2 would extract pages 1 and 2. This defaults
* to Integer.MAX_VALUE such that all pages of the pdf will be extracted.
*
* @return Value of property endPage.
*/
public int getEndPage()
{
return endPage;
}
public BufferedImage pageImage(PDDocument doc, int pageNo) throws IOException
{
BufferedImage retVal;
// *** start copied from beginning of getObjects method
PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary();
//only care about standard encryption and if it was decrypted with the
//user password
if( encDictionary instanceof PDStandardEncryption &&
!doc.wasDecryptedWithOwnerPassword() )
{
PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary;
if( !stdEncryption.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
}
currentPageNo = 1;
document = doc;
//output = outputStream;
//startDocument(document);
if( document.isEncrypted() )
{
// We are expecting non-encrypted documents here, but it is common
// for users to pass in a document that is encrypted with an empty
// password (such a document appears to not be encrypted by
// someone viewing the document, thus the confusion). We will
// attempt to decrypt with the empty password to handle this case.
//
try
{
document.decrypt("");
}
catch (CryptographyException e)
{
throw new IOException("Error decrypting document, details: " + e.getMessage());
}
catch (InvalidPasswordException e)
{
throw new IOException("Error: document is encrypted");
}
}
List allPages = document.getDocumentCatalog().getAllPages();
// *** end copied from beginning of getObjects method
PDPage page = (PDPage)allPages.get(pageNo);
PDResources resources = page.getResources();
Map images = resources.getImages();
if( images != null )
{
Iterator imageIter = images.keySet().iterator();
// ** TODO: return exception if >1 image on page?
while( imageIter.hasNext() )
{
String key = (String )imageIter.next();
PDXObjectImage image = (PDXObjectImage)images.get( key );
retVal = image.getRGBImage();
return retVal;
}
}
throw new IOException("No images found");
}
/**
* This will take a PDDocument and return a list of PDFPage objects for each page
*
* @param doc The document to get the data from.
* @param outputStream The location to put the text.
*
* @throws IOException If the doc is in an invalid state.
*/
public List<PDFPage> findObjects(PDDocument doc) throws IOException
{
PDEncryptionDictionary encDictionary = doc.getEncryptionDictionary();
//only care about standard encryption and if it was decrypted with the
//user password
if( encDictionary instanceof PDStandardEncryption &&
!doc.wasDecryptedWithOwnerPassword() )
{
PDStandardEncryption stdEncryption = (PDStandardEncryption)encDictionary;
if( !stdEncryption.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
}
currentPageNo = 1;
document = doc;
//output = outputStream;
//startDocument(document);
if( document.isEncrypted() )
{
// We are expecting non-encrypted documents here, but it is common
// for users to pass in a document that is encrypted with an empty
// password (such a document appears to not be encrypted by
// someone viewing the document, thus the confusion). We will
// attempt to decrypt with the empty password to handle this case.
//
try
{
document.decrypt("");
}
catch (CryptographyException e)
{
throw new IOException("Error decrypting document, details: " + e.getMessage());
}
catch (InvalidPasswordException e)
{
throw new IOException("Error: document is encrypted");
}
}
List allPages = document.getDocumentCatalog().getAllPages();
int numPages = allPages.size();
// take care of negative page numbers
// and swap startPage and endPage if necessary
if (startPage < 0)
startPage = numPages + startPage;
if (endPage < 0)
endPage = numPages + endPage;
if (startPage > endPage)
{
int tempVar = startPage;
startPage = endPage;
endPage = tempVar;
}
/*
System.out.println("Number of pages: " + numPages);
System.out.println("Start page is now: " + startPage);
System.out.println("End page is now: " + endPage);
*/
// return processPages( document.getDocumentCatalog().getAllPages() );
return processPages(allPages);
// writer only method?
//endDocument(document);
}
/**
* This will process all of the pages and the text that is in them.
*
* @param pages The pages object in the document.
*
* @throws IOException If there is an error parsing the text.
*/
protected List<PDFPage> processPages( List pages ) throws IOException
{
List<PDFPage> retVal = new ArrayList<PDFPage>();
if( startBookmark != null )
{
startBookmarkPageNumber = currentPageNumber( startBookmark, pages );
}
if( endBookmark != null )
{
endBookmarkPageNumber = currentPageNumber( endBookmark, pages );
}
if( startBookmarkPageNumber == -1 && startBookmark != null &&
endBookmarkPageNumber == -1 && endBookmark != null &&
startBookmark.getCOSObject() == endBookmark.getCOSObject() )
{
//this is a special case where both the start and end bookmark
//are the same but point to nothing. In this case
//we will not extract any text.
startBookmarkPageNumber = 0;
endBookmarkPageNumber = 0;
}
Iterator pageIter = pages.iterator();
// this loop uncommented to process only the first page
while( pageIter.hasNext() )
{
PDPage nextPage = (PDPage)pageIter.next();
// added for further development
page = nextPage;
try
{
System.out.println("Current page no: " + currentPageNo);
if (page.getArtBox() != null)
System.out.println("Art Box: " + new GenericSegment(page.getArtBox()));
if (page.getBleedBox() != null)
System.out.println("Bleed Box: " + new GenericSegment(page.getBleedBox()));
// the following value caused problems for pp120-hassan.pdf
if (page.getCropBox() != null)
System.out.println("Crop Box: " + new GenericSegment(page.getCropBox()));
if (page.getMediaBox() != null)
System.out.println("Media Box: " + new GenericSegment(page.getMediaBox()));
if (page.getTrimBox() != null)
System.out.println("Trim Box: " + new GenericSegment(page.getTrimBox()));
// if media box is null (not allowed!) find a substitute
if (page.getMediaBox() == null)
{
if (page.getBleedBox() != null)
page.setMediaBox(page.getBleedBox());
else if (page.getCropBox() != null)
page.setMediaBox(page.getCropBox());
else if (page.getTrimBox() != null)
page.setMediaBox(page.getTrimBox());
else if (page.getArtBox() != null)
page.setMediaBox(page.getArtBox());
}
}
catch(NullPointerException npe)
{
System.out.println("at least one of the boxes is missing!");
npe.printStackTrace();
}
int pageRotation = page.findRotation();
System.out.println("Page rotation: " + pageRotation);
// end of addition
PDStream contentStream = nextPage.getContents();
GenericSegment pageDim;
/* simple version
if (page.getMediaBox() != null)
pageDim = new GenericSegment(page.getMediaBox());
else
pageDim = new GenericSegment(page.getArtBox());
*/
// 20.02.09: swapped artBox for trimBox...
if (page.getMediaBox() != null && page.getTrimBox() != null)
{
GenericSegment mediaBox = new GenericSegment(page.getMediaBox());
GenericSegment trimBox = new GenericSegment(page.getTrimBox());
// fixed "Das Windsor-Syndrom" document
pageDim =
new GenericSegment(trimBox.getX1(), trimBox.getX2(),
trimBox.getY1(), trimBox.getY2());
/* state at end of NextWrap
pageDim =
new GenericSegment(mediaBox.getX1(), mediaBox.getX2(),
mediaBox.getY1(), artBox.getY2());
*/
}
else if (page.getMediaBox() != null)
{
pageDim = new GenericSegment(page.getMediaBox());
}
else if (page.getArtBox() != null)
{
// will never reach this stage because
// a page always has a MediaBox... (above=...
pageDim = new GenericSegment(page.getArtBox());
}
else
{
throw new IOException
("Cannot find a suitable page bounding box!");
}
System.out.println("Processing page: " + currentPageNo);
// System.out.println("startPage: " + startPage);
// System.out.println("endPage: " + endPage);
if( currentPageNo >= startPage && currentPageNo <= endPage &&
(startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
(endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
{
if( contentStream != null )
{
COSStream contents = contentStream.getStream();
PDFPage thePage = processPage(nextPage, contents);
// 20.06.08 this is now done in processPage...
// because of the complications with rotation...
// thePage.setBoundingBox(pageDim.getBoundingBox());
//thePage.reverseYCoordinates();
retVal.add(thePage);
}
}
else
{
// skip to next page
if( contentStream != null )
{
COSStream contents = contentStream.getStream();
//System.out.println("processing page " + currentPageNo + "...");
//PDFPage throwawayVar = processPage( nextPage, contents );
// unnecessary; all that did was increment currentPageNo!
currentPageNo ++;
}
}
}
return retVal;
}
protected int currentPageNumber( PDOutlineItem bookmark, List allPages ) throws IOException
{
int pageNumber = -1;
PDPage page = bookmark.findDestinationPage( document );
if( page != null )
{
pageNumber = allPages.indexOf( page )+1;//use one based indexing
}
return pageNumber;
}
/**
* This will process the contents of a page.
* modified by TH
*
* @param page The page to process.
* @param content The contents of the page.
*
* @throws IOException If there is an error processing the page.
*/
protected PDFPage processPage( PDPage page, COSStream content ) throws IOException
{
// new from 23.04.09
clipBounds = new GenericSegment(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY,
Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY);
this.page = page;
// pageSize = page.getArtBox().createDimension();
// pageSize = page.getCropBox().createDimension(); // seems to make no difference to SN...
PDResources resources = page.findResources();
PDFPage thisPage = new PDFPage();
//GenericSegment pageDim = new GenericSegment(page.getArtBox());
GenericSegment pageDim; // = new GenericSegment(page.getCropBox()); // seems to make no difference to SN...
if (page.getCropBox() != null)
{
pageDim = new GenericSegment(page.getCropBox());
//pageSize = page.getCropBox().createDimension(); // needed for PDFBox methods
}
else
{
pageDim = new GenericSegment(page.findMediaBox());
//pageSize = page.findMediaBox().createDimension(); // needed for PDFBox methods
}
pageSize = page.findMediaBox().createDimension(); // needed for PDFBox graphic drawing methods
// if statement added for EFLensChart.pdf
if (page.getRotation() == null) thisPage.setRotation(0);
else thisPage.setRotation(page.getRotation());
// PDFBox code for rasterizing image
// the mBox is not used for our purposes
int scaling = 2;
PDRectangle mBox = page.findMediaBox();
//PDRectangle mBox = page.getCropBox(); // also does nothing!
int width = (int)(mBox.getWidth());//*2);
int height = (int)(mBox.getHeight());//*2);
Dimension pageDimension = new Dimension( width, height );
//note we are doing twice as many pixels because
//the default size is not really good resolution,
//so create an image that is twice the size
//and let the client scale it down.
BufferedImage retval = new BufferedImage
( width*scaling, height*scaling, BufferedImage.TYPE_BYTE_INDEXED );
//Graphics2D graphics = (Graphics2D)retval.getGraphics();
graphics = (Graphics2D)retval.getGraphics();
graphics.setColor( Color.WHITE );
graphics.fillRect(0,0,width*scaling, height*scaling);
graphics.scale( scaling, scaling );
// end of new stuff...
imageList = new ArrayList<ImageSegment>();
lineList = new ArrayList<LineSegment>();
rectList = new ArrayList<RectSegment>();
fragmentList = new ArrayList<TextFragment>();
charList = new ArrayList<CharSegment>();
// TODO: method within GenericSegment instead of here?
thisPage.setX1(pageDim.getX1());
thisPage.setX2(pageDim.getX2());
thisPage.setY1(pageDim.getY1());
thisPage.setY2(pageDim.getY2());
List<GenericSegment> pageItems = thisPage.getItems();
// end of addition
long start = System.currentTimeMillis();
currentPageNo++;
// former loop (page selection)
// {
startPage( page );
pageArticles = page.getThreadBeads();
int numberOfArticleSections = 1 + pageArticles.size() * 2;
if( !shouldSeparateByBeads )
{
numberOfArticleSections = 1;
}
int originalSize = charactersByArticle.size();
charactersByArticle.setSize( numberOfArticleSections );
for( int i=0; i<numberOfArticleSections; i++ )
{
if( numberOfArticleSections < originalSize )
{
((List)charactersByArticle.get( i )).clear();
}
else
{
charactersByArticle.set( i, new ArrayList() );
}
}
characterListMapping.clear();
long startProcess = System.currentTimeMillis();
processStream( page, page.findResources(), content );
long stopProcess = System.currentTimeMillis();
long startFlush = System.currentTimeMillis();
// TODO: rescue this bit of code someday!
// add all text elements to page object
for( int i=0; i<charactersByArticle.size(); i++)
{
List textList = (List)charactersByArticle.get( i );
Iterator textIter = textList.iterator();
while( textIter.hasNext() )
{
TextPosition thisPos = (TextPosition)textIter.next();
pageItems.add(new TextFragment(thisPos, pageDim));
System.out.println("text fragment by article found");
}
}
thisPage.getItems().addAll(fragmentList);
// line below uncommented 17.03.07
thisPage.getItems().addAll(charList);
// remove anything not within the crop box
if (page.getCropBox() != null)
{
//System.out.println("cropBox: " + new GenericSegment(page.getCropBox()));
// 2011-01-25 theItems already set
// List items = thisPage.getItems();
GenericSegment cropBox = new GenericSegment(page.getCropBox());
// these lines only for manual input of the cropBox
float new_Y2 = page.getMediaBox().getHeight() - cropBox.getY1();
float new_Y1 = page.getMediaBox().getHeight() - cropBox.getY2();
cropBox.setY1(new_Y1);
cropBox.setY2(new_Y2);
//System.out.println("current cropBox: " + cropBox);
if (thisPage.getRotation() == 270 || thisPage.getRotation() == -90)
{
// TODO: refactor as e.g. GenericSegment.rotate
// 3x 90 deg rotate
// well, that's the cheeky way... :)
cropBox = new GenericSegment(cropBox.getX1(),
cropBox.getX1() + cropBox.getHeight(), cropBox.getY1(),
cropBox.getY1() + cropBox.getWidth());
cropBox = new GenericSegment(cropBox.getX1(),
cropBox.getX1() + cropBox.getHeight(), cropBox.getY1(),
cropBox.getY1() + cropBox.getWidth());
cropBox = new GenericSegment(cropBox.getX1(),
cropBox.getX1() + cropBox.getHeight(), cropBox.getY1(),
cropBox.getY1() + cropBox.getWidth());
}
else if (thisPage.getRotation() == 90 || thisPage.getRotation() == -270)
{
cropBox = new GenericSegment(cropBox.getX1(),
cropBox.getX1() + cropBox.getHeight(), cropBox.getY1(),
cropBox.getY1() + cropBox.getWidth());
//System.out.println("new cropBox: " + cropBox);
}
for (int n = 0; n < pageItems.size(); n ++)
{
GenericSegment gs = (GenericSegment)pageItems.get(n);
if (!SegmentUtils.intersects(gs, cropBox))
{
pageItems.remove(gs);
n --;
}
}
}
if (thisPage.getRotation() == 270 || thisPage.getRotation() == -90)
{
// I imagine somewhere in the PDFBox code it tries to be
// clever and reverse the co-ordinates with the incorrect
// page dimensions; we need to undo this(!)
thisPage.reverseYCoordinatesPDF();
// not quite as easy as just swapping X and Y co-ordinates
// page needs to be rotated around its _top_ axis; this means
// subtracting the difference between height & width...
thisPage.setBoundingBox(new float[]
{thisPage.getY1(), thisPage.getY2(),
thisPage.getY2() - thisPage.getWidth(), thisPage.getY2()});
// note getWidth is actually now the height :)
thisPage.normalizeCoordinates();
thisPage.reverseXCoordinates();
}
else
{
//thisPage.normalizeCoordinates();
// reverseYCoordinates is common, so do nothing...
}
pageItems.addAll(rectList);
pageItems.addAll(lineList);
pageItems.addAll(imageList);
thisPage.normalizeCoordinates();
thisPage.reverseYCoordinatesPDF();
thisPage.setLastOpIndex(opIndex);
//pageItems.addAll(lineList);
//pageItems.addAll(rectList);
long stopFlush = System.currentTimeMillis();
endPage( page );
// end of former loop
// }
long stop = System.currentTimeMillis();
// System.out.println("returning thisPage: " + thisPage.toExtendedString());
return thisPage;
}
public boolean isNewTextFragment() {
return newTextFragment;
}
public void setNewTextFragment(boolean newTextFragment) {
this.newTextFragment = newTextFragment;
}
public void pushClipBounds()
{
clipBoundsStack.push(clipBounds.clone());
// NOOOOOO!!!
//clipBounds = new GenericSegment(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY,
// Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY);
}
public void popClipBounds()
{
clipBounds = (GenericSegment)clipBoundsStack.pop();
}
public void simpleModifyClippingPath()
{
//GenericSegment pathBounds = new GenericSegment();
// TODO: getDilatedSegment and enlarge... redundancy?
///// System.out.println("path: " + ((CompoundTextSegment)getPathBounds()).toExtendedString());
clipBounds.shrinkBoundingBox(currentPathBounds());//.getDilatedSegment(5.0f));
///// System.out.println("clipBounds is now: " + clipBounds);
}
public GenericSegment currentPathBounds()
{
CompositeSegment retVal = new CompositeSegment();
retVal.getItems().addAll(linesToAdd);
retVal.getItems().addAll(rectsToAdd);
retVal.getItems().addAll(currentLines);
retVal.getItems().addAll(currentRects);
retVal.findBoundingBox();
return retVal;
}
public void newPath()
{
if (true)
//if (!pathContainsCurve)
{
linesToAdd.addAll(currentLines);
rectsToAdd.addAll(currentRects);
}
newPath = true;
pathContainsCurve = false;
pathBeginSet = false;
pathClosed = false;
currentLines = new ArrayList<LineSegment>();
currentRects = new ArrayList<RectSegment>();
}
public void endPath()
{
newPath = true;
pathContainsCurve = false;
pathBeginSet = false;
pathClosed = false;
linesToAdd = new ArrayList<LineSegment>();
rectsToAdd = new ArrayList<RectSegment>();
}
public void simpleCurveTo(float x1, float y1, float x2, float y2, float x3, float y3)
{
// points already transformed!
pathContainsCurve = true;
LineSegment l1 = new LineSegment(currentX, x1, currentY, y1);
LineSegment l2 = new LineSegment(x1, x2, y1, y2);
LineSegment l3 = new LineSegment(x2, x3, y2, y3);
l1.setCurve(true);
l2.setCurve(true);
l3.setCurve(true);
currentLines.add(l1);
currentLines.add(l2);
currentLines.add(l3);
currentX = x3;
currentY = y3;
}
public void simpleClosePath()
{
if (pathBeginSet && !pathClosed)
{
simpleLineTo(pathBeginX, pathBeginY);
pathClosed = true;
}
}
public void simpleMoveTo(float x, float y)
{
Point2D Ppos = TransformedPoint(x, y);
currentX = (float)Ppos.getX();
currentY = (float)Ppos.getY();
///// System.out.println("moving to: " + Ppos.getX() + ", " + Ppos.getY());
// newPath() called by calling OperatorProcessor method
}
public void simpleLineTo(float x, float y)
{
/*
Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
float xFrom = ctm.getXPosition();
float yFrom = ctm.getYPosition();
Point2D Pfrom = TransformedPoint(xFrom, yFrom);
*/
float[] comp = getStrokingColor().getRGBColorComponents(null);
///// System.out.println("line stroke colour: " + comp[0] + " " + comp[1] + " " + comp[2]);
Point2D Pto = TransformedPoint(x, y);
//if (newPath)
//{
LineSegment newLine = new LineSegment(currentX,
(float)Pto.getX(), currentY, (float)Pto.getY());
/*
LineSegment newLine = new LineSegment(newLineTemp.getX1(),
newLineTemp.getX2(), newLineTemp.getY1() - newLineTemp.getHeight(),
newLineTemp.getY1());
*/
///// System.out.println("adding line: " + newLine);
newLine.correctNegativeDimensions();
newLine.rotate(page);
///// System.out.println("after rotation: " + newLine);
//lineList.add(newLine);
linesToAdd.add(newLine);
// lineList.add(newLine); naughty! 4.08.09
// System.out.println("newLine: " + newLine);
//}
currentX = (float)Pto.getX();
currentY = (float)Pto.getY();
}
public void simpleAddRect(float x, float y, float w, float h)
{
float[] comp = getStrokingColor().getRGBColorComponents(null);
///// System.out.println("transparency: "+getStrokingColor().getTransparency());
///// System.out.println("rect stroke colour: " + comp[0] + " " + comp[1] + " " + comp[2]);
//if (!(comp[0] > 0.9 && comp[1] > 0.9 && comp[2] > 0.9))
//{
//rectsToAdd.add(newRect);
// 11.04.09 funny behaviour owing to reverseYCoordinates
/*
RectSegment newRect = new RectSegment(x, x+w, y+h, y+2*h);
if (h < 0)
newRect = new RectSegment(x, x+w, y, y-h);
*/
RectSegment newRect = new RectSegment(x, x+w, y, y+h);
//newRect.correctNegativeDimensions(); -- this didn't work right here
//RectSegment newRect = new RectSegment(x, x+w, y+2*h, y+h);
///// System.out.println("adding rect: " + newRect);
//if (page.getRotation() != null)
newRect.rotate(page);
///// System.out.println("after rotation: " + newRect);
//rectList.add(newRect);
newRect.correctNegativeDimensions();
rectsToAdd.add(newRect);
// coordinates already transformed in calling OperatorProcessor method...
currentX = x;
currentY = y;
//}
//else
//{
// System.out.println("ignored rect");
//}
}
/**
* TODO: extract byte[] from img and add it to ImageSegment
*
* @param x1
* @param x2
* @param y1
* @param y2
* @param img
*/
public void simpleDrawImage(float x1, float x2, float y1, float y2, PDXObjectImage img ) {
ImageSegment newImageSegment = new ImageSegment(x1, x2, y1, y2);
//(ctm.getXPosition(), ctm.getXPosition() + (float)twh.getX(),
//ctm.getYPosition(), ctm.getYPosition() + (float)twh.getY());
///// System.out.println("adding image segment: " + newImageSegment);
newImageSegment.correctNegativeDimensions();
newImageSegment.rotate(page);
///// System.out.println("image before clipping: " + newImageSegment);
///// System.out.println("clipBounds: " + clipBounds);
newImageSegment.shrinkBoundingBox(clipBounds);
if (!newImageSegment.isZeroSize())
{
///// System.out.println("adding image segment");
imageList.add(newImageSegment);
}
///// else System.out.println("not adding image segment");
///// System.out.println("after rotation: " + newImageSegment);
}
public void simpleStrokePath()
{
newPath(); // adds contents of last sub-path to toAdd lists
// from PageDrawer.java (Ben) hack 4.08.09
try {
graphics.setColor( getGraphicsState().getStrokingColor().getJavaColor() );
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} //per Ben's 11/15 change in StrokePath.java
float[] comp = graphics.getColor().getRGBColorComponents(null);
//if (true);
if (!(comp[0] > 0.9 && comp[1] > 0.9 && comp[2] > 0.9))
{
//lineList.addAll(linesToAdd);
//rectList.addAll(rectsToAdd);
Iterator<LineSegment> llIter = linesToAdd.iterator();
while(llIter.hasNext()) // forgotten while statement added 4.08.09
{
LineSegment ls = llIter.next();
ls.shrinkBoundingBox(clipBounds);
if (!ls.isZeroSize())
lineList.add(ls);
}
Iterator<RectSegment> rlIter = rectsToAdd.iterator();
while(rlIter.hasNext()) // forgotten while statement added 4.08.09
{
RectSegment rs = rlIter.next();
rs.setFilled(false); // is false anyway
rs.shrinkBoundingBox(clipBounds);
if (!rs.isZeroSize())
rectList.add(rs);
}
}
// empty toAdd lists
//newPath();
endPath();
}
public void simpleFillPath()
{
newPath(); // adds contents of last sub-path to toAdd lists
//float[] comp = getStrokingColor().getRGBColorComponents(null);
//System.out.println("rect non stroke colour1: " + comp[0] + " " + comp[1] + " " + comp[2]);
// from PageDrawer.java (Ben) hack 4.08.09
try {
graphics.setColor( getGraphicsState().getNonStrokingColor().getJavaColor() );
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//float[] comp = getNonStrokingColor().getRGBColorComponents(null);
float[] comp = graphics.getColor().getRGBColorComponents(null);
//System.out.println("rect non stroke colour2: " + comp[0] + " " + comp[1] + " " + comp[2]);
//comp = graphics.getColor().getRGBColorComponents(null);
//System.out.println("r " + graphics.getColor().getRed() + " g " + graphics.getColor().getGreen() + " b " + graphics.getColor().getBlue());
//System.out.println("rect non stroke colour3: " + comp[0] + " " + comp[1] + " " + comp[2]);
if (!(comp[0] > 0.9 && comp[1] > 0.9 && comp[2] > 0.9))
{
//lineList.addAll(linesToAdd);
//System.out.println("rectsToAdd: " + rectsToAdd);
Iterator<RectSegment> rlIter = rectsToAdd.iterator(); // in practice, should not be more than one rect#
// in this list
while (rlIter.hasNext())
{
RectSegment rs = rlIter.next();
rs.setFilled(true);
///// System.out.println("rect before shrinking: " + rs);
///// System.out.println("clipBounds: " + clipBounds);
rs.shrinkBoundingBox(clipBounds);
if (!rs.isZeroSize())
rectList.add(rs);
}
// TODO: rectangle recognition from lines
// this part added 4.08.09 (Sydney)
// TODO: accept only if resembles a rectangle
// find BBox
// any lines with points o/s bbox (within error margin) lead to rejection
RectSegment lineRect = null;
Iterator lIter = linesToAdd.iterator();
while(lIter.hasNext())
{
LineSegment ls = (LineSegment)lIter.next();
if (lineRect == null)
lineRect = new RectSegment
(ls.getX1(), ls.getX2(), ls.getY1(), ls.getY2());
// 4.08.09 previously lineRect.getX... got rid of a lot of rubbish...
else
lineRect.growBoundingBox(ls);
}
if (lineRect != null)
rectList.add(lineRect);
// end of addition
if (lineRect != null)
{
// System.out.println("col: r: " + comp[0] + " g: " + comp[1] + " b: " + comp[2]);
// System.out.println("adding lineRect: " + lineRect);
}
//rectList.addAll(rectsToAdd);
}
// empty toAdd lists
//newPath();
endPath();
}
}