package technology.tabula;
import java.awt.Shape;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Line2D;
import java.awt.geom.PathIterator;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDCIDFont;
import org.apache.pdfbox.pdmodel.font.PDCIDFontType2;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.font.encoding.GlyphList;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {
private static final String NBSP = "\u00A0";
protected float minCharWidth;
protected float minCharHeight;
protected List<TextElement> characters;
protected List<Ruling> rulings;
protected RectangleSpatialIndex<TextElement> spatialIndex;
private AffineTransform pageTransform;
private boolean debugClippingPaths;
private boolean extractRulingLines = true;
private Logger log;
private int clipWindingRule = -1;
private GeneralPath currentPath = new GeneralPath();
public List<Shape> clippingPaths;
private int pageRotation;
private PDRectangle pageSize;
private Matrix translateMatrix;
private GlyphList glyphList;
protected ObjectExtractorStreamEngine(PDPage page) {
super(page);
this.log = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class);
this.characters = new ArrayList<TextElement>();
this.rulings = new ArrayList<Ruling>();
this.pageTransform = null;
this.spatialIndex = new RectangleSpatialIndex<TextElement>();
this.minCharWidth = Float.MAX_VALUE;
this.minCharHeight = Float.MAX_VALUE;
this.pageRotation = page.getRotation();
this.pageSize = page.getCropBox();
// calculate page transform
PDRectangle cb = this.getPage().getCropBox();
int rotation = this.getPage().getRotation();
this.pageTransform = new AffineTransform();
if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
this.pageTransform = AffineTransform.getRotateInstance(rotation * (Math.PI / 180.0), 0, 0);
this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
} else {
this.pageTransform.concatenate(AffineTransform.getTranslateInstance(0, cb.getHeight()));
this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
}
// load additional glyph list for Unicode mapping
String path = "org/apache/pdfbox/resources/glyphlist/additional.txt";
InputStream input = GlyphList.class.getClassLoader().getResourceAsStream(path);
this.glyphList = null;
try {
this.glyphList = new GlyphList(GlyphList.getAdobeGlyphList(), input);
} catch (IOException e) {
this.log.error("Error loading glyph list", e);
}
}
@Override
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement)
throws IOException {
TextPosition textPosition = getTextPosition(textRenderingMatrix, font, code, unicode, displacement);
if (textPosition != null) {
String c = textPosition.getUnicode();
// if c not printable, return
if (!isPrintable(c)) {
return;
}
Float h = textPosition.getHeightDir();
if (c.equals(NBSP)) { // replace non-breaking space for space
c = " ";
}
float wos = textPosition.getWidthOfSpace();
TextElement te = new TextElement(Utils.round(textPosition.getYDirAdj() - h, 2),
Utils.round(textPosition.getXDirAdj(), 2), Utils.round(textPosition.getWidthDirAdj(), 2),
Utils.round(textPosition.getHeightDir(), 2), textPosition.getFont(), textPosition.getFontSize(), c,
// workaround a possible bug in PDFBox:
// https://issues.apache.org/jira/browse/PDFBOX-1755
wos, textPosition.getDir());
if (this.currentClippingPath().intersects(te)) {
this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
this.spatialIndex.add(te);
this.characters.add(te);
}
if (this.isDebugClippingPaths() && !this.clippingPaths.contains(this.currentClippingPath())) {
this.clippingPaths.add(this.currentClippingPath());
}
}
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {
currentPath.moveTo((float) p0.getX(), (float) p0.getY());
currentPath.lineTo((float) p1.getX(), (float) p1.getY());
currentPath.lineTo((float) p2.getX(), (float) p2.getY());
currentPath.lineTo((float) p3.getX(), (float) p3.getY());
currentPath.closePath();
}
@Override
public void clip(int windingRule) throws IOException {
// the clipping path will not be updated until the succeeding painting
// operator is called
clipWindingRule = windingRule;
}
@Override
public void closePath() throws IOException {
currentPath.closePath();
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {
currentPath.curveTo(x1, y1, x2, y2, x3, y3);
}
@Override
public void drawImage(PDImage arg0) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void endPath() throws IOException {
if (clipWindingRule != -1) {
currentPath.setWindingRule(clipWindingRule);
getGraphicsState().intersectClippingPath(currentPath);
clipWindingRule = -1;
}
currentPath.reset();
}
@Override
public void fillAndStrokePath(int arg0) throws IOException {
strokeOrFillPath(true);
}
@Override
public void fillPath(int arg0) throws IOException {
strokeOrFillPath(true);
}
@Override
public Point2D getCurrentPoint() throws IOException {
return currentPath.getCurrentPoint();
}
@Override
public void lineTo(float x, float y) throws IOException {
currentPath.lineTo(x, y);
}
@Override
public void moveTo(float x, float y) throws IOException {
currentPath.moveTo(x, y);
}
@Override
public void shadingFill(COSName arg0) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void strokePath() throws IOException {
strokeOrFillPath(false);
}
private void strokeOrFillPath(boolean isFill) {
GeneralPath path = this.currentPath;
if (!this.extractRulingLines) {
this.currentPath.reset();
return;
}
PathIterator pi = path.getPathIterator(this.getPageTransform());
float[] c = new float[6];
int currentSegment;
// skip paths whose first operation is not a MOVETO
// or contains operations other than LINETO, MOVETO or CLOSE
if ((pi.currentSegment(c) != PathIterator.SEG_MOVETO)) {
path.reset();
return;
}
pi.next();
while (!pi.isDone()) {
currentSegment = pi.currentSegment(c);
if (currentSegment != PathIterator.SEG_LINETO && currentSegment != PathIterator.SEG_CLOSE
&& currentSegment != PathIterator.SEG_MOVETO) {
path.reset();
return;
}
pi.next();
}
// TODO: how to implement color filter?
// skip the first path operation and save it as the starting position
float[] first = new float[6];
pi = path.getPathIterator(this.getPageTransform());
pi.currentSegment(first);
// last move
Point2D.Float start_pos = new Point2D.Float(Utils.round(first[0], 2), Utils.round(first[1], 2));
Point2D.Float last_move = start_pos;
Point2D.Float end_pos = null;
Line2D.Float line;
PointComparator pc = new PointComparator();
while (!pi.isDone()) {
pi.next();
// This can be the last segment, when pi.isDone, but we need to
// process it
// otherwise us-017.pdf fails the last value.
try {
currentSegment = pi.currentSegment(c);
} catch (IndexOutOfBoundsException ex) {
continue;
}
switch (currentSegment) {
case PathIterator.SEG_LINETO:
end_pos = new Point2D.Float(c[0], c[1]);
line = pc.compare(start_pos, end_pos) == -1 ? new Line2D.Float(start_pos, end_pos)
: new Line2D.Float(end_pos, start_pos);
if (line.intersects(this.currentClippingPath())) {
Ruling r = new Ruling(line.getP1(), line.getP2()).intersect(this.currentClippingPath());
if (r.length() > 0.01) {
this.rulings.add(r);
}
}
break;
case PathIterator.SEG_MOVETO:
last_move = new Point2D.Float(c[0], c[1]);
end_pos = last_move;
break;
case PathIterator.SEG_CLOSE:
// according to PathIterator docs:
// "the preceding subpath should be closed by appending a line
// segment
// back to the point corresponding to the most recent
// SEG_MOVETO."
line = pc.compare(end_pos, last_move) == -1 ? new Line2D.Float(end_pos, last_move)
: new Line2D.Float(last_move, end_pos);
if (line.intersects(this.currentClippingPath())) {
Ruling r = new Ruling(line.getP1(), line.getP2()).intersect(this.currentClippingPath());
if (r.length() > 0.01) {
this.rulings.add(r);
}
}
break;
}
start_pos = end_pos;
}
path.reset();
}
public AffineTransform getPageTransform() {
return this.pageTransform;
}
public Rectangle2D currentClippingPath() {
Shape clippingPath = this.getGraphicsState().getCurrentClippingPath();
Shape transformedClippingPath = this.getPageTransform().createTransformedShape(clippingPath);
Rectangle2D transformedClippingPathBounds = transformedClippingPath.getBounds2D();
return transformedClippingPathBounds;
}
private static boolean isPrintable(String s) {
Character c;
Character.UnicodeBlock block;
boolean printable = false;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
block = Character.UnicodeBlock.of(c);
printable |= !Character.isISOControl(c) && block != null && block != Character.UnicodeBlock.SPECIALS;
}
return printable;
}
private TextPosition getTextPosition(Matrix textRenderingMatrix, PDFont font, int code, String unicode,
Vector displacement) throws IOException {
// LegacyPDFStreamEngine
PDGraphicsState state = getGraphicsState();
Matrix ctm = state.getCurrentTransformationMatrix();
float fontSize = state.getTextState().getFontSize();
float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f;
Matrix textMatrix = getTextMatrix();
BoundingBox bbox = font.getBoundingBox();
if (bbox.getLowerLeftY() < Short.MIN_VALUE) {
// PDFBOX-2158 and PDFBOX-3130
// files by Salmat eSolutions / ClibPDF Library
bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
}
// 1/2 the bbox is used as the height todo: why?
float glyphHeight = bbox.getHeight() / 2;
PDFontDescriptor fontDescriptor = font.getFontDescriptor();
if (fontDescriptor != null)
{
float capHeight = fontDescriptor.getCapHeight();
if (capHeight != 0 && capHeight < glyphHeight)
{
glyphHeight = capHeight;
}
}
// transformPoint from glyph space -> text space
float height;
if (font instanceof PDType3Font) {
height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
} else {
height = glyphHeight / 1000;
}
float displacementX = displacement.getX();
// the sorting algorithm is based on the width of the character. As the
// displacement
// for vertical characters doesn't provide any suitable value for it, we
// have to
// calculate our own
if (font.isVertical()) {
displacementX = font.getWidth(code) / 1000;
// there may be an additional scaling factor for true type fonts
TrueTypeFont ttf = null;
if (font instanceof PDTrueTypeFont) {
ttf = ((PDTrueTypeFont) font).getTrueTypeFont();
} else if (font instanceof PDType0Font) {
PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont();
if (cidFont instanceof PDCIDFontType2) {
ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont();
}
}
if (ttf != null && ttf.getUnitsPerEm() != 1000) {
displacementX *= 1000f / ttf.getUnitsPerEm();
}
}
// (modified) combined displacement, this is calculated *without* taking
// the character
// spacing and word spacing into account, due to legacy code in
// TextStripper
float tx = displacementX * fontSize * horizontalScaling;
float ty = displacement.getY() * fontSize;
// (modified) combined displacement matrix
Matrix td = Matrix.getTranslateInstance(tx, ty);
// (modified) text rendering matrix
Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text
// space
// ->
// device
// space
float nextX = nextTextRenderingMatrix.getTranslateX();
float nextY = nextTextRenderingMatrix.getTranslateY();
// (modified) width and height calculations
float dxDisplay = nextX - textRenderingMatrix.getTranslateX();
float dyDisplay = height * textRenderingMatrix.getScalingFactorY();
//
// start of the original method
//
// Note on variable names. There are three different units being used in
// this code.
// Character sizes are given in glyph units, text locations are
// initially given in text
// units, and we want to save the data in display units. The variable
// names should end with
// Text or Disp to represent if the values are in text or disp units (no
// glyph units are
// saved).
float glyphSpaceToTextSpaceFactor = 1 / 1000f;
if (font instanceof PDType3Font) {
glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX();
}
float spaceWidthText = 0;
try {
// to avoid crash as described in PDFBOX-614, see what the space
// displacement should be
spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor;
} catch (Throwable exception) {
this.log.warn("Error getting spaceWidthText", exception);
}
if (spaceWidthText == 0) {
spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor;
// the average space width appears to be higher than necessary so
// make it smaller
spaceWidthText *= .80f;
}
if (spaceWidthText == 0) {
spaceWidthText = 1.0f; // if could not find font, use a generic
// value
}
// the space width has to be transformed into display units
float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX();
// use our additional glyph list for Unicode mapping
unicode = font.toUnicode(code, glyphList);
// when there is no Unicode mapping available, Acrobat simply coerces
// the character code
// into Unicode, so we do the same. Subclasses of PDFStreamEngine don't
// necessarily want
// this, which is why we leave it until this point in
// PDFTextStreamEngine.
if (unicode == null) {
if (font instanceof PDSimpleFont) {
char c = (char) code;
unicode = new String(new char[]{c});
} else {
// Acrobat doesn't seem to coerce composite font's character
// codes, instead it
// skips them. See the "allah2.pdf" TestTextStripper file.
return null;
}
}
// adjust for cropbox if needed
Matrix translatedTextRenderingMatrix;
if (translateMatrix == null) {
translatedTextRenderingMatrix = textRenderingMatrix;
} else {
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
nextX -= pageSize.getLowerLeftX();
nextY -= pageSize.getLowerLeftY();
}
return new TextPosition(pageRotation, pageSize.getWidth(), pageSize.getHeight(), translatedTextRenderingMatrix,
nextX, nextY, Math.abs(dyDisplay), dxDisplay, Math.abs(spaceWidthDisplay), unicode, new int[]{code},
font, fontSize, (int) (fontSize * textMatrix.getScalingFactorX()));
}
public boolean isDebugClippingPaths() {
return debugClippingPaths;
}
public void setDebugClippingPaths(boolean debugClippingPaths) {
this.debugClippingPaths = debugClippingPaths;
}
class PointComparator implements Comparator<Point2D> {
@Override
public int compare(Point2D o1, Point2D o2) {
float o1X = Utils.round(o1.getX(), 2);
float o1Y = Utils.round(o1.getY(), 2);
float o2X = Utils.round(o2.getX(), 2);
float o2Y = Utils.round(o2.getY(), 2);
if (o1Y > o2Y)
return 1;
if (o1Y < o2Y)
return -1;
if (o1X > o2X)
return 1;
if (o1X < o2X)
return -1;
return 0;
}
}
}