package technology.tabula.detectors; import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.image.BufferedImage; import java.awt.image.Raster; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdfwriter.ContentStreamWriter; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.rendering.ImageType; import technology.tabula.Line; import technology.tabula.Page; import technology.tabula.Rectangle; import technology.tabula.Ruling; import technology.tabula.TextChunk; import technology.tabula.TextElement; import technology.tabula.Utils; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; /** * Created by matt on 2015-12-17. * <p> * Attempt at an implementation of the table finding algorithm described by * Anssi Nurminen's master's thesis: * http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3 */ public class NurminenDetectionAlgorithm implements DetectionAlgorithm { private static final int GRAYSCALE_INTENSITY_THRESHOLD = 25; private static final int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50; private static final int VERTICAL_EDGE_HEIGHT_MINIMUM = 10; private static final int CELL_CORNER_DISTANCE_MAXIMUM = 10; private static final float POINT_SNAP_DISTANCE_THRESHOLD = 8f; private static final float TABLE_PADDING_AMOUNT = 1.0f; private static final int REQUIRED_TEXT_LINES_FOR_EDGE = 4; private static final int REQUIRED_CELLS_FOR_TABLE = 4; private static final float IDENTICAL_TABLE_OVERLAP_RATIO = 0.9f; /** * Helper class that encapsulates a text edge */ private static final class TextEdge extends Line2D.Float { // types of text edges public static final int LEFT = 0; public static final int MID = 1; public static final int RIGHT = 2; public static final int NUM_TYPES = 3; public int intersectingTextRowCount; public TextEdge(float x1, float y1, float x2, float y2) { super(x1, y1, x2, y2); this.intersectingTextRowCount = 0; } } /** * Helper container for all text edges on a page */ private static final class TextEdges extends ArrayList<List<TextEdge>> { public TextEdges(List<TextEdge> leftEdges, List<TextEdge> midEdges, List<TextEdge> rightEdges) { super(3); this.add(leftEdges); this.add(midEdges); this.add(rightEdges); } } /** * Helper container for relevant text edge info */ private static final class RelevantEdges { public int edgeType; public int edgeCount; public RelevantEdges(int edgeType, int edgeCount) { this.edgeType = edgeType; this.edgeCount = edgeCount; } } @Override public List<Rectangle> detect(Page page) { // get horizontal & vertical lines // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF // instructions that are interpreted incorrectly as visible elements - we really want to capture what a // person sees when they look at the PDF BufferedImage image; PDPage pdfPage = page.getPDPage(); try { image = Utils.pageConvertToImage(pdfPage, 144, ImageType.GRAY); } catch (IOException e) { return new ArrayList<Rectangle>(); } List<Ruling> horizontalRulings = this.getHorizontalRulings(image); // now check the page for vertical lines, but remove the text first to make things less confusing PDDocument removeTextDocument = null; try { removeTextDocument = this.removeText(pdfPage); image = Utils.pageConvertToImage(pdfPage, 144, ImageType.GRAY); } catch (Exception e) { return new ArrayList<Rectangle>(); } finally { if (removeTextDocument != null) { try { removeTextDocument.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } List<Ruling> verticalRulings = this.getVerticalRulings(image); List<Ruling> allEdges = new ArrayList<Ruling>(horizontalRulings); allEdges.addAll(verticalRulings); List<Rectangle> tableAreas = new ArrayList<Rectangle>(); // if we found some edges, try to find some tables based on them if (allEdges.size() > 0) { // now we need to snap edge endpoints to a grid Utils.snapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD); // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings for (List<Ruling> rulings : Arrays.asList(horizontalRulings, verticalRulings)) { for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext(); ) { Ruling ruling = iterator.next(); ruling.normalize(); if (ruling.oblique()) { iterator.remove(); } } } // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the // edge detection/pixel snapping steps horizontalRulings = Ruling.collapseOrientedRulings(horizontalRulings, 5); verticalRulings = Ruling.collapseOrientedRulings(verticalRulings, 5); // use the rulings and points to find cells List<? extends Rectangle> cells = SpreadsheetExtractionAlgorithm.findCells(horizontalRulings, verticalRulings); // then use those cells to make table areas tableAreas = this.getTableAreasFromCells(cells); } // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as // cells if there are missing horizontal lines (which there often are) // let's assume though that these lines should be part of the table for (Line2D.Float verticalRuling : verticalRulings) { for (Rectangle tableArea : tableAreas) { if (verticalRuling.intersects(tableArea) && !(tableArea.contains(verticalRuling.getP1()) && tableArea.contains(verticalRuling.getP2()))) { tableArea.setTop((float) Math.floor(Math.min(tableArea.getTop(), verticalRuling.getY1()))); tableArea.setBottom((float) Math.ceil(Math.max(tableArea.getBottom(), verticalRuling.getY2()))); break; } } } // the tabula Page coordinate space is half the size of the PDFBox image coordinate space // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything for (Rectangle area : tableAreas) { area.x = (float) Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT; area.y = (float) Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT; area.width = (float) Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT; area.height = (float) Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT; } // we're going to want halved horizontal lines later too for (Line2D.Float ruling : horizontalRulings) { ruling.x1 = ruling.x1 / 2; ruling.y1 = ruling.y1 / 2; ruling.x2 = ruling.x2 / 2; ruling.y2 = ruling.y2 / 2; } // now look at text rows to help us find more tables and flesh out existing ones List<TextChunk> textChunks = TextElement.mergeWords(page.getText()); List<Line> lines = TextChunk.groupByLines(textChunks); // first look for text rows that intersect an existing table - those lines should probably be part of the table for (Line textRow : lines) { for (Rectangle tableArea : tableAreas) { if (!tableArea.contains(textRow) && textRow.intersects(tableArea)) { tableArea.setLeft((float) Math.floor(Math.min(textRow.getLeft(), tableArea.getLeft()))); tableArea.setRight((float) Math.ceil(Math.max(textRow.getRight(), tableArea.getRight()))); } } } // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext(); ) { Rectangle table = iterator.next(); boolean intersectsText = false; for (Line textRow : lines) { if (table.intersects(textRow)) { intersectsText = true; break; } } if (!intersectsText) { iterator.remove(); } } // lastly, there may be some tables that don't have any vertical rulings at all // we'll use text edges we've found to try and guess which text rows are part of a table // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be // part of a table. boolean foundTable; do { foundTable = false; // get rid of any text lines contained within existing tables, this allows us to find more tables for (Iterator<Line> iterator = lines.iterator(); iterator.hasNext(); ) { Line textRow = iterator.next(); for (Rectangle table : tableAreas) { if (table.contains(textRow)) { iterator.remove(); break; } } } // get text edges from remaining lines in the document TextEdges textEdges = this.getTextEdges(lines); List<TextEdge> leftTextEdges = textEdges.get(TextEdge.LEFT); List<TextEdge> midTextEdges = textEdges.get(TextEdge.MID); List<TextEdge> rightTextEdges = textEdges.get(TextEdge.RIGHT); // find the relevant text edges (the ones we think define where a table is) RelevantEdges relevantEdgeInfo = this.getRelevantEdges(textEdges, lines); // we found something relevant so let's look for rows that fit our criteria if (relevantEdgeInfo.edgeType != -1) { List<TextEdge> relevantEdges = null; switch (relevantEdgeInfo.edgeType) { case TextEdge.LEFT: relevantEdges = leftTextEdges; break; case TextEdge.MID: relevantEdges = midTextEdges; break; case TextEdge.RIGHT: relevantEdges = rightTextEdges; break; } Rectangle table = this.getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings); if (table != null) { foundTable = true; tableAreas.add(table); } } } while (foundTable); // create a set of our current tables that will eliminate duplicate tables Set<Rectangle> tableSet = new TreeSet<Rectangle>(new Comparator<Rectangle>() { @Override public int compare(Rectangle o1, Rectangle o2) { if (o1.equals(o2)) { return 0; } // o1 is "equal" to o2 if o2 contains all of o1 if (o2.contains(o1)) { return 0; } if (o1.contains(o2)) { return 0; } // otherwise see if these tables are "mostly" the same float overlap = o1.overlapRatio(o2); if (overlap >= IDENTICAL_TABLE_OVERLAP_RATIO) { return 0; } else { return 1; } } }); tableSet.addAll(tableAreas); return new ArrayList<Rectangle>(tableSet); } private Rectangle getTableFromText(List<Line> lines, List<TextEdge> relevantEdges, int relevantEdgeCount, List<Ruling> horizontalRulings) { Rectangle table = new Rectangle(); Line prevRow = null; Line firstTableRow = null; Line lastTableRow = null; int tableSpaceCount = 0; float totalRowSpacing = 0; // go through the lines and find the ones that have the correct count of the relevant edges for (Line textRow : lines) { int numRelevantEdges = 0; if (firstTableRow != null && tableSpaceCount > 0) { // check to make sure this text row is within a line or so of the other lines already added // if it's not, we should stop the table here float tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5f; float lineDistance = textRow.getTop() - prevRow.getTop(); if (lineDistance > tableLineThreshold) { lastTableRow = prevRow; break; } } // for larger tables, be a little lenient on the number of relevant rows the text intersects // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too int relativeEdgeDifferenceThreshold = 1; if (relevantEdgeCount <= 3) { relativeEdgeDifferenceThreshold = 0; } for (TextEdge edge : relevantEdges) { if (textRow.intersectsLine(edge)) { numRelevantEdges++; } } // see if we have a candidate text row if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold)) { // keep track of table row spacing if (prevRow != null && firstTableRow != null) { tableSpaceCount++; totalRowSpacing += (textRow.getTop() - prevRow.getTop()); } // row is part of a table if (table.getArea() == 0) { firstTableRow = textRow; table.setRect(textRow); } else { table.setLeft(Math.min(table.getLeft(), textRow.getLeft())); table.setBottom(Math.max(table.getBottom(), textRow.getBottom())); table.setRight(Math.max(table.getRight(), textRow.getRight())); } } else { // no dice // if we're at the end of the table, save the last row if (firstTableRow != null && lastTableRow == null) { lastTableRow = prevRow; } } prevRow = textRow; } // if we don't have a table now, we won't after the next step either if (table.getArea() == 0) { return null; } if (lastTableRow == null) { // takes care of one-row tables or tables that end at the bottom of a page lastTableRow = prevRow; } // use the average row height and nearby horizontal lines to extend the table area float avgRowHeight; if (tableSpaceCount > 0) { avgRowHeight = totalRowSpacing / tableSpaceCount; } else { avgRowHeight = lastTableRow.height; } float rowHeightThreshold = avgRowHeight * 1.5f; // check lines after the bottom of the table for (Line2D.Float ruling : horizontalRulings) { if (ruling.getY1() < table.getBottom()) { continue; } float distanceFromTable = (float) ruling.getY1() - table.getBottom(); if (distanceFromTable <= rowHeightThreshold) { // use this ruling to help define the table table.setBottom((float) Math.max(table.getBottom(), ruling.getY1())); table.setLeft((float) Math.min(table.getLeft(), ruling.getX1())); table.setRight((float) Math.max(table.getRight(), ruling.getX2())); } else { // no use checking any further break; } } // do the same for lines at the top, but make the threshold greater since table headings tend to be // larger to fit up to three-ish rows of text (at least but we don't want to grab too much) rowHeightThreshold = avgRowHeight * 3.8f; for (int i = horizontalRulings.size() - 1; i >= 0; i--) { Line2D.Float ruling = horizontalRulings.get(i); if (ruling.getY1() > table.getTop()) { continue; } float distanceFromTable = table.getTop() - (float) ruling.getY1(); if (distanceFromTable <= rowHeightThreshold) { table.setTop((float) Math.min(table.getTop(), ruling.getY1())); table.setLeft((float) Math.min(table.getLeft(), ruling.getX1())); table.setRight((float) Math.max(table.getRight(), ruling.getX2())); } else { break; } } // add a bit of padding since the halved horizontal lines are a little fuzzy anyways table.setTop((float) Math.floor(table.getTop()) - TABLE_PADDING_AMOUNT); table.setBottom((float) Math.ceil(table.getBottom()) + TABLE_PADDING_AMOUNT); table.setLeft((float) Math.floor(table.getLeft()) - TABLE_PADDING_AMOUNT); table.setRight((float) Math.ceil(table.getRight()) + TABLE_PADDING_AMOUNT); return table; } private RelevantEdges getRelevantEdges(TextEdges textEdges, List<Line> lines) { List<TextEdge> leftTextEdges = textEdges.get(TextEdge.LEFT); List<TextEdge> midTextEdges = textEdges.get(TextEdge.MID); List<TextEdge> rightTextEdges = textEdges.get(TextEdge.RIGHT); // first we'll find the number of lines each type of edge crosses int[][] edgeCountsPerLine = new int[lines.size()][TextEdge.NUM_TYPES]; for (TextEdge edge : leftTextEdges) { edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.LEFT]++; } for (TextEdge edge : midTextEdges) { edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.MID]++; } for (TextEdge edge : rightTextEdges) { edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.RIGHT]++; } // now let's find the relevant edge type and the number of those edges we should look for // we'll only take a minimum of two edges to look for tables int relevantEdgeType = -1; int relevantEdgeCount = 0; for (int i = edgeCountsPerLine.length - 1; i > 2; i--) { if (edgeCountsPerLine[i][TextEdge.LEFT] > 2 && edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.RIGHT] && edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.MID]) { relevantEdgeCount = edgeCountsPerLine[i][TextEdge.LEFT]; relevantEdgeType = TextEdge.LEFT; break; } if (edgeCountsPerLine[i][TextEdge.RIGHT] > 1 && edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.LEFT] && edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.MID]) { relevantEdgeCount = edgeCountsPerLine[i][TextEdge.RIGHT]; relevantEdgeType = TextEdge.RIGHT; break; } if (edgeCountsPerLine[i][TextEdge.MID] > 1 && edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.RIGHT] && edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.LEFT]) { relevantEdgeCount = edgeCountsPerLine[i][TextEdge.MID]; relevantEdgeType = TextEdge.MID; break; } } return new RelevantEdges(relevantEdgeType, relevantEdgeCount); } private TextEdges getTextEdges(List<Line> lines) { // get all text edges (lines that align with the left, middle and right of chunks of text) that extend // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text List<TextEdge> leftTextEdges = new ArrayList<TextEdge>(); List<TextEdge> midTextEdges = new ArrayList<TextEdge>(); List<TextEdge> rightTextEdges = new ArrayList<TextEdge>(); Map<Integer, List<TextChunk>> currLeftEdges = new HashMap<Integer, List<TextChunk>>(); Map<Integer, List<TextChunk>> currMidEdges = new HashMap<Integer, List<TextChunk>>(); Map<Integer, List<TextChunk>> currRightEdges = new HashMap<Integer, List<TextChunk>>(); for (Line textRow : lines) { for (TextChunk text : textRow.getTextElements()) { Integer left = new Integer((int) Math.floor(text.getLeft())); Integer right = new Integer((int) Math.floor(text.getRight())); Integer mid = new Integer(left + ((right - left) / 2)); // first put this chunk into any edge buckets it belongs to List<TextChunk> leftEdge = currLeftEdges.get(left); if (leftEdge == null) { leftEdge = new ArrayList<TextChunk>(); currLeftEdges.put(left, leftEdge); } leftEdge.add(text); List<TextChunk> midEdge = currMidEdges.get(mid); if (midEdge == null) { midEdge = new ArrayList<TextChunk>(); currMidEdges.put(mid, midEdge); } midEdge.add(text); List<TextChunk> rightEdge = currRightEdges.get(right); if (rightEdge == null) { rightEdge = new ArrayList<TextChunk>(); currRightEdges.put(right, rightEdge); } rightEdge.add(text); // now see if this text chunk blows up any other edges for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext(); ) { Map.Entry<Integer, List<TextChunk>> entry = iterator.next(); Integer key = entry.getKey(); if (key > left && key < right) { iterator.remove(); List<TextChunk> edgeChunks = entry.getValue(); if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks.get(0); TextChunk last = edgeChunks.get(edgeChunks.size() - 1); TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom()); edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size()); leftTextEdges.add(edge); } } } for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext(); ) { Map.Entry<Integer, List<TextChunk>> entry = iterator.next(); Integer key = entry.getKey(); if (key > left && key < right && Math.abs(key - mid) > 2) { iterator.remove(); List<TextChunk> edgeChunks = entry.getValue(); if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks.get(0); TextChunk last = edgeChunks.get(edgeChunks.size() - 1); TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom()); edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size()); midTextEdges.add(edge); } } } for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext(); ) { Map.Entry<Integer, List<TextChunk>> entry = iterator.next(); Integer key = entry.getKey(); if (key > left && key < right) { iterator.remove(); List<TextChunk> edgeChunks = entry.getValue(); if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks.get(0); TextChunk last = edgeChunks.get(edgeChunks.size() - 1); TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom()); edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size()); rightTextEdges.add(edge); } } } } } // add the leftovers for (Integer key : currLeftEdges.keySet()) { List<TextChunk> edgeChunks = currLeftEdges.get(key); if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks.get(0); TextChunk last = edgeChunks.get(edgeChunks.size() - 1); TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom()); edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size()); leftTextEdges.add(edge); } } for (Integer key : currMidEdges.keySet()) { List<TextChunk> edgeChunks = currMidEdges.get(key); if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks.get(0); TextChunk last = edgeChunks.get(edgeChunks.size() - 1); TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom()); edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size()); midTextEdges.add(edge); } } for (Integer key : currRightEdges.keySet()) { List<TextChunk> edgeChunks = currRightEdges.get(key); if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) { TextChunk first = edgeChunks.get(0); TextChunk last = edgeChunks.get(edgeChunks.size() - 1); TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom()); edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size()); rightTextEdges.add(edge); } } return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges); } private List<Rectangle> getTableAreasFromCells(List<? extends Rectangle> cells) { List<List<Rectangle>> cellGroups = new ArrayList<List<Rectangle>>(); for (Rectangle cell : cells) { boolean addedToGroup = false; cellCheck: for (List<Rectangle> cellGroup : cellGroups) { for (Rectangle groupCell : cellGroup) { Point2D[] groupCellCorners = groupCell.getPoints(); Point2D[] candidateCorners = cell.getPoints(); for (int i = 0; i < candidateCorners.length; i++) { for (int j = 0; j < groupCellCorners.length; j++) { if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) { cellGroup.add(cell); addedToGroup = true; break cellCheck; } } } } } if (!addedToGroup) { ArrayList<Rectangle> cellGroup = new ArrayList<Rectangle>(); cellGroup.add(cell); cellGroups.add(cellGroup); } } // create table areas based on cell group List<Rectangle> tableAreas = new ArrayList<Rectangle>(); for (List<Rectangle> cellGroup : cellGroups) { // less than four cells should not make a table if (cellGroup.size() < REQUIRED_CELLS_FOR_TABLE) { continue; } float top = Float.MAX_VALUE; float left = Float.MAX_VALUE; float bottom = Float.MIN_VALUE; float right = Float.MIN_VALUE; for (Rectangle cell : cellGroup) { if (cell.getTop() < top) top = cell.getTop(); if (cell.getLeft() < left) left = cell.getLeft(); if (cell.getBottom() > bottom) bottom = cell.getBottom(); if (cell.getRight() > right) right = cell.getRight(); } tableAreas.add(new Rectangle(top, left, right - left, bottom - top)); } return tableAreas; } private List<Ruling> getHorizontalRulings(BufferedImage image) { // get all horizontal edges, which we'll define as a change in grayscale colour // along a straight line of a certain length ArrayList<Ruling> horizontalRulings = new ArrayList<Ruling>(); Raster r = image.getRaster(); int width = r.getWidth(); int height = r.getHeight(); for (int x = 0; x < width; x++) { int[] lastPixel = r.getPixel(x, 0, (int[]) null); for (int y = 1; y < height - 1; y++) { int[] currPixel = r.getPixel(x, y, (int[]) null); int diff = Math.abs(currPixel[0] - lastPixel[0]); if (diff > GRAYSCALE_INTENSITY_THRESHOLD) { // we hit what could be a line // don't bother scanning it if we've hit a pixel in the line before boolean alreadyChecked = false; for (Line2D.Float line : horizontalRulings) { if (y == line.getY1() && x >= line.getX1() && x <= line.getX2()) { alreadyChecked = true; break; } } if (alreadyChecked) { lastPixel = currPixel; continue; } int lineX = x + 1; while (lineX < width) { int[] linePixel = r.getPixel(lineX, y, (int[]) null); int[] abovePixel = r.getPixel(lineX, y - 1, (int[]) null); if (Math.abs(linePixel[0] - abovePixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD || Math.abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) { break; } lineX++; } int endX = lineX - 1; int lineWidth = endX - x; if (lineWidth > HORIZONTAL_EDGE_WIDTH_MINIMUM) { horizontalRulings.add(new Ruling(new Point2D.Float(x, y), new Point2D.Float(endX, y))); } } lastPixel = currPixel; } } return horizontalRulings; } private List<Ruling> getVerticalRulings(BufferedImage image) { // get all vertical edges, which we'll define as a change in grayscale colour // along a straight line of a certain length ArrayList<Ruling> verticalRulings = new ArrayList<Ruling>(); Raster r = image.getRaster(); int width = r.getWidth(); int height = r.getHeight(); for (int y = 0; y < height; y++) { int[] lastPixel = r.getPixel(0, y, (int[]) null); for (int x = 1; x < width - 1; x++) { int[] currPixel = r.getPixel(x, y, (int[]) null); int diff = Math.abs(currPixel[0] - lastPixel[0]); if (diff > GRAYSCALE_INTENSITY_THRESHOLD) { // we hit what could be a line // don't bother scanning it if we've hit a pixel in the line before boolean alreadyChecked = false; for (Line2D.Float line : verticalRulings) { if (x == line.getX1() && y >= line.getY1() && y <= line.getY2()) { alreadyChecked = true; break; } } if (alreadyChecked) { lastPixel = currPixel; continue; } int lineY = y + 1; while (lineY < height) { int[] linePixel = r.getPixel(x, lineY, (int[]) null); int[] leftPixel = r.getPixel(x - 1, lineY, (int[]) null); if (Math.abs(linePixel[0] - leftPixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD || Math.abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) { break; } lineY++; } int endY = lineY - 1; int lineLength = endY - y; if (lineLength > VERTICAL_EDGE_HEIGHT_MINIMUM) { verticalRulings.add(new Ruling(new Point2D.Float(x, y), new Point2D.Float(x, endY))); } } lastPixel = currPixel; } } return verticalRulings; } // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html private PDDocument removeText(PDPage page) throws IOException { PDFStreamParser parser = new PDFStreamParser(page); parser.parse(); List<Object> tokens = parser.getTokens(); List<Object> newTokens = new ArrayList<Object>(); for (Object token : tokens) { if (token instanceof Operator) { Operator op = (Operator) token; if (op.getName().equals("TJ") || op.getName().equals("Tj")) { //remove the one argument to this operator newTokens.remove(newTokens.size() - 1); continue; } } newTokens.add(token); } PDDocument document = new PDDocument(); document.addPage(page); PDStream newContents = new PDStream(document); OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE); ContentStreamWriter writer = new ContentStreamWriter(out); writer.writeTokens(newTokens); out.close(); page.setContents(newContents); return document; } }