NurminenDetectionAlgorithm.java example

Explorer
tabula-java-master
- src
  - main
    - java
      - technology
        tabula
        Cell.java
        CohenSutherlandClipping.java
        CommandLineApp.java
        DummyGraphics2D.java
        HasText.java
        Line.java
        ObjectExtractor.java
        ObjectExtractorStreamEngine.java
        Page.java
        PageIterator.java
        ProjectionProfile.java
        QuickSort.java
        Rectangle.java
        RectangleSpatialIndex.java
        RectangularTextContainer.java
        Ruling.java
        Table.java
        TableWithRulingLines.java
        TextChunk.java
        TextElement.java
        Utils.java
        debug
        Debug.java
        detectors
        DetectionAlgorithm.java
        NurminenDetectionAlgorithm.java
        SpreadsheetDetectionAlgorithm.java
        extractors
        BasicExtractionAlgorithm.java
        ExtractionAlgorithm.java
        SpreadsheetExtractionAlgorithm.java
        json
        RulingSerializer.java
        TableSerializer.java
        TextChunkSerializer.java
        writers
        CSVWriter.java
        JSONWriter.java
        TSVWriter.java
        Writer.java
  - test
    - java
      - technology
        tabula
        TestBasicExtractor.java
        TestCell.java
        TestCellPosition.java
        TestCommandLineApp.java
        TestDebug.java
        TestLine.java
        TestObjectExtractor.java
        TestProjectionProfile.java
        TestRectangle.java
        TestRectangleSpatialIndex.java
        TestRuling.java
        TestSpreadsheetExtractor.java
        TestTableDetection.java
        TestTextElement.java
        TestUtils.java
        TestWriters.java
        UtilsForTesting.java
package technology.tabula.detectors;

import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.awt.image.Raster;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.rendering.ImageType;

import technology.tabula.Line;
import technology.tabula.Page;
import technology.tabula.Rectangle;
import technology.tabula.Ruling;
import technology.tabula.TextChunk;
import technology.tabula.TextElement;
import technology.tabula.Utils;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

/**
 * Created by matt on 2015-12-17.
 * <p>
 * Attempt at an implementation of the table finding algorithm described by
 * Anssi Nurminen's master's thesis:
 * http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3
 */
public class NurminenDetectionAlgorithm implements DetectionAlgorithm {

    private static final int GRAYSCALE_INTENSITY_THRESHOLD = 25;
    private static final int HORIZONTAL_EDGE_WIDTH_MINIMUM = 50;
    private static final int VERTICAL_EDGE_HEIGHT_MINIMUM = 10;
    private static final int CELL_CORNER_DISTANCE_MAXIMUM = 10;
    private static final float POINT_SNAP_DISTANCE_THRESHOLD = 8f;
    private static final float TABLE_PADDING_AMOUNT = 1.0f;
    private static final int REQUIRED_TEXT_LINES_FOR_EDGE = 4;
    private static final int REQUIRED_CELLS_FOR_TABLE = 4;
    private static final float IDENTICAL_TABLE_OVERLAP_RATIO = 0.9f;

    /**
     * Helper class that encapsulates a text edge
     */
    private static final class TextEdge extends Line2D.Float {
        // types of text edges
        public static final int LEFT = 0;
        public static final int MID = 1;
        public static final int RIGHT = 2;
        public static final int NUM_TYPES = 3;

        public int intersectingTextRowCount;

        public TextEdge(float x1, float y1, float x2, float y2) {
            super(x1, y1, x2, y2);
            this.intersectingTextRowCount = 0;
        }
    }

    /**
     * Helper container for all text edges on a page
     */
    private static final class TextEdges extends ArrayList<List<TextEdge>> {
        public TextEdges(List<TextEdge> leftEdges, List<TextEdge> midEdges, List<TextEdge> rightEdges) {
            super(3);
            this.add(leftEdges);
            this.add(midEdges);
            this.add(rightEdges);
        }
    }

    /**
     * Helper container for relevant text edge info
     */
    private static final class RelevantEdges {
        public int edgeType;
        public int edgeCount;

        public RelevantEdges(int edgeType, int edgeCount) {
            this.edgeType = edgeType;
            this.edgeCount = edgeCount;
        }
    }

    @Override
    public List<Rectangle> detect(Page page) {

        // get horizontal & vertical lines
        // we get these from an image of the PDF and not the PDF itself because sometimes there are invisible PDF
        // instructions that are interpreted incorrectly as visible elements - we really want to capture what a
        // person sees when they look at the PDF
        BufferedImage image;
        PDPage pdfPage = page.getPDPage();
        try {
            image = Utils.pageConvertToImage(pdfPage, 144, ImageType.GRAY);
        } catch (IOException e) {
            return new ArrayList<Rectangle>();
        }

        List<Ruling> horizontalRulings = this.getHorizontalRulings(image);

        // now check the page for vertical lines, but remove the text first to make things less confusing
        PDDocument removeTextDocument = null;
        try {
            removeTextDocument = this.removeText(pdfPage);
            image = Utils.pageConvertToImage(pdfPage, 144, ImageType.GRAY);
        } catch (Exception e) {
            return new ArrayList<Rectangle>();
        } finally {
            if (removeTextDocument != null) {
                try {
                    removeTextDocument.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        List<Ruling> verticalRulings = this.getVerticalRulings(image);

        List<Ruling> allEdges = new ArrayList<Ruling>(horizontalRulings);
        allEdges.addAll(verticalRulings);

        List<Rectangle> tableAreas = new ArrayList<Rectangle>();

        // if we found some edges, try to find some tables based on them
        if (allEdges.size() > 0) {
            // now we need to snap edge endpoints to a grid
            Utils.snapPoints(allEdges, POINT_SNAP_DISTANCE_THRESHOLD, POINT_SNAP_DISTANCE_THRESHOLD);

            // normalize the rulings to make sure snapping didn't create any wacky non-horizontal/vertical rulings
            for (List<Ruling> rulings : Arrays.asList(horizontalRulings, verticalRulings)) {
                for (Iterator<Ruling> iterator = rulings.iterator(); iterator.hasNext(); ) {
                    Ruling ruling = iterator.next();

                    ruling.normalize();
                    if (ruling.oblique()) {
                        iterator.remove();
                    }
                }
            }

            // merge the edge lines into rulings - this makes finding edges between crossing points in the next step easier
            // we use a larger pixel expansion than the normal spreadsheet extraction method to cover gaps in the
            // edge detection/pixel snapping steps
            horizontalRulings = Ruling.collapseOrientedRulings(horizontalRulings, 5);
            verticalRulings = Ruling.collapseOrientedRulings(verticalRulings, 5);

            // use the rulings and points to find cells
            List<? extends Rectangle> cells = SpreadsheetExtractionAlgorithm.findCells(horizontalRulings, verticalRulings);

            // then use those cells to make table areas
            tableAreas = this.getTableAreasFromCells(cells);
        }

        // next find any vertical rulings that intersect tables - sometimes these won't have completely been captured as
        // cells if there are missing horizontal lines (which there often are)
        // let's assume though that these lines should be part of the table
        for (Line2D.Float verticalRuling : verticalRulings) {
            for (Rectangle tableArea : tableAreas) {
                if (verticalRuling.intersects(tableArea) &&
                        !(tableArea.contains(verticalRuling.getP1()) && tableArea.contains(verticalRuling.getP2()))) {

                    tableArea.setTop((float) Math.floor(Math.min(tableArea.getTop(), verticalRuling.getY1())));
                    tableArea.setBottom((float) Math.ceil(Math.max(tableArea.getBottom(), verticalRuling.getY2())));
                    break;
                }
            }
        }

        // the tabula Page coordinate space is half the size of the PDFBox image coordinate space
        // so halve the table area size before proceeding and add a bit of padding to make sure we capture everything
        for (Rectangle area : tableAreas) {
            area.x = (float) Math.floor(area.x / 2) - TABLE_PADDING_AMOUNT;
            area.y = (float) Math.floor(area.y / 2) - TABLE_PADDING_AMOUNT;
            area.width = (float) Math.ceil(area.width / 2) + TABLE_PADDING_AMOUNT;
            area.height = (float) Math.ceil(area.height / 2) + TABLE_PADDING_AMOUNT;
        }

        // we're going to want halved horizontal lines later too
        for (Line2D.Float ruling : horizontalRulings) {
            ruling.x1 = ruling.x1 / 2;
            ruling.y1 = ruling.y1 / 2;
            ruling.x2 = ruling.x2 / 2;
            ruling.y2 = ruling.y2 / 2;
        }

        // now look at text rows to help us find more tables and flesh out existing ones
        List<TextChunk> textChunks = TextElement.mergeWords(page.getText());
        List<Line> lines = TextChunk.groupByLines(textChunks);

        // first look for text rows that intersect an existing table - those lines should probably be part of the table
        for (Line textRow : lines) {
            for (Rectangle tableArea : tableAreas) {
                if (!tableArea.contains(textRow) && textRow.intersects(tableArea)) {
                    tableArea.setLeft((float) Math.floor(Math.min(textRow.getLeft(), tableArea.getLeft())));
                    tableArea.setRight((float) Math.ceil(Math.max(textRow.getRight(), tableArea.getRight())));
                }
            }
        }

        // get rid of tables that DO NOT intersect any text areas - these are likely graphs or some sort of graphic
        for (Iterator<Rectangle> iterator = tableAreas.iterator(); iterator.hasNext(); ) {
            Rectangle table = iterator.next();

            boolean intersectsText = false;
            for (Line textRow : lines) {
                if (table.intersects(textRow)) {
                    intersectsText = true;
                    break;
                }
            }

            if (!intersectsText) {
                iterator.remove();
            }
        }

        // lastly, there may be some tables that don't have any vertical rulings at all
        // we'll use text edges we've found to try and guess which text rows are part of a table

        // in his thesis nurminen goes through every row to try to assign a probability that the line is in a table
        // we're going to try a general heuristic instead, trying to find what type of edge (left/right/mid) intersects
        // the most text rows, and then use that magic number of "relevant" edges to decide what text rows should be
        // part of a table.

        boolean foundTable;

        do {
            foundTable = false;

            // get rid of any text lines contained within existing tables, this allows us to find more tables
            for (Iterator<Line> iterator = lines.iterator(); iterator.hasNext(); ) {
                Line textRow = iterator.next();
                for (Rectangle table : tableAreas) {
                    if (table.contains(textRow)) {
                        iterator.remove();
                        break;
                    }
                }
            }

            // get text edges from remaining lines in the document
            TextEdges textEdges = this.getTextEdges(lines);
            List<TextEdge> leftTextEdges = textEdges.get(TextEdge.LEFT);
            List<TextEdge> midTextEdges = textEdges.get(TextEdge.MID);
            List<TextEdge> rightTextEdges = textEdges.get(TextEdge.RIGHT);

            // find the relevant text edges (the ones we think define where a table is)
            RelevantEdges relevantEdgeInfo = this.getRelevantEdges(textEdges, lines);

            // we found something relevant so let's look for rows that fit our criteria
            if (relevantEdgeInfo.edgeType != -1) {
                List<TextEdge> relevantEdges = null;
                switch (relevantEdgeInfo.edgeType) {
                    case TextEdge.LEFT:
                        relevantEdges = leftTextEdges;
                        break;
                    case TextEdge.MID:
                        relevantEdges = midTextEdges;
                        break;
                    case TextEdge.RIGHT:
                        relevantEdges = rightTextEdges;
                        break;
                }

                Rectangle table = this.getTableFromText(lines, relevantEdges, relevantEdgeInfo.edgeCount, horizontalRulings);

                if (table != null) {
                    foundTable = true;
                    tableAreas.add(table);
                }
            }
        } while (foundTable);

        // create a set of our current tables that will eliminate duplicate tables
        Set<Rectangle> tableSet = new TreeSet<Rectangle>(new Comparator<Rectangle>() {
            @Override
            public int compare(Rectangle o1, Rectangle o2) {
                if (o1.equals(o2)) {
                    return 0;
                }

                // o1 is "equal" to o2 if o2 contains all of o1
                if (o2.contains(o1)) {
                    return 0;
                }

                if (o1.contains(o2)) {
                    return 0;
                }

                // otherwise see if these tables are "mostly" the same
                float overlap = o1.overlapRatio(o2);
                if (overlap >= IDENTICAL_TABLE_OVERLAP_RATIO) {
                    return 0;
                } else {
                    return 1;
                }
            }
        });

        tableSet.addAll(tableAreas);

        return new ArrayList<Rectangle>(tableSet);
    }

    private Rectangle getTableFromText(List<Line> lines,
                                       List<TextEdge> relevantEdges,
                                       int relevantEdgeCount,
                                       List<Ruling> horizontalRulings) {

        Rectangle table = new Rectangle();

        Line prevRow = null;
        Line firstTableRow = null;
        Line lastTableRow = null;

        int tableSpaceCount = 0;
        float totalRowSpacing = 0;

        // go through the lines and find the ones that have the correct count of the relevant edges
        for (Line textRow : lines) {
            int numRelevantEdges = 0;

            if (firstTableRow != null && tableSpaceCount > 0) {
                // check to make sure this text row is within a line or so of the other lines already added
                // if it's not, we should stop the table here
                float tableLineThreshold = (totalRowSpacing / tableSpaceCount) * 2.5f;
                float lineDistance = textRow.getTop() - prevRow.getTop();

                if (lineDistance > tableLineThreshold) {
                    lastTableRow = prevRow;
                    break;
                }
            }

            // for larger tables, be a little lenient on the number of relevant rows the text intersects
            // for smaller tables, not so much - otherwise we'll end up treating paragraphs as tables too
            int relativeEdgeDifferenceThreshold = 1;
            if (relevantEdgeCount <= 3) {
                relativeEdgeDifferenceThreshold = 0;
            }

            for (TextEdge edge : relevantEdges) {
                if (textRow.intersectsLine(edge)) {
                    numRelevantEdges++;
                }
            }

            // see if we have a candidate text row
            if (numRelevantEdges >= (relevantEdgeCount - relativeEdgeDifferenceThreshold)) {
                // keep track of table row spacing
                if (prevRow != null && firstTableRow != null) {
                    tableSpaceCount++;
                    totalRowSpacing += (textRow.getTop() - prevRow.getTop());
                }

                // row is part of a table
                if (table.getArea() == 0) {
                    firstTableRow = textRow;
                    table.setRect(textRow);
                } else {
                    table.setLeft(Math.min(table.getLeft(), textRow.getLeft()));
                    table.setBottom(Math.max(table.getBottom(), textRow.getBottom()));
                    table.setRight(Math.max(table.getRight(), textRow.getRight()));
                }
            } else {
                // no dice
                // if we're at the end of the table, save the last row
                if (firstTableRow != null && lastTableRow == null) {
                    lastTableRow = prevRow;
                }
            }

            prevRow = textRow;
        }

        // if we don't have a table now, we won't after the next step either
        if (table.getArea() == 0) {
            return null;
        }

        if (lastTableRow == null) {
            // takes care of one-row tables or tables that end at the bottom of a page
            lastTableRow = prevRow;
        }

        // use the average row height and nearby horizontal lines to extend the table area
        float avgRowHeight;
        if (tableSpaceCount > 0) {
            avgRowHeight = totalRowSpacing / tableSpaceCount;
        } else {
            avgRowHeight = lastTableRow.height;
        }

        float rowHeightThreshold = avgRowHeight * 1.5f;

        // check lines after the bottom of the table
        for (Line2D.Float ruling : horizontalRulings) {

            if (ruling.getY1() < table.getBottom()) {
                continue;
            }

            float distanceFromTable = (float) ruling.getY1() - table.getBottom();
            if (distanceFromTable <= rowHeightThreshold) {
                // use this ruling to help define the table
                table.setBottom((float) Math.max(table.getBottom(), ruling.getY1()));
                table.setLeft((float) Math.min(table.getLeft(), ruling.getX1()));
                table.setRight((float) Math.max(table.getRight(), ruling.getX2()));
            } else {
                // no use checking any further
                break;
            }
        }

        // do the same for lines at the top, but make the threshold greater since table headings tend to be
        // larger to fit up to three-ish rows of text (at least but we don't want to grab too much)
        rowHeightThreshold = avgRowHeight * 3.8f;

        for (int i = horizontalRulings.size() - 1; i >= 0; i--) {
            Line2D.Float ruling = horizontalRulings.get(i);

            if (ruling.getY1() > table.getTop()) {
                continue;
            }

            float distanceFromTable = table.getTop() - (float) ruling.getY1();
            if (distanceFromTable <= rowHeightThreshold) {
                table.setTop((float) Math.min(table.getTop(), ruling.getY1()));
                table.setLeft((float) Math.min(table.getLeft(), ruling.getX1()));
                table.setRight((float) Math.max(table.getRight(), ruling.getX2()));
            } else {
                break;
            }
        }

        // add a bit of padding since the halved horizontal lines are a little fuzzy anyways
        table.setTop((float) Math.floor(table.getTop()) - TABLE_PADDING_AMOUNT);
        table.setBottom((float) Math.ceil(table.getBottom()) + TABLE_PADDING_AMOUNT);
        table.setLeft((float) Math.floor(table.getLeft()) - TABLE_PADDING_AMOUNT);
        table.setRight((float) Math.ceil(table.getRight()) + TABLE_PADDING_AMOUNT);

        return table;
    }

    private RelevantEdges getRelevantEdges(TextEdges textEdges, List<Line> lines) {
        List<TextEdge> leftTextEdges = textEdges.get(TextEdge.LEFT);
        List<TextEdge> midTextEdges = textEdges.get(TextEdge.MID);
        List<TextEdge> rightTextEdges = textEdges.get(TextEdge.RIGHT);

        // first we'll find the number of lines each type of edge crosses
        int[][] edgeCountsPerLine = new int[lines.size()][TextEdge.NUM_TYPES];

        for (TextEdge edge : leftTextEdges) {
            edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.LEFT]++;
        }

        for (TextEdge edge : midTextEdges) {
            edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.MID]++;
        }

        for (TextEdge edge : rightTextEdges) {
            edgeCountsPerLine[edge.intersectingTextRowCount - 1][TextEdge.RIGHT]++;
        }

        // now let's find the relevant edge type and the number of those edges we should look for
        // we'll only take a minimum of two edges to look for tables
        int relevantEdgeType = -1;
        int relevantEdgeCount = 0;
        for (int i = edgeCountsPerLine.length - 1; i > 2; i--) {
            if (edgeCountsPerLine[i][TextEdge.LEFT] > 2 &&
                    edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.RIGHT] &&
                    edgeCountsPerLine[i][TextEdge.LEFT] >= edgeCountsPerLine[i][TextEdge.MID]) {
                relevantEdgeCount = edgeCountsPerLine[i][TextEdge.LEFT];
                relevantEdgeType = TextEdge.LEFT;
                break;
            }

            if (edgeCountsPerLine[i][TextEdge.RIGHT] > 1 &&
                    edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.LEFT] &&
                    edgeCountsPerLine[i][TextEdge.RIGHT] >= edgeCountsPerLine[i][TextEdge.MID]) {
                relevantEdgeCount = edgeCountsPerLine[i][TextEdge.RIGHT];
                relevantEdgeType = TextEdge.RIGHT;
                break;
            }

            if (edgeCountsPerLine[i][TextEdge.MID] > 1 &&
                    edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.RIGHT] &&
                    edgeCountsPerLine[i][TextEdge.MID] >= edgeCountsPerLine[i][TextEdge.LEFT]) {
                relevantEdgeCount = edgeCountsPerLine[i][TextEdge.MID];
                relevantEdgeType = TextEdge.MID;
                break;
            }
        }

        return new RelevantEdges(relevantEdgeType, relevantEdgeCount);
    }

    private TextEdges getTextEdges(List<Line> lines) {

        // get all text edges (lines that align with the left, middle and right of chunks of text) that extend
        // uninterrupted over at least REQUIRED_TEXT_LINES_FOR_EDGE lines of text
        List<TextEdge> leftTextEdges = new ArrayList<TextEdge>();
        List<TextEdge> midTextEdges = new ArrayList<TextEdge>();
        List<TextEdge> rightTextEdges = new ArrayList<TextEdge>();

        Map<Integer, List<TextChunk>> currLeftEdges = new HashMap<Integer, List<TextChunk>>();
        Map<Integer, List<TextChunk>> currMidEdges = new HashMap<Integer, List<TextChunk>>();
        Map<Integer, List<TextChunk>> currRightEdges = new HashMap<Integer, List<TextChunk>>();

        for (Line textRow : lines) {
            for (TextChunk text : textRow.getTextElements()) {
                Integer left = new Integer((int) Math.floor(text.getLeft()));
                Integer right = new Integer((int) Math.floor(text.getRight()));
                Integer mid = new Integer(left + ((right - left) / 2));

                // first put this chunk into any edge buckets it belongs to
                List<TextChunk> leftEdge = currLeftEdges.get(left);
                if (leftEdge == null) {
                    leftEdge = new ArrayList<TextChunk>();
                    currLeftEdges.put(left, leftEdge);
                }
                leftEdge.add(text);

                List<TextChunk> midEdge = currMidEdges.get(mid);
                if (midEdge == null) {
                    midEdge = new ArrayList<TextChunk>();
                    currMidEdges.put(mid, midEdge);
                }
                midEdge.add(text);

                List<TextChunk> rightEdge = currRightEdges.get(right);
                if (rightEdge == null) {
                    rightEdge = new ArrayList<TextChunk>();
                    currRightEdges.put(right, rightEdge);
                }
                rightEdge.add(text);

                // now see if this text chunk blows up any other edges
                for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext(); ) {
                    Map.Entry<Integer, List<TextChunk>> entry = iterator.next();
                    Integer key = entry.getKey();
                    if (key > left && key < right) {
                        iterator.remove();
                        List<TextChunk> edgeChunks = entry.getValue();
                        if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
                            TextChunk first = edgeChunks.get(0);
                            TextChunk last = edgeChunks.get(edgeChunks.size() - 1);

                            TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
                            edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());

                            leftTextEdges.add(edge);
                        }
                    }
                }

                for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext(); ) {
                    Map.Entry<Integer, List<TextChunk>> entry = iterator.next();
                    Integer key = entry.getKey();
                    if (key > left && key < right && Math.abs(key - mid) > 2) {
                        iterator.remove();
                        List<TextChunk> edgeChunks = entry.getValue();
                        if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
                            TextChunk first = edgeChunks.get(0);
                            TextChunk last = edgeChunks.get(edgeChunks.size() - 1);

                            TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
                            edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());

                            midTextEdges.add(edge);
                        }
                    }
                }

                for (Iterator<Map.Entry<Integer, List<TextChunk>>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext(); ) {
                    Map.Entry<Integer, List<TextChunk>> entry = iterator.next();
                    Integer key = entry.getKey();
                    if (key > left && key < right) {
                        iterator.remove();
                        List<TextChunk> edgeChunks = entry.getValue();
                        if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
                            TextChunk first = edgeChunks.get(0);
                            TextChunk last = edgeChunks.get(edgeChunks.size() - 1);

                            TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
                            edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());

                            rightTextEdges.add(edge);
                        }
                    }
                }
            }
        }

        // add the leftovers
        for (Integer key : currLeftEdges.keySet()) {
            List<TextChunk> edgeChunks = currLeftEdges.get(key);
            if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
                TextChunk first = edgeChunks.get(0);
                TextChunk last = edgeChunks.get(edgeChunks.size() - 1);

                TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
                edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());

                leftTextEdges.add(edge);
            }
        }

        for (Integer key : currMidEdges.keySet()) {
            List<TextChunk> edgeChunks = currMidEdges.get(key);
            if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
                TextChunk first = edgeChunks.get(0);
                TextChunk last = edgeChunks.get(edgeChunks.size() - 1);

                TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
                edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());

                midTextEdges.add(edge);
            }
        }

        for (Integer key : currRightEdges.keySet()) {
            List<TextChunk> edgeChunks = currRightEdges.get(key);
            if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
                TextChunk first = edgeChunks.get(0);
                TextChunk last = edgeChunks.get(edgeChunks.size() - 1);

                TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
                edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());

                rightTextEdges.add(edge);
            }
        }

        return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges);
    }

    private List<Rectangle> getTableAreasFromCells(List<? extends Rectangle> cells) {
        List<List<Rectangle>> cellGroups = new ArrayList<List<Rectangle>>();
        for (Rectangle cell : cells) {
            boolean addedToGroup = false;

            cellCheck:
            for (List<Rectangle> cellGroup : cellGroups) {
                for (Rectangle groupCell : cellGroup) {
                    Point2D[] groupCellCorners = groupCell.getPoints();
                    Point2D[] candidateCorners = cell.getPoints();

                    for (int i = 0; i < candidateCorners.length; i++) {
                        for (int j = 0; j < groupCellCorners.length; j++) {
                            if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) {
                                cellGroup.add(cell);
                                addedToGroup = true;
                                break cellCheck;
                            }
                        }
                    }
                }
            }

            if (!addedToGroup) {
                ArrayList<Rectangle> cellGroup = new ArrayList<Rectangle>();
                cellGroup.add(cell);
                cellGroups.add(cellGroup);
            }
        }

        // create table areas based on cell group
        List<Rectangle> tableAreas = new ArrayList<Rectangle>();
        for (List<Rectangle> cellGroup : cellGroups) {
            // less than four cells should not make a table
            if (cellGroup.size() < REQUIRED_CELLS_FOR_TABLE) {
                continue;
            }

            float top = Float.MAX_VALUE;
            float left = Float.MAX_VALUE;
            float bottom = Float.MIN_VALUE;
            float right = Float.MIN_VALUE;

            for (Rectangle cell : cellGroup) {
                if (cell.getTop() < top) top = cell.getTop();
                if (cell.getLeft() < left) left = cell.getLeft();
                if (cell.getBottom() > bottom) bottom = cell.getBottom();
                if (cell.getRight() > right) right = cell.getRight();
            }

            tableAreas.add(new Rectangle(top, left, right - left, bottom - top));
        }

        return tableAreas;
    }

    private List<Ruling> getHorizontalRulings(BufferedImage image) {

        // get all horizontal edges, which we'll define as a change in grayscale colour
        // along a straight line of a certain length
        ArrayList<Ruling> horizontalRulings = new ArrayList<Ruling>();

        Raster r = image.getRaster();
        int width = r.getWidth();
        int height = r.getHeight();

        for (int x = 0; x < width; x++) {

            int[] lastPixel = r.getPixel(x, 0, (int[]) null);

            for (int y = 1; y < height - 1; y++) {

                int[] currPixel = r.getPixel(x, y, (int[]) null);

                int diff = Math.abs(currPixel[0] - lastPixel[0]);
                if (diff > GRAYSCALE_INTENSITY_THRESHOLD) {
                    // we hit what could be a line
                    // don't bother scanning it if we've hit a pixel in the line before
                    boolean alreadyChecked = false;
                    for (Line2D.Float line : horizontalRulings) {
                        if (y == line.getY1() && x >= line.getX1() && x <= line.getX2()) {
                            alreadyChecked = true;
                            break;
                        }
                    }

                    if (alreadyChecked) {
                        lastPixel = currPixel;
                        continue;
                    }

                    int lineX = x + 1;

                    while (lineX < width) {
                        int[] linePixel = r.getPixel(lineX, y, (int[]) null);
                        int[] abovePixel = r.getPixel(lineX, y - 1, (int[]) null);

                        if (Math.abs(linePixel[0] - abovePixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD
                                || Math.abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) {
                            break;
                        }

                        lineX++;
                    }

                    int endX = lineX - 1;
                    int lineWidth = endX - x;
                    if (lineWidth > HORIZONTAL_EDGE_WIDTH_MINIMUM) {
                        horizontalRulings.add(new Ruling(new Point2D.Float(x, y), new Point2D.Float(endX, y)));
                    }
                }

                lastPixel = currPixel;
            }
        }

        return horizontalRulings;
    }

    private List<Ruling> getVerticalRulings(BufferedImage image) {

        // get all vertical edges, which we'll define as a change in grayscale colour
        // along a straight line of a certain length
        ArrayList<Ruling> verticalRulings = new ArrayList<Ruling>();

        Raster r = image.getRaster();
        int width = r.getWidth();
        int height = r.getHeight();

        for (int y = 0; y < height; y++) {

            int[] lastPixel = r.getPixel(0, y, (int[]) null);

            for (int x = 1; x < width - 1; x++) {

                int[] currPixel = r.getPixel(x, y, (int[]) null);

                int diff = Math.abs(currPixel[0] - lastPixel[0]);
                if (diff > GRAYSCALE_INTENSITY_THRESHOLD) {
                    // we hit what could be a line
                    // don't bother scanning it if we've hit a pixel in the line before
                    boolean alreadyChecked = false;
                    for (Line2D.Float line : verticalRulings) {
                        if (x == line.getX1() && y >= line.getY1() && y <= line.getY2()) {
                            alreadyChecked = true;
                            break;
                        }
                    }

                    if (alreadyChecked) {
                        lastPixel = currPixel;
                        continue;
                    }

                    int lineY = y + 1;

                    while (lineY < height) {
                        int[] linePixel = r.getPixel(x, lineY, (int[]) null);
                        int[] leftPixel = r.getPixel(x - 1, lineY, (int[]) null);

                        if (Math.abs(linePixel[0] - leftPixel[0]) <= GRAYSCALE_INTENSITY_THRESHOLD
                                || Math.abs(currPixel[0] - linePixel[0]) > GRAYSCALE_INTENSITY_THRESHOLD) {
                            break;
                        }

                        lineY++;
                    }

                    int endY = lineY - 1;
                    int lineLength = endY - y;
                    if (lineLength > VERTICAL_EDGE_HEIGHT_MINIMUM) {
                        verticalRulings.add(new Ruling(new Point2D.Float(x, y), new Point2D.Float(x, endY)));
                    }
                }

                lastPixel = currPixel;
            }
        }

        return verticalRulings;
    }


    // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html
    private PDDocument removeText(PDPage page) throws IOException {

        PDFStreamParser parser = new PDFStreamParser(page);
        parser.parse();
        List<Object> tokens = parser.getTokens();
        List<Object> newTokens = new ArrayList<Object>();
        for (Object token : tokens) {
            if (token instanceof Operator) {
                Operator op = (Operator) token;
                if (op.getName().equals("TJ") || op.getName().equals("Tj")) {
                    //remove the one argument to this operator
                    newTokens.remove(newTokens.size() - 1);
                    continue;
                }
            }
            newTokens.add(token);
        }

        PDDocument document = new PDDocument();
        document.addPage(page);

        PDStream newContents = new PDStream(document);
        OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
        ContentStreamWriter writer = new ContentStreamWriter(out);
        writer.writeTokens(newTokens);
        out.close();
        page.setContents(newContents);

        return document;

    }
}