SpreadsheetDetectionAlgorithm.java example

Explorer

tabula-java-master
- src
  - main
    - java
      - technology
        tabula
        Cell.java
        CohenSutherlandClipping.java
        CommandLineApp.java
        DummyGraphics2D.java
        HasText.java
        Line.java
        ObjectExtractor.java
        ObjectExtractorStreamEngine.java
        Page.java
        PageIterator.java
        ProjectionProfile.java
        QuickSort.java
        Rectangle.java
        RectangleSpatialIndex.java
        RectangularTextContainer.java
        Ruling.java
        Table.java
        TableWithRulingLines.java
        TextChunk.java
        TextElement.java
        Utils.java
        debug
        Debug.java
        detectors
        DetectionAlgorithm.java
        NurminenDetectionAlgorithm.java
        SpreadsheetDetectionAlgorithm.java
        extractors
        BasicExtractionAlgorithm.java
        ExtractionAlgorithm.java
        SpreadsheetExtractionAlgorithm.java
        json
        RulingSerializer.java
        TableSerializer.java
        TextChunkSerializer.java
        writers
        CSVWriter.java
        JSONWriter.java
        TSVWriter.java
        Writer.java
  - test
    - java
      - technology
        tabula
        TestBasicExtractor.java
        TestCell.java
        TestCellPosition.java
        TestCommandLineApp.java
        TestDebug.java
        TestLine.java
        TestObjectExtractor.java
        TestProjectionProfile.java
        TestRectangle.java
        TestRectangleSpatialIndex.java
        TestRuling.java
        TestSpreadsheetExtractor.java
        TestTableDetection.java
        TestTextElement.java
        TestUtils.java
        TestWriters.java
        UtilsForTesting.java

package technology.tabula.detectors;

import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.Cell;
import technology.tabula.Page;
import technology.tabula.Rectangle;
import technology.tabula.Ruling;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

import java.io.File;
import java.util.Collections;
import java.util.List;

/**
 * Created by matt on 2015-12-14.
 *
 * This is the basic spreadsheet table detection algorithm currently implemented in tabula (web).
 *
 * It uses intersecting ruling lines to find tables.
 */
public class SpreadsheetDetectionAlgorithm implements DetectionAlgorithm {
    @Override
    public List<Rectangle> detect(Page page) {
        List<Cell> cells = SpreadsheetExtractionAlgorithm.findCells(page.getHorizontalRulings(), page.getVerticalRulings());

        SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

        List<Rectangle> tables = sea.findSpreadsheetsFromCells(cells);

        // we want tables to be returned from top to bottom on the page
        Collections.sort(tables);

        return tables;
    }
}