package technology.tabula.detectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.Cell;
import technology.tabula.Page;
import technology.tabula.Rectangle;
import technology.tabula.Ruling;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import java.io.File;
import java.util.Collections;
import java.util.List;
/**
* Created by matt on 2015-12-14.
*
* This is the basic spreadsheet table detection algorithm currently implemented in tabula (web).
*
* It uses intersecting ruling lines to find tables.
*/
public class SpreadsheetDetectionAlgorithm implements DetectionAlgorithm {
@Override
public List<Rectangle> detect(Page page) {
List<Cell> cells = SpreadsheetExtractionAlgorithm.findCells(page.getHorizontalRulings(), page.getVerticalRulings());
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
List<Rectangle> tables = sea.findSpreadsheetsFromCells(cells);
// we want tables to be returned from top to bottom on the page
Collections.sort(tables);
return tables;
}
}