package technology.tabula.extractors;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import technology.tabula.Cell;
import technology.tabula.Page;
import technology.tabula.Rectangle;
import technology.tabula.Ruling;
import technology.tabula.Table;
import technology.tabula.TableWithRulingLines;
import technology.tabula.TextElement;
import technology.tabula.Utils;
import technology.tabula.writers.CSVWriter;
/**
* @author manuel
*
*/
public class SpreadsheetExtractionAlgorithm implements ExtractionAlgorithm {
private static final float MAGIC_HEURISTIC_NUMBER = 0.65f;
private static final Comparator<Point2D> POINT_COMPARATOR = new Comparator<Point2D>() {
@Override
public int compare(Point2D arg0, Point2D arg1) {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
}
else if (arg0Y < arg1Y) {
rv = -1;
}
else if (arg0X > arg1X) {
rv = 1;
}
else if (arg0X < arg1X) {
rv = -1;
}
return rv;
}
};
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = new Comparator<Point2D>() {
@Override
public int compare(Point2D arg0, Point2D arg1) {
int rv = 0;
float arg0X = Utils.round(arg0.getX(), 2);
float arg0Y = Utils.round(arg0.getY(), 2);
float arg1X = Utils.round(arg1.getX(), 2);
float arg1Y = Utils.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
}
else if (arg0X < arg1X) {
rv = -1;
}
else if (arg0Y > arg1Y) {
rv = 1;
}
else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
}
};
@Override
public List<? extends Table> extract(Page page) {
return extract(page, page.getRulings());
}
/**
* Extract a list of Table from page using rulings as separators
*/
public List<? extends Table> extract(Page page, List<Ruling> rulings) {
// split rulings into horizontal and vertical
List<Ruling> horizontalR = new ArrayList<Ruling>(),
verticalR = new ArrayList<Ruling>();
for (Ruling r: rulings) {
if (r.horizontal()) {
horizontalR.add(r);
}
else if (r.vertical()) {
verticalR.add(r);
}
}
horizontalR = Ruling.collapseOrientedRulings(horizontalR);
verticalR = Ruling.collapseOrientedRulings(verticalR);
List<Cell> cells = findCells(horizontalR, verticalR);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
List<TableWithRulingLines> spreadsheets = new ArrayList<TableWithRulingLines>();
for (Rectangle area: spreadsheetAreas) {
List<Cell> overlappingCells = new ArrayList<Cell>();
for (Cell c: cells) {
if (c.intersects(area)) {
c.setTextElements(TextElement.mergeWords(page.getText(c)));
overlappingCells.add(c);
}
}
List<Ruling> horizontalOverlappingRulings = new ArrayList<Ruling>();
for (Ruling hr: horizontalR) {
if (area.intersectsLine(hr)) {
horizontalOverlappingRulings.add(hr);
}
}
List<Ruling> verticalOverlappingRulings = new ArrayList<Ruling>();
for (Ruling vr: verticalR) {
if (area.intersectsLine(vr)) {
verticalOverlappingRulings.add(vr);
}
}
TableWithRulingLines t = new TableWithRulingLines(area, page, overlappingCells,
horizontalOverlappingRulings, verticalOverlappingRulings);
t.setExtractionAlgorithm(this);
spreadsheets.add(t);
}
Utils.sort(spreadsheets);
return spreadsheets;
}
public boolean isTabular(Page page) {
// if there's no text at all on the page, it's not a table
// (we won't be able to do anything with it though)
if(page.getText().isEmpty()){
return false;
}
// get minimal region of page that contains every character (in effect,
// removes white "margins")
Page minimalRegion = page.getArea(Utils.bounds(page.getText()));
List<? extends Table> tables = new SpreadsheetExtractionAlgorithm().extract(minimalRegion);
if (tables.size() == 0) {
return false;
}
Table table = tables.get(0);
int rowsDefinedByLines = table.getRows().size();
int colsDefinedByLines = table.getCols().size();
tables = new BasicExtractionAlgorithm().extract(minimalRegion);
if (tables.size() == 0) {
// TODO WHAT DO WE DO HERE?
}
table = tables.get(0);
int rowsDefinedWithoutLines = table.getRows().size();
int colsDefinedWithoutLines = table.getCols().size();
float ratio = (((float) colsDefinedByLines / colsDefinedWithoutLines) + ((float) rowsDefinedByLines / rowsDefinedWithoutLines)) / 2.0f;
return ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1/MAGIC_HEURISTIC_NUMBER);
}
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<Cell> cellsFound = new ArrayList<Cell>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<Point2D>(intersectionPoints.keySet());
Collections.sort(intersectionPointsList, POINT_COMPARATOR);
boolean doBreak = false;
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
doBreak = false;
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<Point2D>();
// CrossingPointsDirectlyToTheRight( topLeft );
List<Point2D> yPoints = new ArrayList<Point2D>();
for (Point2D p: intersectionPointsList.subList(i, intersectionPointsList.size())) {
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
xPoints.add(p);
}
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
yPoints.add(p);
}
}
outer:
for (Point2D xPoint: xPoints) {
if (doBreak) { break; }
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
continue;
}
for (Point2D yPoint: yPoints) {
// is there an horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight)
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new Cell(topLeft, btmRight));
doBreak = true;
break outer;
}
}
}
}
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
// that aren't connected with an horizontal ruler?
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
return cellsFound;
}
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<Rectangle>();
Set<Point2D> pointSet = new HashSet<Point2D>();
Map<Point2D, Point2D> edgesH = new HashMap<Point2D, Point2D>();
Map<Point2D, Point2D> edgesV = new HashMap<Point2D, Point2D>();
int i = 0;
cells = new ArrayList<Rectangle>(new HashSet<Rectangle>(cells));
Collections.sort(cells);
for (Rectangle cell: cells) {
for(Point2D pt: cell.getPoints()) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
}
else {
pointSet.add(pt);
}
}
}
// X first sort
List<Point2D> pointsSortX = new ArrayList<Point2D>(pointSet);
Collections.sort(pointsSortX, X_FIRST_POINT_COMPARATOR);
// Y first sort
List<Point2D> pointsSortY = new ArrayList<Point2D>(pointSet);
Collections.sort(pointsSortY, POINT_COMPARATOR);
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
while (i < pointSet.size() && Utils.feq(pointsSortY.get(i).getY(), currY)) {
edgesH.put(pointsSortY.get(i), pointsSortY.get(i+1));
edgesH.put(pointsSortY.get(i+1), pointsSortY.get(i));
i += 2;
}
}
i = 0;
while (i < pointSet.size()) {
float currX = (float) pointsSortX.get(i).getX();
while (i < pointSet.size() && Utils.feq(pointsSortX.get(i).getX(), currX)) {
edgesV.put(pointsSortX.get(i), pointsSortX.get(i+1));
edgesV.put(pointsSortX.get(i+1), pointsSortX.get(i));
i += 2;
}
}
// Get all the polygons
List<List<PolygonVertex>> polygons = new ArrayList<List<PolygonVertex>>();
Point2D nextVertex;
while (!edgesH.isEmpty()) {
ArrayList<PolygonVertex> polygon = new ArrayList<PolygonVertex>();
Point2D first = edgesH.keySet().iterator().next();
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
edgesH.remove(first);
while (true) {
PolygonVertex curr = polygon.get(polygon.size() - 1);
PolygonVertex lastAddedVertex;
if (curr.direction == Direction.HORIZONTAL) {
nextVertex = edgesV.get(curr.point);
edgesV.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
polygon.add(lastAddedVertex);
}
else {
nextVertex = edgesH.get(curr.point);
edgesH.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
polygon.add(lastAddedVertex);
}
if (lastAddedVertex.equals(polygon.get(0))) {
// closed polygon
polygon.remove(polygon.size() - 1);
break;
}
}
for (PolygonVertex vertex: polygon) {
edgesH.remove(vertex.point);
edgesV.remove(vertex.point);
}
polygons.add(polygon);
}
// calculate grid-aligned minimum area rectangles for each found polygon
for(List<PolygonVertex> poly: polygons) {
float top = java.lang.Float.MAX_VALUE;
float left = java.lang.Float.MAX_VALUE;
float bottom = java.lang.Float.MIN_VALUE;
float right = java.lang.Float.MIN_VALUE;
for (PolygonVertex pt: poly) {
top = (float) Math.min(top, pt.point.getY());
left = (float) Math.min(left, pt.point.getX());
bottom = (float) Math.max(bottom, pt.point.getY());
right = (float) Math.max(right, pt.point.getX());
}
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
}
return rectangles;
}
@Override
public String toString() {
return "lattice";
}
private enum Direction {
HORIZONTAL,
VERTICAL
}
static class PolygonVertex {
Point2D point;
Direction direction;
public PolygonVertex(Point2D point, Direction direction) {
this.direction = direction;
this.point = point;
}
public boolean equals(Object other) {
if (this == other)
return true;
if (!(other instanceof PolygonVertex))
return false;
return this.point.equals(((PolygonVertex) other).point);
}
public int hashCode() {
return this.point.hashCode();
}
public String toString() {
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
}
}
}