package technology.tabula.extractors;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Arrays;
import technology.tabula.Line;
import technology.tabula.Page;
import technology.tabula.Rectangle;
import technology.tabula.Ruling;
import technology.tabula.Table;
import technology.tabula.TextChunk;
import technology.tabula.TextElement;
public class BasicExtractionAlgorithm implements ExtractionAlgorithm {
private List<Ruling> verticalRulings = null;
public BasicExtractionAlgorithm() {
}
public BasicExtractionAlgorithm(List<Ruling> verticalRulings) {
this.verticalRulings = verticalRulings;
}
public List<Table> extract(Page page, List<Float> verticalRulingPositions) {
List<Ruling> verticalRulings = new ArrayList<Ruling>(verticalRulingPositions.size());
for (Float p: verticalRulingPositions) {
verticalRulings.add(new Ruling((float) page.getTop(), (float) p, 0.0f, (float) page.getHeight()));
}
this.verticalRulings = verticalRulings;
return this.extract(page);
}
@Override
public List<Table> extract(Page page) {
List<TextElement> textElements = page.getText();
if (textElements.size() == 0) {
return Arrays.asList(new Table[] { Table.EMPTY });
}
List<TextChunk> textChunks = this.verticalRulings == null ? TextElement.mergeWords(page.getText()) : TextElement.mergeWords(page.getText(), this.verticalRulings);
List<Line> lines = TextChunk.groupByLines(textChunks);
List<Float> columns = null;
if (this.verticalRulings != null) {
Collections.sort(this.verticalRulings, new Comparator<Ruling>() {
@Override
public int compare(Ruling arg0, Ruling arg1) {
return Double.compare(arg0.getLeft(), arg1.getLeft());
}
});
columns = new ArrayList<Float>(this.verticalRulings.size());
for (Ruling vr: this.verticalRulings) {
columns.add(vr.getLeft());
}
}
else {
columns = columnPositions(lines);
}
Table table = new Table(page, this);
for (int i = 0; i < lines.size(); i++) {
Line line = lines.get(i);
List<TextChunk> elements = line.getTextElements();
Collections.sort(elements, new Comparator<TextChunk>() {
@Override
public int compare(TextChunk o1, TextChunk o2) {
return new java.lang.Float(o1.getLeft()).compareTo(o2.getLeft());
}
});
for (TextChunk tc: elements) {
if (tc.isSameChar(Line.WHITE_SPACE_CHARS)) {
continue;
}
int j = 0;
boolean found = false;
for(; j < columns.size(); j++) {
if (tc.getLeft() <= columns.get(j)) {
found = true;
break;
}
}
table.add(tc, i, found ? j : columns.size());
}
}
return Arrays.asList(new Table[] { table } );
}
@Override
public String toString() {
return "stream";
}
/**
* @param lines must be an array of lines sorted by their +top+ attribute
* @return a list of column boundaries (x axis)
*/
public static List<java.lang.Float> columnPositions(List<Line> lines) {
List<Rectangle> regions = new ArrayList<Rectangle>();
for (TextChunk tc: lines.get(0).getTextElements()) {
if (tc.isSameChar(Line.WHITE_SPACE_CHARS)) {
continue;
}
Rectangle r = new Rectangle();
r.setRect(tc);
regions.add(r);
}
for (Line l: lines.subList(1, lines.size())) {
List<TextChunk> lineTextElements = new ArrayList<TextChunk>();
for (TextChunk tc: l.getTextElements()) {
if (!tc.isSameChar(Line.WHITE_SPACE_CHARS)) {
lineTextElements.add(tc);
}
}
for (Rectangle cr: regions) {
List<TextChunk> overlaps = new ArrayList<TextChunk>();
for (TextChunk te: lineTextElements) {
if (cr.horizontallyOverlaps(te)) {
overlaps.add(te);
}
}
for (TextChunk te: overlaps) {
cr.merge(te);
}
lineTextElements.removeAll(overlaps);
}
for (TextChunk te: lineTextElements) {
Rectangle r = new Rectangle();
r.setRect(te);
regions.add(r);
}
}
List<java.lang.Float> rv = new ArrayList<java.lang.Float>();
for (Rectangle r: regions) {
rv.add((float) r.getRight());
}
Collections.sort(rv);
return rv;
}
}