package technology.tabula;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.pdmodel.font.PDFont;
@SuppressWarnings("serial")
public class TextElement extends Rectangle implements HasText {
private final String text;
private final PDFont font;
private float fontSize;
private float widthOfSpace, dir;
private static final float AVERAGE_CHAR_TOLERANCE = 0.3f;
public TextElement(float y, float x, float width, float height,
PDFont font, float fontSize, String c, float widthOfSpace) {
this(y, x, width, height, font, fontSize, c, widthOfSpace, 0f);
}
public TextElement(float y, float x, float width, float height,
PDFont font, float fontSize, String c, float widthOfSpace, float dir) {
super();
this.setRect(x, y, width, height);
this.text = c;
this.widthOfSpace = widthOfSpace;
this.fontSize = fontSize;
this.font = font;
this.dir = dir;
}
public String getText() {
return text;
}
public float getDirection() {
return dir;
}
public float getWidthOfSpace() {
return widthOfSpace;
}
public PDFont getFont() {
return font;
}
public float getFontSize() {
return fontSize;
}
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",text=\"%s\"]", this.getText()));
return sb.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + java.lang.Float.floatToIntBits(dir);
result = prime * result + ((font == null) ? 0 : font.hashCode());
result = prime * result + java.lang.Float.floatToIntBits(fontSize);
result = prime * result + ((text == null) ? 0 : text.hashCode());
result = prime * result + java.lang.Float.floatToIntBits(widthOfSpace);
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
TextElement other = (TextElement) obj;
if (java.lang.Float.floatToIntBits(dir) != java.lang.Float
.floatToIntBits(other.dir))
return false;
if (font == null) {
if (other.font != null)
return false;
} else if (!font.equals(other.font))
return false;
if (java.lang.Float.floatToIntBits(fontSize) != java.lang.Float
.floatToIntBits(other.fontSize))
return false;
if (text == null) {
if (other.text != null)
return false;
} else if (!text.equals(other.text))
return false;
if (java.lang.Float.floatToIntBits(widthOfSpace) != java.lang.Float
.floatToIntBits(other.widthOfSpace))
return false;
return true;
}
public static List<TextChunk> mergeWords(List<TextElement> textElements) {
return mergeWords(textElements, new ArrayList<Ruling>());
}
/**
* heuristically merge a list of TextElement into a list of TextChunk
* ported from from PDFBox's PDFTextStripper.writePage, with modifications.
* Here be dragons
*/
public static List<TextChunk> mergeWords(List<TextElement> textElements, List<Ruling> verticalRulings) {
List<TextChunk> textChunks = new ArrayList<TextChunk>();
if (textElements.isEmpty()) {
return textChunks;
}
// it's a problem that this `remove` is side-effecty
// other things depend on `textElements` and it can sometimes lead to the first textElement in textElement
// not appearing in the final output because it's been removed here.
// https://github.com/tabulapdf/tabula-java/issues/78
List<TextElement> copyOfTextElements = new ArrayList<TextElement>(textElements);
textChunks.add(new TextChunk(copyOfTextElements.remove(0)));
TextChunk firstTC = textChunks.get(0);
float previousAveCharWidth = (float) firstTC.getWidth();
float endOfLastTextX = (float) firstTC.getRight();
float maxYForLine = (float) firstTC.getBottom();
float maxHeightForLine = (float) firstTC.getHeight();
float minYTopForLine = (float) firstTC.getTop();
float lastWordSpacing = -1;
float wordSpacing, deltaSpace, averageCharWidth, deltaCharWidth;
float expectedStartOfNextWordX, dist;
TextElement sp, prevChar;
TextChunk currentChunk;
boolean sameLine, acrossVerticalRuling;
for (TextElement chr : copyOfTextElements) {
currentChunk = textChunks.get(textChunks.size() - 1);
prevChar = currentChunk.textElements.get(currentChunk.textElements.size() - 1);
// if same char AND overlapped, skip
if ((chr.getText().equals(prevChar.getText())) && (prevChar.overlapRatio(chr) > 0.5)) {
continue;
}
// if chr is a space that overlaps with prevChar, skip
if (chr.getText().equals(" ") && Utils.feq(prevChar.getLeft(), chr.getLeft()) && Utils.feq(prevChar.getTop(), chr.getTop())) {
continue;
}
// Resets the average character width when we see a change in font
// or a change in the font size
if ((chr.getFont() != prevChar.getFont()) || !Utils.feq(chr.getFontSize(), prevChar.getFontSize())) {
previousAveCharWidth = -1;
}
// is there any vertical ruling that goes across chr and prevChar?
acrossVerticalRuling = false;
for (Ruling r : verticalRulings) {
if (
(verticallyOverlapsRuling(prevChar, r) && verticallyOverlapsRuling(chr, r)) &&
(prevChar.x < r.getPosition() && chr.x > r.getPosition()) || (prevChar.x > r.getPosition() && chr.x < r.getPosition())
) {
acrossVerticalRuling = true;
break;
}
}
// Estimate the expected width of the space based on the
// space character with some margin.
wordSpacing = chr.getWidthOfSpace();
deltaSpace = 0;
if (java.lang.Float.isNaN(wordSpacing) || wordSpacing == 0) {
deltaSpace = java.lang.Float.MAX_VALUE;
} else if (lastWordSpacing < 0) {
deltaSpace = wordSpacing * 0.5f; // 0.5 == spacing tolerance
} else {
deltaSpace = ((wordSpacing + lastWordSpacing) / 2.0f) * 0.5f;
}
// Estimate the expected width of the space based on the
// average character width with some margin. This calculation does not
// make a true average (average of averages) but we found that it gave the
// best results after numerous experiments. Based on experiments we also found that
// .3 worked well.
if (previousAveCharWidth < 0) {
averageCharWidth = (float) (chr.getWidth() / chr.getText().length());
} else {
averageCharWidth = (float) ((previousAveCharWidth + (chr.getWidth() / chr.getText().length())) / 2.0f);
}
deltaCharWidth = averageCharWidth * AVERAGE_CHAR_TOLERANCE;
// Compares the values obtained by the average method and the wordSpacing method and picks
// the smaller number.
expectedStartOfNextWordX = -java.lang.Float.MAX_VALUE;
if (endOfLastTextX != -1) {
expectedStartOfNextWordX = endOfLastTextX + Math.min(deltaCharWidth, deltaSpace);
}
// new line?
sameLine = true;
if (!Utils.overlap((float) chr.getBottom(), chr.height, maxYForLine, maxHeightForLine)) {
endOfLastTextX = -1;
expectedStartOfNextWordX = -java.lang.Float.MAX_VALUE;
maxYForLine = -java.lang.Float.MAX_VALUE;
maxHeightForLine = -1;
minYTopForLine = java.lang.Float.MAX_VALUE;
sameLine = false;
}
endOfLastTextX = (float) chr.getRight();
// should we add a space?
if (!acrossVerticalRuling &&
sameLine &&
expectedStartOfNextWordX < chr.getLeft() &&
!prevChar.getText().endsWith(" ")) {
sp = new TextElement((float) prevChar.getTop(),
(float) prevChar.getLeft(),
(float) (expectedStartOfNextWordX - prevChar.getLeft()),
(float) prevChar.getHeight(),
prevChar.getFont(),
prevChar.getFontSize(),
" ",
prevChar.getWidthOfSpace());
currentChunk.add(sp);
} else {
sp = null;
}
maxYForLine = (float) Math.max(chr.getBottom(), maxYForLine);
maxHeightForLine = (float) Math.max(maxHeightForLine, chr.getHeight());
minYTopForLine = (float) Math.min(minYTopForLine, chr.getTop());
dist = (float) (chr.getLeft() - (sp != null ? sp.getRight() : prevChar.getRight()));
if (!acrossVerticalRuling &&
sameLine &&
(dist < 0 ? currentChunk.verticallyOverlaps(chr) : dist < wordSpacing)) {
currentChunk.add(chr);
} else { // create a new chunk
textChunks.add(new TextChunk(chr));
}
lastWordSpacing = wordSpacing;
previousAveCharWidth = (float) (sp != null ? (averageCharWidth + sp.getWidth()) / 2.0f : averageCharWidth);
}
List<TextChunk> textChunksSeparatedByDirectionality = new ArrayList<TextChunk>();
// count up characters by directionality
for (TextChunk chunk : textChunks) {
// choose the dominant direction
boolean isLtrDominant = chunk.isLtrDominant() != -1; // treat neutral as LTR
TextChunk dirChunk = chunk.groupByDirectionality(isLtrDominant);
textChunksSeparatedByDirectionality.add(dirChunk);
}
return textChunksSeparatedByDirectionality;
}
private static boolean verticallyOverlapsRuling(TextElement te, Ruling r) {
return Math.max(0, Math.min(te.getBottom(), r.getY2()) - Math.max(te.getTop(), r.getY1())) > 0;
}
}