package technology.tabula; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Collections; import java.util.HashMap; import java.text.Normalizer; @SuppressWarnings("serial") public class TextChunk extends RectangularTextContainer<TextElement> implements HasText { public static final TextChunk EMPTY = new TextChunk(0, 0, 0, 0); List<TextElement> textElements = new ArrayList<TextElement>(); public TextChunk(float top, float left, float width, float height) { super(top, left, width, height); } public TextChunk(TextElement textElement) { super(textElement.y, textElement.x, textElement.width, textElement.height); this.add(textElement); } public TextChunk(List<TextElement> textElements) { this(textElements.get(0)); for (int i = 1; i < textElements.size(); i++) { this.add(textElements.get(i)); } } private enum DirectionalityOptions { LTR, NONE, RTL } // I hate Java so bad. // we're making this HashMap static! which requires really funky initialization per http://stackoverflow.com/questions/6802483/how-to-directly-initialize-a-hashmap-in-a-literal-way/6802502#6802502 private static HashMap<Byte, DirectionalityOptions> directionalities; static { directionalities = new HashMap<Byte, DirectionalityOptions>(); // BCT = bidirectional character type directionalities.put(java.lang.Character.DIRECTIONALITY_ARABIC_NUMBER, DirectionalityOptions.LTR); // Weak BCT "AN" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_BOUNDARY_NEUTRAL, DirectionalityOptions.NONE); // Weak BCT "BN" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DirectionalityOptions.LTR); // Weak BCT "CS" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_EUROPEAN_NUMBER, DirectionalityOptions.LTR); // Weak BCT "EN" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR, DirectionalityOptions.LTR); // Weak BCT "ES" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DirectionalityOptions.LTR); // Weak BCT "ET" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT, DirectionalityOptions.LTR); // Strong BCT "L" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING, DirectionalityOptions.LTR); // Strong BCT "LRE" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DirectionalityOptions.LTR); // Strong BCT "LRO" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_NONSPACING_MARK, DirectionalityOptions.NONE); // Weak BCT "NSM" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_OTHER_NEUTRALS, DirectionalityOptions.NONE); // Neutral BCT "ON" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_PARAGRAPH_SEPARATOR, DirectionalityOptions.NONE); // Neutral BCT "B" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT, DirectionalityOptions.NONE); // Weak BCT "PDF" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT, DirectionalityOptions.RTL); // Strong BCT "R" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC, DirectionalityOptions.RTL); // Strong BCT "AL" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DirectionalityOptions.RTL); // Strong BCT "RLE" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE, DirectionalityOptions.RTL); // Strong BCT "RLO" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_SEGMENT_SEPARATOR, DirectionalityOptions.RTL); // Neutral BCT "S" in the Unicode specification. directionalities.put(java.lang.Character.DIRECTIONALITY_UNDEFINED, DirectionalityOptions.NONE); // Undefined BCT. directionalities.put(java.lang.Character.DIRECTIONALITY_WHITESPACE, DirectionalityOptions.NONE); // Neutral BCT "WS" in the Unicode specification. } /** * Splits a TextChunk into N TextChunks, where each chunk is of a single directionality, and * then reverse the RTL ones. * what we're doing here is *reversing* the Unicode bidi algorithm * in the language of that algorithm, each chunk is a (maximal) directional run. * We attach whitespace to the beginning of non-RTL **/ public TextChunk groupByDirectionality(Boolean isLtrDominant) { if (this.getTextElements().size() <= 0) { throw new IllegalArgumentException(); } ArrayList<ArrayList<TextElement>> chunks = new ArrayList<ArrayList<TextElement>>(); ArrayList<TextElement> buff = new ArrayList<TextElement>(); DirectionalityOptions buffDirectionality = DirectionalityOptions.NONE; // the directionality of the characters in buff; for (TextElement te : this.getTextElements()) { //TODO: we need to loop over the textelement characters // because it is possible for a textelement to contain multiple characters? // System.out.println(te.getText() + " is " + Character.getDirectionality(te.getText().charAt(0) ) + " " + directionalities.get(Character.getDirectionality(te.getText().charAt(0) ))); if (buff.size() == 0) { buff.add(te); buffDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0))); } else { if (buffDirectionality == DirectionalityOptions.NONE) { buffDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0))); } DirectionalityOptions teDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0))); if (teDirectionality == buffDirectionality || teDirectionality == DirectionalityOptions.NONE) { if (Character.getDirectionality(te.getText().charAt(0)) == java.lang.Character.DIRECTIONALITY_WHITESPACE && (buffDirectionality == (isLtrDominant ? DirectionalityOptions.RTL : DirectionalityOptions.LTR))) { buff.add(0, te); } else { buff.add(te); } } else { // finish this chunk if (buffDirectionality == DirectionalityOptions.RTL) { Collections.reverse(buff); } chunks.add(buff); // and start a new one buffDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0))); buff = new ArrayList<TextElement>(); buff.add(te); } } } if (buffDirectionality == DirectionalityOptions.RTL) { Collections.reverse(buff); } chunks.add(buff); ArrayList<TextElement> everything = new ArrayList<TextElement>(); if (!isLtrDominant) { Collections.reverse(chunks); } for (ArrayList<TextElement> group : chunks) { everything.addAll(group); } return new TextChunk(everything); } @Override /* We're comparing based on ordering in the logical ordering of text here. Assuming identical Y-axis positions, if TextChunk A has a lower X-axis than TextChunk B, then A is "before" it -- iff this is LTR text. Otherwise, it is A is after B. */ public int compareTo(Rectangle other) { double thisBottom = this.getBottom(); double otherBottom = other.getBottom(); int rv; if (this.equals(other)) return 0; if (this.verticalOverlap(other) > VERTICAL_COMPARISON_THRESHOLD) { rv = java.lang.Double.compare(this.getX(), other.getX()); // reverse the ordering if both TextChunks are RTL if (this.isLtrDominant() == -1 && other.isLtrDominant() == -1) { rv = -1 * rv; } } else { rv = java.lang.Double.compare(thisBottom, otherBottom); } return rv; } public int isLtrDominant() { int ltrCnt = 0; int rtlCnt = 0; for (int i = 0; i < this.getTextElements().size(); i++) { String elementText = this.getTextElements().get(i).getText(); for (int j = 0; j < elementText.length(); j++) { byte dir = Character.getDirectionality(elementText.charAt(j)); if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT) || (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) || (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)) { ltrCnt++; } else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT) || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) { rtlCnt++; } } } return java.lang.Integer.compare(ltrCnt, rtlCnt); // 1 is LTR, 0 is neutral, -1 is RTL } public TextChunk merge(TextChunk other) { super.merge(other); return this; } public void add(TextElement textElement) { this.textElements.add(textElement); this.merge(textElement); } public void add(List<TextElement> textElements) { for (TextElement te : textElements) { this.add(te); } } public List<TextElement> getTextElements() { return textElements; } public String getText() { if (this.textElements.size() == 0) { return ""; } StringBuilder sb = new StringBuilder(); for (TextElement te : this.textElements) { sb.append(te.getText()); } return Normalizer.normalize(sb.toString(), Normalizer.Form.NFKC).trim(); } @Override public String getText(boolean useLineReturns) { // TODO Auto-generated method stub return null; } /** * Returns true if text contained in this TextChunk is the same repeated character */ public boolean isSameChar(Character c) { return isSameChar(new Character[]{c}); } public boolean isSameChar(Character[] c) { String s = this.getText(); List<Character> chars = Arrays.asList(c); for (int i = 0; i < s.length(); i++) { if (!chars.contains(s.charAt(i))) { return false; } } return true; } /** * Splits a TextChunk in two, at the position of the i-th TextElement */ public TextChunk[] splitAt(int i) { if (i < 1 || i >= this.getTextElements().size()) { throw new IllegalArgumentException(); } TextChunk[] rv = new TextChunk[]{ new TextChunk(this.getTextElements().subList(0, i)), new TextChunk(this.getTextElements().subList(i, this.getTextElements().size())) }; return rv; } /** * Removes runs of identical TextElements in this TextChunk * For example, if the TextChunk contains this string of characters: "1234xxxxx56xx" * and c == 'x' and minRunLength == 4, this method will return a list of TextChunk * such that: ["1234", "56xx"] */ public List<TextChunk> squeeze(Character c, int minRunLength) { Character currentChar, lastChar = null; int subSequenceLength = 0, subSequenceStart = 0; TextChunk[] t; List<TextChunk> rv = new ArrayList<TextChunk>(); for (int i = 0; i < this.getTextElements().size(); i++) { TextElement textElement = this.getTextElements().get(i); String text = textElement.getText(); if (text.length() > 1) { currentChar = text.trim().charAt(0); } else { currentChar = text.charAt(0); } if (lastChar != null && currentChar.equals(c) && lastChar.equals(currentChar)) { subSequenceLength++; } else { if (((lastChar != null && !lastChar.equals(currentChar)) || i + 1 == this.getTextElements().size()) && subSequenceLength >= minRunLength) { if (subSequenceStart == 0 && subSequenceLength <= this.getTextElements().size() - 1) { t = this.splitAt(subSequenceLength); } else { t = this.splitAt(subSequenceStart); rv.add(t[0]); } rv.addAll(t[1].squeeze(c, minRunLength)); // Lo and behold, recursion. break; } subSequenceLength = 1; subSequenceStart = i; } lastChar = currentChar; } if (rv.isEmpty()) { // no splits occurred, hence this.squeeze() == [this] if (subSequenceLength >= minRunLength && subSequenceLength < this.textElements.size()) { TextChunk[] chunks = this.splitAt(subSequenceStart); rv.add(chunks[0]); } else { rv.add(this); } } return rv; } @Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((textElements == null) ? 0 : textElements.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; TextChunk other = (TextChunk) obj; if (textElements == null) { if (other.textElements != null) return false; } else if (!textElements.equals(other.textElements)) return false; return true; } public static boolean allSameChar(List<TextChunk> textChunks) { /* the previous, far more elegant version of this method failed when there was an empty TextChunk in textChunks. * so I rewrote it in an ugly way. but it works! * it would be good for this to get rewritten eventually * the purpose is basically just to return true iff there are 2+ TextChunks and they're identical. * -Jeremy 5/13/2016 */ if (textChunks.size() == 1) return false; boolean hasHadAtLeastOneNonEmptyTextChunk = false; char first = '\u0000'; for (TextChunk tc : textChunks) { if (tc.getText().length() == 0) { continue; } if (first == '\u0000') { first = tc.getText().charAt(0); } else { hasHadAtLeastOneNonEmptyTextChunk = true; if (!tc.isSameChar(first)) return false; } } return hasHadAtLeastOneNonEmptyTextChunk; } public static List<Line> groupByLines(List<TextChunk> textChunks) { List<Line> lines = new ArrayList<Line>(); if (textChunks.size() == 0) { return lines; } float bbwidth = Rectangle.boundingBoxOf(textChunks).width; Line l = new Line(); l.addTextChunk(textChunks.get(0)); textChunks.remove(0); lines.add(l); Line last = lines.get(lines.size() - 1); for (TextChunk te : textChunks) { if (last.verticalOverlapRatio(te) < 0.1) { if (last.width / bbwidth > 0.9 && TextChunk.allSameChar(last.getTextElements())) { lines.remove(lines.size() - 1); } lines.add(new Line()); last = lines.get(lines.size() - 1); } last.addTextChunk(te); } if (last.width / bbwidth > 0.9 && TextChunk.allSameChar(last.getTextElements())) { lines.remove(lines.size() - 1); } List<Line> rv = new ArrayList<Line>(lines.size()); for (Line line : lines) { rv.add(Line.removeRepeatedCharacters(line, ' ', 3)); } return rv; } }