TextChunk.java example

Explorer
tabula-java-master
- src
  - main
    - java
      - technology
        tabula
        Cell.java
        CohenSutherlandClipping.java
        CommandLineApp.java
        DummyGraphics2D.java
        HasText.java
        Line.java
        ObjectExtractor.java
        ObjectExtractorStreamEngine.java
        Page.java
        PageIterator.java
        ProjectionProfile.java
        QuickSort.java
        Rectangle.java
        RectangleSpatialIndex.java
        RectangularTextContainer.java
        Ruling.java
        Table.java
        TableWithRulingLines.java
        TextChunk.java
        TextElement.java
        Utils.java
        debug
        Debug.java
        detectors
        DetectionAlgorithm.java
        NurminenDetectionAlgorithm.java
        SpreadsheetDetectionAlgorithm.java
        extractors
        BasicExtractionAlgorithm.java
        ExtractionAlgorithm.java
        SpreadsheetExtractionAlgorithm.java
        json
        RulingSerializer.java
        TableSerializer.java
        TextChunkSerializer.java
        writers
        CSVWriter.java
        JSONWriter.java
        TSVWriter.java
        Writer.java
  - test
    - java
      - technology
        tabula
        TestBasicExtractor.java
        TestCell.java
        TestCellPosition.java
        TestCommandLineApp.java
        TestDebug.java
        TestLine.java
        TestObjectExtractor.java
        TestProjectionProfile.java
        TestRectangle.java
        TestRectangleSpatialIndex.java
        TestRuling.java
        TestSpreadsheetExtractor.java
        TestTableDetection.java
        TestTextElement.java
        TestUtils.java
        TestWriters.java
        UtilsForTesting.java
package technology.tabula;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Collections;
import java.util.HashMap;
import java.text.Normalizer;

@SuppressWarnings("serial")
public class TextChunk extends RectangularTextContainer<TextElement> implements HasText {
    public static final TextChunk EMPTY = new TextChunk(0, 0, 0, 0);
    List<TextElement> textElements = new ArrayList<TextElement>();

    public TextChunk(float top, float left, float width, float height) {
        super(top, left, width, height);
    }

    public TextChunk(TextElement textElement) {
        super(textElement.y, textElement.x, textElement.width, textElement.height);
        this.add(textElement);
    }

    public TextChunk(List<TextElement> textElements) {
        this(textElements.get(0));
        for (int i = 1; i < textElements.size(); i++) {
            this.add(textElements.get(i));
        }
    }

    private enum DirectionalityOptions {
        LTR, NONE, RTL
    }

    // I hate Java so bad.
    // we're making this HashMap static! which requires really funky initialization per http://stackoverflow.com/questions/6802483/how-to-directly-initialize-a-hashmap-in-a-literal-way/6802502#6802502
    private static HashMap<Byte, DirectionalityOptions> directionalities;

    static {
        directionalities = new HashMap<Byte, DirectionalityOptions>();
        // BCT = bidirectional character type
        directionalities.put(java.lang.Character.DIRECTIONALITY_ARABIC_NUMBER, DirectionalityOptions.LTR);               // Weak BCT    "AN" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_BOUNDARY_NEUTRAL, DirectionalityOptions.NONE);            // Weak BCT    "BN" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DirectionalityOptions.LTR);     // Weak BCT    "CS" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_EUROPEAN_NUMBER, DirectionalityOptions.LTR);             // Weak BCT    "EN" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR, DirectionalityOptions.LTR);   // Weak BCT    "ES" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DirectionalityOptions.LTR);  // Weak BCT    "ET" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT, DirectionalityOptions.LTR);              // Strong BCT  "L" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING, DirectionalityOptions.LTR);     // Strong BCT  "LRE" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DirectionalityOptions.LTR);      // Strong BCT  "LRO" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_NONSPACING_MARK, DirectionalityOptions.NONE);             // Weak BCT    "NSM" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_OTHER_NEUTRALS, DirectionalityOptions.NONE);              // Neutral BCT "ON" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_PARAGRAPH_SEPARATOR, DirectionalityOptions.NONE);         // Neutral BCT "B" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT, DirectionalityOptions.NONE);      // Weak BCT    "PDF" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT, DirectionalityOptions.RTL);              // Strong BCT  "R" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC, DirectionalityOptions.RTL);       // Strong BCT  "AL" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DirectionalityOptions.RTL);    // Strong BCT  "RLE" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE, DirectionalityOptions.RTL);     // Strong BCT  "RLO" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_SEGMENT_SEPARATOR, DirectionalityOptions.RTL);          // Neutral BCT "S" in the Unicode specification.
        directionalities.put(java.lang.Character.DIRECTIONALITY_UNDEFINED, DirectionalityOptions.NONE);                   // Undefined BCT.
        directionalities.put(java.lang.Character.DIRECTIONALITY_WHITESPACE, DirectionalityOptions.NONE);                  // Neutral BCT "WS" in the Unicode specification.
    }

    /**
     * Splits a TextChunk into N TextChunks, where each chunk is of a single directionality, and
     * then reverse the RTL ones.
     * what we're doing here is *reversing* the Unicode bidi algorithm
     * in the language of that algorithm, each chunk is a (maximal) directional run.
     * We attach whitespace to the beginning of non-RTL
     **/
    public TextChunk groupByDirectionality(Boolean isLtrDominant) {
        if (this.getTextElements().size() <= 0) {
            throw new IllegalArgumentException();
        }

        ArrayList<ArrayList<TextElement>> chunks = new ArrayList<ArrayList<TextElement>>();
        ArrayList<TextElement> buff = new ArrayList<TextElement>();
        DirectionalityOptions buffDirectionality = DirectionalityOptions.NONE; // the directionality of the characters in buff;

        for (TextElement te : this.getTextElements()) {
            //TODO: we need to loop over the textelement characters
            //      because it is possible for a textelement to contain multiple characters?


            // System.out.println(te.getText() + " is " + Character.getDirectionality(te.getText().charAt(0) ) + " " + directionalities.get(Character.getDirectionality(te.getText().charAt(0) )));
            if (buff.size() == 0) {
                buff.add(te);
                buffDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0)));
            } else {
                if (buffDirectionality == DirectionalityOptions.NONE) {
                    buffDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0)));
                }
                DirectionalityOptions teDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0)));

                if (teDirectionality == buffDirectionality || teDirectionality == DirectionalityOptions.NONE) {
                    if (Character.getDirectionality(te.getText().charAt(0)) == java.lang.Character.DIRECTIONALITY_WHITESPACE && (buffDirectionality == (isLtrDominant ? DirectionalityOptions.RTL : DirectionalityOptions.LTR))) {
                        buff.add(0, te);
                    } else {
                        buff.add(te);
                    }
                } else {
                    // finish this chunk
                    if (buffDirectionality == DirectionalityOptions.RTL) {
                        Collections.reverse(buff);
                    }
                    chunks.add(buff);

                    // and start a new one
                    buffDirectionality = directionalities.get(Character.getDirectionality(te.getText().charAt(0)));
                    buff = new ArrayList<TextElement>();
                    buff.add(te);
                }
            }
        }
        if (buffDirectionality == DirectionalityOptions.RTL) {
            Collections.reverse(buff);
        }
        chunks.add(buff);
        ArrayList<TextElement> everything = new ArrayList<TextElement>();
        if (!isLtrDominant) {
            Collections.reverse(chunks);
        }
        for (ArrayList<TextElement> group : chunks) {
            everything.addAll(group);
        }
        return new TextChunk(everything);
    }

    @Override
    /*
        We're comparing based on ordering in the logical ordering of text here. 
        Assuming identical Y-axis positions, if TextChunk A has a lower X-axis 
        than TextChunk B, then A is "before" it -- iff this is LTR text. Otherwise, 
        it is A is after B.
    */
    public int compareTo(Rectangle other) {
        double thisBottom = this.getBottom();
        double otherBottom = other.getBottom();
        int rv;

        if (this.equals(other)) return 0;

        if (this.verticalOverlap(other) > VERTICAL_COMPARISON_THRESHOLD) {
            rv = java.lang.Double.compare(this.getX(), other.getX());

            // reverse the ordering if both TextChunks are RTL
            if (this.isLtrDominant() == -1 && other.isLtrDominant() == -1) {
                rv = -1 * rv;
            }
        } else {
            rv = java.lang.Double.compare(thisBottom, otherBottom);
        }
        return rv;
    }

    public int isLtrDominant() {
        int ltrCnt = 0;
        int rtlCnt = 0;
        for (int i = 0; i < this.getTextElements().size(); i++) {
            String elementText = this.getTextElements().get(i).getText();
            for (int j = 0; j < elementText.length(); j++) {
                byte dir = Character.getDirectionality(elementText.charAt(j));
                if ((dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT) ||
                        (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING) ||
                        (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)) {
                    ltrCnt++;
                } else if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT) ||
                        (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC) ||
                        (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING) ||
                        (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
                    rtlCnt++;
                }
            }
        }
        return java.lang.Integer.compare(ltrCnt, rtlCnt); // 1 is LTR, 0 is neutral, -1 is RTL
    }


    public TextChunk merge(TextChunk other) {
        super.merge(other);
        return this;
    }

    public void add(TextElement textElement) {
        this.textElements.add(textElement);
        this.merge(textElement);
    }

    public void add(List<TextElement> textElements) {
        for (TextElement te : textElements) {
            this.add(te);
        }
    }

    public List<TextElement> getTextElements() {
        return textElements;
    }

    public String getText() {
        if (this.textElements.size() == 0) {
            return "";
        }

        StringBuilder sb = new StringBuilder();
        for (TextElement te : this.textElements) {
            sb.append(te.getText());
        }
        return Normalizer.normalize(sb.toString(), Normalizer.Form.NFKC).trim();
    }

    @Override
    public String getText(boolean useLineReturns) {
        // TODO Auto-generated method stub
        return null;
    }


    /**
     * Returns true if text contained in this TextChunk is the same repeated character
     */
    public boolean isSameChar(Character c) {
        return isSameChar(new Character[]{c});
    }

    public boolean isSameChar(Character[] c) {
        String s = this.getText();
        List<Character> chars = Arrays.asList(c);
        for (int i = 0; i < s.length(); i++) {
            if (!chars.contains(s.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    /**
     * Splits a TextChunk in two, at the position of the i-th TextElement
     */
    public TextChunk[] splitAt(int i) {
        if (i < 1 || i >= this.getTextElements().size()) {
            throw new IllegalArgumentException();
        }

        TextChunk[] rv = new TextChunk[]{
                new TextChunk(this.getTextElements().subList(0, i)),
                new TextChunk(this.getTextElements().subList(i, this.getTextElements().size()))
        };
        return rv;
    }

    /**
     * Removes runs of identical TextElements in this TextChunk
     * For example, if the TextChunk contains this string of characters: "1234xxxxx56xx"
     * and c == 'x' and minRunLength == 4, this method will return a list of TextChunk
     * such that: ["1234", "56xx"]
     */
    public List<TextChunk> squeeze(Character c, int minRunLength) {
        Character currentChar, lastChar = null;
        int subSequenceLength = 0, subSequenceStart = 0;
        TextChunk[] t;
        List<TextChunk> rv = new ArrayList<TextChunk>();

        for (int i = 0; i < this.getTextElements().size(); i++) {
            TextElement textElement = this.getTextElements().get(i);
            String text = textElement.getText();
            if (text.length() > 1) {
                currentChar = text.trim().charAt(0);
            } else {
                currentChar = text.charAt(0);
            }


            if (lastChar != null && currentChar.equals(c) && lastChar.equals(currentChar)) {
                subSequenceLength++;
            } else {
                if (((lastChar != null && !lastChar.equals(currentChar)) || i + 1 == this.getTextElements().size()) && subSequenceLength >= minRunLength) {

                    if (subSequenceStart == 0 && subSequenceLength <= this.getTextElements().size() - 1) {
                        t = this.splitAt(subSequenceLength);
                    } else {
                        t = this.splitAt(subSequenceStart);
                        rv.add(t[0]);
                    }
                    rv.addAll(t[1].squeeze(c, minRunLength)); // Lo and behold, recursion.
                    break;

                }
                subSequenceLength = 1;
                subSequenceStart = i;
            }
            lastChar = currentChar;
        }


        if (rv.isEmpty()) { // no splits occurred, hence this.squeeze() == [this]
            if (subSequenceLength >= minRunLength && subSequenceLength < this.textElements.size()) {
                TextChunk[] chunks = this.splitAt(subSequenceStart);
                rv.add(chunks[0]);
            } else {
                rv.add(this);
            }
        }

        return rv;

    }


    @Override
    public int hashCode() {
        final int prime = 31;
        int result = super.hashCode();
        result = prime * result
                + ((textElements == null) ? 0 : textElements.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (!super.equals(obj))
            return false;
        if (getClass() != obj.getClass())
            return false;
        TextChunk other = (TextChunk) obj;
        if (textElements == null) {
            if (other.textElements != null)
                return false;
        } else if (!textElements.equals(other.textElements))
            return false;
        return true;
    }

    public static boolean allSameChar(List<TextChunk> textChunks) {
        /* the previous, far more elegant version of this method failed when there was an empty TextChunk in textChunks.
         * so I rewrote it in an ugly way. but it works!
         * it would be good for this to get rewritten eventually
         * the purpose is basically just to return true iff there are 2+ TextChunks and they're identical.
         * -Jeremy 5/13/2016
         */

        if (textChunks.size() == 1) return false;
        boolean hasHadAtLeastOneNonEmptyTextChunk = false;
        char first = '\u0000';
        for (TextChunk tc : textChunks) {
            if (tc.getText().length() == 0) {
                continue;
            }
            if (first == '\u0000') {
                first = tc.getText().charAt(0);
            } else {
                hasHadAtLeastOneNonEmptyTextChunk = true;
                if (!tc.isSameChar(first)) return false;
            }
        }
        return hasHadAtLeastOneNonEmptyTextChunk;
    }

    public static List<Line> groupByLines(List<TextChunk> textChunks) {
        List<Line> lines = new ArrayList<Line>();

        if (textChunks.size() == 0) {
            return lines;
        }

        float bbwidth = Rectangle.boundingBoxOf(textChunks).width;

        Line l = new Line();
        l.addTextChunk(textChunks.get(0));
        textChunks.remove(0);
        lines.add(l);

        Line last = lines.get(lines.size() - 1);
        for (TextChunk te : textChunks) {
            if (last.verticalOverlapRatio(te) < 0.1) {
                if (last.width / bbwidth > 0.9 && TextChunk.allSameChar(last.getTextElements())) {
                    lines.remove(lines.size() - 1);
                }
                lines.add(new Line());
                last = lines.get(lines.size() - 1);
            }
            last.addTextChunk(te);
        }

        if (last.width / bbwidth > 0.9 && TextChunk.allSameChar(last.getTextElements())) {
            lines.remove(lines.size() - 1);
        }

        List<Line> rv = new ArrayList<Line>(lines.size());

        for (Line line : lines) {
            rv.add(Line.removeRepeatedCharacters(line, ' ', 3));
        }

        return rv;
    }

}