package edu.harvard.wcfia.yoshikoder.document; import java.awt.Font; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.logging.Logger; import java.util.regex.Pattern; import javax.swing.JOptionPane; import javax.swing.JScrollPane; import javax.swing.JTable; import edu.harvard.wcfia.yoshikoder.document.tokenizer.WordTokenizer; /** * TokenStructuredDocument is a decorator that adds a tokenization to a regular YKDocument * and allows Pattern searches over the tokens. * <p> * <b>EXPERIMENTAL</b> * * @author will * */ public class TokenStructuredDocument implements YKDocument { private static Logger log = Logger.getLogger("edu.harvard.wcfia.yoshikoder.document.TokenStructuredDocument"); protected int[][] tokenSpans; protected YKDocument instance; public TokenStructuredDocument(YKDocument document){ instance = document; } /** * Returns how many tokens there are in the document. * @return * @throws IOException */ public int getLengthInTokens() throws IOException { if (tokenSpans == null){ tokenize(); } return tokenSpans.length; } /** * Hands back the spans corresponding to the tokens in this document. * @return * @throws IOException */ public int[][] getTokenSpans() throws IOException { if (tokenSpans == null){ tokenize(); } return tokenSpans; } /** * Hands back the tokens of this document as Strings. * @return * @throws IOException */ public String[] getTokens() throws IOException { int[][] tspans = getTokenSpans(); String[] tokens = new String[tspans.length]; String txt = getText(); for (int ii = 0; ii < tspans.length; ii++) tokens[ii] = txt.substring(tspans[ii][0], tspans[ii][1]); return tokens; } /** * Returns spans (beginning and end indexes) defined over tokenSpan <i>rows</i>, * not the document's characters. * This is a helper function to reduce redundant match computations when retrieving * pattern spans themselves, counts of them, or concordance spans around them. * None of the public routines do any regular expression matching. * <p> * Be careful with this one. Row spans start with the index of the * first element and end with the index of the last element. Not the index * of the last element + 1. Things might well be easier if they did... * * @param entity a sequence of regular expression to match * @return spans over rows of tokenSpan (beginning and end rows for each pattern sequence) * @throws IOException */ protected int[][] getRowSpansThatMatchPattern(Pattern[] entity) throws IOException { if (tokenSpans == null){ tokenize(); } List list = new ArrayList(); String txt = instance.getText(); // grab the text for the duration of this method for (int ii=0; ii < tokenSpans.length-entity.length + 1; ii++){ boolean matches = true; int jj; for (jj=0; jj<entity.length; jj++){ if (!entity[jj].matcher(txt.substring(tokenSpans[ii+jj][0], tokenSpans[ii+jj][1])).matches()){ matches = false; break; } } if (matches) list.add(new int[]{ii, ii+jj-1}); } return (int[][])list.toArray(new int[list.size()][2]); } /** * The spans corresponding to token sequences matching the Pattern sequence. Each * row is a match. The first element is the start character index of the matching * part of the text, and the last element is the last character index of the matching part of * the text. E.g. substring(result[2][0], result[2][1]) is the text of the second match * @param entity sequence of Patterns that need to be all match, in order. * @return * @throws IOException */ public int[][] getPatternMatchSpans(Pattern[] entity) throws IOException { int[][] rowSpans = getRowSpansThatMatchPattern(entity); int[][] pms = new int[rowSpans.length][2]; for (int ii=0; ii<pms.length; ii++){ pms[ii][0] = tokenSpans[rowSpans[ii][0]][0]; // start char of first token pms[ii][1] = tokenSpans[rowSpans[ii][1]][1]; // end char of last token } return pms; } public int[][] getPatternMatchSpans(Pattern p) throws IOException { return getPatternMatchSpans(new Pattern[]{p}); } /** * Computes how many matches there were to the sequence of patterns handed in. * This does not call getPatternMatchSpans first. * @param entity * @return * @throws IOException */ public int getPatternMatchCounts(Pattern[] entity) throws IOException { int[][] rowSpans = getRowSpansThatMatchPattern(entity); return rowSpans.length; } public int getPatternMatchCounts(Pattern p) throws IOException { return getPatternMatchCounts(new Pattern[]{p}); } /** * Generates a concordance from a concordance span. A concordance is a * N x 3 String array. When no tokens can be shown on left or right hand side * an empty string is returned. * @param entity * @param window * @return * @throws IOException */ public String[][] getPatternConcordance(Pattern[] entity, int window) throws IOException { int[][] pcs = getPatternConcordanceSpans(entity, window); String [][] s = new String[pcs.length][3]; String txt = getText(); // note that we'll be handing out references to the text here... for (int ii=0; ii<pcs.length; ii++){ if (pcs[ii][0] != -1) s[ii][0] = txt.substring(pcs[ii][0], pcs[ii][1]); else s[ii][0] = ""; s[ii][1] = txt.substring(pcs[ii][2], pcs[ii][3]); if (pcs[ii][4] != -1) s[ii][2] = txt.substring(pcs[ii][4], pcs[ii][5]); else s[ii][2] = ""; } return s; } public String[][] getPatternConcordance(Pattern pattern, int window) throws IOException { return getPatternConcordance(new Pattern[]{pattern}, window); } public int[][] getPatternConcordanceSpans(Pattern p, int window) throws IOException { return getPatternConcordanceSpans(new Pattern[]{p}, window); } /** * Returns the spans of the left, center, and right hand sides of each line of a * concordance. <b>Note:</b> when thers is nothing to be displayed on the left * or right hand side, i.e. the keyword is flush to one end of the text, then the * spans are set to -1. * * @param entity the pattern sequence whose context is to be found * @param window how many tokens either side of entity to show * @return * @throws IOException */ public int[][] getPatternConcordanceSpans(Pattern[] entity, int window) throws IOException{ int[][] rowSpans = getRowSpansThatMatchPattern(entity); int[][] concSpans = new int[rowSpans.length][6]; if (window < 1){ // ugly special case... for (int ii=0; ii<concSpans.length; ii++){ concSpans[ii][0] = -1; concSpans[ii][1] = -1; concSpans[ii][2] = tokenSpans[rowSpans[ii][0]][0]; // first char of the entity concSpans[ii][3] = tokenSpans[rowSpans[ii][1]][1]; // last char of the entity concSpans[ii][4] = -1; concSpans[ii][5] = -1; } return concSpans; } int docLength = tokenSpans.length; for (int ii=0; ii<concSpans.length; ii++){ // construct the left hand side int lhsStartIndex = rowSpans[ii][0] - window; int lhsEndIndex = rowSpans[ii][0]-1; if (lhsStartIndex >= 0){ concSpans[ii][0] = tokenSpans[lhsStartIndex][0]; // beginning of lhs concSpans[ii][1] = tokenSpans[lhsEndIndex][1]; // end of lhs } else if (lhsStartIndex > -window){ concSpans[ii][0] = tokenSpans[0][0]; // beginning of first token concSpans[ii][1] = tokenSpans[lhsEndIndex][1]; // end of lhs } else if (lhsStartIndex == -window){ concSpans[ii][0] = -1; // flag that there is nothing to show on the left concSpans[ii][1] = -1; } else { log.info("Should never get here (lhs)"); } // target pattern is easy concSpans[ii][2] = tokenSpans[rowSpans[ii][0]][0]; // first char of the entity concSpans[ii][3] = tokenSpans[rowSpans[ii][1]][1]; // last char of the entity // construct the right hand side int rhsStartIndex = rowSpans[ii][1] + 1; int rhsEndIndex = rowSpans[ii][1] + window; if (rhsEndIndex < docLength){ concSpans[ii][4] = tokenSpans[rhsStartIndex][0]; // beginning of rhs concSpans[ii][5] = tokenSpans[rhsEndIndex][1]; // end of rhs } else if (rhsEndIndex < docLength + window -1 ){ concSpans[ii][4] = tokenSpans[rhsStartIndex][0]; // beginning of token after target concSpans[ii][5] = tokenSpans[docLength-1][1]; // end of tokens } else if (rhsEndIndex == docLength + window - 1){ concSpans[ii][4] = -1; // flag that there is nothing to show on the left concSpans[ii][5] = -1; } else { log.info("Should never get here (rhs)"); } } return concSpans; } /** * Tokenizes the document using a WordTokenizer. This is not the final version. * */ protected void tokenize() throws IOException { String txt = getText(); WordTokenizer tok = new WordTokenizer(getLocale()); tokenSpans = tok.getTokenSpans(txt); } /** * Triggers a fresh tokenization which updates all the token spans. * Useful for when a new tokenizer is available for this document's locale. * @throws IOException */ public void retokenize() throws IOException { tokenize(); } /** * Dumps the document's tokenization to String, creating first if necessary. * @return * @throws IOException */ public String debugToString() throws IOException{ StringBuffer sb = new StringBuffer(); if (tokenSpans == null) tokenize(); for (int ii=0; ii<tokenSpans.length; ii++){ int start = tokenSpans[ii][0]; int end = tokenSpans[ii][1]; sb.append(ii + ": " + getText().substring(start, end) + " [" + start + "," + end + "]\n"); } return sb.toString(); } // delegate YKDocument methods and toString to the instance public String getCharsetName() { return instance.getCharsetName(); } public Locale getLocale() { return instance.getLocale(); } public File getLocation() { return instance.getLocation(); } public Font getPreferredFont() { return instance.getPreferredFont(); } public String getText() throws IOException { return instance.getText(); } public String getTitle() { return instance.getTitle(); } public void setCharsetName(String csname) { instance.setCharsetName(csname); } public void setLocale(Locale loc) { instance.setLocale(loc); } public void setLocation(File f) { instance.setLocation(f); } public void setPreferedFont(Font f) { instance.setPreferedFont(f); } public void setTitle(String title) { instance.setTitle(title); } public String toString(){ return instance.toString(); } public static void main(String[] args) throws Exception { File f = new File("/Users/will/spantest.txt"); TokenStructuredDocument doc = new TokenStructuredDocument(new LazyYKDocument("span test", f)); System.out.println(doc.debugToString()); Pattern[] pats = new Pattern[]{Pattern.compile("second"), Pattern.compile("third")}; int[][] spans = doc.getPatternMatchSpans(pats); System.out.println(spans.length); for (int ii=0; ii<spans.length; ii++){ System.out.println(doc.getText().substring(spans[ii][0], spans[ii][1])); } int[][] concSpans = doc.getPatternConcordanceSpans(pats, 0); for (int ii=0; ii<concSpans.length; ii++){ if (concSpans[ii][0] + concSpans[ii][1] != -2) System.out.print(doc.getText().substring(concSpans[ii][0], concSpans[ii][1])); System.out.print(" [ " + doc.getText().substring(concSpans[ii][2], concSpans[ii][3]) + " ] "); if (concSpans[ii][4] + concSpans[ii][5] != -2) System.out.println(doc.getText().substring(concSpans[ii][4], concSpans[ii][5])); } String[][] pc = doc.getPatternConcordance(pats, 5); JTable table = new JTable(pc, new String[]{"LHS", "Target", "RHS"}); JOptionPane.showMessageDialog(null, new JScrollPane(table)); System.exit(0); } }