ParsingFrame.java example

Explorer

GATEinSpring-master
- gate
  - WEB-INF
    - gate-files
      - plugins
        Gazetteer_LKB
        src
        com
        ontotext
        kim
        gate
        KimGazetteer.java
        KimLookupParser.java
        SesameEnrichment.java
        SettingsHashBuilder.java
        model
        AliasCacheImpl.java
        AliasTextTransformer.java
        ClassStatisticListener.java
        DataFeedFactory.java
        DictionaryFeeder.java
        EntitiesQueryListener.java
        EntityPriority.java
        HashRegister.java
        HashedAlias.java
        KIMDataFeed.java
        ParsingFrame.java
        StatisticListener.java
        test
        PrivateRepositoryFeedTest.java
        util
        datastore
        PopulateDictionary.java
        PrivateRepositoryFeed.java
        RepositoryFeed.java
        Groovy
        src
        gate
        groovy
        GateGroovyMethods.java
        GroovySupport.java
        ScriptPR.java
        ScriptableControllerPersistence.java
        gui
        ControllerScriptEditor.java
        Module7practice
        src
        module7
        DocStats.java
        StatsViewer.java
        Stemmer_Snowball
        src
        org
        tartarus
        snowball
        Among.java
        SnowballProgram.java
        SnowballStemmer.java
        TestApp.java
        ext
        danishStemmer.java
        dutchStemmer.java
        englishStemmer.java
        finnishStemmer.java
        frenchStemmer.java
        germanStemmer.java
        hungarianStemmer.java
        italianStemmer.java
        norwegianStemmer.java
        porterStemmer.java
        portugueseStemmer.java
        romanianStemmer.java
        russianStemmer.java
        spanishStemmer.java
        swedishStemmer.java
        turkishStemmer.java
        stemmer
        SnowballStemmer.java
    - src
      - ca
        openlanguage
        nlpgate
        GateHandler.java

package com.ontotext.kim.model;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.collections.Transformer;
import org.apache.commons.collections.TransformerUtils;

import com.ontotext.kim.model.AliasCacheImpl.Stats;


/**
 * This class is used as a text parsing tool by classes
 * <code>AliasCacheImpl</code> and <code>KimLookupParser</code> and produces
 * special output for the class <code>HashedAlias</code>.<br>
 * The class provides means to parse a string into sequence of plain numeric
 * and letter lexemes delimited by optional non-alpha-numeric character
 * sequences of arbitrary length. This is the base for creation of a Hashed
 * Alias and also the base for searching it in a text.<br>
 * The class offers a frame view over the parsed string. The frame spans one or
 * more sequential alpha-numeric lexemes (ANL) and their non-alpha-numeric
 * context (surrounding puctuation and white spaces).<br>
 * The frame contains three parts:<br>
 *  * middle - this is a part of the initial string which starts in the 
 * beginning of the first ANL of the frame and ends in the end of the last ANL
 * of the frame.<br>
 *  * prefix - this is the non-alpha-numeric sequence before the first ANL of
 * the frame.<br>
 *  * suffix - this is the non-alpha-numeric sequence after the last ANL of the
 * frame.<br>
 * The class has couple of methods for manipulating the frame. The first one
 * is for extending the frame by one lexeme. The second one is for moving
 * the frame start by one lexeme, which resets the frame length to one
 * lexeme.<br>
 * The class has retrieval methods that get the current frame offset and
 * the lengths of the three parts. There are dedicated methods for calculation
 * of the two Hash Codes specific to a Hashed Alias.
 * 
 * @author danko
 *
 */
public class ParsingFrame {
	
	public static final boolean SPLIT_AT_SMALL_TO_CAPITAL_CASE_CHANGE = false;
	//================================================
	// ParsingFrame: Lexeme parsing regular expression
	//================================================
	/** This is the Reg-Ex pattern that is used search the ANLs */
	private static final Pattern LEXEME_MATCH_PAT = Pattern.compile(
			"(" +
			// Number only lexemes
			"(?:\\d)+|" +
			
			(SPLIT_AT_SMALL_TO_CAPITAL_CASE_CHANGE 
					? "(?:\\p{javaUpperCase}{0,}\\p{javaLowerCase}{1,})|(?:\\p{javaUpperCase}{1,})"			    
					: "(?:[\\p{javaUpperCase}\\p{javaLowerCase}]+)" ) + 
			")"
			// Non-Alpha-Numeric suffix
			+ "([^\\d\\p{javaUpperCase}\\p{javaLowerCase}]*)"
	);
	/** This is the delimiting character used for the normalized form
	 *  of the parsed string. The normalized for is used to calculate
	 *  the H1 hash value of the <code>HashAlias</code> */
	private static final String H1_DELIMITER = "^";

	public static Transformer frameTT = TransformerUtils.nopTransformer();

	//================================================
	// ParsingFrame: Parse result buffering
	//================================================
	/** This class represents a single alpha-numeric lexeme (ANL) and its
	 * non-alpha-numeric context. */
	private static class ParsingBufferElement {
		public final int pref;
		public final int midd;
		public final int suff;
		public final String middTxt;
		public final int offset;
		public ParsingBufferElement(int pref, int midd, int suff,
				String middTxt, int offset) {
			this.pref = pref;  this.midd = midd;
			this.suff = suff;  this.middTxt = middTxt; this.offset = offset;
		}
	}
	private final static ParsingBufferElement EMPTY_ELEMENT =
		new ParsingBufferElement(0,0,0,"",0);

	/** This is the parsing buffer that stores the results from parsing of 
	 * the input text. The result is stored as a sequence of ANL elements
	 * with the sizes of their non-alpha-numeric prefix and suffix. */
	private ArrayList<ParsingBufferElement> parsingBuffer =
		new ArrayList<ParsingBufferElement>();

	//================================================
	// ParsingFrame: The definition data of the frame
	//================================================
	private int parsingIx = -1;
	private int frameIx = 0;
	private int restartIx = 0;
	private boolean frameCanExpand = true;

	private int prefix = 0;
	private int middle = 0;
	private int suffix = 0;
	private final String source;
	private String normalizedAlias = "";
	private int lexemeCount = 0;

	//================================================
	// ParsingFrame: class constructor
	//================================================
	/** The constructor parses the source string with the regular expression
	 * based <code>java.util.regex.Matcher</code> and stores the results in
	 * the parsing buffer <code>parsingBuffer</code>.
	 * @param source - the String to be processed through the
	 * <code>ParsingFrame</code> class
	 */
	public ParsingFrame (String source) {
		Stats.markIt(-1);
		this.source = source;
		Matcher lexMatcher = LEXEME_MATCH_PAT.matcher(source);
		int lastSuff = -1;
		while (lexMatcher.find()) {
			if (lastSuff < 0)
				lastSuff = lexMatcher.start();
			parsingBuffer.add( new ParsingBufferElement(
					lastSuff,
					lexMatcher.end(1) - lexMatcher.start(1),
					lexMatcher.end(2) - lexMatcher.start(2),
					lexMatcher.group(1),
					lexMatcher.start() - lastSuff));
			lastSuff = lexMatcher.end(2) - lexMatcher.start(2);
		}

		Stats.markIt(7);
	}

	//================================================
	// ParsingFrame: LookUp and Annotation data
	//================================================
	private int aliasHash1 = 0;
	private int aliasHash2 = 0;
	private int aliasOffset1 = 0;
	private int aliasOffset2 = 0;
	private int oldPref=-1;
	private int oldSuff=-1;
	private void resetLAD() {
		aliasHash1 = 0;
		aliasHash2 = 0;
		oldPref=-1;
		oldSuff=-1;
		this.setNewPrefSufLen(prefix, suffix);
	}
	/** This method changes the effective lengths of the non-alpha-numeric
	 * prefix and suffix. This change affects the results of methods:<br>
	 * <code>getAliasHash2</code>, <code>getLength</code>,
	 * <code>getAliasStart</code> and <code>getAliasEnd</code>.<br>
	 * The last values of the prefix and suffix are stored so if called
	 * again with the same values - no recalculations will be performed. 
	 * @param prefLen - the new effective prefix length.
	 * @param suffLen - the new effective suffix length.
	 */
	public void setNewPrefSufLen(int prefLen, int suffLen) {
		if (oldPref!=prefLen) {
			prefLen = Math.min(prefLen, prefix);
			int offset = (parsingBuffer.size()>0)?
					parsingBuffer.get(frameIx).offset:
						0;
					aliasOffset1 = offset + prefix - prefLen;
		}
		if (oldPref!=prefLen || oldSuff!=suffLen) {
			suffLen = Math.min(suffLen, suffix);
			aliasOffset2 = aliasOffset1 + prefLen + middle + suffLen;

			aliasHash2 = 0;  // Recalculation will be done when and if required
		}
		oldPref=prefLen;
		oldSuff=suffLen;
	}

	//================================================
	// ParsingFrame: Frame manipulation methods
	//================================================
	/** Method attempts to extend the frame to cover one more parsing-buffer
	 * element. If successful - recalculates the frame state metrics.  
	 * @return <b>true</b> if the content of the frame was extended with
	 * one more buffer element
	 */
	public boolean parseOne() {
		if (!frameCanMove())  return false;

		Stats.markIt(-1);
		ParsingBufferElement pbe = EMPTY_ELEMENT;
		boolean oneParsed = false;
		if (restartIx >= 0) {
			frameIx = restartIx;
			lexemeCount = 0;
			oneParsed = (restartIx < parsingBuffer.size());
			if (oneParsed)
				parsingIx = restartIx;
		}
		else {
			oneParsed = ((parsingIx+1) < parsingBuffer.size());
			if (oneParsed)
				parsingIx++;
		}

		if (oneParsed) {
			pbe = parsingBuffer.get(parsingIx);
			lexemeCount++;
		}
		Stats.markIt(8);

		Stats.markIt(-1);
		if (restartIx >= 0) {
			prefix = pbe.pref;
			middle = pbe.midd;
			suffix = pbe.suff;
			normalizedAlias = H1_DELIMITER + pbe.middTxt + H1_DELIMITER;
		}
		else {
			middle += suffix + pbe.midd;
			suffix = pbe.suff;
			normalizedAlias += pbe.middTxt + H1_DELIMITER;
		}

		// Allows for preliminary detection of parse ending
		frameCanExpand = oneParsed && ((parsingIx+1) < parsingBuffer.size());
		restartIx = -1;

		resetLAD();
		Stats.markIt(9);

		return oneParsed;
	}

	/** A shortcut method that extends the frame to cover the whole input
	 * string */
	public void parseAll() {
		while (frameCanExpand)  parseOne();
	}

	/** The method moves the start of the frame - one buffer element forward
	 * and shrinks the size of the frame to one element. */
	public void moveOne() {
		if (restartIx >= 0)  return;
		// Calculate the offset for new parsing start
		restartIx = frameIx + 1;

		// Conditions under which the frame-movement attempt fails
		if (restartIx >= parsingBuffer.size())
			restartIx = -1;
	}

	//==================================================
	// ParsingFrame: Extraction of frame dynamic flags
	//==================================================
	public boolean frameCanExpand() { return frameCanExpand; }
	public boolean frameCanMove() { return frameCanExpand || restartIx >= 0; }

	//==================================================
	// ParsingFrame: Extraction of frame state metrics
	//==================================================
	/** Retrieves the <code>HashedAlias</code> related Hash-Code-1. It is 
	 * calculated over the normalized form of the underlying text. A text
	 * transformation with externally provided transformation logic is 
	 * performed prior to hash-code calculation.
	 * @return - the value of the hash-code.
	 */
	public int getAliasHash1() {
		checkValid();
		if (aliasHash1==0)
			aliasHash1 = frameTT.transform(normalizedAlias).hashCode(); 
		return aliasHash1;
	}
	/** Retrieves the <code>HashedAlias</code> related Hash-Code-2. It is 
	 * calculated over the plain underlying text. A text
	 * transformation with externally provided transformation logic is 
	 * performed prior to hash-code calculation.
	 * @return - the value of the hash-code.
	 */
	public int getAliasHash2() {
		checkValid();
		if (aliasHash2==0)
			aliasHash2 = frameTT.transform(source.substring(
					aliasOffset1, aliasOffset2)).hashCode();
		return aliasHash2;
	}
	/** Retrieves the length of the non-alpha-numeric prefix of the frame
	 * @return non-alpha-numeric prefix length
	 */
	public int getPrefixLen() { checkValid(); return prefix; }
	/** Retrieves the length of the middle part of the frame
	 * @return middle part length
	 */
	public int getMiddleLen() { checkValid(); return middle; }
	/** Retrieves the length of the non-alpha-numeric suffix of the frame
	 * @return non-alpha-numeric suffix length
	 */
	public int getSuffixLen() { checkValid(); return suffix; }
	/** Retrieves the offset in the original input text of the frame start
	 * @return frame start offset
	 */
	public int getAliasStart() { checkValid(); return aliasOffset1; }
	/** Retrieves the offset in the original input text of the frame end
	 * @return frame end offset
	 */
	public int getAliasEnd() { checkValid(); return aliasOffset2; }
	/** Retrieves the length of the text corresponding to the current frame
	 * @return frame text length
	 */
	public int getLength() { checkValid(); return aliasOffset2-aliasOffset1; }
	/** Retrieves the count of ANLs in the frame
	 * @return ANL count
	 */
	public int getLexemeCount() { checkValid(); return lexemeCount; }

	private void checkValid() {
		if (restartIx >= 0)
			throw new RuntimeException("No parsed content in the Frame!");
	}

	//========================================
	// Methods overridden for Testing purposes
	//========================================
	@Override
	public boolean equals(Object o) {
		if (o instanceof ParsingFrame) {
			ParsingFrame pf = (ParsingFrame) o;
			return (pf.parsingBuffer.get(pf.frameIx).offset ==
				this.parsingBuffer.get(this.frameIx).offset &&
				pf.lexemeCount == this.lexemeCount &&
				pf.getAliasHash1() == this.getAliasHash1() );
		}
		return false;
	}

	@Override
	public int hashCode() {
		return 1;
	}
	
	@Override
	public String toString() {
		return this.normalizedAlias +
		"(" + parsingBuffer.get(frameIx).offset + "/" +
		source.length() + ")" +
		prefix + "-" + middle + "-" + suffix +
		((frameCanExpand)? " CAN-expand": " NO-expand") + " " +
		((restartIx>=0)? ("RELOCATE="+restartIx): "");
	}
}