package won.matcher.utils.preprocessing; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; /** * Class uses OpenNLP to extract tokens from a text. Used for pre-processing the need attribute content. * * User: hfriedrich * Date: 13.06.2015 */ public class OpenNlpTokenExtraction { Tokenizer tokenizer = SimpleTokenizer.INSTANCE; POSTaggerME posTagger = null; public OpenNlpTokenExtraction() throws IOException { InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("en-pos-maxent.bin"); POSModel model = new POSModel(modelIn); posTagger = new POSTaggerME(model); } /** * Extract tokens that are words and have length > 1 from a text * * @param text * @return */ public String[] extractWordTokens(String text) { text = text.toLowerCase(); String[] tokens = tokenizer.tokenize(text); // filter out tokens with length 1, tokens that start with non-word characters or numbers Pattern filter = Pattern.compile(".{1}+|\\W.*|\\d.*"); return filterTokens(Arrays.asList(tokens), filter); } /** * Extract tokens that are words of type nouns, adjectives or forgein words and have length > 1 from a text. * NOTE: This method uses pos-tagger and is supposed to be slower than the extractWordTokens() * * @param text * @return */ public String[] extractRelevantWordTokens(String text) { text = text.toLowerCase(); List<String> extracted = new LinkedList<>(); String[] tokens = tokenizer.tokenize(text); String[] tags = posTagger.tag(tokens); // extract nouns, adjectives and foreign words for (int i = 0; i < tags.length; i++) { if (tags[i].startsWith("N") || tags[i].startsWith("J") || tags[i].equals("FW")) { extracted.add(tokens[i]); } } // filter out tokens with length 1, tokens that start with non-word characters or numbers Pattern filter = Pattern.compile(".{1}+|\\W.*|\\d.*"); return filterTokens(extracted, filter); } private String[] filterTokens(Iterable<String> tokens, Pattern pattern) { List<String> extracted = new LinkedList<>(); for (String token : tokens) { if (!pattern.matcher(token).matches()) { extracted.add(token); } } return extracted.toArray(new String[extracted.size()]); } }