/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package postaggersalanguage.five;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;
import org.voyanttools.trombone.nlp.PosLemmas;
import com.shef.ac.uk.util.Util;
import opennlp.tools.cmdline.postag.POSModelLoader;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
/**
*
* @author ahmetaker
*/
public class POSTaggersALanguage {
private String lang;
private POSModel itsPOSModel = null;
private SentenceModel itsSentenceModel = null;
private TokenizerModel itsTokenizerModel = null;
private Map<String, String> nounDic;
private Map<String, String> adjDic;
private Map<String, String> advDic;
private Map<String, String> verbDic;
private Map<String, String> detDic;
private Map<String, String> pronDic;
private Map<String, String> posMap;
public POSTaggersALanguage(String lang) throws IOException {
this.lang = lang;
String file = this.getClass().getResource("").getFile();
nounDic = Util.loadDictionary(file + "//dictionaries//" + lang + "//nounDic.txt");
adjDic = Util.loadDictionary(file + "//dictionaries//" + lang + "//adjDic.txt");
advDic = Util.loadDictionary(file + "//dictionaries//" + lang + "//advDic.txt");
verbDic = Util.loadDictionary(file + "//dictionaries//" + lang + "//verbDic.txt");
detDic = Util.loadDictionary(file + "//dictionaries//" + lang + "//detDic.txt");
pronDic = Util.loadDictionary(file + "//dictionaries//" + lang + "//pronounDic.txt");
posMap = Util.getFileContentAsMap(file + "/universal-pos-tags/" + lang + "POSMapping.txt", "######", true);
}
public Span[] tokenizePos(String aSentence, String aResourceFolder) throws InvalidFormatException, IOException {
if (itsTokenizerModel == null) {
InputStream is = new FileInputStream(aResourceFolder + "/tokenizerModels/" + lang + "-token.bin");
itsTokenizerModel = new TokenizerModel(is);
is.close();
}
Tokenizer tokenizer = new TokenizerME(itsTokenizerModel);
Span[] tokens = tokenizer.tokenizePos(aSentence);
//now apply also some rules!
ArrayList<Span> array = new ArrayList<Span>();
for (int i = 0; i < tokens.length; i++) {
String token = aSentence.substring(tokens[i].getStart(), tokens[i].getEnd());
if ("".equals(token)) {
continue;
}
char chraters[] = token.toCharArray();
Vector<String> take = new Vector<String>();
StringBuffer buffer = new StringBuffer();
for (int j = 0; j < chraters.length; j++) {
String c = chraters[j] + "";
if (Heuristics.isPunctuation(c)) {
String str = buffer.toString().trim();
if (!str.equals("")) {
take.add(buffer.toString());
}
buffer = new StringBuffer();
take.add(c);
} else {
buffer.append(c);
}
}
if (!buffer.toString().equals("")) {
take.add(buffer.toString());
}
for (int j = 0; j < take.size(); j++) {
String string = take.get(j);
array.add(new Span(tokens[i].getStart(), tokens[i].getEnd(), string));
}
}
Span a[] = new Span[array.size()];
return array.toArray(a);
}
public Span[] sentenceDetectPos(String aText) throws InvalidFormatException, IOException {
if (itsSentenceModel == null) {
InputStream is = new FileInputStream(this.getClass().getResource("").getFile() + "/setenceDetectionModels/" + lang + "-sent.bin");
itsSentenceModel = new SentenceModel(is);
is.close();
}
SentenceDetectorME sdetector = new SentenceDetectorME(itsSentenceModel);
Span[] sentences = sdetector.sentPosDetect(aText);
return sentences;
}
public String[] posTag(String aSentence[], String aResourceFolder) {
String posTaggedVersion[] = null;
if (itsPOSModel == null) {
itsPOSModel = new POSModelLoader()
.load(new File(this.getClass().getResource("").getFile() + "/posModels/" + lang + "-pos-maxent.bin"));
}
//PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
POSTaggerME tagger = new POSTaggerME(itsPOSModel);
posTaggedVersion = tagger.tag(aSentence);
return posTaggedVersion;
}
public PosLemmas getLemmatized(String text) throws IOException {
String file = this.getClass().getResource("").getFile();
PosLemmas posLemmas = new PosLemmas(text);
Span[] sentences = sentenceDetectPos(text);
for (Span sentence : sentences) {
int sentenceStart = sentence.getStart();
String sentenceString = text.substring(sentenceStart, sentence.getEnd());
Span[] tokens = tokenizePos(sentenceString, file);
String[] strings = Span.spansToStrings(tokens, sentenceString);
String[] pos = posTag(strings, file);
for (int i=0; i<tokens.length; i++) {
String token = strings[i];
String lemma = null;
String posType = pos[i];
if ("it".equalsIgnoreCase(lang)) {
posType = posType.substring(0, 1);
}
String generalType = posMap.get(posType.toLowerCase());
if (Heuristics.isNumber(token)==false && Heuristics.isPunctuation(token)==false) {
if (generalType != null) {
if ("NOUN".equalsIgnoreCase(generalType)) {
lemma = nounDic.get(token.toLowerCase());
} else if ("VERB".equalsIgnoreCase(generalType)) {
lemma = verbDic.get(token.toLowerCase());
} else if ("ADJ".equalsIgnoreCase(generalType)) {
lemma = adjDic.get(token.toLowerCase());
} else if ("ADV".equalsIgnoreCase(generalType)) {
lemma = advDic.get(token.toLowerCase());
} else if ("PRON".equalsIgnoreCase(generalType)) {
lemma = pronDic.get(token.toLowerCase());
}
if (!"nl".equalsIgnoreCase(lang) && lemma == null) {
try {
lemma = Lemmatizer.getLemma(file, token, lang, generalType);
} catch (Exception e) {
try {
lemma = Lemmatizer.getLemma(file, token.toLowerCase(), lang, generalType);
} catch (Exception e2) {
}
}
}
}
posLemmas.add(token, generalType, lemma, sentenceStart+tokens[i].getStart(), sentenceStart+tokens[i].getEnd());
}
// if (lemma!=null) {
// posLemmas.add(token, generalType, lemma, sentenceStart+tokens[i].getStart(), sentenceStart+tokens[i].getEnd());
// spans.add(new Span(sentenceStart+tokens[i].getStart(), sentenceStart+tokens[i].getEnd(), lemma));
// }
}
}
return posLemmas;
}
public static void main(String args[]) throws InvalidFormatException, IOException {
String lang = "en";
POSTaggersALanguage posTagger = new POSTaggersALanguage(lang);
String text = "This time, it’s your turn: advise Parliament in the first LinkedIn discussion on an ongoing report. The rapporteur wants to hear your views @...(read more). --- Keywords ---";
PosLemmas lemmas = posTagger.getLemmatized(text);
Iterator<PosLemmas> iterator = lemmas.iterator();
while (iterator.hasNext()) {
iterator.next();
System.out.println(lemmas.getCurrentTerm()+"-"+lemmas.getCurrentLemma());
}
}
}