package io.github.infolis.algorithm;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.InvalidFormatException;
/**
* Tokenizer for different languages (using OpenNLP tokenizer).
*
* @author kata
*
*/
public class TokenizerOpenNLP extends Tokenizer {
// TODO path to model as Execution param
// TODO download script for models...
public TokenizerOpenNLP (DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) throws IOException {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
initialize("src/main/resources/de-token.bin", "src/main/resources/de-sent.bin");
}
private opennlp.tools.tokenize.Tokenizer tokenizer;
private SentenceDetectorME sentenizer;
private static final Logger log = LoggerFactory.getLogger(TokenizerOpenNLP.class);
private static final List<String> executionTags = Arrays.asList("TOKENIZED_OPENNLP");
private static final String tokenizeTag = "TOKENIZED_OPENNLP";
protected static String getTokenizeTag() {
return tokenizeTag;
}
protected List<String> getExecutionTags() {
return executionTags;
}
public void initialize(String modelPathTokenize, String modelPathSentenize) throws InvalidFormatException, IOException {
InputStream modelInTokenize = new FileInputStream(modelPathTokenize);
TokenizerModel modelTokenize = new TokenizerModel(modelInTokenize);
tokenizer = new TokenizerME(modelTokenize);
InputStream modelInSentenize = new FileInputStream(modelPathSentenize);
SentenceModel modelSentenize = new SentenceModel(modelInSentenize);
sentenizer = new SentenceDetectorME(modelSentenize);
}
public String tokenize(String input) throws InvalidFormatException, IOException {
return String.join(" ", this.tokenizer.tokenize(input));
}
public String[] sentenize(String input) throws InvalidFormatException, IOException {
return this.sentenizer.sentDetect(input);
}
public List<String> getTokenizedSentences(String text) throws InvalidFormatException, IOException {
String[] sentences = sentenize(text);
List<String> tokenizedSentences = new ArrayList<>();
for (String sentence : sentences) {
tokenizedSentences.add(tokenize(sentence));
}
return tokenizedSentences;
}
public List<String> getTokenizedSentences(File file) throws InvalidFormatException, IOException {
return getTokenizedSentences(FileUtils.readFileToString(file));
}
@Override
//TODO
public void validate() {
}
}