package io.github.infolis.algorithm; import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Word; import edu.stanford.nlp.process.DocumentPreprocessor; import edu.stanford.nlp.process.PTBTokenizer; import edu.stanford.nlp.process.PTBTokenizer.PTBTokenizerFactory; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; /** * * @author kata * */ public class TokenizerStanford extends Tokenizer { private final static List<String> compoundMarkers = Arrays.asList("(-)", "(–)", "(/)"); public TokenizerStanford (DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) throws IOException { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } private static final Logger log = LoggerFactory.getLogger(TokenizerStanford.class); private final List<String> executionTags = new ArrayList<>(Arrays.asList("TOKENIZED_STANFORD")); private static final String tokenizeTag = "TOKENIZED_STANFORD"; private static final String tokenizeNLsTag = "TOKENIZENLS"; private static final String NLsNotTokenizedTag = "NLSNOTTOKENIZED"; private static final String ptb3EscapingTag = "PTB3ESCAPING"; private static final String noPtb3EscapingTag = "NOPTB3ESCAPING"; protected List<String> getExecutionTags() { return executionTags; } protected static String getTokenizeTag() { return tokenizeTag; } protected static String getTokenizeNLsTag() { return tokenizeNLsTag; } protected static String getPtb3EscapingTag() { return ptb3EscapingTag; } public List<String> getTokenizedSentences(String text) { Reader reader = new StringReader(text); return tokenize(reader, getExecution().getTokenizeNLs(), getExecution().getPtb3Escaping()); } public List<String> getTokenizedSentences(File file) { return tokenize(file.getAbsolutePath(), getExecution().getTokenizeNLs(), getExecution().getPtb3Escaping()); } public List<String> tokenize(String filename, boolean tokenizeNLs, boolean ptb3Escaping) { if (tokenizeNLs) this.executionTags.add(tokenizeNLsTag); else this.executionTags.add(NLsNotTokenizedTag); if (ptb3Escaping) this.executionTags.add(ptb3EscapingTag); else this.executionTags.add(noPtb3EscapingTag); DocumentPreprocessor dp = new DocumentPreprocessor(filename); return applyPTBTokenizer(dp, tokenizeNLs, ptb3Escaping); } public List<String> tokenize(Reader reader, boolean tokenizeNLs, boolean ptb3Escaping) { if (tokenizeNLs) this.executionTags.add(tokenizeNLsTag); else this.executionTags.add(NLsNotTokenizedTag); if (ptb3Escaping) this.executionTags.add(ptb3EscapingTag); else this.executionTags.add(noPtb3EscapingTag); DocumentPreprocessor dp = new DocumentPreprocessor(reader); return applyPTBTokenizer(dp, tokenizeNLs, ptb3Escaping); } private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) { PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true"); dp.setTokenizerFactory(tf); List<String> sentences = new ArrayList<>(); for (List<HasWord> wordList : dp) { String sentence = ""; for (HasWord word : wordList) { sentence += " " + splitCompounds(word.word()); } sentences.add(sentence); } return sentences; } private static String splitCompounds(String text) { return text.replaceAll("(?<=\\S)(" + String.join("|", compoundMarkers) + ")(?=\\S)", " $1 "); } @Override //TODO public void validate() { //getExecution().getTokenizeNLs() //getExecution().getPtb3Escaping() } }