package io.github.infolis.algorithm; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.commons.io.IOUtils; import org.junit.Ignore; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import io.github.infolis.InfolisBaseTest; import io.github.infolis.model.Execution; import io.github.infolis.model.entity.InfolisFile; import opennlp.tools.util.InvalidFormatException; /** * * @author kata * */ public class TokenizerTest extends InfolisBaseTest { private static final Logger log = LoggerFactory.getLogger(TokenizerTest.class); List<InfolisFile> testFiles; String[] testStrings = { "On the one hand, the granularity (what is the smallest element of research data in need of description?) and the possible, aggregating intermediary steps vary widely." + System.getProperty("line.separator") + "On the other-hand, ...", "Funktioniert der \nTokenizer auch gut für z.B. deutsch? Und was macht er hiermit/damit, hiermit\\damit oder hiermit / damit? Oder hier-mit? Und mit dem ALLBUS-Datensatz, dem ALLBUS -Datensatz oder dem ALLBUS- Datensatz?\n" }; List<String> uris = new ArrayList<>(); public TokenizerTest() throws Exception { testFiles = createTestTextFiles(2, testStrings); for (InfolisFile file : testFiles) { uris.add(file.getUri()); } } @Test public void testStanfordTokenize() throws InvalidFormatException, IOException{ Execution exec = new Execution(); exec.setInputFiles(uris); exec.setAlgorithm(TokenizerStanford.class); exec.setTokenizeNLs(true); exec.setPtb3Escaping(true); exec.instantiateAlgorithm(dataStoreClient, fileResolver).run(); List<InfolisFile> outFiles = dataStoreClient.get(InfolisFile.class, exec.getOutputFiles()); for (InfolisFile outFile : outFiles) { InputStream is = fileResolver.openInputStream(outFile); String content = IOUtils.toString(is); log.debug("output stanford: " + content); } } // TODO path to model as param // TODO download script for model... @Ignore public void testOpenNLPTokenize() throws InvalidFormatException, IOException { Execution exec = new Execution(); exec.setInputFiles(uris); exec.setAlgorithm(TokenizerOpenNLP.class); exec.instantiateAlgorithm(dataStoreClient, fileResolver).run(); List<InfolisFile> outFiles = dataStoreClient.get(InfolisFile.class, exec.getOutputFiles()); for (InfolisFile outFile : outFiles) { InputStream is = fileResolver.openInputStream(outFile); String content = IOUtils.toString(is); log.debug("output openNLP: " + content); } } }