package io.github.infolis.algorithm;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import io.github.infolis.InfolisConfig;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.model.Execution;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.util.SerializationUtils;
import opennlp.tools.util.InvalidFormatException;
/**
*
* @author kata
*
*/
public abstract class Tokenizer extends BaseAlgorithm {
public Tokenizer (DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) throws IOException {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
protected abstract List<String> getExecutionTags();
private static final Logger log = LoggerFactory.getLogger(Tokenizer.class);
/**
* Splits text into sentences and tokenizes all words.
*
* @param text
* @return
* @throws InvalidFormatException
* @throws IOException
*/
public abstract List<String> getTokenizedSentences(String text) throws InvalidFormatException, IOException;
/**
* Splits text in file into sentences and tokenizes all words.
*
* @param file
* @return
* @throws InvalidFormatException
* @throws IOException
*/
public abstract List<String> getTokenizedSentences(File file) throws InvalidFormatException, IOException;
public String getTokenizedText(List<String> tokenizedSentences) {
return String.join(System.getProperty("line.separator"), tokenizedSentences);
}
private String transformFilename(String filename, String outputDir) {
String outFileName = SerializationUtils.changeFileExtension(filename, "tokenized.txt");
if (null != outputDir && !outputDir.isEmpty()) {
outFileName = SerializationUtils.changeBaseDir(outFileName, outputDir);
}
return outFileName;
}
public String createInfolisFile(String filename, String entity, List<String> tokenizedSentences, Set<String> tags) throws IOException {
InfolisFile infolisFile = new InfolisFile();
String outFileName = transformFilename(filename, getExecution().getOutputDirectory());
String asText = String.join(System.getProperty("line.separator"), tokenizedSentences);
infolisFile.setFileName(outFileName);
infolisFile.setMediaType("text/plain");
infolisFile.setTags(tags);
infolisFile.setMd5(SerializationUtils.getHexMd5(asText));
infolisFile.setFileStatus("AVAILABLE");
try (OutputStream outStream = getOutputFileResolver().openOutputStream(infolisFile)) {
try {
IOUtils.write(asText, outStream);
} catch (IOException e) {
warn(log, "Error copying text to output stream: " + e);
throw e;
}
} catch (IOException e) {
warn(log, "Error opening output stream to text file: " + e);
throw e;
}
infolisFile.setManifestsEntity(entity);
getOutputDataStoreClient().post(InfolisFile.class, infolisFile);
return infolisFile.getUri();
}
@Override
public void execute() throws IOException {
Execution tagExec = getExecution().createSubExecution(TagSearcher.class);
tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags());
tagExec.instantiateAlgorithm(this).run();
getExecution().getInputFiles().addAll(tagExec.getInputFiles());
// if no output directory is given, create temporary output files
if (null == getExecution().getOutputDirectory() || getExecution().getOutputDirectory().equals("")) {
String TOKENIZED_DIR_PREFIX = "tokenized-";
String tempDir = Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), TOKENIZED_DIR_PREFIX).toString();
FileUtils.forceDeleteOnExit(new File(tempDir));
getExecution().setOutputDirectory(tempDir);
}
for (String inputFileURI : getExecution().getInputFiles()) {
InfolisFile infolisFile = getInputDataStoreClient().get(InfolisFile.class, inputFileURI);
// TODO update status
String text = IOUtils.toString(getInputFileResolver().openInputStream(infolisFile));
List<String> tokenizedSentences = getTokenizedSentences(text);
Set<String> tagsToSet = getExecution().getTags();
tagsToSet.addAll(infolisFile.getTags());
tagsToSet.addAll(getExecutionTags());
tagsToSet.remove(TextExtractor.getExecutionTagUntokenized());
String outputFileURI = createInfolisFile(infolisFile.getFileName(), infolisFile.getManifestsEntity(), tokenizedSentences, tagsToSet);
getExecution().getOutputFiles().add(outputFileURI);
}
}
@Override
//TODO
public void validate() {
}
}