/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package io.github.infolis.algorithm;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.InfolisFile;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.util.LimitedTimeMatcher;
import io.github.infolis.util.RegexUtils;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import javax.ws.rs.BadRequestException;
import javax.ws.rs.ProcessingException;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author domi
* @author kata
* @author kba
*/
public class RegexSearcher extends BaseAlgorithm {
public RegexSearcher(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
private static final Logger log = LoggerFactory.getLogger(RegexSearcher.class);
private String getFileAsString(InfolisFile file)
throws IOException {
InputStream in = getInputFileResolver().openInputStream(file);
String input = IOUtils.toString(in);
in.close();
return input;
}
private List<TextualReference> searchForPatterns(InfolisFile file)
throws IOException {
String inputClean = getFileAsString(file);
List<TextualReference> res = new ArrayList<>();
for (String patternURI : this.getExecution().getPatterns()) {
log.trace(patternURI);
InfolisPattern pattern = getOutputDataStoreClient().get(InfolisPattern.class, patternURI);
log.trace("Searching for pattern '{}'", pattern.getPatternRegex());
Pattern p = Pattern.compile(pattern.getPatternRegex());
// call m.find() as a thread: catastrophic backtracking may occur
// which causes application to hang
// thus monitor runtime of threat and terminate if processing takes
// too long
LimitedTimeMatcher ltm = new LimitedTimeMatcher(p, inputClean, RegexUtils.maxTimeMillis,
file.getFileName() + "\n" + pattern.getPatternRegex());
ltm.run();
// thread was aborted due to long processing time
if (!ltm.finished()) {
// TODO: what to do if search was aborted?
log.warn("Search was aborted. TODO");
}
while (ltm.matched()) {
log.debug(String.format("found pattern %s in file %s, match: %s", pattern.getPatternRegex(), file, ltm.group()));
String referencedTerm = ltm.group(getExecution().getReferenceGroup()).trim();
log.trace("referenced term: " + referencedTerm);
String leftContext = ltm.group(getExecution().getLeftContextGroup());
String rightContext = ltm.group(getExecution().getRightContextGroup());
log.trace("leftContext: " + leftContext);
log.trace("rightContext: " + rightContext);
if (null == leftContext || leftContext.isEmpty()) leftContext = " ";
if (null == rightContext || rightContext.isEmpty()) rightContext = " ";
Set<String> tagsToSet = getExecution().getTags();
tagsToSet.addAll(file.getTags());
TextualReference textRef = new TextualReference(leftContext, referencedTerm, rightContext,
file.getUri(), patternURI, file.getManifestsEntity());
textRef.setTags(tagsToSet);
log.trace("added reference: " + textRef);
res.add(textRef);
log.trace("Searching for next match of pattern " + pattern.getPatternRegex());
ltm.run();
}
}
log.trace("Done searching for patterns in " + file);
return res;
}
@Override
public void execute() throws IOException {
Execution tagExec = getExecution().createSubExecution(TagSearcher.class);
tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags());
tagExec.getInfolisPatternTags().addAll(getExecution().getInfolisPatternTags());
tagExec.instantiateAlgorithm(this).run();
getExecution().getPatterns().addAll(tagExec.getPatterns());
getExecution().getInputFiles().addAll(tagExec.getInputFiles());
List<TextualReference> detectedContexts = new ArrayList<>();
int counter = 0, size = getExecution().getInputFiles().size();
System.out.println("number of documents to process: " + size);
for (String inputFileURI : getExecution().getInputFiles()) {
counter++;
log.trace("Input file URI: '{}'", inputFileURI);
InfolisFile inputFile;
try {
inputFile = getInputDataStoreClient().get(InfolisFile.class, inputFileURI);
} catch (BadRequestException | ProcessingException e) {
error(log, "Could not retrieve file " + inputFileURI + ": " + e.getMessage());
getExecution().setStatus(ExecutionStatus.FAILED);
return;
}
if (null == inputFile) {
throw new RuntimeException("File was not registered with the data store: " + inputFileURI);
}
if (null == inputFile.getMediaType()) {
throw new RuntimeException("File has no mediaType: " + inputFileURI);
}
// if the input file is not a text file
if (!inputFile.getMediaType().startsWith("text/plain")) {
// if the input file is a PDF file, convert it
if (inputFile.getMediaType().startsWith("application/pdf")) {
Execution convertExec = getExecution().createSubExecution(TextExtractor.class);
convertExec.setInputFiles(Arrays.asList(inputFile.getUri()));
// TODO wire this more efficiently so files are stored temporarily
Algorithm algo = convertExec.instantiateAlgorithm(this);
// do the actual conversion
algo.run();
// Set the inputFile to the file we just created
InfolisFile convertedInputFile = algo.getOutputDataStoreClient().get(InfolisFile.class, convertExec.getOutputFiles().get(0));
log.debug("Converted {} -> {}", inputFile.getUri(), convertedInputFile.getUri());
log.trace("Content: " + IOUtils.toString(algo.getInputFileResolver().openInputStream(convertedInputFile)));
inputFile = convertedInputFile;
} else {
throw new RuntimeException(getClass() + " execution / inputFiles " + "Can only search through text files or PDF files");
}
}
log.trace("Start extracting from '{}'.", inputFile);
updateProgress(counter, size);
detectedContexts.addAll(searchForPatterns(inputFile));
}
for (TextualReference sC : detectedContexts) {
getOutputDataStoreClient().post(TextualReference.class, sC);
this.getExecution().getTextualReferences().add(sC.getUri());
}
getExecution().setStatus(ExecutionStatus.FINISHED);
log.debug("No. contexts found: {}", getExecution().getTextualReferences().size());
}
@Override
public void validate() {
Execution exec = this.getExecution();
if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) &&
(null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) {
throw new IllegalArgumentException("Must set at least one inputFile!");
}
if ((null == exec.getPatterns() || exec.getPatterns().isEmpty()) &&
(null == exec.getInfolisPatternTags() || exec.getInfolisPatternTags().isEmpty())) {
throw new IllegalArgumentException("No patterns given.");
}
}
}