package io.github.infolis.algorithm;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.net.MediaType;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.model.Execution;
import io.github.infolis.model.entity.InfolisFile;
/**
*
* @author kata
*
* A complexAlgorithm is defined as a baseAlgorithm that requires preprocessing of its input.
*
*/
public abstract class ComplexAlgorithm extends BaseAlgorithm {
public ComplexAlgorithm(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient,
FileResolver inputFileResolver, FileResolver outputFileResolver) {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
private static final Logger log = LoggerFactory.getLogger(ComplexAlgorithm.class);
private List<String> getToTextExtract() {
List<String> toTextExtract = new ArrayList<>();
for (InfolisFile file : getInputDataStoreClient().get(
InfolisFile.class, getExecution().getInputFiles())) {
if (file.getMediaType().equals(MediaType.PDF.toString())) {
toTextExtract.add(file.getUri());
getExecution().getInputFiles().remove(file.getUri());
}
}
debug(log, "Scheduling {} of {} files for text extraction",
toTextExtract.size(), toTextExtract.size() + getExecution().getInputFiles().size());
return toTextExtract;
}
private void extract(List<String> toTextExtract) {
if (!toTextExtract.isEmpty()) {
Execution textExtract = getExecution().createSubExecution(TextExtractor.class);
textExtract.setTokenize(getExecution().isTokenize());
textExtract.setTokenizeNLs(getExecution().getTokenizeNLs());
textExtract.setPtb3Escaping(getExecution().getPtb3Escaping());
textExtract.setRemoveBib(getExecution().isRemoveBib());
textExtract.setTags(getExecution().getTags());
textExtract.setInputFiles(toTextExtract);
textExtract.setOutputDirectory(getExecution().getOutputDirectory());
textExtract.setStartPage(getExecution().getStartPage());
textExtract.instantiateAlgorithm(this).run();
getExecution().getInputFiles().addAll(textExtract.getOutputFiles());
}
}
/**
* Removes the bibliographies of all given input files.
* If a file is scheduled for tokenization as well, the old uri is replaced by the
* new one generated by this method.
*
* @param toBibExtract
* @param toTokenize
* @return
*/
private List<String> removeBibs(List<String> toBibExtract, List<String> toTokenize) {
if (getExecution().isRemoveBib() && !toBibExtract.isEmpty()) {
for (String uri : toBibExtract) {
Execution bibRemoverExec = getExecution().createSubExecution(BibliographyExtractor.class);
bibRemoverExec.setTags(getExecution().getTags());
bibRemoverExec.setInputFiles(Arrays.asList(uri));
bibRemoverExec.instantiateAlgorithm(this).run();
debug(log, "Removed bibliographies of input file: " + uri);
if (!toTokenize.contains(uri)) {
getExecution().getInputFiles().add(bibRemoverExec.getOutputFiles().get(0));
}
else {
toTokenize.remove(uri);
toTokenize.add(bibRemoverExec.getOutputFiles().get(0));
}
}
}
return toTokenize;
}
private void tokenize(List<String> toTokenize) {
if (getExecution().isTokenize() && !toTokenize.isEmpty()) {
Execution tokenizerExec = getExecution().createSubExecution(TokenizerStanford.class);
tokenizerExec.setTags(getExecution().getTags());
tokenizerExec.setTokenizeNLs(getExecution().getTokenizeNLs());
tokenizerExec.setPtb3Escaping(getExecution().getPtb3Escaping());
tokenizerExec.setInputFiles(toTokenize);
tokenizerExec.instantiateAlgorithm(this).run();
debug(log, "Tokenized {} files with parameters tokenizeNLs={} ptb3Escaping={}", toTokenize.size(), tokenizerExec.getTokenizeNLs(), tokenizerExec.getPtb3Escaping());
log.debug(toTokenize.toString());
getExecution().getInputFiles().addAll(tokenizerExec.getOutputFiles());
}
}
/**
* Check whether the input files have been preprocessed according to the given
* parameters. If not, apply appropriate preprocessing if applicable and
* display warnings otherwise.
*
* This method allows for the fact that the list of input files may contain a
* mixture of pdfs and text files, and the latter with possibly different
* preprocessing steps applied.
*/
public void preprocessInputFiles() {
List<String> toTextExtract = getToTextExtract();
extract(toTextExtract);
// all pdf files that were given have been preprocessed properly
// if txt files are given in addition to the pdfs or only text files are given,
// check if their preprocessing corresponds to the given parameters
List<List<String>> toTokenizeAndToBibExtract = getToTokenizeAndToBibExtract();
List<String> toTokenize = toTokenizeAndToBibExtract.get(0);
List<String> toBibExtract = toTokenizeAndToBibExtract.get(1);
toTokenize = removeBibs(toBibExtract, toTokenize);
tokenize(toTokenize);
}
/**
* For each input file, check whether their preprocessing is compatible with the given
* parameters. If not, schedule file for preprocessing, if applicable, or post a warning
* to the log.
* (Checking separately for tokenize and bibExtract options would be more concise but
* would require more get operations from the dataStoreClient and would thus be
* more inefficient)
*
* @return
*/
private List<List<String>> getToTokenizeAndToBibExtract() {
List<String> toTokenize = new ArrayList<>();
List<String> toBibExtract = new ArrayList<>();
if (getExecution().isTokenize() || getExecution().isRemoveBib()) {
String tokenizeTag = TokenizerStanford.getTokenizeTag();
String tokenizeNLsTag = TokenizerStanford.getTokenizeNLsTag();
String ptb3EscapingTag = TokenizerStanford.getPtb3EscapingTag();
for (InfolisFile file : getInputDataStoreClient().get(
InfolisFile.class, getExecution().getInputFiles())) {
// if input file isn't tokenized, apply tokenizer
if (getExecution().isTokenize()) {
if (!file.getTags().contains(tokenizeTag)) {
toTokenize.add(file.getUri());
getExecution().getInputFiles().remove(file.getUri());
}
else if (file.getTags().contains(TokenizerOpenNLP.getTokenizeTag())) {
warn(log, "This algorithm is optimized for input files tokenized with TokenizerStanford, your input files "
+ "have been tokenized with TokenizerOpenNLP. This may hurt performance");
}
// if input texts were tokenized using the same tokenizer but with different parameters than
// specified, display warning
else if (file.getTags().contains(tokenizeTag)
&& (file.getTags().contains(tokenizeNLsTag) && !getExecution().getTokenizeNLs())) {
warn(log, "You specified that this algorithm should not make use of NL-tokens but your input files "
+ "seem to contain such tokens. This may hurt performance");
}
else if (file.getTags().contains(tokenizeTag)
&& (!file.getTags().contains(tokenizeNLsTag) && getExecution().getTokenizeNLs())) {
warn(log, "You specified that this algorithm should make use of NL-tokens but your input files "
+ "do not seem to contain such tokens. This may hurt performance");
}
else if (file.getTags().contains(tokenizeTag)
&& (file.getTags().contains(ptb3EscapingTag) && !getExecution().getPtb3Escaping())) {
warn(log, "You specified that this algorithm should not make use of ptb3Escaping-tokens but your input files "
+ "seem to contain such tokens. This may hurt performance");
}
else if (file.getTags().contains(tokenizeTag)
&& (!file.getTags().contains(ptb3EscapingTag) && getExecution().getPtb3Escaping())) {
warn(log, "You specified that this algorithm should make use of ptb3Escaping-tokens but your input files "
+ "do not seem to contain such tokens. This may hurt performance");
}
}
// removing bibliographies is optional
// if it is to be performed, check whether input files are stripped of
// their bibliography sections already
if (getExecution().isRemoveBib()) {
if (!file.getTags().contains(BibliographyExtractor.getExecutionTags().get(0))) {
toBibExtract.add(file.getUri());
getExecution().getInputFiles().remove(file.getUri());
}
}
}
}
debug(log, "Scheduling {} files for tokenization", toTokenize.size());
log.debug(toTokenize.toString());
debug(log, "Scheduling {} files for removal of bibliographies", toBibExtract.size());
log.debug(toBibExtract.toString());
return Arrays.asList(toTokenize, toBibExtract);
}
}