package io.github.infolis.algorithm; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.net.MediaType; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.model.Execution; import io.github.infolis.model.entity.InfolisFile; /** * * @author kata * * A complexAlgorithm is defined as a baseAlgorithm that requires preprocessing of its input. * */ public abstract class ComplexAlgorithm extends BaseAlgorithm { public ComplexAlgorithm(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } private static final Logger log = LoggerFactory.getLogger(ComplexAlgorithm.class); private List<String> getToTextExtract() { List<String> toTextExtract = new ArrayList<>(); for (InfolisFile file : getInputDataStoreClient().get( InfolisFile.class, getExecution().getInputFiles())) { if (file.getMediaType().equals(MediaType.PDF.toString())) { toTextExtract.add(file.getUri()); getExecution().getInputFiles().remove(file.getUri()); } } debug(log, "Scheduling {} of {} files for text extraction", toTextExtract.size(), toTextExtract.size() + getExecution().getInputFiles().size()); return toTextExtract; } private void extract(List<String> toTextExtract) { if (!toTextExtract.isEmpty()) { Execution textExtract = getExecution().createSubExecution(TextExtractor.class); textExtract.setTokenize(getExecution().isTokenize()); textExtract.setTokenizeNLs(getExecution().getTokenizeNLs()); textExtract.setPtb3Escaping(getExecution().getPtb3Escaping()); textExtract.setRemoveBib(getExecution().isRemoveBib()); textExtract.setTags(getExecution().getTags()); textExtract.setInputFiles(toTextExtract); textExtract.setOutputDirectory(getExecution().getOutputDirectory()); textExtract.setStartPage(getExecution().getStartPage()); textExtract.instantiateAlgorithm(this).run(); getExecution().getInputFiles().addAll(textExtract.getOutputFiles()); } } /** * Removes the bibliographies of all given input files. * If a file is scheduled for tokenization as well, the old uri is replaced by the * new one generated by this method. * * @param toBibExtract * @param toTokenize * @return */ private List<String> removeBibs(List<String> toBibExtract, List<String> toTokenize) { if (getExecution().isRemoveBib() && !toBibExtract.isEmpty()) { for (String uri : toBibExtract) { Execution bibRemoverExec = getExecution().createSubExecution(BibliographyExtractor.class); bibRemoverExec.setTags(getExecution().getTags()); bibRemoverExec.setInputFiles(Arrays.asList(uri)); bibRemoverExec.instantiateAlgorithm(this).run(); debug(log, "Removed bibliographies of input file: " + uri); if (!toTokenize.contains(uri)) { getExecution().getInputFiles().add(bibRemoverExec.getOutputFiles().get(0)); } else { toTokenize.remove(uri); toTokenize.add(bibRemoverExec.getOutputFiles().get(0)); } } } return toTokenize; } private void tokenize(List<String> toTokenize) { if (getExecution().isTokenize() && !toTokenize.isEmpty()) { Execution tokenizerExec = getExecution().createSubExecution(TokenizerStanford.class); tokenizerExec.setTags(getExecution().getTags()); tokenizerExec.setTokenizeNLs(getExecution().getTokenizeNLs()); tokenizerExec.setPtb3Escaping(getExecution().getPtb3Escaping()); tokenizerExec.setInputFiles(toTokenize); tokenizerExec.instantiateAlgorithm(this).run(); debug(log, "Tokenized {} files with parameters tokenizeNLs={} ptb3Escaping={}", toTokenize.size(), tokenizerExec.getTokenizeNLs(), tokenizerExec.getPtb3Escaping()); log.debug(toTokenize.toString()); getExecution().getInputFiles().addAll(tokenizerExec.getOutputFiles()); } } /** * Check whether the input files have been preprocessed according to the given * parameters. If not, apply appropriate preprocessing if applicable and * display warnings otherwise. * * This method allows for the fact that the list of input files may contain a * mixture of pdfs and text files, and the latter with possibly different * preprocessing steps applied. */ public void preprocessInputFiles() { List<String> toTextExtract = getToTextExtract(); extract(toTextExtract); // all pdf files that were given have been preprocessed properly // if txt files are given in addition to the pdfs or only text files are given, // check if their preprocessing corresponds to the given parameters List<List<String>> toTokenizeAndToBibExtract = getToTokenizeAndToBibExtract(); List<String> toTokenize = toTokenizeAndToBibExtract.get(0); List<String> toBibExtract = toTokenizeAndToBibExtract.get(1); toTokenize = removeBibs(toBibExtract, toTokenize); tokenize(toTokenize); } /** * For each input file, check whether their preprocessing is compatible with the given * parameters. If not, schedule file for preprocessing, if applicable, or post a warning * to the log. * (Checking separately for tokenize and bibExtract options would be more concise but * would require more get operations from the dataStoreClient and would thus be * more inefficient) * * @return */ private List<List<String>> getToTokenizeAndToBibExtract() { List<String> toTokenize = new ArrayList<>(); List<String> toBibExtract = new ArrayList<>(); if (getExecution().isTokenize() || getExecution().isRemoveBib()) { String tokenizeTag = TokenizerStanford.getTokenizeTag(); String tokenizeNLsTag = TokenizerStanford.getTokenizeNLsTag(); String ptb3EscapingTag = TokenizerStanford.getPtb3EscapingTag(); for (InfolisFile file : getInputDataStoreClient().get( InfolisFile.class, getExecution().getInputFiles())) { // if input file isn't tokenized, apply tokenizer if (getExecution().isTokenize()) { if (!file.getTags().contains(tokenizeTag)) { toTokenize.add(file.getUri()); getExecution().getInputFiles().remove(file.getUri()); } else if (file.getTags().contains(TokenizerOpenNLP.getTokenizeTag())) { warn(log, "This algorithm is optimized for input files tokenized with TokenizerStanford, your input files " + "have been tokenized with TokenizerOpenNLP. This may hurt performance"); } // if input texts were tokenized using the same tokenizer but with different parameters than // specified, display warning else if (file.getTags().contains(tokenizeTag) && (file.getTags().contains(tokenizeNLsTag) && !getExecution().getTokenizeNLs())) { warn(log, "You specified that this algorithm should not make use of NL-tokens but your input files " + "seem to contain such tokens. This may hurt performance"); } else if (file.getTags().contains(tokenizeTag) && (!file.getTags().contains(tokenizeNLsTag) && getExecution().getTokenizeNLs())) { warn(log, "You specified that this algorithm should make use of NL-tokens but your input files " + "do not seem to contain such tokens. This may hurt performance"); } else if (file.getTags().contains(tokenizeTag) && (file.getTags().contains(ptb3EscapingTag) && !getExecution().getPtb3Escaping())) { warn(log, "You specified that this algorithm should not make use of ptb3Escaping-tokens but your input files " + "seem to contain such tokens. This may hurt performance"); } else if (file.getTags().contains(tokenizeTag) && (!file.getTags().contains(ptb3EscapingTag) && getExecution().getPtb3Escaping())) { warn(log, "You specified that this algorithm should make use of ptb3Escaping-tokens but your input files " + "do not seem to contain such tokens. This may hurt performance"); } } // removing bibliographies is optional // if it is to be performed, check whether input files are stripped of // their bibliography sections already if (getExecution().isRemoveBib()) { if (!file.getTags().contains(BibliographyExtractor.getExecutionTags().get(0))) { toBibExtract.add(file.getUri()); getExecution().getInputFiles().remove(file.getUri()); } } } } debug(log, "Scheduling {} files for tokenization", toTokenize.size()); log.debug(toTokenize.toString()); debug(log, "Scheduling {} files for removal of bibliographies", toBibExtract.size()); log.debug(toBibExtract.toString()); return Arrays.asList(toTokenize, toBibExtract); } }