package io.github.infolis.model; import io.github.infolis.algorithm.Algorithm; import io.github.infolis.algorithm.BaseAlgorithm; import io.github.infolis.algorithm.FederatedSearcher; import io.github.infolis.algorithm.SearchResultLinker; import io.github.infolis.algorithm.LuceneSearcher; import io.github.infolis.algorithm.TextExtractor; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.infolink.querying.QueryService; import io.github.infolis.util.RegexUtils; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.annotation.JsonInclude.Include; import java.lang.reflect.Field; /** * * @author domi * @author kba * @author kata */ @JsonIgnoreProperties(ignoreUnknown = true) @JsonInclude(Include.NON_NULL) public class Execution extends BaseModel { private static final Logger logger = LoggerFactory.getLogger(Execution.class); // // // // CONSTRUCTORS AND METHODS // // // public Execution() { } public Execution(Class<? extends Algorithm> algo) { this.algorithm = algo; } public Algorithm instantiateAlgorithm(DataStoreClient dataStoreClient, FileResolver fileResolver) { return instantiateAlgorithm(dataStoreClient, dataStoreClient, fileResolver, fileResolver); } public Algorithm instantiateAlgorithm(Algorithm copyFrom) { return instantiateAlgorithm( copyFrom.getInputDataStoreClient(), copyFrom.getOutputDataStoreClient(), copyFrom.getInputFileResolver(), copyFrom.getOutputFileResolver()); } public Algorithm instantiateAlgorithm( DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver ) { if (null == this.getAlgorithm()) { throw new IllegalArgumentException( "Must set 'algorithm' of execution before calling instantiateAlgorithm."); } Algorithm algo; try { Constructor<? extends Algorithm> constructor = this.algorithm.getDeclaredConstructor(DataStoreClient.class, DataStoreClient.class, FileResolver.class, FileResolver.class); algo = constructor.newInstance(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | SecurityException | IllegalArgumentException | InvocationTargetException e) { throw new RuntimeException(e); } algo.setExecution(this); logger.debug("Created instance for algorithm '{}'", this.getAlgorithm()); return algo; } // // // // EXECUTION ATTRIBUTES // // // /** * The algorithm which is supposed to be executed within this * execution. * * {@link Algorithm} */ private Class<? extends Algorithm> algorithm; /** * Status of the execution (PENDING, STARTED, FINISHED, FAILED). * Default (when starting): ExecutionStatus.PENDING * * {@link ExecutionStatus} */ private ExecutionStatus status = ExecutionStatus.PENDING; /** * Log messages of this execution. */ private List<String> log = new ArrayList<>(); /** * Timestamp when execution started. */ private Date startTime; /** * Timestamp when execution ended. */ private Date endTime; /** * Progress of the execution in percent, a value between [0..100]. */ private long progress =0; // // // // Parameters // // // /** * Input files can either be pdfs or text files. * They are for example used to search patterns within the * Pattern Applier algorithm. * * {@link TextExtractor} {@link Bootstrapping} * {@link InfolisPatternSearcher} {@link SearchPatternsAndCreateLinks} * {@link Indexer} */ private List<String> inputFiles = new ArrayList<>(); /** * Output files to save the output files (txt files) of algorithms. * These files can serve as input for following algorimths. * For example, the TextExtraction algorithm extracts texts of pdfs * and stores these texts as output files. * * {@link LuceneSearcher} {@link TextExtractor} */ private List<String> outputFiles = new ArrayList<>(); /** * Whether to remove bibliographies from text/plain articles. * Default: false * * {@link TextExtractor} */ private boolean removeBib = false; /** * Whether to tokenize text input. Bootstrapping requires tokenized * input texts to perform well. It can either be called on tokenized * input texts or it can be called on untokenized text or pdf files and * perform tokenization itself. If unspecified, defaults to false for * TextExtractor and to true for Bootstrapping. * Default: null * * {@link TextExtractor} {@link Bootstrapping} */ private Boolean tokenize = null; /** * Output directory of Indexer and TextExtractor. * * {@link TextExtractor} {@link Indexer} {@link Bootstrapping} */ private String outputDirectory = ""; /** * Input directory of LuceneSearcher = output directory of indexer. * * {@link Indexer} {@link LuceneSearcher} */ private String indexDirectory = ""; /** * The slop for phrases used by the Lucene query parser. * It determines how similar two phrases must be to be matched. * If zero, then only exact phrase matches, if 10 up to 10 edit * operations may be carried out. * Default: 10 * * {@link Bootstrapping} {@link LuceneSearcher} */ private int phraseSlop = 10; /** * Determines whether the Lucene query parser is allowed to * use leading wildcard characters. * Default: true * * {@link Bootstrapping} {@link LuceneSearcher} */ private boolean allowLeadingWildcards = true; /** * The maximum number of clauses permitted per BooleanQuery (Lucence search). * A boolean query represents a query that matches documents * matching boolean combinations of other queries. * Default: Integer max value * * {@link Bootstrapping} {@link LuceneSearcher} */ private int maxClauseCount = Integer.MAX_VALUE; /** * A search term that can be used in different algorithms * whenever something a certain term needs to be searched in a text. * For example, the bootstrapping algorithms need a seed in the * beginning to start the whole process. The search term represents * such a seed, e.g. the study name "ALLBUS". * * {@link LuceneSearcher} */ private String searchTerm; /** * Any kind of search query that can be used within the algorithms. * For example, it represents the search query which is used * to perform a search in different repositories to find * fitting research data. * * {@link FederatedSearcher} {@link SearchPatternsAndCreateLinks} */ private String searchQuery; /** * Group numbers to use for RegexSearcher: group of reference term. * * {@Link RegexSearcher} */ private int referenceGroup = RegexUtils.doiGroupNum; /** * Group numbers to use for RegexSearcher: group of left context. * * {@Link RegexSearcher} */ private int leftContextGroup = RegexUtils.doiLeftContextGroupNum; /** * Group numbers to use for RegexSearcher: group of right context. * * {@Link RegexSearcher} */ private int rightContextGroup = RegexUtils.doiRightContextGroupNum; /** * A textual reference represents any kind of reference that * can be find in a text, e.g. a term like a study name has been found in a publication. * Besides the text and the term that has been found in the text, * it also contains the context, i.e. where the term has been detected. * * {@link FederatedSearcher} {@link MetaDataExtractor} * {@link Resolver} {@link LuceneSearcher} {@link SearchPatternsAndCreateLinks} * {@link PatternApplier} {@link Bootstrapping} */ private List<String> textualReferences = new ArrayList<>(); /** * A list of patterns (internally expressed as regular expression) * that can be applied on texts, e.g. to find links to research data. * * {@link PatternApplier} {@link ApplyPatternAndResolve} {@link Bootstrapping} */ private List<String> patterns = new ArrayList<>(); /** * Indicates whether we require a term to contain * at least one upper case character. * The idea behind is that especially a study name is supposed to be a * named entity and thus should contain at least one upper-case character. * Default: false * * {@link PatternApplier} {@link Bootstrapping} */ private boolean upperCaseConstraint = false; /** * Seeds used for bootstrapping, e.g. study names to start * with like "ALLBUS". * * {@link Bootstrapping} */ private List<String> seeds = new ArrayList<>(); /** * Maximum number of iterations during the bootstrapping process. * A high number of iterations can lead to a increased run time. * Default: 10 * * {@link Bootstrapping} */ private int maxIterations = 10; /** * Number of words used for creation of patterns. * * {@link StandardPatternInducer} */ private int windowsize = 3; //TODO: also used for frequencyBasedBootstrapping, should we just name //it bootstrapping threshold? /** * Determines which patterns (and entities for reliability based bootstrapping) * are the relevant ones. For the frequency based bootstrapping * this means how often a pattern need to occur and for the * reliability based bootstrapping how reliable the pattern and the entities * used to generate this pattern are. * * Default: 0.8 * * {@link Bootstrapping} */ private double reliabilityThreshold = 0.8; /** * Strategy to use for bootstrapping. Can either be: * mergeCurrent, mergeNew, mergeAll, separate, reliability. * The first four strategies are different kinds of * strategies for the frequency based bootstrapping. They mainly differ * in the way how to handle patterns that have been generated in previous * iterations. The strategy reliability referes to the reliability * based bootstrapping. * Default: mergeAll * * {@link BootstrapStrategy} {@link Bootstrapping} */ private BootstrapStrategy bootstrapStrategy = BootstrapStrategy.mergeAll; /** * The SearchResultLinkerClass determines the SearchResultLinker to * use. That class is responsible for deciding which SearchResults to * select for creating links. */ private Class<? extends SearchResultLinker> searchResultLinkerClass; /** * As a final step, links between the texts and the discovered * named entities (research data) are established and saved in this list. * * {@link Resolver} {@link ApplyPatternAndResolve} */ private List<String> links; /** * We can search different repositories for named entities. * One query service represents one specific type of search, e.g. * a SOLR-based search or a search within a portal returning HTML. * This list contains all query services that should be used. * * {@link FederatedSearcher} {@link ApplyPatternAndResolve} */ private List<String> queryServices; /** * We can search different repositories for named entities. * TODO */ private List<Class<? extends QueryService>> queryServiceClasses; /** * After a search in one or more repositories, a list * of search results is returned. These results not only contain * the repository which was searched but also information like * the relevance score. * * {@link FederatedSearcher} {@link ApplyPatternAndResolve} */ private List<String> searchResults; //TODO: include local search /** * Beside the search in external repositories, we can also * search in our own database. As use case, we get a URN for a publication * from a user and want to show all named entities that are linked to * this publication. With an interal search using the generated links, * we can find this entities which are returned in this list. * * * {@link LocalResolver} */ private List<String> linkedEntities; /** * List of tags to identify a specific set of InfolisPatterns */ private Set<String> infolisPatternTags = new HashSet<>(); /** * List of tags to identify a specific set of InfolisFiles */ private Set<String> infolisFileTags = new HashSet<>(); /** * List of tags to identify a specific set of TextualReferences */ private Set<String> textualReferenceTags = new HashSet<>(); /** * Flag used by TextExtractor: if set to false, pdfs for which corresponding text * files already exist in the specified text directory will not be converted again, instead * the existing text files will be returned as InfolisFile instances. If set to true, all * pdfs will be converted regardless of any existing files in the text directory. * Default: true. * {@link TextExtractor} */ private boolean overwriteTextfiles = true; /** * Determines whether new line characters are to be tokenized. * {@link Tokenizer} */ private boolean tokenizeNLs = false; /** * Enable all traditional PTB3 token transforms (like parentheses becoming -LRB-, -RRB-). * {@link Tokenizer} */ private boolean ptb3Escaping = true; /** * Index (starting at 1 rather than 0) of the first page to extract. * Useful to ignore title pages if present. * {@link TextExtractor} */ private int startPage = 1; /** * List of entities (URIs of the entities) for which key words should be * generated. Serves as input for the keyword tagging algorithm. * * {@link KeywordTagger} */ private List<String> entitiesForKeywordTagging = new ArrayList<>(); /** * Key words which are generated by the KeywordTagger. * {@link KeywordTagger} */ private List<String> keyWords = new ArrayList<>(); /** * Path/URI to the thesaurus which is used for the key word tagging. * {@link KeywordTagger} */ private String thesaurus = new String(); /** * Language of the abstracts * {@link KeywordTagger} */ private String abstractLanguage = "en"; /** * Meta data files of the publications in which information about * the title, author, astract, subject, identifier, URL, and language * can be found. * {@link TextAndMetaDataExtractor} */ private List<String> metaDataFiles = new ArrayList<>();; // // // // GETTERS / SETTERS // // // public ExecutionStatus getStatus() { return status; } public void setStatus(ExecutionStatus status) { this.status = status; } public List<String> getLog() { return log; } public void setLog(List<String> log) { this.log = log; } public List<String> getInputFiles() { return inputFiles; } public void setInputFiles(List<String> paramPdfInput) { this.inputFiles = paramPdfInput; } @JsonIgnore public String getFirstInputFile() { return inputFiles.get(0); } @JsonIgnore public void setFirstInputFile(String fileName) { if (null == inputFiles) { inputFiles = new ArrayList<>(); } if (inputFiles.size() > 0) { inputFiles.set(0, fileName); } else { inputFiles.add(fileName); } } public List<String> getOutputFiles() { return outputFiles; } public void setOutputFiles(List<String> paramPdfOutput) { this.outputFiles = paramPdfOutput; } @JsonIgnore public String getFirstOutputFile() { return outputFiles.get(0); } @JsonIgnore public void setFirstOutputFile(String fileName) { if (null == outputFiles) { outputFiles = new ArrayList<>(); } if (outputFiles.size() > 0) { outputFiles.set(0, fileName); } else { outputFiles.add(fileName); } } public boolean isRemoveBib() { return removeBib; } public void setRemoveBib(boolean removeBib) { this.removeBib = removeBib; } public Boolean isTokenize() { return this.tokenize; } public void setTokenize(boolean tokenize) { this.tokenize = tokenize; } public String getOutputDirectory() { return outputDirectory; } public void setOutputDirectory(String outputDirectory) { this.outputDirectory = outputDirectory; } public String getIndexDirectory() { return indexDirectory; } public void setIndexDirectory(String indexDirectory) { this.indexDirectory = indexDirectory; } public Class<? extends Algorithm> getAlgorithm() { return algorithm; } public void setAlgorithm(Class<? extends Algorithm> algorithm) { this.algorithm = algorithm; } public Date getStartTime() { return startTime; } public void setStartTime(Date startTime) { this.startTime = startTime; } public Date getEndTime() { return endTime; } public void setEndTime(Date endTime) { this.endTime = endTime; } public int getPhraseSlop() { return phraseSlop; } public void setPhraseSlop(int phraseSlop) { this.phraseSlop = phraseSlop; } public boolean isAllowLeadingWildcards() { return allowLeadingWildcards; } public void setAllowLeadingWildcards(boolean allowLeadingWildcards) { this.allowLeadingWildcards = allowLeadingWildcards; } public int getMaxClauseCount() { return maxClauseCount; } public void setMaxClauseCount(int maxClauseCount) { this.maxClauseCount = maxClauseCount; } public String getSearchTerm() { return searchTerm; } public void setSearchTerm(String searchTerm) { this.searchTerm = searchTerm; } public String getSearchQuery() { return searchQuery; } public void setSearchQuery(String searchQuery) { this.searchQuery = searchQuery; } public void setLeftContextGroup(int groupNum) { this.leftContextGroup = groupNum; } public void setRightContextGroup(int groupNum) { this.rightContextGroup = groupNum; } public void setReferenceGroup(int groupNum) { this.referenceGroup = groupNum; } public int getLeftContextGroup() { return this.leftContextGroup; } public int getRightContextGroup() { return this.rightContextGroup; } public int getReferenceGroup() { return this.referenceGroup; } public List<String> getTextualReferences() { return textualReferences; } public void setTextualReferences(List<String> textualReferences) { this.textualReferences = textualReferences; } public List<String> getPatterns() { return patterns; } public void setPatterns(List<String> patterns) { this.patterns = patterns; } public boolean isUpperCaseConstraint() { return upperCaseConstraint; } public void setUpperCaseConstraint(boolean upperCaseConstraint) { this.upperCaseConstraint = upperCaseConstraint; } public List<String> getLinks() { return links; } public List<String> getSeeds() { return seeds; } public void setSeeds(List<String> terms) { this.seeds = terms; } public int getMaxIterations() { return maxIterations; } public void setMaxIterations(int maxIterations) { this.maxIterations = maxIterations; } public int getWindowsize() { return this.windowsize; } public void setWindowsize(int windowsize) { this.windowsize = windowsize; } public double getReliabilityThreshold() { return reliabilityThreshold; } public void setReliabilityThreshold(double threshold) { this.reliabilityThreshold = threshold; } public BootstrapStrategy getBootstrapStrategy() { return bootstrapStrategy; } public void setBootstrapStrategy(BootstrapStrategy bootstrapStrategy) { this.bootstrapStrategy = bootstrapStrategy; } public List<String> getQueryServices() { return queryServices; } public void setQueryServices(List<String> queryServices) { this.queryServices = queryServices; } public List<String> getSearchResults() { return searchResults; } public void setSearchResults(List<String> searchResults) { this.searchResults = searchResults; } public void setLinks(List<String> links) { this.links = links; } public Class<? extends SearchResultLinker> getSearchResultLinkerClass() { return this.searchResultLinkerClass; } public void setSearchResultLinkerClass (Class<? extends SearchResultLinker> searchResultLinkerClass) { this.searchResultLinkerClass = searchResultLinkerClass; } public List<String> getLinkedEntities() { return linkedEntities; } public void setLinkedEntities(List<String> linkedEntities) { this.linkedEntities = linkedEntities; } public long getProgress() { return progress; } public Set<String> getInfolisPatternTags() { return infolisPatternTags; } public void setInfolisPatternTags(Set<String> infolisPatternTags) { this.infolisPatternTags = infolisPatternTags; } public Set<String> getInfolisFileTags() { return infolisFileTags; } public void setInfolisFileTags(Set<String> infolisFileTags) { this.infolisFileTags = infolisFileTags; } public Set<String> getTextualReferenceTags() { return this.textualReferenceTags; } public void setTextualReferenceTags(Set<String> textualReferenceTags) { this.textualReferenceTags = textualReferenceTags; } public void setProgress(long progress) { this.progress = progress; } public void setOverwriteTextfiles(boolean overwriteTextfiles) { this.overwriteTextfiles = overwriteTextfiles; } public boolean getOverwriteTextfiles() { return this.overwriteTextfiles; } public void setTokenizeNLs(boolean tokenizeNLs) { this.tokenizeNLs = tokenizeNLs; } public boolean getTokenizeNLs() { return this.tokenizeNLs; } public void setPtb3Escaping(boolean ptb3Escaping) { this.ptb3Escaping = ptb3Escaping; } public boolean getPtb3Escaping() { return this.ptb3Escaping; } public void setStartPage(int startPage) { this.startPage = startPage; } public int getStartPage() { return this.startPage; } public void setEntitiesForKeywordTagging(List<String> ents) { this.entitiesForKeywordTagging = ents; } public List<String> getEntitiesForKeywordTagging() { return this.entitiesForKeywordTagging; } public void setKeyWords(List<String> keywords) { this.keyWords = keywords; } public List<String> getKeyWords() { return this.keyWords; } public void setThesaurus(String thesaurus) { this.thesaurus = thesaurus; } public String getThesaurus() { return this.thesaurus; } public void setAbstractLanguage(String absLang) { this.abstractLanguage = absLang; } public String getAbstractLanguage() { return this.abstractLanguage; } public List<String> getMetaDataFiles() { return metaDataFiles; } public void setMetaDataFiles(List<String> metaDataDir) { this.metaDataFiles = metaDataDir; } public List<Class<? extends QueryService>> getQueryServiceClasses() { return queryServiceClasses; } public void setQueryServiceClasses(List<Class<? extends QueryService>> queryServiceClasses) { for(Class<? extends QueryService> qs : queryServiceClasses) { instantiateQueryService(qs); if(this.queryServiceClasses==null) { this.queryServiceClasses = new ArrayList<>(); } this.queryServiceClasses.add(qs); } } public void addQueryServiceClasses(Class<? extends QueryService> queryServiceClasses) { instantiateQueryService(queryServiceClasses); if(this.queryServiceClasses==null) { this.queryServiceClasses = new ArrayList<>(); } this.queryServiceClasses.add(queryServiceClasses); } private QueryService instantiateQueryService(Class<? extends QueryService> qs) { if (null == qs) { throw new IllegalArgumentException( "Must set 'queryServiceClass' of execution before calling."); } QueryService queryService; try { Constructor<? extends QueryService> constructor = qs.getDeclaredConstructor(); queryService = constructor.newInstance(); } catch (InstantiationException | IllegalAccessException | NoSuchMethodException | SecurityException | IllegalArgumentException | InvocationTargetException e) { throw new RuntimeException(e); } logger.debug("Created instance for queryService '{}'", qs); return queryService; } public void setProperty(String fieldName, Object value) throws NoSuchFieldException, IllegalAccessException { Field field = this.getClass().getDeclaredField(fieldName); field.setAccessible(true); if (field.getType() == Character.TYPE) {field.set(this, value.toString().charAt(0)); return;} if (field.getType() == Short.TYPE) {field.set(this, Short.parseShort(value.toString())); return;} if (field.getType() == Integer.TYPE) {field.set(this, Integer.parseInt(value.toString())); return;} if (field.getType() == Long.TYPE) {field.set(this, Long.parseLong(value.toString())); return;} if (field.getType() == Float.TYPE) {field.set(this, Float.parseFloat(value.toString())); return;} if (field.getType() == Double.TYPE) {field.set(this, Double.parseDouble(value.toString())); return;} if (field.getType() == Byte.TYPE) {field.set(this, Byte.parseByte(value.toString())); return;} if (field.getType() == Boolean.TYPE) {field.set(this, Boolean.parseBoolean(value.toString())); return;} // if (field.getGenericType() == ); field.set(this, value); } public Execution createSubExecution(Class<? extends BaseAlgorithm> algo) { Execution subExec = new Execution(algo); //subExec.setLog(getLog()); subExec.setTags(getTags()); return subExec; } }