package io.github.infolis.algorithm; import java.io.IOException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.model.BootstrapStrategy; import io.github.infolis.model.Execution; import io.github.infolis.model.ExecutionStatus; import io.github.infolis.util.SerializationUtils; /** * * @author kata * */ public class LearnPatternsAndCreateLinks extends ComplexAlgorithm { public LearnPatternsAndCreateLinks(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } private static final Logger log = LoggerFactory.getLogger(LearnPatternsAndCreateLinks.class); @Override public void execute() { Execution tagExec = getExecution().createSubExecution(TagSearcher.class); tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags()); tagExec.instantiateAlgorithm(this).run(); getExecution().getInputFiles().addAll(tagExec.getInputFiles()); preprocessInputFiles(); try { debug(log, "Step1: Learning patterns and extracting textual references..."); Execution learnExec = learn(); updateProgress(1, 2); debug(log, "Step 2: Creating links..."); Execution linkingExec = createLinks(learnExec); updateProgress(2, 2); getExecution().setTextualReferences(linkingExec.getTextualReferences()); getExecution().setLinks(linkingExec.getLinks()); debug(log, "Done. Returning {} textual references and {} entity links", getExecution().getTextualReferences().size(), getExecution().getLinks().size()); log.debug(SerializationUtils.toCsv(getExecution().getLinks(), getOutputDataStoreClient())); getExecution().setStatus(ExecutionStatus.FINISHED); } catch (IllegalArgumentException | IllegalAlgorithmArgumentException | IOException e) { error(log, "Execution threw an Exception: {}", e); getExecution().setStatus(ExecutionStatus.FAILED); } } private Execution createLinks(Execution learnExec) throws IllegalAlgorithmArgumentException, IOException { Execution linkExec = getExecution().createSubExecution(ReferenceLinker.class); linkExec.setSearchResultLinkerClass(getExecution().getSearchResultLinkerClass()); linkExec.setInputFiles(getExecution().getInputFiles()); linkExec.setTextualReferences(learnExec.getTextualReferences()); if (null != getExecution().getQueryServiceClasses()) { linkExec.setQueryServiceClasses(getExecution().getQueryServiceClasses()); } if (null != getExecution().getQueryServices()) { linkExec.setQueryServices(getExecution().getQueryServices()); } linkExec.instantiateAlgorithm(this).run(); return linkExec; } private Execution learn() throws IllegalArgumentException, IOException { Execution learnExec; if (getExecution().getBootstrapStrategy().equals(BootstrapStrategy.reliability)){ learnExec = getExecution().createSubExecution(ReliabilityBasedBootstrapping.class); } else learnExec = getExecution().createSubExecution(FrequencyBasedBootstrapping.class); learnExec.setInputFiles(getExecution().getInputFiles()); learnExec.setBootstrapStrategy(getExecution().getBootstrapStrategy()); learnExec.setStartPage(getExecution().getStartPage()); learnExec.setRemoveBib(getExecution().isRemoveBib()); learnExec.setTokenize(getExecution().isTokenize()); learnExec.setTokenizeNLs(getExecution().getTokenizeNLs()); learnExec.setPtb3Escaping(getExecution().getPtb3Escaping()); learnExec.setPhraseSlop(getExecution().getPhraseSlop()); learnExec.setSeeds(getExecution().getSeeds()); learnExec.setUpperCaseConstraint(getExecution().isUpperCaseConstraint()); learnExec.setReliabilityThreshold(getExecution().getReliabilityThreshold()); learnExec.setMaxIterations(getExecution().getMaxIterations()); learnExec.instantiateAlgorithm(this).run(); return learnExec; } @Override public void validate() throws IllegalAlgorithmArgumentException { Execution exec = this.getExecution(); if (null == exec.getSeeds() || exec.getSeeds().isEmpty()) { throw new IllegalAlgorithmArgumentException(getClass(), "seeds", "Required parameter 'seeds' is missing!"); } if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) && (null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) { throw new IllegalAlgorithmArgumentException(getClass(), "inputFiles", "Required parameter 'inputFiles' is missing!"); } if (null == exec.getBootstrapStrategy()) { throw new IllegalAlgorithmArgumentException(getClass(), "bootstrapStrategy", "Required parameter 'bootstrapStrategy' is missing!"); } if (null == exec.isTokenize()) { warn(log, "Warning: tokenize parameter unspecified. Defaulting to true for LearnPatternsAndCreateLinks."); this.getExecution().setTokenize(true); } boolean queryServiceSet = false; if (null != exec.getQueryServiceClasses() && !exec.getQueryServiceClasses().isEmpty()) { queryServiceSet = true; } if (null != exec.getQueryServices() && !exec.getQueryServices().isEmpty()) { queryServiceSet = true; } if (!queryServiceSet) { throw new IllegalAlgorithmArgumentException(getClass(), "queryService", "Required parameter 'query services' is missing!"); } if (null == exec.getSearchResultLinkerClass()) { throw new IllegalAlgorithmArgumentException(getClass(), "searchResultLinkerClass", "Required parameter 'SearchResultLinkerClass' is missing!"); } } }