package io.github.infolis.algorithm;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.infolink.patternLearner.BootstrapLearner;
import io.github.infolis.model.Execution;
import io.github.infolis.model.ExecutionStatus;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.util.RegexUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.queryparser.classic.ParseException;
import org.slf4j.LoggerFactory;
/**
*
* @author kata
*
*/
public abstract class Bootstrapping extends ComplexAlgorithm implements BootstrapLearner {
public Bootstrapping(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) throws IOException {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
private static final org.slf4j.Logger log = LoggerFactory.getLogger(Bootstrapping.class);
public Execution indexerExecution;
public abstract List<TextualReference> bootstrap() throws ParseException, IOException, InstantiationException, IllegalAccessException;
public abstract static class PatternInducer {
protected abstract List<InfolisPattern> induce(TextualReference context, Double[] thresholds);
public abstract int getPatternsPerContext();
};
// TODO define getBestPatterns - method
public abstract class PatternRanker {};
Execution createIndex() throws IOException {
Execution execution = getExecution().createSubExecution(Indexer.class);
execution.setInputFiles(getExecution().getInputFiles());
execution.setOutputDirectory(getExecution().getIndexDirectory());
execution.instantiateAlgorithm(this).run();
getOutputDataStoreClient().post(Execution.class, execution);
return execution;
}
List<TextualReference> getContextsForSeed(String seed) {
// use lucene index to search for term in corpus
Execution execution = getExecution().createSubExecution(LuceneSearcher.class);
execution.setIndexDirectory(indexerExecution.getOutputDirectory());
execution.setPhraseSlop(0);
execution.setAllowLeadingWildcards(true);
execution.setMaxClauseCount(getExecution().getMaxClauseCount());
execution.setSearchTerm(seed);
InfolisPattern termPattern = new InfolisPattern(RegexUtils.normalizeQuery(seed, true));
DataStoreClient tempClient = getTempDataStoreClient();
tempClient.post(InfolisPattern.class, termPattern);
execution.setPatterns(Arrays.asList(termPattern.getUri()));
execution.setInputFiles(getExecution().getInputFiles());
execution.setReliabilityThreshold(getExecution().getReliabilityThreshold());
Algorithm algo = execution.instantiateAlgorithm(
getInputDataStoreClient(), tempClient, getInputFileResolver(), getOutputFileResolver());
algo.run();
getOutputDataStoreClient().post(Execution.class, execution);
getExecution().getLog().addAll(execution.getLog());
List<TextualReference> textualReferences = new ArrayList<>();
for (String uri : execution.getTextualReferences()) {
textualReferences.add(tempClient.get(TextualReference.class, uri));
}
tempClient.clear();
return textualReferences;
}
private List<String> getPatternUris(Collection<InfolisPattern> patternList) {
List<String> patternUris = new ArrayList<String>();
for (InfolisPattern curPat : patternList) {
if (curPat.getUri() == null)
throw new RuntimeException("Pattern does not have a URI!");
patternUris.add(curPat.getUri());
log.debug("pattern " + curPat + " has uri " + curPat.getUri());
}
return patternUris;
}
List<String> getContextsForPatterns(Collection<InfolisPattern> patterns,
DataStoreClient outputDataStoreClient) {
Execution searcherExec = getExecution().createSubExecution(InfolisPatternSearcher.class);
searcherExec.setInputFiles(getExecution().getInputFiles());
searcherExec.setIndexDirectory(indexerExecution.getOutputDirectory());
searcherExec.setPatterns(getPatternUris(patterns));
searcherExec.setUpperCaseConstraint(getExecution().isUpperCaseConstraint());
searcherExec.setPhraseSlop(getExecution().getPhraseSlop());
searcherExec.setAllowLeadingWildcards(true);
searcherExec.setMaxClauseCount(getExecution().getMaxClauseCount());
searcherExec.instantiateAlgorithm(getInputDataStoreClient(), outputDataStoreClient,
getInputFileResolver(), getOutputFileResolver()).run();
return searcherExec.getTextualReferences();
}
@Override
public void validate() {
Execution exec = this.getExecution();
if (null == exec.getSeeds() || exec.getSeeds().isEmpty()) {
throw new IllegalArgumentException("Must set at least one term as seed!");
}
if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) &&
(null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) {
throw new IllegalArgumentException("Must set at least one input file!");
}
if (null == exec.getBootstrapStrategy()) {
throw new IllegalArgumentException("Must set the bootstrap strategy!");
}
if (null == exec.isTokenize()) {
warn(log, "tokenize parameter unspecified. Setting to true for Bootstrapping");
exec.setTokenize(true);
}
}
@Override
public void execute() throws IOException {
Execution tagExec = getExecution().createSubExecution(TagSearcher.class);
tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags());
tagExec.getInfolisPatternTags().addAll(getExecution().getInfolisPatternTags());
tagExec.instantiateAlgorithm(this).run();
getExecution().getPatterns().addAll(tagExec.getPatterns());
getExecution().getInputFiles().addAll(tagExec.getInputFiles());
preprocessInputFiles();
this.indexerExecution = createIndex();
List<TextualReference> detectedReferences = new ArrayList<>();
try {
detectedReferences = bootstrap();
} catch (ParseException | IOException | InstantiationException | IllegalAccessException ex) {
log.error("Could not apply reliability bootstrapping: " + ex);
getExecution().setStatus(ExecutionStatus.FAILED);
}
Set<String> detectedPatterns = new HashSet<>();
for (TextualReference ref : detectedReferences) {
this.getExecution().getTextualReferences().add(ref.getUri());
detectedPatterns.add(ref.getPattern());
}
this.getExecution().getPatterns().addAll(detectedPatterns);
debug(log, "Final list of patterns: ");
for (InfolisPattern p : getOutputDataStoreClient().get(InfolisPattern.class, this.getExecution().getPatterns())) {
debug(log, p.getPatternRegex() + "=" + p.getPatternReliability());
}
getExecution().setStatus(ExecutionStatus.FINISHED);
}
}