package io.github.infolis.algorithm;
import io.github.infolis.datastore.DataStoreClient;
import io.github.infolis.datastore.FileResolver;
import io.github.infolis.infolink.patternLearner.Reliability;
import io.github.infolis.infolink.patternLearner.StandardPatternInducer;
import io.github.infolis.model.entity.InfolisPattern;
import io.github.infolis.model.EntityType;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.Entity;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.queryparser.classic.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author kata
*/
public class ReliabilityBasedBootstrapping extends Bootstrapping {
public ReliabilityBasedBootstrapping(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) throws IOException {
super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver);
}
private static final Logger log = LoggerFactory.getLogger(ReliabilityBasedBootstrapping.class);
private Reliability r = new Reliability();
public PatternInducer getPatternInducer() {
return new StandardPatternInducer(getExecution().getWindowsize());
}
public PatternRanker getPatternRanker() {
//return new ReliabilityPatternRanker();
return new RelativeReliabilityPatternRanker();
}
public List<TextualReference> bootstrap() throws IOException, ParseException {
Set<Entity> reliableInstances = new HashSet<>();
Set<InfolisPattern> reliablePatterns = new HashSet<>();
//ReliabilityPatternRanker patternRanker = new ReliabilityPatternRanker();
RelativeReliabilityPatternRanker patternRanker = new RelativeReliabilityPatternRanker();
//TODO define and use generic PatternRanker
//PatternRanker patternRanker = getPatternRanker();
int numIter = 1;
Set<Entity> seeds = new HashSet<>();
Set<String> seedTerms = new HashSet<>();
seedTerms.addAll(getExecution().getSeeds());
this.r.setSeedTerms(seedTerms);
Map<String, Double> lastTopK = new HashMap<>();
// initialize bootstrapping:
// 1. search for all initial seeds and save contexts
for (String seed : seedTerms) {
log.info("Bootstrapping with seed \"" + seed + "\"");
Entity newSeed = new Entity(seed);
newSeed.setTags(getExecution().getTags());
newSeed.setEntityType(EntityType.citedData);
newSeed.setTextualReferences(this.getContextsForSeed(seed));
newSeed.setIsSeed();
seeds.add(newSeed);
}
log.info("Extracted contexts of all seeds.");
log.info("--- Entering Pattern Induction phase ---");
// start bootstrapping
while (numIter < getExecution().getMaxIterations()) {
log.info("Bootstrapping... Iteration: " + numIter);
log.debug("Current reliable instances: ");
for (Entity instance : reliableInstances) log.debug(instance.getName());
log.debug("Current top patterns: " + lastTopK);
// add seeds selected in last iteration to list of reliable instances
reliableInstances.addAll(seeds);
// delete cache of reliability scores as they may change with new evidence of new iterations
r.deleteScoreCache();
// Pattern Induction
double threshold = getExecution().getReliabilityThreshold();
Double[] thresholds = new Double[9];
// use equal threshold for all candidates
Arrays.fill(thresholds, threshold);
List<List<InfolisPattern>> candidatePatterns = constructCandidates(seeds, thresholds);
log.info("Pattern Induction completed.");
log.info("--- Entering Pattern Selection phase ---");
// Pattern Ranking/Selection
// 2. get reliable patterns along with their textual references
// reset list of reliable patterns found in this iteration
Collection<InfolisPattern> reliablePatterns_iteration = new HashSet<>();
Map<String, Double> reliableRegex_iteration = patternRanker.getReliablePatterns(candidatePatterns, reliableInstances);
for (String relRegex : reliableRegex_iteration.keySet()) {
reliablePatterns.add(patternRanker.knownPatterns.get(relRegex));
reliablePatterns_iteration.add(patternRanker.knownPatterns.get(relRegex));
}
log.info("Pattern Selection completed.");
log.info("--- Entering Instance Extraction phase ---");
// Instance Extraction: filter seeds, select only reliable ones
seeds = new HashSet<>();
log.debug("selected " + reliablePatterns_iteration.size() + " patterns");
// get list of new instances from textual references of reliablePatterns_iteration
// compute reliability of all instances
Collection<TextualReference> reliableContexts_iteration = new ArrayList<>();
for (InfolisPattern reliablePattern : reliablePatterns_iteration) reliableContexts_iteration.addAll(reliablePattern.getTextualReferences());
log.debug("extracted " + reliableContexts_iteration.size() + " textual references in this iteration");
Set<String> newInstanceNames = new HashSet<>();
for (TextualReference sC : reliableContexts_iteration) {
String newInstanceName = sC.getReference();
Collection<String> reliableInstanceTerms = new HashSet<>();
for (Entity i : reliableInstances) {
i.setTags(sC.getTags());
reliableInstanceTerms.add(i.getName()); }
if (!reliableInstanceTerms.contains(newInstanceName)) {
newInstanceNames.add(newInstanceName);
log.debug("Found new instance: " + newInstanceName);
}
}
for (String newInstanceName : newInstanceNames) {
Entity newInstance = new Entity(newInstanceName);
newInstance.setEntityType(EntityType.citedData);
// counts of instances are required for computation of pmi
newInstance.setTextualReferences(this.getContextsForSeed(newInstanceName));
log.debug("new Instance stored contexts: " + newInstance.getTextualReferences());
// for computation of reliability, save time nad consider only patterns of this iteration:
// if instance had been found by patterns of earlier iterations, it would not be
// considered as new instance here
/*
if (newInstance.isReliable(reliablePatterns_iteration, getExecution().getInputFiles().size(), r, getExecution().getReliabilityThreshold())) {
seeds.add(newInstance);
}*/
//TODO include reliability computation for instances again!
seeds.add(newInstance);
log.debug("Reliability of instance \"" + newInstanceName + "\": " + newInstance.getEntityReliability());
}
for (Entity i : r.getInstances()) {
log.debug("stored instance: \"" + i.getName() + "\"=" + i.getEntityReliability());
log.debug("stored associations: " + i.getAssociations().size());
}
for (InfolisPattern p : r.getPatterns()) {
log.debug("stored pattern: " + p.getPatternRegex());
log.debug("stored associations: " + p.getAssociations().size());
}
// return if pattern set is stable or seed set is empty
if (patternRanker.topK.equals(lastTopK)) {
log.debug("pattern set is stable, nothing more to do. Returning.");
break;
} else if (seeds.isEmpty()) {
log.debug("no new seeds, nothing more to do. Returning.");
break;
} else {
lastTopK = patternRanker.topK;
numIter++;
}
}
Collection<TextualReference> topContexts = new ArrayList<>();
for (String regex : patternRanker.topK.keySet()) {
InfolisPattern topPattern = patternRanker.knownPatterns.get(regex);
this.getOutputDataStoreClient().post(InfolisPattern.class, topPattern);
// textual reference holds temporary uris for patterns
//TODO also for entities?
for (TextualReference textRef : topPattern.getTextualReferences()) {
textRef.setPattern(topPattern.getUri());
/*InfolisFile infolisFile = this.getOutputDataStoreClient().get(InfolisFile.class, textRef.getFile());
textRef.setMentionsReference(infolisFile.getEntity());*/
}
topContexts.addAll(topPattern.getTextualReferences());
}
log.info("Final iteration: " + numIter);
log.debug("Final reliable instances: ");
for (Entity i : reliableInstances) { log.debug(i.getName() + "=" + i.getEntityReliability()); }
log.debug("Final top patterns: ");
for (Map.Entry<String, Double> k : patternRanker.topK.entrySet()) {
log.debug(String.format("%s=%s", k.getKey(), k.getValue()));
}
List<TextualReference> reliableReferences = removeUnreliableInstances(topContexts, reliableInstances);
this.getOutputDataStoreClient().post(TextualReference.class, reliableReferences);
return reliableReferences;
}
private List<TextualReference> removeUnreliableInstances(Collection<TextualReference> contexts, Set<Entity> reliableInstances) {
Set<String> reliableInstanceTerms = new HashSet<String>();
for (Entity i : reliableInstances) { reliableInstanceTerms.add(i.getName()); }
List<TextualReference> res = new ArrayList<>();
for (TextualReference context : contexts) {
if (reliableInstanceTerms.contains(context.getReference())) {
res.add(context);
}
}
return res;
}
/**
* Resolves list of study context URIs and returns list of corresponding
* studyContexts.
*
* @param URIs list of study context URIs
* @return list of corresponding study contexts
*/
private List<TextualReference> getStudyContexts(Collection<String> URIs, DataStoreClient client) {
List<TextualReference> contexts = new ArrayList<>();
for (String uri : URIs) {
contexts.add(client.get(TextualReference.class, uri));
}
return contexts;
}
/**
* Constructs all pattern candidates from context using the specified
* thresholds for the different kinds of patterns (different generality
* levels).
*
* @param context context retrieved through term search for seed, basis for
* pattern induction
* @param thresholds thresholds for different generality levels of patterns
* @return
*/
private List<List<InfolisPattern>> constructCandidates(Collection<Entity> instances, Double[] thresholds) {
List<List<InfolisPattern>> candidateList = new ArrayList<>();
for (Entity i : instances) {
for (TextualReference context : i.getTextualReferences()) {
candidateList.add(getPatternInducer().induce(context, thresholds));
}
}
return candidateList;
}
/**
* Class for pattern ranking and selection.
*
* @author kata
*
*/
private class ReliabilityPatternRanker extends Bootstrapping.PatternRanker {
//TODO custom comparator for entities..
private Map<String,InfolisPattern> knownPatterns = new HashMap<>();
private Map<Double, Collection<String>> reliableRegex = new HashMap<>();
private Map<String, Double> topK = new HashMap<>();
private DataStoreClient tempClient = getTempDataStoreClient();
/**
*
* @param candidatesPerContext
* @param relInstances
* @return
* @throws IOException
* @throws ParseException
*/
private Map<String, Double> getReliablePatterns(List<List<InfolisPattern>> candidatesPerContext, Set<Entity> relInstances) throws IOException, ParseException {
int size = getExecution().getInputFiles().size();
List<String> processedRegex_iteration = new ArrayList<>();
for (List<InfolisPattern> candidatesForContext : candidatesPerContext) {
for (InfolisPattern candidate : candidatesForContext) {
// may be null if context had less words than windowsize for pattern induction
if (null == candidate.getLuceneQuery()) continue;
log.debug("Checking if pattern is reliable: " + candidate.getPatternRegex());
// Do not process patterns more than once in one iteration, scores do not change.
// Scores may change from iteration to iteration though, thus do not exclude
// patterns already checked in another iteration
if (processedRegex_iteration.contains(candidate.getPatternRegex())) {
log.debug("Pattern already known, continuing.");
break; // this prohibits induction of less general patterns
//continue; // this prohibits induction of duplicate patterns but allows less general ones
}
// compute reliability again for patterns known from previous iterations - scores may change
if (this.knownPatterns.containsKey(candidate.getPatternRegex())) {
candidate = this.knownPatterns.get(candidate.getPatternRegex());
//contexts_pattern = candidatePattern.getTextualReferences();
} // even potentially unreliable candidates need a URI for extraction of contexts
else {
tempClient.post(InfolisPattern.class, candidate);
// TODO: use on set of candidates instead of on single candidate
candidate.setTextualReferences(getStudyContexts(getContextsForPatterns(Arrays.asList(candidate), tempClient), tempClient));
this.knownPatterns.put(candidate.getPatternRegex(), candidate);
}
// Pattern Ranking / Selection
if (candidate.isReliable(size, relInstances, r)) {
double candidateReliability = candidate.getPatternReliability();
log.debug("Pattern reliable, score: " + candidateReliability);
Collection<String> regexWithSameScore = new ArrayList<>();
if (this.reliableRegex.containsKey(candidateReliability)) {
regexWithSameScore = this.reliableRegex.get(candidateReliability);
}
regexWithSameScore.add(candidate.getPatternRegex());
this.reliableRegex.put(candidateReliability, regexWithSameScore);
// this returns the top k patterns regardless if their score is above the threshold
//topK = getTopK(this.reliableRegex, 5);
// this returns all top k patterns above the threshold
//TODO: start with small k and increase with each iteration
//TODO: at the same time, decrease thresholds slightly
this.topK = getTopK(removeBelowThreshold(this.reliableRegex, getExecution().getReliabilityThreshold()), 100);
processedRegex_iteration.add(candidate.getPatternRegex());
break; // this prohibits induction of less general patterns
// and equally general pattern of the other type (e.g. candidate2 vs. candidateB)
//continue; // this prohibits induction of duplicate patterns but allows less general ones
} else {
processedRegex_iteration.add(candidate.getPatternRegex());
log.debug("Pattern unreliable, score: " + candidate.getPatternReliability());
}
}
}
tempClient.clear();
// this returns only the most reliable patterns, not all reliable ones
// thus, new seeds are generated based on the most reliable patterns only
return this.topK;
}
}
/**
* Class for pattern ranking and selection using a relative threshold.
*
* @author kata
*
*/
private class RelativeReliabilityPatternRanker extends Bootstrapping.PatternRanker {
//TODO custom comparator for entities..
private Map<String,InfolisPattern> knownPatterns = new HashMap<>();
private Map<Double, Collection<String>> reliableRegex = new HashMap<>();
private Map<String, Double> topK = new HashMap<>();
private DataStoreClient tempClient = getTempDataStoreClient();
/**
*
* @param candidatesPerContext
* @param relInstances
* @return
* @throws IOException
* @throws ParseException
*/
private Map<String, Double> getReliablePatterns(List<List<InfolisPattern>> candidatesPerContext, Set<Entity> relInstances) throws IOException, ParseException {
int size = getExecution().getInputFiles().size();
List<String> processedRegex_iteration = new ArrayList<>();
Map<String, Double> lastTopK = new HashMap<>(topK);
boolean firstIteration = false;
if (lastTopK.size() == 0) firstIteration = true;
double summedConfidenceTopK = 0;
for (double confidence : lastTopK.values()) {
summedConfidenceTopK += confidence;
}
double averageConfidenceTopK = 0;
if (!firstIteration) averageConfidenceTopK = summedConfidenceTopK / lastTopK.size();
int newK = lastTopK.size() + 1;
if (firstIteration) newK += 19;
for (List<InfolisPattern> candidatesForContext : candidatesPerContext) {
for (InfolisPattern candidate : candidatesForContext) {
// may be null if context had less words than windowsize for pattern induction
if (null == candidate.getLuceneQuery()) continue;
log.debug("Checking if pattern is reliable: " + candidate.getPatternRegex());
// Do not process patterns more than once in one iteration, scores do not change.
// Scores may change from iteration to iteration though, thus do not exclude
// patterns already checked in another iteration
if (processedRegex_iteration.contains(candidate.getPatternRegex())) {
log.debug("Pattern already known, continuing.");
break; // this prohibits induction of less general patterns
//continue; // this prohibits induction of duplicate patterns but allows less general ones
}
// compute reliability again for patterns known from previous iterations - scores may change
if (this.knownPatterns.containsKey(candidate.getPatternRegex())) {
candidate = this.knownPatterns.get(candidate.getPatternRegex());
//contexts_pattern = candidatePattern.getTextualReferences();
} // even potentially unreliable candidates need a URI for extraction of contexts
else {
tempClient.post(InfolisPattern.class, candidate);
// TODO: use on set of candidates instead of on single candidate
candidate.setTextualReferences(getStudyContexts(getContextsForPatterns(Arrays.asList(candidate), tempClient), tempClient));
this.knownPatterns.put(candidate.getPatternRegex(), candidate);
}
// Pattern Ranking / Selection
candidate.isReliable(size, relInstances, r);
double candidateReliability = candidate.getPatternReliability();
// TODO is the approximate highlighter implementation the cause for this happening?
if (Double.isNaN(candidateReliability)) {
log.warn("Pattern has score of NaN. Ignoring: " + candidate.getLuceneQuery());
continue;
}
log.debug("Pattern score: " + candidateReliability);
Collection<String> regexWithSameScore = new ArrayList<>();
if (this.reliableRegex.containsKey(candidateReliability)) {
regexWithSameScore = this.reliableRegex.get(candidateReliability);
}
regexWithSameScore.add(candidate.getPatternRegex());
this.reliableRegex.put(candidateReliability, regexWithSameScore);
processedRegex_iteration.add(candidate.getPatternRegex());
}
}
Map<String, Double> newTopK = getTopK(this.reliableRegex, newK);
double summedConfidenceNewTopK = 0;
if (firstIteration) {
this.topK = newTopK;
tempClient.clear();
return this.topK;
}
else {
for (double confidence : newTopK.values()) {
summedConfidenceTopK += confidence;
}
double averageConfidenceNewTopK = summedConfidenceNewTopK / newTopK.size();
// if the new patterns would decrease the average confidence more than allowed, reset pattern set
log.debug("average score of current top k patterns: " + averageConfidenceTopK);
log.debug("average score of new top k patterns: " + averageConfidenceNewTopK);
log.debug("divergence: " + Math.abs(averageConfidenceTopK - averageConfidenceNewTopK));
if (Math.abs(averageConfidenceTopK - averageConfidenceNewTopK) > getExecution().getReliabilityThreshold()) {
this.topK = lastTopK;
}
else this.topK = newTopK;
tempClient.clear();
return this.topK;
}
}
}
/**
* Checks whether score of map entry is below the given threshold.
*
* @param item map entry having score as key
* @param threshold threshold for score
* @return
*/
static boolean isBelowThreshold(Map.Entry<Double, Collection<String>> item, double threshold) {
return (item.getKey() < threshold);
}
/**
* Removes all entries from map whose score is below the given threshold.
*
* @param patternScoreMap map with scores as keys
* @param threshold threshold for acceptance of entries
* @return map containing only entries with scores higher than or equal to
* threshold
*/
static Map<Double, Collection<String>> removeBelowThreshold(Map<Double, Collection<String>> patternScoreMap, double threshold) {
Iterator<Map.Entry<Double, Collection<String>>> iter = patternScoreMap.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry<Double, Collection<String>> entry = iter.next();
if (isBelowThreshold(entry, threshold)) {
iter.remove();
}
}
return patternScoreMap;
}
/**
* Filters given map and returns only the top k entries.
*
* @param patternScoreMap map with scores as keys and a collection of
* patterns with this score as values
* @param k maximal number of entries to return
* @return map of k-best entries having pattern strings as keys and scores
* as values
*/
static Map<String, Double> getTopK(Map<Double, Collection<String>> patternScoreMap, int k) {
Map<String, Double> topK = new HashMap<>();
List<Double> scores = new ArrayList<>(patternScoreMap.keySet());
Collections.sort(scores, Collections.reverseOrder());
int n = 0;
for (double score : scores.subList(0, Math.min(k, scores.size()))) {
for (String value : patternScoreMap.get(score)) {
if (n >= k) {
break;
}
topK.put(value, score);
n++;
}
}
return topK;
}
}