package io.github.infolis.infolink.patternLearner; import io.github.infolis.model.entity.InfolisPattern; import io.github.infolis.model.TextualReference; import io.github.infolis.model.entity.Entity; import io.github.infolis.util.MathUtils; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Class for storing espresso-like pattern ranking and instance ranking reliability scores. * * Implemented based on algorithm in: * Patrick Pantel and Marco Pennacchiotti. 2006. * Espresso: leveraging generic patterns for automatically harvesting semantic relations. * In Proceedings of the 21st International Conference on Computational Linguistics and * the 44th annual meeting of the Association for Computational Linguistics (ACL-44). * Association for Computational Linguistics, Stroudsburg, PA, USA, 113-120. * DOI=10.3115/1220175.1220190 http://dx.doi.org/10.3115/1220175.1220190 * http://www.anthology.aclweb.org/P/P06/P06-1.pdf#page=153 * * @author kata */ public class Reliability { Map<String, Entity> instances; Map<String, InfolisPattern> patterns; Set<String> seedTerms; double maximumPmi; private static final Logger log = LoggerFactory.getLogger(Reliability.class); // reliability scores may change between iterations in bootstrapping but not during one iteration // avoid multiple computations of score for the same entities inside the same iteration // to allow scores to change between iterations, reset all scores at beginning of new iteration Map<String, Double> scoreCache; /** * Class constructor initializing empty sets for instances and patterns. */ public Reliability() { this.instances = new HashMap<>(); this.patterns = new HashMap<>(); this.maximumPmi = -100.0; this.seedTerms = new HashSet<>(); this.scoreCache = new HashMap<>(); } public void deleteScoreCache() { this.scoreCache = new HashMap<>(); } public void setSeedTerms(Set<String> seedTerms) { this.seedTerms = seedTerms; } public Set<String> getSeedTerms() { return this.seedTerms; } public Collection<Entity> getInstances() { return this.instances.values(); } public Collection<InfolisPattern> getPatterns() { return this.patterns.values(); } public InfolisPattern getPattern(String regex) { return this.patterns.get(regex); } /** * Adds a new Instance instance. The instance may have been added before * with only a subset of all initializing patterns. Thus, when adding a new * instance, checks if an instance with the same name is already known and * if so, the new associations are added to the existing instance. * * @param instance Instance instance to be added * @return true, if instance was not included in this instances before, * false if already in this instances */ public boolean addInstance(Entity instance) { if (this.instances.containsKey(instance.getName())) { Entity curInstance = this.instances.get(instance.getName()); Map<String, Double> curAssociations = curInstance.getAssociations(); curAssociations.putAll(instance.getAssociations()); instance.setAssociations(curAssociations); this.instances.put(instance.getName(), instance); return false; } this.instances.put(instance.getName(), instance); return true; } /** * Adds a new Pattern instance. The pattern may have been added before with * only a subset of all extracted instances. Thus, when adding a new * pattern, checks if a pattern with the same name is already known and if * so, the new associations are added to the existing pattern. * * @param pattern Pattern instance to be added * @return true, if pattern was not included in this patterns before, false * if already in this patterns */ public boolean addPattern(InfolisPattern pattern) { if (this.patterns.containsKey(pattern.getPatternRegex())) { InfolisPattern curPattern = this.patterns.get(pattern.getPatternRegex()); Map<String, Double> curAssociations = curPattern.getAssociations(); curAssociations.putAll(pattern.getAssociations()); pattern.setAssociations(curAssociations); this.patterns.put(pattern.getPatternRegex(), pattern); return false; } this.patterns.put(pattern.getPatternRegex(), pattern); return true; } /** * Sets this maximumPmi to pmi if higher than the current maximum. * * @param pmi the new value to maybe become the new maximum * @return true, if pmi is the new maximum (or equal to the existing one), * false otherwise (if lesser than maximum) */ public boolean setMaxPmi(double pmi) { if (pmi >= this.maximumPmi) { this.maximumPmi = pmi; return true; } else { return false; } } public double getMaxPmi() { return this.maximumPmi; } /** * Counts joint occurrences of instance and pattern. * Needed for computation of probabilities for computation of pmi scores. * * @param instance * @param pattern * @return */ private int countJointOccurrences(Entity instance, InfolisPattern pattern) { int jointOccurrences = 0; // joint occurrences can be calculated in two different ways: // either search for pattern in textual references of instance (note: search pattern in the // context strings, not search for pattern listed as extracting pattern there as the contexts // were extracted by term search, not by applying patterns) // or search for term in contexts of pattern // Most efficient solution: search for term in contexts of pattern for (TextualReference context : pattern.getTextualReferences()) { if (context.getReference().equals(instance.getName())) jointOccurrences++; } return jointOccurrences; } /** * Computes the point-wise mutual information score for instance and pattern. * * @param patternCount count of all occurrences of regex in the complete input data * @param dataSize size of input data (number of input documents) * @param contexts_seeds contexts of all currently known seeds, extracted by term search * @param regex pattern regex string * @param instance name of the instance * @return point-wise mutual information score of instance and pattern (belonging to regex) */ private double computePmi(int dataSize, Entity instance, InfolisPattern pattern) { log.debug("computing pmi of instance \"" + instance.getName() + "\" and pattern \"" + pattern.getPatternRegex() + "\""); int patternCount = pattern.getTextualReferences().size(); int instanceCount = instance.getTextualReferences().size(); int jointOccurrences = countJointOccurrences(instance, pattern); // p_x: probability of instance occurring in the data double p_x = (double) instanceCount / (double) dataSize; // p_y: probability of pattern occurring in the data double p_y = (double) patternCount / (double) dataSize; // p_xy: joint probability of pattern and instance occurring in the data double p_xy = (double) jointOccurrences / (double) dataSize; double pmi_score = MathUtils.pmi(p_xy, p_x, p_y); log.debug("data size: " + (double) dataSize); log.debug("total studycontexts where instance can be found: " + instanceCount); log.debug("total studycontexts where pattern can be found: " + patternCount); log.debug("total studycontexts where both instance and pattern can be found: " + jointOccurrences); log.trace("p_xy: " + p_xy); log.trace("p_x: " + p_x); log.trace("p_y: " + p_y); log.trace("pmi: " + pmi_score); return pmi_score; } /** * Computes the reliability score of pattern based on the given data. * * @param dataSize size of the input data (number of input documents) * @param reliableInstances all currently known instances * @param pattern pattern to compute the reliability score for * @return pattern reliability score */ public double computeReliability(int dataSize, Set<Entity> reliableInstances, InfolisPattern pattern) { //TODO: use custom comparator for Entities to avoid necessity of building this map Map<String, Entity> reliableInstanceNames = new HashMap<>(); for (Entity i : reliableInstances) { reliableInstanceNames.put(i.getName(), i); } // compute pmi for every known instance referenced using pattern for (TextualReference ref : pattern.getTextualReferences()) { // do not try to compute reliability of unknown instances at this step if (!reliableInstanceNames.containsKey(ref.getReference())) continue; Entity instance = reliableInstanceNames.get(ref.getReference()); double pmi = this.computePmi(dataSize, instance, pattern); // instance and pattern do not occur together in the data and are thus not associated // should not happen here because instance is found as term in the textual references of pattern if (Double.isNaN(pmi) || Double.isInfinite(pmi)) throw new IllegalStateException( "Spurious association of pattern \"" + pattern.getPatternRegex() + " and instance\"" + instance.getName()); pattern.addAssociation(instance.getName(), pmi); //Instance instance = new Instance(instanceName); instance.addAssociation(pattern.getPatternRegex(), pmi); //TODO: why use regex for storing association? Shouldn't the URI be used? this.addPattern(pattern); this.addInstance(instance); this.setMaxPmi(pmi); } return this.reliability(pattern, new HashSet<String>()); } /** * Computes the reliability score of instance based on the given data. * * @param dataSize size of the input data (number of input documents) * @param reliablePatterns currently known reliable patterns * @param contexts_seeds contexts of all currently known instances, extracted by term search * @param instance instance to compute the reliability score for * @return instance reliability score */ public double computeReliability(int dataSize, Collection<InfolisPattern> reliablePatterns, Entity instance) { // for every known pattern, check whether instance is associated with it for (InfolisPattern pattern : reliablePatterns) { //double pmi = this.computePmi_instance(dataSize, pattern, instance); double pmi = this.computePmi(dataSize, instance, pattern); // instance and pattern never occur together and thus are not associated // this may happen here and is not an error if (Double.isNaN(pmi) || Double.isInfinite(pmi)) { continue; } instance.addAssociation(pattern.getPatternRegex(), pmi); this.addInstance(instance); //InfolisPattern pattern = this.getPattern(regex); pattern.addAssociation(instance.getName(), pmi); this.addPattern(pattern); this.setMaxPmi(pmi); } return this.reliability(instance, new HashSet<String>()); } /** * Computes the reliability of an instance. * * @return the reliability score */ public double reliability(Entity instance, Set<String> callingEntitiesTrace) { log.debug("Computing reliability of instance: " + instance.getName()); if (this.seedTerms.contains(instance.getName())) { return 1.0; } if (scoreCache.containsKey(instance.getName())) return scoreCache.get(instance.getName()); double rp = 0.0; Map<String, Double> patternsAndPmis = instance.getAssociations(); float P = Float.valueOf(patternsAndPmis.size()); for (String patternString : patternsAndPmis.keySet()) { // avoid circles if (callingEntitiesTrace.contains(patternString)) { continue; } double pmi = patternsAndPmis.get(patternString); InfolisPattern pattern = this.patterns.get(patternString); callingEntitiesTrace.add(instance.getName()); if (maximumPmi != 0) { rp += ((pmi / maximumPmi) * reliability(pattern, callingEntitiesTrace)); } } log.debug("instance max pmi: " + maximumPmi); log.debug("instance number of associations: " + P); log.debug("instance rp: " + rp); log.debug("instance returned reliability: " + rp / P); double score = rp / P; scoreCache.put(instance.getName(), score); return score; } /** * Computes the reliability of a pattern. * * @return the reliability score */ public double reliability(InfolisPattern pattern, Set<String> callingEntitiesTrace) { log.debug("Computing reliability of pattern: " + pattern.getPatternRegex()); if (scoreCache.containsKey(pattern.getPatternRegex())) return scoreCache.get(pattern.getPatternRegex()); double rp = 0.0; Map<String, Double> instancesAndPmis = pattern.getAssociations(); float P = Float.valueOf(instancesAndPmis.size()); for (String instanceName : instancesAndPmis.keySet()) { if (callingEntitiesTrace.contains(instanceName)) { continue; } double pmi = instancesAndPmis.get(instanceName); Entity instance = instances.get(instanceName); callingEntitiesTrace.add(pattern.getPatternRegex()); double reliability_instance = reliability(instance, callingEntitiesTrace); log.debug("stored pmi for pattern \"" + pattern.getPatternRegex() + "\" and instance \"" + instanceName +"\": " + pmi); if (maximumPmi != 0) { rp += ((pmi / maximumPmi) * reliability_instance); } } log.debug("max pmi: " + maximumPmi); log.debug("pattern number of associations: " + P); log.debug("pattern rp: " + rp); log.debug("returned pattern reliability: " + rp / P); double score = rp / P; scoreCache.put(pattern.getPatternRegex(), score); return score; } }