package io.github.infolis.algorithm; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.infolink.patternLearner.StandardPatternInducer; import io.github.infolis.model.BootstrapStrategy; import io.github.infolis.model.EntityType; import io.github.infolis.model.entity.InfolisPattern; import io.github.infolis.model.TextualReference; import io.github.infolis.model.entity.Entity; import io.github.infolis.util.RegexUtils; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.queryparser.classic.ParseException; import org.slf4j.LoggerFactory; /** * * @author kata */ public class FrequencyBasedBootstrapping extends Bootstrapping { public FrequencyBasedBootstrapping(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) throws IOException { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } private static final org.slf4j.Logger log = LoggerFactory.getLogger(FrequencyBasedBootstrapping.class); public PatternInducer getPatternInducer() { return new StandardPatternInducer(getExecution().getWindowsize()); } public PatternRanker getPatternRanker() { return new FrequencyPatternRanker(); } /** * Generates extraction patterns using an iterative bootstrapping approach. * * <ol> * <li>searches for seeds in the specified corpus and extracts the * surrounding words as contexts</li> * <li>analyzes contexts and generates extraction patterns</li> * <li>applies extraction patterns on corpus to extract new seeds</li> * <li>continues with 1) until maximum number of iterations is reached</li> * <li>outputs found seeds, contexts and extraction patterns</li> * </ol> * * Method for assessing pattern validity is frequency-based. * * @param seed the term to be searched as starting point in the current * iteration * @param threshold threshold for accepting patterns * @param maxIterations maximum number of iterations for algorithm * @throws IllegalAccessException * @throws InstantiationException * */ public List<TextualReference> bootstrap() throws ParseException, IOException, InstantiationException, IllegalAccessException { int numIter = 1; List<TextualReference> extractedContextsFromSeeds = new ArrayList<>(); List<TextualReference> extractedContextsFromPatterns = new ArrayList<>(); Map<String, Entity> processedSeeds = new HashMap<>(); List<String> processedPatterns = new ArrayList<>(); Set<Entity> seeds = new HashSet<>(); Set<Entity> newSeedsIteration = new HashSet<>(); Set<String> newSeedTermsIteration = new HashSet<>(); FrequencyPatternRanker ranker = new FrequencyPatternRanker(); //TODO define and use generic PatternRanker //PatternRanker ranker = getPatternRanker(); for (String term : getExecution().getSeeds()) { Entity entity = new Entity(term); entity.setTags(getExecution().getTags()); entity.setEntityType(EntityType.citedData); entity.setIsSeed(); newSeedsIteration.add(entity); } while (numIter < getExecution().getMaxIterations()) { updateProgress(numIter, getExecution().getMaxIterations()); seeds = newSeedsIteration; newSeedsIteration = new HashSet<>(); newSeedTermsIteration = new HashSet<>(); HashSet<String> addedSeeds = new HashSet<>(); //info(log, "Bootstrapping... Iteration: " + numIter); debug(log, "Bootstrapping... Iteration: " + numIter); Set<InfolisPattern> newPatterns = new HashSet<>(); List<TextualReference> contexts_currentIteration = new ArrayList<>(); for (Entity seed : seeds) { //info(log, "Bootstrapping with seed \"" + seed.getName() + "\""); debug(log, "Bootstrapping with seed \"" + seed.getName() + "\""); if (processedSeeds.keySet().contains(seed.getName())) { if (getExecution().getBootstrapStrategy() == BootstrapStrategy.mergeCurrent) { // add context of each seed only once even if seed was found multiple times if (!addedSeeds.contains(seed.getName())) { contexts_currentIteration.addAll(processedSeeds.get(seed.getName()).getTextualReferences()); addedSeeds.add(seed.getName()); } } //log.trace("seed " + seed.getName() + " already known, continuing."); continue; } // 1. use lucene index to search for term in corpus List<TextualReference> detectedContexts = this.getContextsForSeed(seed.getName()); contexts_currentIteration.addAll(detectedContexts); extractedContextsFromSeeds.addAll(detectedContexts); seed.setTextualReferences(detectedContexts); processedSeeds.put(seed.getName(), seed); addedSeeds.add(seed.getName()); //info(log, "Extracted contexts of seed."); debug(log, "Extracted contexts of seed."); // 2. generate patterns if (getExecution().getBootstrapStrategy() == BootstrapStrategy.separate) { debug(log, "--- Entering Pattern Induction phase ---");//info List<InfolisPattern> candidates = inducePatterns(detectedContexts); debug(log, "Pattern Induction completed.");//info debug(log, "--- Entering Pattern Selection phase ---");//info //newPatterns.addAll(ranker.getBestPatterns(candidates, detectedContexts, processedPatterns, new HashSet<Entity>())); newPatterns.addAll(ranker.getBestPatterns(candidates, processedPatterns, new HashSet<Entity>())); } } // mergeNew and mergeCurrent have different contexts_currentIteration at this point, with previously processed seeds filtered for mergeNew but not for mergeCurrent if (getExecution().getBootstrapStrategy() == BootstrapStrategy.mergeCurrent || getExecution().getBootstrapStrategy() == BootstrapStrategy.mergeNew) { debug(log, "--- Entering Pattern Induction phase ---");//info List<InfolisPattern> candidates = inducePatterns(contexts_currentIteration); debug(log, "Pattern Induction completed.");//info debug(log, "--- Entering Pattern Selection phase ---");//info //newPatterns.addAll(ranker.getBestPatterns(candidates, contexts_currentIteration, processedPatterns, new HashSet<Entity>())); newPatterns.addAll(ranker.getBestPatterns(candidates, processedPatterns, new HashSet<Entity>())); } if (getExecution().getBootstrapStrategy() == BootstrapStrategy.mergeAll) { debug(log, "--- Entering Pattern Induction phase ---");//info List<InfolisPattern> candidates = inducePatterns(extractedContextsFromSeeds); debug(log, "Pattern Induction completed.");//info debug(log, "--- Entering Pattern Selection phase ---");//info //newPatterns.addAll(ranker.getBestPatterns(candidates, extractedContextsFromSeeds, processedPatterns, new HashSet<Entity>())); newPatterns.addAll(ranker.getBestPatterns(candidates, processedPatterns, new HashSet<Entity>())); } // POST the patterns getOutputDataStoreClient().post(InfolisPattern.class, newPatterns); for (InfolisPattern pattern : newPatterns) { processedPatterns.add(pattern.getPatternRegex()); } debug(log, "Pattern Selection completed.");//info debug(log, "Selected " + newPatterns.size() + " new patterns"); debug(log, "--- Entering Instance Extraction phase ---");//info // 3. search for patterns in corpus if (!newPatterns.isEmpty()) { //List<String> res = this.getContextsForPatterns(newPatterns); List<String> res = this.getContextsForPatterns(newPatterns, this.getOutputDataStoreClient()); for (TextualReference studyContext : getInputDataStoreClient().get(TextualReference.class, res)) { extractedContextsFromPatterns.add(studyContext); Entity entity = new Entity(studyContext.getReference()); entity.setTags(studyContext.getTags()); entity.setEntityType(EntityType.citedData); // an entity is just as reliable as the textual reference it was extracted from try { entity.setEntityReliability(studyContext.getReferenceReliability()); } catch (NullPointerException npe) { log.debug("Cannot set reliability of entity: textual reference's reliability score is null"); } newSeedsIteration.add(entity); newSeedTermsIteration.add(studyContext.getReference()); } } debug(log, String.format("Found %s seeds in current iteration (%s occurrences): %s)", newSeedTermsIteration.size(), newSeedsIteration.size(), newSeedTermsIteration)); numIter++; if (newSeedTermsIteration.isEmpty() || processedSeeds.keySet().containsAll(newSeedTermsIteration)) { debug(log, "No new seeds found in iteration, returning."); // extractedContexts contains all contexts resulting from searching a seed term // extractedContexts_patterns contains all contexts resulting from searching for the induced patterns // thus, return the latter here debug(log, "Final iteration: " + numIter);//info debug(log, "Final list of instances: "); for (Entity i : processedSeeds.values()) { debug(log, i.getName() + "=" + i.getEntityReliability()); } return extractedContextsFromPatterns; } } debug(log, "Maximum number of iterations reached, returning."); debug(log, "Final iteration: " + numIter);//info debug(log, "Final list of instances: "); for (Entity i : processedSeeds.values()) { debug(log, i.getName() + "=" + i.getEntityReliability()); } return extractedContextsFromPatterns; } private List<InfolisPattern> inducePatterns(Collection<TextualReference> contexts) { List<InfolisPattern> patterns = new ArrayList<>(); int n = 0; double threshold = getExecution().getReliabilityThreshold(); for (TextualReference context : contexts) { n++; log.debug("Inducing patterns for context " + n + " of " + contexts.size()); Double[] thresholds = {threshold, threshold - 0.02, threshold - 0.04, threshold - 0.06, threshold - 0.08, threshold - 0.02, threshold - 0.04, threshold - 0.06, threshold - 0.08}; patterns.addAll(getPatternInducer().induce(context, thresholds)); } return patterns; } class FrequencyPatternRanker extends Bootstrapping.PatternRanker { protected Set<InfolisPattern> getBestPatterns(List<InfolisPattern> candidates, List<String> processedRegex, Set<Entity> reliableSeeds) { Set<InfolisPattern> patterns = new HashSet<>(); Set<String> processedRegex_iteration = new HashSet<>(); // constraint for patterns: at least one component not be a stopword // prevent induction of patterns less general than already known patterns: // check whether pattern is known before continuing int acceptedPatterns = 0; for (int candidateNo = 0; candidateNo < candidates.size(); candidateNo++) { InfolisPattern candidate = candidates.get(candidateNo); int candidateNoInCurrentContext = candidateNo % getPatternInducer().getPatternsPerContext(); int remainingCandidatesForContext = getPatternInducer().getPatternsPerContext() - candidateNoInCurrentContext; log.debug(String.format("Processing candidate no. %s (context no. %s)", String.valueOf(candidateNo), String.valueOf( ((candidateNo - remainingCandidatesForContext) / getPatternInducer().getPatternsPerContext())))); log.debug("Checking if pattern is relevant: " + candidate.getPatternRegex()); if (processedRegex.contains(candidate.getPatternRegex()) || processedRegex_iteration.contains(candidate.getPatternRegex())) { log.debug("Pattern already known, continuing with candidates for next context."); // skip less general pattern candidates of the same context candidateNo += remainingCandidatesForContext -1; continue; } boolean nonStopwordPresent = false; double relevance = 0; try { relevance = computeRelevance(candidate, candidates); } catch (NullPointerException npe) {} for (String word : candidate.getWords()) { if (!RegexUtils.isStopword(word)) { nonStopwordPresent = true; continue; } } if (!nonStopwordPresent) { log.debug("Pattern rejected - stopwords only"); if (acceptedPatterns == 1) { candidateNo += remainingCandidatesForContext -1; acceptedPatterns = 0; continue; } } else if (nonStopwordPresent && isRelevant(candidate, relevance)) { candidate.setTags(getExecution().getTags()); candidate.setPatternReliability(relevance); patterns.add(candidate); processedRegex_iteration.add(candidate.getPatternRegex()); log.debug("Pattern accepted"); if (acceptedPatterns == 1) { candidateNo += remainingCandidatesForContext -1; acceptedPatterns = 0; continue; } // omit only less general patterns, do not limit generation of patterns on same level else { acceptedPatterns ++; } } else { log.debug("Pattern rejected - not relevant"); if (acceptedPatterns == 1) { candidateNo += remainingCandidatesForContext -1; acceptedPatterns = 0; continue; } } } return patterns; } private double computeRelevance(int count, int size, int minCount) { double score = 0.0; if (count >= minCount) { //TODO make configurable int norm = 1; score = ((double) count / size) * norm; } log.debug("Relevance score: " + score); log.debug("Occurrences: " + count); log.debug("Size: " + size); return score; } private double computeRelevance(InfolisPattern pattern, List<InfolisPattern> candidateList) { //TODO make configurable int minCount = 0; int count = 0; for (InfolisPattern candidateP : candidateList) { if (pattern.getPatternRegex().equals(candidateP.getPatternRegex())) count++; } return computeRelevance(count, candidateList.size() / getPatternInducer().getPatternsPerContext(), minCount); } private boolean isRelevant(InfolisPattern pattern, double score) { return (score >= pattern.getThreshold()); } } }