/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package io.github.infolis.algorithm; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.model.Execution; import io.github.infolis.model.ExecutionStatus; import io.github.infolis.model.TextualReference; import io.github.infolis.model.entity.InfolisPattern; import io.github.infolis.util.RegexUtils; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author kata * */ public class InfolisPatternSearcher extends BaseAlgorithm { public InfolisPatternSearcher(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } private static final Logger log = LoggerFactory.getLogger(InfolisPatternSearcher.class); private List<String> getTextRefsForLuceneQueries( List<String> patternUris, DataStoreClient client) { Execution exec = getExecution().createSubExecution(LuceneSearcher.class); exec.setIndexDirectory(getExecution().getIndexDirectory()); exec.setPhraseSlop(getExecution().getPhraseSlop()); exec.setAllowLeadingWildcards(getExecution().isAllowLeadingWildcards()); exec.setMaxClauseCount(getExecution().getMaxClauseCount()); exec.setPatterns(patternUris); exec.setInputFiles(getExecution().getInputFiles()); // LuceneSearcher posts textual references but they are temporary exec.instantiateAlgorithm(this.getInputDataStoreClient(), client, this.getInputFileResolver(), this.getOutputFileResolver()).run(); return exec.getTextualReferences(); } private static String getReference(String text, String regex) { Pattern p = Pattern.compile(regex); Matcher m = p.matcher(text); if (m.find()) return m.group(1); return ""; } private static boolean satisfiesUpperCaseConstraint(String string) { // do not treat -RRB-, -LRB- and *NL* tokens as uppercase words return !(string.replaceAll("-RRB-", "").replaceAll("-LRB-", "") .replaceAll("\\*NL\\*", "").toLowerCase() .equals(string.replaceAll("-RRB-", "") .replaceAll("-LRB-", "") .replaceAll("\\*NL\\*", ""))); } /** * Retrieves contexts for InfolisPatterns using LuceneSearcher and validates them using * the patterns' regular expressions. Validation is necessary because * <ul> * <li>lucene queries in the InfolisPatterns may have wildcards for words that must match * a regular expression, e.g. consist of digits only</li> * <li>finding named entities consisting of more than one word is enabled using lucene's * phraseSlop parameter. This fuzzy matching may cause text snippets to match that are * not supposed to match</li> * <li>lucene's Highlighters perform approximate matching of queries and text. Highlighted * snippets may not always truely contain a match</li> * </ul> * @param patterns * @return */ private List<String> getContextsForPatterns(List<String> patternUris) { int counter = 0, size = patternUris.size(); log.debug("number of patterns to search for: " + size); DataStoreClient tempClient = this.getTempDataStoreClient(); // for all patterns, retrieve documents in which they occur (using lucene) for (String patternUri : patternUris) { tempClient.put(InfolisPattern.class, getOutputDataStoreClient() .get(InfolisPattern.class, patternUri), patternUri); } List<String> textRefsForPatterns = getTextRefsForLuceneQueries( patternUris, tempClient); List<String> validatedTextualReferences = new ArrayList<>(); // open each reference once and validate with the corresponding regular expression for (String textRefUri : textRefsForPatterns) { TextualReference textRef = tempClient.get(TextualReference.class, textRefUri); InfolisPattern pattern = tempClient.get(InfolisPattern.class, textRef.getPattern()); log.debug("pattern: " + pattern.getPatternRegex()); log.debug("candidate textual reference: " + textRef.getLeftText()); String referencedTerm = getReference(textRef.getLeftText(), pattern.getPatternRegex()); // textual reference does not match regex if ("".equals(referencedTerm)) { log.debug("Textual reference does not match regex: " + pattern.getPatternRegex()); log.debug("Textual reference: " + textRef.getLeftText()); continue; } if ((getExecution().isUpperCaseConstraint() && !satisfiesUpperCaseConstraint(referencedTerm))) { log.debug("Referenced term does not satisfy uppercase-constraint \"" + referencedTerm + "\""); continue; } // if referencedTerm contains no characters or is a stopword: ignore // TODO: not accurate - include accents etc in match... \p{M}? if (referencedTerm.matches("\\P{L}+") || RegexUtils.isStopword(referencedTerm)) { log.debug("Invalid referenced term \"" + referencedTerm + "\""); continue; } TextualReference validatedTextRef = LuceneSearcher.getContext(referencedTerm, textRef.getLeftText(), textRef.getTextFile(), pattern.getUri(), textRef.getMentionsReference(), textRef.getTags()); // a textual reference is just as reliable as the pattern that was used to extract it try { validatedTextRef.setReferenceReliability(pattern.getPatternReliability()); } catch (NullPointerException npe) { log.debug("Cannot set reliability of textual reference: pattern's reliability score is null"); } getOutputDataStoreClient().post(TextualReference.class, validatedTextRef); validatedTextualReferences.add(validatedTextRef.getUri()); log.debug("added textual reference " + validatedTextRef); counter++; updateProgress(counter, size); } tempClient.clear(); return validatedTextualReferences; } Execution createIndex() throws IOException { Execution execution = getExecution().createSubExecution(Indexer.class); execution.setInputFiles(getExecution().getInputFiles()); getOutputDataStoreClient().post(Execution.class, execution); execution.instantiateAlgorithm(this).run(); return execution; } @Override public void execute() throws IOException { Execution tagExec = getExecution().createSubExecution(TagSearcher.class); tagExec.getInfolisFileTags().addAll(getExecution().getInfolisFileTags()); tagExec.getInfolisPatternTags().addAll(getExecution().getInfolisPatternTags()); tagExec.instantiateAlgorithm(this).run(); getExecution().getPatterns().addAll(tagExec.getPatterns()); getExecution().getInputFiles().addAll(tagExec.getInputFiles()); if (null == getExecution().getIndexDirectory() || getExecution().getIndexDirectory().isEmpty()) { debug(log, "No index directory specified, indexing on demand"); Execution indexerExecution = createIndex(); getExecution().setIndexDirectory(indexerExecution.getOutputDirectory()); } log.debug("started"); getExecution().setTextualReferences(getContextsForPatterns(getExecution().getPatterns())); log.debug("No. contexts found: {}", getExecution().getTextualReferences().size()); getExecution().setStatus(ExecutionStatus.FINISHED); } @Override public void validate() { Execution exec = this.getExecution(); if ((null == exec.getInputFiles() || exec.getInputFiles().isEmpty()) && (null == exec.getInfolisFileTags() || exec.getInfolisFileTags().isEmpty())) { throw new IllegalArgumentException("Must set at least one inputFile!"); } if ((null == exec.getPatterns() || exec.getPatterns().isEmpty()) && (null == exec.getInfolisPatternTags() || exec.getInfolisPatternTags().isEmpty())) { throw new IllegalArgumentException("No patterns given."); } } }