/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package io.github.infolis.model.entity; import io.github.infolis.infolink.patternLearner.Reliability; import io.github.infolis.util.RegexUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.lucene.queryparser.classic.ParseException; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonInclude; import io.github.infolis.model.BaseModel; import io.github.infolis.model.TextualReference; import java.util.Collection; import java.util.HashMap; /** * * @author kata * @author domi * @author kba */ @JsonIgnoreProperties(ignoreUnknown = true) @JsonInclude(JsonInclude.Include.NON_NULL) public class InfolisPattern extends BaseModel { private static final org.slf4j.Logger log = LoggerFactory.getLogger(InfolisPattern.class); // TODO can this be final? private String patternRegex; private String luceneQuery; private Set<String> words = new HashSet<>(); private double threshold; private double patternReliability; private Map<String, Double> associations = new HashMap<>(); //TODO: change to URI -> string? private Collection<TextualReference> textualReferences; public InfolisPattern(String patternRegex, String luceneQuery, Set<String> words, double threshold) { this.setLuceneQuery(luceneQuery); this.setPatternRegex(patternRegex); this.setWords(words); this.setThreshold(threshold); } public InfolisPattern(String luceneQuery) { this.luceneQuery = luceneQuery; } public InfolisPattern() { } public void setTextualReferences(Collection<TextualReference> textualReferences) { this.textualReferences = textualReferences; } public Collection<TextualReference> getTextualReferences() { return this.textualReferences; } /** * @return the patternRegex */ public String getPatternRegex() { return patternRegex; } /** * @return the luceneQuery */ public String getLuceneQuery() { return luceneQuery; } /** * @return the words */ public Set<String> getWords() { return words; } public void setPatternRegex(String patternRegex) { this.patternRegex = patternRegex; } public void setLuceneQuery(String luceneQuery) { this.luceneQuery = luceneQuery; } /** * * @param words the words to set */ public void setWords(Set<String> words) { this.words = words; } /** * Generates a regular expression to capture given <emph>title</emph> as * dataset title along with any number specifications. * * @param title name of the dataset to find inside the regex * @return a regular expression for finding the given title along with any * number specifications */ private static String constructTitleVersionRegex(String title) { // at least one whitespace required... return "(" + title + ")" + "\\S*?" + "\\s+" + "\\S*?" + "\\s*" + "\\S*?" + "\\s*" + "\\S*?" + "\\s*" + "\\S*?" + "\\s*" + "\\S*?" + "\\s*" + "((" + RegexUtils.yearRegex + "\\s*((-)|(–))\\s*\\d\\d(\\d\\d)?" + ")|(" + RegexUtils.yearRegex + ")|(\\d+[.,-/\\\\]?\\d*))"; } /** * Generates regular expressions for finding dataset names listed in * <emph>filename</emph> * with titles and number specifications. * * @param filename Name of the file containing a list of dataset names (one * name per line) * @return A Set of Patterns */ public static Set<InfolisPattern> constructPatterns(String filename) { Set<InfolisPattern> patternSet = new HashSet<>(); try { File f = new File(filename); InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "UTF8"); BufferedReader reader = new BufferedReader(isr); String studyTitle; while ((studyTitle = reader.readLine()) != null) { if (!studyTitle.matches("\\s*")) { InfolisPattern p = new InfolisPattern(constructTitleVersionRegex(studyTitle)); patternSet.add(p); } } reader.close(); } catch (IOException ioe) { ioe.printStackTrace(); } return patternSet; } public boolean isReliable(int dataSize, Set<Entity> reliableInstances, Reliability r) throws IOException, ParseException { this.patternReliability = r.computeReliability(dataSize, reliableInstances, this); if (this.getPatternReliability() >= this.getThreshold()) { return true; } else { return false; } } /** * @return the threshold */ public double getThreshold() { return threshold; } /** * @param threshold the threshold to set */ public void setThreshold(double threshold) { this.threshold = threshold; } /** * @return the reliability */ public double getPatternReliability() { return this.patternReliability; } /** * @param reliability the reliability to set */ public void setPatternReliability(double reliability) { this.patternReliability = reliability; } /** * @return the associations */ public Map<String, Double> getAssociations() { return associations; } /** * @param associations the associations to set */ public void setAssociations(Map<String, Double> associations) { this.associations = associations; } public boolean addAssociation(String entityName, double score) { if (this.getAssociations().containsKey(entityName)) { log.debug("association between entity " + this.getPatternRegex() + " and entity " + entityName + " already known, overwriting previously saved score."); } return (this.getAssociations().put(entityName, score) == null); } }