package io.github.infolis.infolink.querying;
import io.github.infolis.algorithm.SearchResultLinker;
import io.github.infolis.model.TextualReference;
import io.github.infolis.model.entity.Entity;
import io.github.infolis.model.entity.EntityLink;
import io.github.infolis.model.entity.SearchResult;
import io.github.infolis.util.InformationExtractor;
import io.github.infolis.util.RegexUtils;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author kata
*
*/
public class SearchResultScorer {
private final static Logger log = LoggerFactory.getLogger(SearchResultScorer.class);
/**
* Computes score and temporal relations based on correspondence of numbers in entities and
* search results. Considers ranges, abbreviated years and exact matches.
*
* @param reference
* @param targetCandidate
* @return
*/
public static SearchResultLinker.CandidateTargetEntity computeScoreBasedOnNumbers(
Entity entity, SearchResult targetCandidate) {
SearchResultLinker.CandidateTargetEntity candidate = new SearchResultLinker.CandidateTargetEntity();
candidate.setSearchResult(targetCandidate);
List<String> textRefNumInfoList = entity.getNumericInfo();
if(targetCandidate.getNumericInformation() == null || targetCandidate.getNumericInformation().isEmpty()) {
targetCandidate.setNumericInformation(InformationExtractor.extractNumbers(targetCandidate.getTitles().get(0)));
}
List<String> targetCandidateNumInfoList = targetCandidate.getNumericInformation();
if (targetCandidate.getNumericInformation().isEmpty()) {
candidate.setScore(0.5);
if (textRefNumInfoList.isEmpty()) candidate.addEntityRelation(EntityLink.EntityRelation.same_as_temporal);
else candidate.addEntityRelation(EntityLink.EntityRelation.part_of_temporal);
return candidate;
}
if (textRefNumInfoList.isEmpty()) {
candidate.setScore(0.7);
candidate.addEntityRelation(EntityLink.EntityRelation.superset_of_temporal);
return candidate;
}
for (String textRefNumInfo : textRefNumInfoList) {
for (String targetCandidateNumInfo : targetCandidateNumInfoList) {
Set<EntityLink.EntityRelation> entityRelations = numericInfoMatches(textRefNumInfo, targetCandidateNumInfo);
if (null != entityRelations) {
candidate.setScore(1.0);
candidate.setEntityRelations(entityRelations);
return candidate;
}
}
}
candidate.setScore(0.0);
return candidate;
}
/**
* Computes score and temporal relations based on correspondence of numbers in textual references and
* search results. Considers ranges, abbreviated years and exact matches.
*
* @param reference
* @param targetCandidate
* @return
*/
public static SearchResultLinker.CandidateTargetEntity computeScoreBasedOnNumbers(TextualReference reference, SearchResult targetCandidate) {
SearchResultLinker.CandidateTargetEntity candidate = new SearchResultLinker.CandidateTargetEntity();
candidate.setSearchResult(targetCandidate);
List<String> textRefNumInfoList = InformationExtractor.extractNumericInfo(reference);
if(targetCandidate.getNumericInformation() == null || targetCandidate.getNumericInformation().isEmpty()) {
targetCandidate.setNumericInformation(InformationExtractor.extractNumbers(targetCandidate.getTitles().get(0)));
}
List<String> targetCandidateNumInfoList = targetCandidate.getNumericInformation();
if (targetCandidate.getNumericInformation().isEmpty()) {
candidate.setScore(0.5);
if (textRefNumInfoList.isEmpty()) candidate.addEntityRelation(EntityLink.EntityRelation.same_as_temporal);
else candidate.addEntityRelation(EntityLink.EntityRelation.part_of_temporal);
return candidate;
}
if (textRefNumInfoList.isEmpty()) {
candidate.setScore(0.7);
candidate.addEntityRelation(EntityLink.EntityRelation.superset_of_temporal);
return candidate;
}
for (String textRefNumInfo : textRefNumInfoList) {
for (String targetCandidateNumInfo : targetCandidateNumInfoList) {
Set<EntityLink.EntityRelation> entityRelations = numericInfoMatches(textRefNumInfo, targetCandidateNumInfo);
if (null != entityRelations) {
candidate.setScore(1.0);
candidate.setEntityRelations(entityRelations);
return candidate;
}
}
}
candidate.setScore(0.0);
return candidate;
}
private static boolean containsYear(String number) {
return Pattern.matches(".*?" + RegexUtils.yearRegex + ".*?", number);
}
private static boolean containsAbbreviatedYear(String number) {
return Pattern.matches("[\\D]*?" + "('?\\d\\d)" + "[^\\d\\.]*?", number);
}
private static boolean containsEnum(String number) {
return Pattern.matches(".*?\\d\\s*" + RegexUtils.enumRegex+ "\\s*\\d.*?", number);
}
private static boolean containsRange(String number) {
return Pattern.matches(".*?\\d\\s*" + RegexUtils.rangeRegex+ "\\s*\\d.*?", number);
}
private static String[] getFullYearVariants(String extractedNumber) {
String number1a, number1b = extractedNumber;
if (containsAbbreviatedYear(extractedNumber)) {
number1a = "19" + extractedNumber;
number1b = "20" + extractedNumber;
}
else { number1a = number1b = extractedNumber; }
return new String[]{number1a, number1b};
}
// call method for every enumerated value, one match is sufficient <- ?
// TODO if not all enumerated values have a match, superset relation should be added...
private static Set<EntityLink.EntityRelation> enumMatches(List<String> enumInfo, String info2) {
Set<EntityLink.EntityRelation> relations = new HashSet<>();
for (String info : enumInfo) {
for (String enumeratedNumber : info.split(RegexUtils.enumRegex)) {
log.debug(String.format("computing score for enum part \"%s\"", enumeratedNumber));
Set<EntityLink.EntityRelation> rels = numericInfoMatches(enumeratedNumber, info2);
if (null != rels) {
relations.addAll(rels);
// remove same_as_temporal: if it is part of an enumeration,
// it's just a part
relations.remove(EntityLink.EntityRelation.same_as_temporal);
relations.add(EntityLink.EntityRelation.part_of_temporal);
return relations;
}
}
//return relations;
}
return null;
}
private static Set<EntityLink.EntityRelation> rangeMatches(
List<String> numericInfo1, List<String> numericInfo2,
boolean containsRange_numericInfo2, boolean containsYear_numericInfo2,
boolean containsAbbrYear_numericInfo2, boolean invert) {
// ranges may contain abbreviated years
String[] variants1a = getFullYearVariants(numericInfo1.get(0));
String[] variants1b = getFullYearVariants(numericInfo1.get(1));
List<String> numericInfo1a = Arrays.asList(variants1a[0], variants1b[0]);
List<String> numericInfo1b = Arrays.asList(variants1a[1], variants1b[1]);
Set<EntityLink.EntityRelation> relations = new HashSet<>();
Set<EntityLink.EntityRelation> relations1;
Set<EntityLink.EntityRelation> relations2;
if (containsRange_numericInfo2) {
log.debug("2nd argument also contains range, computing overlap");
// ranges may contain abbreviated years
String[] variants2a = getFullYearVariants(numericInfo2.get(0));
String[] variants2b = getFullYearVariants(numericInfo2.get(1));
List<String> numericInfo2a = Arrays.asList(variants2a[0], variants2b[0]);
List<String> numericInfo2b = Arrays.asList(variants2a[1], variants2b[1]);
relations1 = overlap(numericInfo1a, numericInfo2a, invert);
relations2 = overlap(numericInfo1b, numericInfo2b, invert);
if (null != relations1) relations.addAll(relations1);
if (null != relations2) relations.addAll(relations2);
if (!relations.isEmpty()) return relations;
else return null;
}
// year must be inside of range
if (containsYear_numericInfo2) {
if (inRange(numericInfo1a, numericInfo2.get(0))
|| inRange(numericInfo1b, numericInfo2.get(0))) {
if (invert) relations.add(EntityLink.EntityRelation.part_of_temporal);
else relations.add(EntityLink.EntityRelation.superset_of_temporal);
return relations;
}
}
// modified value must be inside of range
if (containsAbbrYear_numericInfo2) {
if (inRange(numericInfo1a, "19" + numericInfo2.get(0))
|| inRange(numericInfo1b, "19" + numericInfo2.get(0))
|| inRange(numericInfo1a, "20" + numericInfo2.get(0))
|| inRange(numericInfo1b, "20" + numericInfo2.get(0))) {
if (invert) relations.add(EntityLink.EntityRelation.part_of_temporal);
else relations.add(EntityLink.EntityRelation.superset_of_temporal);
return relations;
}
}
else {
if (inRange(numericInfo1a, numericInfo2.get(0))
|| inRange(numericInfo1b, numericInfo2.get(0))) {
if (invert) relations.add(EntityLink.EntityRelation.part_of_temporal);
else relations.add(EntityLink.EntityRelation.superset_of_temporal);
return relations;
}
}
return null;
}
public static boolean yearsMatch(List<String> numericInfo1, List<String> numericInfo2, boolean containsYear_numericInfo2, boolean containsAbbrYear_numericInfo2) {
if (containsAbbrYear_numericInfo2) {
for (String year : numericInfo1) {
for (String abbrYear2 : numericInfo2) {
for (String year2 : getFullYearVariants(abbrYear2)) {
if (year.equals(year2)) {
log.debug("Years match: " + year + " <-> " + year2);
return true;
}
else { log.debug("No year match: " + year + " <-> " + year2); }
}
}
}
return false;
}
// candidate numeric info contains a year as well or is some number
else {
for (String year : numericInfo1) {
for (String year2 : numericInfo2) {
if (year.equals(year2)) {
log.debug("Years match: " + year + " <-> " + year2);
return true;
}
else { log.debug("No year match: " + year + " <-> " + year2); }
}
}
return false;
}
}
public static boolean abbreviatedYearsMatch(List<String> numericInfo1, List<String> numericInfo2, boolean containsAbbrYear_numericInfo2) {
// modified year must match modified year
if (containsAbbrYear_numericInfo2) {
for (String abbreviatedYear : numericInfo1) {
for (String abbreviatedYear2 : numericInfo2) {
if (abbreviatedYear.equals(abbreviatedYear2)) { return true; }
}
}
return false;
}
// info2 is some float number. Compare, because 90 may be an abbreviated year or a number and 90 == 90.0
else {
for (String info2 : numericInfo2) {
float number2 = Float.parseFloat(info2);
for (String abbreviatedYear : numericInfo1) {
float number1 = Float.parseFloat(abbreviatedYear);
if (Math.abs(number1 - number2) < 0.00001) {
log.debug("Equal: " + number1 + " <-> " + number2); return true; }
}
}
return false;
}
}
private static boolean floatsMatch(List<String> numericInfo1, List<String> numericInfo2) {
for (String info1 : numericInfo1) {
float number1 = Float.parseFloat(info1);
for (String info2 : numericInfo2) {
float number2 = Float.parseFloat(info2);
if (number1 == number2) { return true; }
}
}
return false;
}
protected static Set<EntityLink.EntityRelation> numericInfoMatches(String numericInfo, String string) {
// for study references without any specified years / numbers, accept all candidates
// TODO: match to higher order entity according to dataset ontology (study, not dataset)
Set<EntityLink.EntityRelation> relations = new HashSet<>();
if (numericInfo == null || string == null) {
relations.add(EntityLink.EntityRelation.superset_of_temporal);
return relations;
}
List<String> numericInfo1 = InformationExtractor.extractNumbers(numericInfo);
List<String> numericInfo2 = InformationExtractor.extractNumbers(string);
boolean containsRange_numericInfo1 = containsRange(numericInfo);
boolean containsRange_numericInfo2 = containsRange(string);
boolean containsEnum_numericInfo1 = containsEnum(numericInfo);
boolean containsEnum_numericInfo2 = containsEnum(string);
boolean containsYear_numericInfo1 = containsYear(numericInfo);
boolean containsYear_numericInfo2 = containsYear(string);
boolean containsAbbrYear_numericInfo1 = containsAbbreviatedYear(numericInfo);
boolean containsAbbrYear_numericInfo2 = containsAbbreviatedYear(string);
if (containsEnum_numericInfo1) {
log.debug("Enum match for: " + numericInfo + " <-> " + string + "?");
return enumMatches(numericInfo1, string);
}
if (containsEnum_numericInfo2) {
log.debug("Enum match for: " + string + " <-> " + numericInfo + "?");
return enumMatches(numericInfo2, numericInfo);
}
// extracted numeric information is a range specification
if (containsRange_numericInfo1) {
log.debug("Range match for: " + numericInfo + " <-> " + string + "?");
log.debug("first argument is range");
// continue if range does not match - maybe parts do
Set<EntityLink.EntityRelation> rels = rangeMatches(numericInfo1, numericInfo2,
containsRange_numericInfo2, containsYear_numericInfo2,
containsAbbrYear_numericInfo2, false);//false
// if empty, it is not a valid range; continue searching
if (null == rels) ;
else if (!rels.isEmpty()) return rels;
}
if (containsRange_numericInfo2) {
log.debug("Range match for: " + numericInfo + " <-> " + string + "?");
log.debug("second argument is range");
// continue if range does not match - maybe parts do
Set<EntityLink.EntityRelation> rels = rangeMatches(numericInfo2, numericInfo1,
containsRange_numericInfo1, containsYear_numericInfo1,
containsAbbrYear_numericInfo1, true);//true
// if empty, it is not a valid range; continue searching
if (null == rels) ;
else if (!rels.isEmpty()) return rels;
}
// extracted numeric info contains a year
if (containsYear_numericInfo1) {
log.debug("Year match for: " + numericInfo + " <-> " + string + "?");
if (yearsMatch(numericInfo1, numericInfo2,
containsYear_numericInfo2, containsAbbrYear_numericInfo2)) {
relations.add(EntityLink.EntityRelation.same_as_temporal);
return relations;
}
}
// extracted numeric info contains a year
if (containsYear_numericInfo2) {
log.debug("Year match for: " + string + " <-> " + numericInfo + "?");
if (yearsMatch(numericInfo2, numericInfo1, containsYear_numericInfo1, containsAbbrYear_numericInfo1)) {
relations.add(EntityLink.EntityRelation.same_as_temporal);
return relations;
}
}
if (containsAbbrYear_numericInfo1) {
log.debug("Abbreviated year match for: " + numericInfo + " <-> " + string + "?");
if (abbreviatedYearsMatch(numericInfo1, numericInfo2, containsAbbrYear_numericInfo2)) {
relations.add(EntityLink.EntityRelation.same_as_temporal);
return relations;
}
}
if (containsAbbrYear_numericInfo2) {
log.debug("Abbreviated year match for: " + string + " <-> " + numericInfo + "?");
if (abbreviatedYearsMatch(numericInfo2, numericInfo1, containsAbbrYear_numericInfo1)) {
relations.add(EntityLink.EntityRelation.same_as_temporal);
return relations;
}
}
else {
log.debug("Number match for: " + numericInfo + " <-> " + string + "?");
if (floatsMatch(numericInfo1, numericInfo2)) {
relations.add(EntityLink.EntityRelation.same_as_temporal);
return relations;
}
}
return null;
}
private static float[] toFloatArray(List<String> numberList) {
float[] res = new float[numberList.size()];
for (int i = 0; i< numberList.size(); i++) {
res[i] = Float.parseFloat(numberList.get(i));
}
return res;
}
/**
* Checks whether given value lies inside of range1.
*
* @param range1
* @param value
* @return
*/
static boolean inRange(List<String> range1, String value) {
try {
float[] info1 = toFloatArray(range1);
float year2 = Float.parseFloat(value);
return (inRange(info1, year2));
}
catch (NumberFormatException nfe) { log.debug(nfe.getMessage()); return false; }
}
/**
* Checks whether given value lies inside of range1.
*
* @param range1
* @param value
* @return
*/
static boolean inRange(float[] range1, float value) {
float year1a = range1[0]; float year1b = range1[1];
// probably not a range after all (e.g. Ausländer in Deutschland 2000 - 2. Welle)
if (year1a > year1b) { return false; }
return (value >= year1a && value <= year1b);
}
/**
* Return true if both ranges overlap = Check whether period a entirely or partly covers period b.
* Period a: year1a - year1b; period b: year2a - year2b.
* 5 cases may occur:
* 0. period a and b are equal: e.g. 1991-1999 and 1991-1999
* 1. period b is entirely covered: e.g. 1991-1999 in 1990-2000
* 2. period b is partly covered 1): e.g. 1980-1999 in 1990-2000
* 3. period b is partly covered 2): e.g. 1991-2013 in 1990-2000
* 4. period a is entirely covered: e.g. 1980-2013 in 1990-2000
*
* @param range1
* @param range2
* @param invert
* @return
*/
static Set<EntityLink.EntityRelation> overlap(List<String> range1,
List<String> range2, boolean invert) {
log.debug(String.format("computing overlap between '%s' and '%s'. Invert: %s", range1, range2, invert));
Set<EntityLink.EntityRelation> relations = new HashSet<>();
try {
float[] info1 = toFloatArray(range1);
float[] info2 = toFloatArray(range2);
float year1a = info1[0]; float year1b = info1[1];
float year2a = info2[0]; float year2b = info2[1];
// probably not a range after all (e.g. Ausländer in Deutschland 2000 - 2. Welle)
if (year1a > year1b || year2a > year2b) return relations;
// case 0
if ((year1a == year2a) && (year1b == year2b)) {
relations.add(EntityLink.EntityRelation.same_as_temporal);
log.debug("returning " + relations);
return relations;
}
// case 1
if ((year2a >= year1a) && (year2b <= year1b)) {
if (invert) relations.add(EntityLink.EntityRelation.part_of_temporal);
else relations.add(EntityLink.EntityRelation.superset_of_temporal);
log.debug("returning " + relations);
return relations;
}
// case 2
if ((year2a < year1a) && (year2b <= year1b) && (year2b >= year1a)) {
if (year2b != year1b) relations.add(EntityLink.EntityRelation.superset_of_temporal);
relations.add(EntityLink.EntityRelation.part_of_temporal);
log.debug("returning " + relations);
return relations;
}
// case 3
if ((year2a >= year1a) && (year2b > year1b) && (year2a <= year1b)) {
if (year2a != year1a) relations.add(EntityLink.EntityRelation.superset_of_temporal);
relations.add(EntityLink.EntityRelation.part_of_temporal);
log.debug("returning " + relations);
return relations;
}
// case 4
if ((year2a <= year1a) && (year2b >= year1b)) {
if (invert) relations.add(EntityLink.EntityRelation.superset_of_temporal);
else relations.add(EntityLink.EntityRelation.part_of_temporal);
log.debug("returning " + relations);
return relations;
}
return null;
}
catch (NumberFormatException nfe) { log.debug(nfe.getMessage()); return null; }
}
}