package io.github.infolis.algorithm; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import io.github.infolis.datastore.DataStoreClient; import io.github.infolis.datastore.FileResolver; import io.github.infolis.model.EntityType; import io.github.infolis.model.TextualReference; import io.github.infolis.model.entity.Entity; import io.github.infolis.model.entity.EntityLink; import io.github.infolis.model.entity.SearchResult; import io.github.infolis.infolink.querying.SearchResultScorer; import io.github.infolis.infolink.querying.QueryService; import io.github.infolis.infolink.querying.QueryService.QueryField; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * @author kata * @author domi * */ public abstract class SearchResultLinker extends BaseAlgorithm { private static final Logger log = LoggerFactory.getLogger(SearchResultLinker.class); // weight for number-based score, weight for reliability of QueryService, weight for list index private float[] weights = {1.0f, 1.0f, 1.0f}; Set<QueryField> queryStrategy; private int maxNum = 1000; public SearchResultLinker(DataStoreClient inputDataStoreClient, DataStoreClient outputDataStoreClient, FileResolver inputFileResolver, FileResolver outputFileResolver) { super(inputDataStoreClient, outputDataStoreClient, inputFileResolver, outputFileResolver); } public void setWeightForNumberBasedScore(float weight) { weights[0] = weight; } public void setWeightForQSReliability(float weight) { weights[1] = weight; } public void setWeightForListIndex(float weight) { weights[2] = weight; } public void setQueryStrategy(Set<QueryField> queryStrategy) { this.queryStrategy = queryStrategy; } public Set<QueryField> getQueryStrategy() { return this.queryStrategy; } public void setMaxNum(int maxNum) { this.maxNum = maxNum; } public int getMaxNum() { return this.maxNum; } public static class CandidateTargetEntity { SearchResult searchResult; double score; Set<EntityLink.EntityRelation> entityRelations = new HashSet<>(); public void setSearchResult(SearchResult searchResult) { this.searchResult = searchResult; } public void setScore(double score) { this.score = score; } public void setEntityRelations(Set<EntityLink.EntityRelation> entityRelations) { this.entityRelations = entityRelations; } public void addEntityRelation(EntityLink.EntityRelation entityRelation) { this.entityRelations.add(entityRelation); } } public List<CandidateTargetEntity> rankResults(Entity entity) { List<String> searchResultURIs = getExecution().getSearchResults(); List<SearchResult> searchResults = getInputDataStoreClient().get( SearchResult.class, searchResultURIs); List<CandidateTargetEntity> candidates = new ArrayList<>(); int counter = 0; for (SearchResult searchResult : searchResults) { counter++; double confidenceValue = 0.0; int factors = 0; CandidateTargetEntity searchResultCandidate = SearchResultScorer.computeScoreBasedOnNumbers(entity, searchResult); if (0 != weights[0]) { log.debug("Computing score based on numbers. Weight: " + weights[0]); confidenceValue = weights[0] * searchResultCandidate.score; factors++; log.debug("number score: " + confidenceValue); } if (0 != weights[1]) { log.debug("Adding score based on query service reliability. Weight: " + weights[1]); double serviceScore = weights[1] * getInputDataStoreClient().get(QueryService.class, searchResult.getQueryService()).getServiceReliability(); factors++; log.debug("service score: " + serviceScore); confidenceValue += serviceScore; } if (0 != weights[2]) { log.debug("Adding score based on list index. Weight: " + weights[2]); // normalize: +1 to avoid NaN if only results contains only one search result double indexScore = weights[2] * (1 - ((double) searchResult.getListIndex() / ((double) searchResults.get(searchResults.size() - 1).getListIndex() + 1))); factors++; log.debug("index score: " + indexScore); confidenceValue += indexScore; } if (0 != factors) confidenceValue = confidenceValue / factors; log.debug("Confidence score: " + confidenceValue); CandidateTargetEntity candidate = new CandidateTargetEntity(); candidate.searchResult = searchResult; candidate.score = confidenceValue; candidate.entityRelations = searchResultCandidate.entityRelations; candidates.add(candidate); updateProgress(counter, searchResults.size()); } return candidates; } public List<CandidateTargetEntity> rankResults(TextualReference textRef) { List<String> searchResultURIs = getExecution().getSearchResults(); List<SearchResult> searchResults = getInputDataStoreClient().get(SearchResult.class, searchResultURIs); List<CandidateTargetEntity> candidates = new ArrayList<>(); int counter = 0; for (SearchResult searchResult : searchResults) { counter++; double confidenceValue = 0.0; int factors = 0; CandidateTargetEntity searchResultCandidate = SearchResultScorer.computeScoreBasedOnNumbers(textRef, searchResult); if (0 != weights[0]) { log.debug("Computing score based on numbers. Weight: " + weights[0]); confidenceValue = weights[0] * searchResultCandidate.score; factors++; log.debug("number score: " + confidenceValue); } if (0 != weights[1]) { log.debug("Adding score based on query service reliability. Weight: " + weights[1]); double serviceScore = weights[1] * getInputDataStoreClient().get(QueryService.class, searchResult.getQueryService()).getServiceReliability(); factors++; log.debug("service score: " + serviceScore); confidenceValue += serviceScore; } if (0 != weights[2]) { log.debug("Adding score based on list index. Weight: " + weights[2]); // normalize: +1 to avoid NaN if only results contains only one search result double indexScore = weights[2] * (1 - ((double) searchResult.getListIndex() / ((double) searchResults.get(searchResults.size() - 1).getListIndex() + 1))); factors++; log.debug("index score: " + indexScore); confidenceValue += indexScore; } if (0 != factors) confidenceValue = confidenceValue / factors; log.debug("Confidence score: " + confidenceValue); CandidateTargetEntity candidate = new CandidateTargetEntity(); candidate.searchResult = searchResult; candidate.score = confidenceValue; candidate.entityRelations = searchResultCandidate.entityRelations; candidates.add(candidate); updateProgress(counter, searchResults.size()); } return candidates; } // the confidence score of the best result equals the confidence score of the QueryService public List<CandidateTargetEntity> getBestResultsAtFirstIndex() { List<CandidateTargetEntity> candidates = new ArrayList<>(); List<SearchResult> searchResults = getInputDataStoreClient().get( SearchResult.class, getExecution().getSearchResults()); for (SearchResult searchResult : searchResults) { if (searchResult.getListIndex() == 0) { double confidence = weights[1] * getInputDataStoreClient().get(QueryService.class, searchResult.getQueryService()) .getServiceReliability(); CandidateTargetEntity candidate = new CandidateTargetEntity(); candidate.searchResult = searchResult; candidate.score = confidence; candidate.entityRelations = new HashSet<>(Arrays.asList( EntityLink.EntityRelation.unknown)); candidates.add(candidate); } } return candidates; } public List<CandidateTargetEntity> getBestSearchResult(List<CandidateTargetEntity> candidates) { CandidateTargetEntity bestCandidate = null; double bestScore = -1.0; log.debug("Selecting the best search results"); for (CandidateTargetEntity candidate : candidates) { if (candidate.score > bestScore) { bestScore = candidate.score; bestCandidate = candidate; } } log.debug("Best search result: " + bestCandidate.searchResult.getIdentifier() + ": " + bestCandidate.searchResult.getTitles()); log.debug("Score: " + bestScore); List<CandidateTargetEntity> bestCandidates = new ArrayList<>(); CandidateTargetEntity candidate = new CandidateTargetEntity(); candidate.searchResult = bestCandidate.searchResult; candidate.score = bestScore; candidate.entityRelations = new HashSet<>(Arrays.asList( EntityLink.EntityRelation.unknown)); bestCandidates.add(candidate); return bestCandidates; } public List<CandidateTargetEntity> getMatchingSearchResults( List<CandidateTargetEntity> candidates, double threshold) { log.debug("Selecting all search results with score above or equal to threshold"); List<CandidateTargetEntity> matchingCandidates = new ArrayList<>(); for (CandidateTargetEntity candidate : candidates) { log.debug("Score for search result " + candidate.searchResult.getUri() + ": " + candidate.score); if (candidate.score >= threshold) { matchingCandidates.add(candidate); } } return matchingCandidates; } public List<String> createLinks(Entity fromEntity, List<CandidateTargetEntity> candidates) { List<String> entityLinks = new ArrayList<>(); for (CandidateTargetEntity candidate : candidates) { Entity toEntity = new Entity(); toEntity.setTags(candidate.searchResult.getTags()); toEntity.addAllTags(getExecution().getTags()); // TODO as of now, setting EntityType to dataset is always correct // if queryservices are added which incorporate databases also, // distinguish the types here toEntity.setEntityType(EntityType.dataset); toEntity.addIdentifier(candidate.searchResult.getIdentifier()); if (candidate.searchResult.getTitles() != null && candidate.searchResult.getTitles().size()>0) { toEntity.setName(candidate.searchResult.getTitles().get(0)); } if (candidate.searchResult.getNumericInformation() != null && candidate.searchResult.getNumericInformation().size()>0) { List<String> numInfo = new ArrayList<>(); numInfo.add(candidate.searchResult.getNumericInformation().get(0)); toEntity.setNumericInfo(numInfo); } getOutputDataStoreClient().post(Entity.class, toEntity); log.debug("Creating link for entity: " + fromEntity.getUri()); EntityLink el = new EntityLink(fromEntity.getUri(), toEntity.getUri(), candidate.score, ""); el.setEntityRelations(candidate.entityRelations); el.setTags(toEntity.getTags()); getOutputDataStoreClient().post(EntityLink.class, el); entityLinks.add(el.getUri()); } return entityLinks; } public List<String> createLinks(TextualReference textRef, List<CandidateTargetEntity> candidates) { List<String> entityLinks = new ArrayList<>(); for (CandidateTargetEntity candidate : candidates) { Entity referencedInstance = new Entity(); referencedInstance.setTags(candidate.searchResult.getTags()); referencedInstance.addAllTags(getExecution().getTags()); // TODO as of now, setting EntityType to dataset is always correct // if queryservices are added which incorporate databases also, // distinguish the types here referencedInstance.setEntityType(EntityType.dataset); referencedInstance.addIdentifier(candidate.searchResult.getIdentifier()); if(candidate.searchResult.getTitles() != null && candidate.searchResult.getTitles().size()>0) { referencedInstance.setName(candidate.searchResult.getTitles().get(0)); } if(candidate.searchResult.getNumericInformation() != null && candidate.searchResult.getNumericInformation().size()>0) { List<String> numInfo = new ArrayList<>(); numInfo.add(candidate.searchResult.getNumericInformation().get(0)); referencedInstance.setNumericInfo(numInfo); } getOutputDataStoreClient().post(Entity.class, referencedInstance); String linkReason = textRef.getUri(); log.debug("Creating link for TextualReference: " + textRef.getReference() + "; mentionsReference: " + textRef.getMentionsReference()); log.debug("File: " + textRef.getTextFile()); EntityLink el = new EntityLink(textRef.getMentionsReference(), referencedInstance.getUri(), candidate.score, linkReason); el.setEntityRelations(candidate.entityRelations); el.setTags(referencedInstance.getTags()); getOutputDataStoreClient().post(EntityLink.class, el); entityLinks.add(el.getUri()); } return entityLinks; } @Override public void validate() throws IllegalAlgorithmArgumentException { if (null == getExecution().getSearchResults() || getExecution().getSearchResults().isEmpty() ){ throw new IllegalAlgorithmArgumentException(getClass(), "searchResults", "Required parameter 'search results' is missing!"); } if (null == getExecution().getTextualReferences() || getExecution().getTextualReferences().isEmpty()) { if (null == getExecution().getLinkedEntities() || getExecution().getLinkedEntities().isEmpty()) { throw new IllegalAlgorithmArgumentException(getClass(), "linkedEntities/textualReferences", "Required parameter 'linked entities' or 'textual references' is missing!"); } } } }