package org.wikibrain.sr.evaluation;
import gnu.trove.map.TIntIntMap;
import gnu.trove.map.TObjectDoubleMap;
import gnu.trove.map.TObjectIntMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TObjectDoubleHashMap;
import gnu.trove.map.hash.TObjectIntHashMap;
import org.wikibrain.core.lang.Language;
import org.wikibrain.sr.utils.KnownSim;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @author Shilad Sen
*/
public class KnownMostSim {
private final Language language;
private final String phrase;
private final int pageId;
private final List<KnownSim> mostSimilar;
/**
* Creates a new KnownMostSim without a similarity threshold (i.e. keeps everything).
* @see #KnownMostSim(java.util.List, double)
* @param mostSim
*/
public KnownMostSim(List<KnownSim> mostSim) {
this(mostSim, Double.NEGATIVE_INFINITY);
}
/**
* Creates a new KnownMostSim from a list of KnownSims.
*
* Each KnownSim's phrase1 and language must be identical.
* If the list has duplicate phrase2, they will be merged into a single KnownSim with the mean similarity score.
* All (postmerged) KnownSims with similarity less than threshold will be removed.
* The final list is sorted in reverse order of similarity.
*
* @param mostSim
*/
public KnownMostSim(List<KnownSim> mostSim, double threshold) {
if (mostSim.isEmpty()) {
throw new IllegalArgumentException();
}
// set and check the phrase and language
phrase = mostSim.get(0).phrase1;
language = mostSim.get(0).language;
for (KnownSim ks : mostSim) {
if (!ks.phrase1.equals(phrase)) {
throw new IllegalArgumentException("expected phrase " + phrase + ", received " + ks.phrase1);
}
if (!ks.language.equals(language)) {
throw new IllegalArgumentException("expected phrase " + language + ", received " + ks.language);
}
}
// set the most common local page id
int maxIdCount = 0;
int maxId = -1;
TIntIntMap idCounts = new TIntIntHashMap();
for (KnownSim ks : mostSim) {
if (ks.wpId1 >= 0) {
int n = idCounts.adjustOrPutValue(ks.wpId1, 1, 1);
if (n > maxIdCount) {
maxIdCount = n;
maxId = ks.wpId1;
}
}
}
this.pageId = maxId;
// Set the mean scores for other phrases
TObjectIntMap<String> ids = new TObjectIntHashMap<String>();
TObjectIntMap<String> counts = new TObjectIntHashMap<String>();
TObjectDoubleMap<String> sums = new TObjectDoubleHashMap<String>();
this.mostSimilar = new ArrayList<KnownSim>();
for (KnownSim ks : mostSim) {
ids.put(ks.phrase2, ks.wpId2);
counts.adjustOrPutValue(ks.phrase2, 1, 1);
sums.adjustOrPutValue(ks.phrase2, ks.similarity, ks.similarity);
}
for (String phrase2 : counts.keySet()) {
double mean = sums.get(phrase2) / counts.get(phrase2);
if (mean >= threshold) {
mostSimilar.add(new KnownSim(phrase, phrase2, pageId, ids.get(phrase2), mean, language));
}
}
Collections.sort(this.mostSimilar);
Collections.reverse(this.mostSimilar);
}
public KnownMostSim getAboveThreshold(double threshold) {
return new KnownMostSim(mostSimilar, threshold);
}
public Language getLanguage() {
return language;
}
public List<KnownSim> getMostSimilar() {
return mostSimilar;
}
public String getPhrase() {
return phrase;
}
public int getPageId() {
return pageId;
}
}