package edu.gslis.ttg.jaccard;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
public class JaccardStore {
private Map<long[], Double> scores; // <docPair, jaccardScore>
private TreeMap<Double, List<long[]>> scoreLookup; // <jaccardScore, docPairs>
public JaccardStore() {
scores = new HashMap<long[], Double>();
scoreLookup = new TreeMap<Double, List<long[]>>();
}
public double getScore(long doc1, long doc2) {
return scores.get(ordered(doc1, doc2));
}
public void setScore(long doc1, long doc2, double score) {
scores.put(ordered(doc1, doc2), score);
if (scoreLookup.get(score) == null) {
scoreLookup.put(score, new ArrayList<long[]>());
}
scoreLookup.get(score).add(ordered(doc1, doc2));
}
public List<long[]> getDocsForScore(double score) {
return scoreLookup.get(score);
}
public NavigableMap<Double, List<long[]>> getDocsGreaterThanScore(double score) {
return scoreLookup.tailMap(score, true);
}
public int size() {
return scores.keySet().size();
}
private long[] ordered(long doc1, long doc2) {
long[] ordered = new long[2];
if (doc1 < doc2) {
ordered[0] = doc1;
ordered[1] = doc2;
} else {
ordered[0] = doc2;
ordered[1] = doc1;
}
return ordered;
}
public static double computeJaccardSimilarity(Set<String> doc1, Set<String> doc2) {
Set<String> intersection = new HashSet<String>(doc1);
Set<String> union = new HashSet<String>(doc1);
intersection.retainAll(doc2);
union.addAll(doc2);
return intersection.size() / (double) union.size();
}
public static double computeJaccardSimilarity(String doc1, String doc2) {
String[] docOneTerms = doc1.toLowerCase().split("[^A-Za-z0-9]");
List<String> termList = new ArrayList<String>(Arrays.asList(docOneTerms));
termList.removeAll(Arrays.asList("", null));
Set<String> docOneBag = new HashSet<String>(termList);
String[] docTwoTerms = doc2.toLowerCase().split("[^A-Za-z0-9]");
termList = new ArrayList<String>(Arrays.asList(docTwoTerms));
termList.removeAll(Arrays.asList("", null));
Set<String> docTwoBag = new HashSet<String>(termList);
return computeJaccardSimilarity(docOneBag, docTwoBag);
}
}