package org.wikibrain.sr.utils; import gnu.trove.iterator.TIntDoubleIterator; import gnu.trove.map.TIntDoubleMap; import gnu.trove.map.TIntFloatMap; import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TIntFloatHashMap; import org.apache.commons.lang3.ArrayUtils; import org.wikibrain.lucene.WikiBrainScoreDoc; import org.wikibrain.matrix.MatrixRow; import java.util.*; /** * * */ public class SimUtils { public static double cosineSimilarity(TIntDoubleMap X, TIntDoubleMap Y) { double xDotX = 0.0; double yDotY = 0.0; double xDotY = 0.0; for (int id : X.keys()) { double x = X.get(id); xDotX += x * x; if (Y.containsKey(id)) { xDotY += x * Y.get(id); } } for (double y : Y.values()) { yDotY += y * y; } return xDotX * yDotY != 0 ? xDotY / Math.sqrt(xDotX * yDotY): 0.0; } public static double cosineSimilarity(TIntFloatMap X, TIntFloatMap Y) { double xDotX = 0.0; double yDotY = 0.0; double xDotY = 0.0; for (int id : X.keys()) { double x = X.get(id); xDotX += x * x; if (Y.containsKey(id)) { xDotY += x * Y.get(id); } } for (double y : Y.values()) { yDotY += y * y; } return xDotX * yDotY != 0 ? xDotY / Math.sqrt(xDotX * yDotY): 0.0; } public static double cosineSimilarity(MatrixRow a, MatrixRow b) { double adota = 0.0; double bdotb = 0.0; double adotb = 0.0; int na = a.getNumCols(); int nb = b.getNumCols(); int i = 0, j = 0; if((na == 0 || nb == 0)) { // do not perform calculations if one or both are 0 return 0; } int ca = a.getColIndex(i); int cb = b.getColIndex(j); while (i < na && j < nb) { if (ca < cb) { // if matrix a has a lower value, then get the next column float va = a.getColValue(i++); adota += va * va; ca = a.getColIndex(i); } else if (ca > cb) { // if matrix b has a lower value, then get the next column float vb = b.getColValue(j++); bdotb += vb * vb; cb = b.getColIndex(j); } else { // if both have the same value, increment the intersection and get the next columns in both matrices float va = a.getColValue(i++); float vb = b.getColValue(j++); adota += va * va; bdotb += vb * vb; adotb += va * vb; ca = a.getColIndex(i); cb = b.getColIndex(j); } } for (; i < na; i++) { float va = a.getColValue(i); adota += va * va; } for (; j < nb; j++) { float vb = b.getColValue(j); bdotb += vb * vb; } if (adota * bdotb * adotb == 0) { return 0.0; } else { return adotb / Math.sqrt(adota * bdotb); } } public static double googleSimilarity(int sizeA, int sizeB, int intersection, int numTotal) { return 1.0 - (Math.log(Math.max(sizeA,sizeB))-Math.log(intersection)) / (Math.log(numTotal)-Math.log(Math.min(sizeA,sizeB))); } /** * Normalize a vector to unit length. * @param X * @return */ public static TIntDoubleMap normalizeVector(TIntDoubleMap X) { TIntDoubleHashMap Y = new TIntDoubleHashMap(); double sumSquares = 0.0; for (double x : X.values()) { sumSquares += x * x; } if (sumSquares != 0.0) { double norm = Math.sqrt(sumSquares); for (int id : X.keys()) { Y.put(id, X.get(id) / norm); } return Y; } return X; } /** * Normalize a vector to unit length. * @param X * @return */ public static TIntFloatMap normalizeVector(TIntFloatMap X) { TIntFloatHashMap Y = new TIntFloatHashMap(); double sumSquares = 0.0; for (double x : X.values()) { sumSquares += x * x; } if (sumSquares != 0.0) { double norm = Math.sqrt(sumSquares); for (int id : X.keys()) { Y.put(id, (float) (X.get(id) / norm)); } return Y; } return X; } public static Map sortByValue(TIntDoubleHashMap unsortMap) { if (unsortMap.isEmpty()) { return new HashMap(); } HashMap<Integer, Double> tempMap = new HashMap<Integer, Double>(); TIntDoubleIterator iterator = unsortMap.iterator(); for ( int i = unsortMap.size(); i-- > 0; ) { iterator.advance(); tempMap.put( iterator.key(), iterator.value() ); } List<Map.Entry> list = new LinkedList<Map.Entry>(tempMap.entrySet()); // sort list based on comparator Collections.sort(list, Collections.reverseOrder(new Comparator() { public int compare(Object o1, Object o2) { return ((Comparable) ((Map.Entry) (o1)).getValue()) .compareTo(((Map.Entry) (o2)).getValue()); } })); Map sortedMap = new LinkedHashMap(); for (Iterator it = list.iterator(); it.hasNext();) { Map.Entry entry = (Map.Entry) it.next(); sortedMap.put(entry.getKey(), entry.getValue()); } return sortedMap; } /** * Prune a WikiBrainScoreDoc array. * @param wikibrainScoreDocs array of WikiBrainScoreDoc */ public static WikiBrainScoreDoc[] pruneSimilar(WikiBrainScoreDoc[] wikibrainScoreDocs) { if (wikibrainScoreDocs.length == 0) { return wikibrainScoreDocs; } int cutoff = wikibrainScoreDocs.length; double threshold = 0.005 * wikibrainScoreDocs[0].score; for (int i = 0, j = 100; j < wikibrainScoreDocs.length; i++, j++) { float delta = wikibrainScoreDocs[i].score - wikibrainScoreDocs[j].score; if (delta < threshold) { cutoff = j; break; } } if (cutoff < wikibrainScoreDocs.length) { // LOG.info("pruned results from " + docs.scoreDocs.length + " to " + cutoff); wikibrainScoreDocs = ArrayUtils.subarray(wikibrainScoreDocs, 0, cutoff); } return wikibrainScoreDocs; } public static double cosineSimilarity(float[] X, float[] Y) { if (X == null || Y == null) { return 0.0; } else if (X.length != Y.length) { throw new IllegalArgumentException(); } double xDotX = 0.0; double yDotY = 0.0; double xDotY = 0.0; for (int i = 0; i < X.length; i++) { xDotX += X[i] * X[i]; yDotY += Y[i] * Y[i]; xDotY += X[i] * Y[i]; } return xDotX * yDotY != 0 ? xDotY / Math.sqrt(xDotX * yDotY): 0.0; } }