package org.wikibrain.sr.vector; import gnu.trove.map.TIntFloatMap; import gnu.trove.map.hash.TIntFloatHashMap; import org.junit.Ignore; import org.junit.Test; import org.wikibrain.matrix.SparseMatrixRow; import org.wikibrain.matrix.ValueConf; import org.wikibrain.sr.utils.SimUtils; import java.util.Random; import static org.junit.Assert.*; /** * @author Shilad Sen */ public class TestCosineSimilarity { static final int [] ROW1_IDS = { 7, 9, 12, 5, 6, 2 }; static final float [] ROW1_VALS = { 0.3f, 0.5f, 0.2f, 0.7f, 0.8f, 0.1f }; static final int [] ROW2_IDS = { 5, 3, 2, 4, 7 }; static final float [] ROW2_VALS = { 0.8f, 0.1f, 0.2f, 0.4f, 0.5f }; @Test public void testUtil() { TIntFloatMap row1 = getMap(ROW1_IDS, ROW1_VALS); TIntFloatMap row2 = getMap(ROW2_IDS, ROW2_VALS); double expected = cosineSimilarity(row1, row2); double actual = SimUtils.cosineSimilarity(row1, row2); assertEquals(expected, actual, 0.0001); } @Test public void testMap() { TIntFloatMap row1 = getMap(ROW1_IDS, ROW1_VALS); TIntFloatMap row2 = getMap(ROW2_IDS, ROW2_VALS); double expected = cosineSimilarity(row1, row2); double actual = new CosineSimilarity().similarity(row1, row2); assertEquals(expected, actual, 0.0001); actual = new CosineSimilarity().similarity(row2, row1); assertEquals(expected, actual, 0.0001); } @Test public void testRows() { TIntFloatMap map1 = getMap(ROW1_IDS, ROW1_VALS); TIntFloatMap map2 = getMap(ROW2_IDS, ROW2_VALS); SparseMatrixRow row1 = getRow(ROW1_IDS, ROW1_VALS); SparseMatrixRow row2 = getRow(ROW2_IDS, ROW2_VALS); double expected = cosineSimilarity(map1, map2); double actual = new CosineSimilarity().similarity(row1, row2); assertEquals(expected, actual, 0.0001); actual = new CosineSimilarity().similarity(row2, row1); assertEquals(expected, actual, 0.0001); } private double cosineSimilarity(TIntFloatMap row1, TIntFloatMap row2) { double adota = 0.0; double bdotb = 0.0; double adotb = 0.0; for (double v : row1.values()) { adota += v * v; } for (double v : row2.values()) { bdotb += v * v; } for (int id : row1.keys()) { if (row2.containsKey(id)) { adotb += row1.get(id) * row2.get(id); } } return adotb / (Math.sqrt(adota) * Math.sqrt(bdotb)); } private SparseMatrixRow getRow(int [] ids, float [] vals) { assertEquals(ids.length, vals.length); return new SparseMatrixRow(new ValueConf(), 34, ids, vals); } private TIntFloatMap getMap(int []ids, float [] vals) { assertEquals(ids.length, vals.length); TIntFloatHashMap map = new TIntFloatHashMap(); for (int i = 0; i < ids.length; i++) { map.put(ids[i], vals[i]); } return map; } @Ignore @Test public void benchmark() { int numOuter = 100; int numInner = 10000; long before = System.currentTimeMillis(); double sum = 0; Random random = new Random(); for (int i = 0; i < numOuter; i++) { int overlap[] = new int[10]; for (int j = 0; j < overlap.length; j++) { overlap[j] = random.nextInt(Integer.MAX_VALUE / 10); } SparseMatrixRow row1 = makeRow(100, overlap); SparseMatrixRow row2 = makeRow(100, overlap); CosineSimilarity sim = new CosineSimilarity(); for (int j = 0; j < numInner; j++) { sum += sim.similarity(row1, row2); } } long after = System.currentTimeMillis(); System.out.println("elapsed is " + (after - before) + " sim is " + (sum / (numOuter * numInner))); } private SparseMatrixRow makeRow(int size, int[] mustInclude) { Random random = new Random(); int ids[] = new int[size]; float vals[] = new float[size]; for (int i = 0; i < size; i++) { ids[i] = (i < mustInclude.length) ? mustInclude[i] : random.nextInt(Integer.MAX_VALUE / 10); vals[i] = random.nextFloat(); } return getRow(ids, vals); } }