//Copyright (C) 2010 Steffen Rendle, Zeno Gantner
//Copyright (C) 2011 Zeno Gantner, Chris Newell
//
//This file is part of MyMediaLite.
//
//MyMediaLite is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//MyMediaLite is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with MyMediaLite. If not, see <http://www.gnu.org/licenses/>.
package org.mymedialite.correlation;
import it.unimi.dsi.fastutil.ints.IntList;
import java.util.HashSet;
import org.mymedialite.datatype.IBooleanMatrix;
import org.mymedialite.datatype.SymmetricMatrix;
import org.mymedialite.util.Memory;
/**
* Class for storing cosine similarities.
* http://en.wikipedia.org/wiki/Cosine_similarity
* @version 2.03
*/
public final class BinaryCosine extends BinaryDataCorrelationMatrix {
/**
* Creates an object of type Cosine.
* @param numEntities the number of entities
*/
public BinaryCosine(int numEntities) {
super(numEntities);
}
/**
* Creates a Cosine similarity matrix from given data.
* @param vectors the boolean data
* @return the similarity matrix based on the data
*/
public static CorrelationMatrix create(IBooleanMatrix vectors) {
BinaryDataCorrelationMatrix cm;
int numEntities = vectors.numberOfRows();
try {
cm = new BinaryCosine(numEntities);
} catch (OutOfMemoryError e) {
System.err.println("Too many entities: " + numEntities);
throw e;
}
cm.computeCorrelations(vectors);
return cm;
}
/**
*/
public void computeCorrelations(IBooleanMatrix entityData) {
// If possible, save some memory
if (entityData.numberOfColumns() > Short.MAX_VALUE)
computeCorrelationsUIntOverlap(entityData);
else
computeCorrelationsUShortOverlap(entityData);
}
void computeCorrelationsUIntOverlap(IBooleanMatrix entityData) {
IBooleanMatrix transpose = (IBooleanMatrix) entityData.transpose();
SymmetricMatrix<Integer> overlap = new SymmetricMatrix<Integer>(entityData.numberOfRows(), 0);
overlap.init(new Integer(0));
// Go over all (other) entities
for (int row_id = 0; row_id < transpose.numberOfRows(); row_id++) {
IntList row = transpose.getEntriesByRow(row_id);
for (int i = 0; i < row.size(); i++) {
int x = row.getInt(i);
for (int j = i + 1; j < row.size(); j++) {
int y = row.getInt(j);
overlap.set(x, y, overlap.get(x, y) + 1);
}
}
}
// The diagonal of the correlation matrix
for (int i = 0; i < numEntities; i++)
set(i, i, 1.0F);
// Compute cosine
for (int x = 0; x < numEntities; x++)
for (int y = 0; y < x; y++) {
if(entityData.numEntriesByRow(x) == 0 || entityData.numEntriesByRow(y) == 0) {
set(x, y, 0.0F);
} else {
set(x, y, (float)(overlap.get(x, y) / Math.sqrt(entityData.numEntriesByRow(x) * entityData.numEntriesByRow(y))));
}
}
}
void computeCorrelationsUShortOverlap(IBooleanMatrix entity_data) {
IBooleanMatrix transpose = (IBooleanMatrix) entity_data.transpose();
SymmetricMatrix<Short> overlap = new SymmetricMatrix<Short>(entity_data.numberOfRows(), new Short("0"));
overlap.init(new Short("0"));
// Go over all (other) entities
for (int row_id = 0; row_id < transpose.numberOfRows(); row_id++) {
IntList row = transpose.getEntriesByRow(row_id);
for (int i = 0; i < row.size(); i++) {
int x = row.getInt(i);
for (int j = i + 1; j < row.size(); j++) {
int y = row.getInt(j);
overlap.set(x, y, (short)(overlap.get(x, y) + 1));
}
}
}
// The diagonal of the correlation matrix
for (int i = 0; i < numEntities; i++)
set(i, i, 1.0F);
// Compute cosine
for (int x = 0; x < numEntities; x++)
for (int y = 0; y < x; y++) {
if(entity_data.numEntriesByRow(x) == 0 || entity_data.numEntriesByRow(y) == 0) {
set(x, y, 0.0F);
} else {
set(x, y, (float) (overlap.get(x, y) / Math.sqrt(entity_data.numEntriesByRow(x) * entity_data.numEntriesByRow(y))));
}
}
}
/**
* Computes the cosine similarity of two binary vectors.
* @param vectorI the first vector
* @param vectorJ the second vector
* @return the cosine similarity between the two vectors
*/
public static float computeCorrelation(HashSet<Integer> vectorI, HashSet<Integer> vectorJ) {
int cntr = 0;
for (int k : vectorJ)
if (vectorI.contains(k))
cntr++;
return cntr / (float) Math.sqrt(vectorI.size() * vectorJ.size());
}
}