//Copyright (C) 2011 Zeno Gantner, Chris Newell
//
//This file is part of MyMediaLite.
//
//MyMediaLite is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//MyMediaLite is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with MyMediaLite. If not, see <http://www.gnu.org/licenses/>.
//
package org.mymedialite.data;
import java.util.Calendar;
import java.util.List;
import org.mymedialite.datatype.SparseBooleanMatrix;
/**
* Extension methods for dataset statistics.
* @version 2.03
*/
public class Extensions {
// Prevent instantiation.
private Extensions() { }
/**
* Display dataset statistics.
* @param train the training data
* @param test the test data
* @param user_attributes the user attributes
* @param item_attributes the item attributes
* @param display_overlap if set true, display the user/item overlap between train and test
*/
public static String statistics(IRatings train,
IRatings test,
SparseBooleanMatrix user_attributes,
SparseBooleanMatrix item_attributes,
boolean display_overlap) {
// IRatings test = null,
// SparseBooleanMatrix user_attributes = null
// SparseBooleanMatrix item_attributes = null
// Training data stats
int num_users = train.allUsers().size();
int num_items = train.allItems().size();
long matrix_size = (long) num_users * num_items;
long empty_size = matrix_size - train.size();
double sparsity = (double) 100L * empty_size / matrix_size;
String s = "Training data: " + num_users + " users, " + num_items + " items, " + train.size() + " ratings, sparsity " + sparsity + "\n";
if (train instanceof ITimedRatings) {
ITimedRatings time_train = (ITimedRatings)train;
s += "Rating period: " + time_train.earliestTime() + " to " + time_train.latestTime() + "\n";
}
// Test data stats
if (test != null) {
num_users = test.allUsers().size();
num_items = test.allItems().size();
matrix_size = (long) num_users * num_items;
empty_size = matrix_size - test.size(); // TODO depends on the eval scheme whether this instanceof correct
sparsity = (double) 100L * empty_size / matrix_size;
// TODO floating point format for sparsity
s += "Test data: " + num_users + " users, " + num_items + " items, " + test.size() + " ratings, sparsity " + sparsity + "\n";
if (test instanceof ITimedRatings) {
ITimedRatings time_test = (ITimedRatings)test;
s += "Rating period: " + time_test.earliestTime() + " to " + time_test.latestTime() + "\n";
}
}
// Count and display the overlap between train and test
if (display_overlap && test != null) {
int num_new_users = 0;
int num_new_items = 0;
long start = Calendar.getInstance().getTimeInMillis();
List<Integer> new_users = test.allUsers();
new_users.removeAll(train.allUsers());
num_new_users = new_users.size();
List<Integer> new_items = test.allItems();
new_items.removeAll(train.allItems());
num_new_items = new_items.size();
s += num_new_users + " new users, " + num_new_items + " new items " + (Calendar.getInstance().getTimeInMillis() - start) / 1000 + " seconds\n";
}
return s + statistics(user_attributes, item_attributes);
}
/**
* Display data statistics for item recommendation datasets.
* @param training_data the training dataset
* @param test_data the test dataset
* @param user_attributes the user attributes
* @param item_attributes the item attributes
*/
public static String statistics(IPosOnlyFeedback training_data,
IPosOnlyFeedback test_data,
SparseBooleanMatrix user_attributes,
SparseBooleanMatrix item_attributes) {
//IPosOnlyFeedback test_data = null,
//SparseBooleanMatrix user_attributes = null
//SparseBooleanMatrix item_attributes = null
// Training data stats
int num_users = training_data.allUsers().size();
int num_items = training_data.allItems().size();
long matrix_size = (long) num_users * num_items;
long empty_size = matrix_size - training_data.size();
double sparsity = (double) 100L * empty_size / matrix_size;
String s = "Training data: " + num_users + " users, " + num_items + " items, " + training_data.size() + " events, sparsity " + sparsity;
// Test data stats
if (test_data != null) {
num_users = test_data.allUsers().size();
num_items = test_data.allItems().size();
matrix_size = (long) num_users * num_items;
empty_size = matrix_size - test_data.size();
sparsity = (double) 100L * empty_size / matrix_size; // TODO depends on the eval scheme whether this instanceof correct
s += "Test data: " + num_users + " users, " + num_items + " items, " + test_data.size() + " events, sparsity " + sparsity + "\n";
}
return s + statistics(user_attributes, item_attributes);
}
/**
* Display statistics for user and item attributes.
* @param user_attributes the user attributes
* @param item_attributes the item attributes
*/
public static String statistics(SparseBooleanMatrix user_attributes, SparseBooleanMatrix item_attributes) {
String s = "";
if (user_attributes != null || item_attributes != null) s += "Attribute data: ";
if (user_attributes != null) {
s += user_attributes.numberOfColumns() + " user attributes for " +
user_attributes.numberOfRows() + " users, " +
user_attributes.numberOfEntries() + " assignments, " +
user_attributes.nonEmptyRowIDs().size() + " users with attribute assignments\n";
}
if (item_attributes != null) {
s += item_attributes.nonEmptyColumnIDs().size() + " item attributes for " +
item_attributes.numberOfRows() + " items, " +
item_attributes.numberOfEntries() + " assignments, " +
item_attributes.nonEmptyRowIDs().size() + " items with attribute assignments\n";
}
return s;
}
}