// Copyright (C) 2010, 2011 Zeno Gantner, Chris Newell // // This file is part of MyMediaLite. // // MyMediaLite is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // MyMediaLite is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with MyMediaLite. If not, see <http://www.gnu.org/licenses/>. package org.mymedialite.util; import java.util.Arrays; import java.util.Calendar; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import org.mymedialite.data.IPosOnlyFeedback; import org.mymedialite.data.IRatings; import org.mymedialite.data.ITimedRatings; import org.mymedialite.datatype.SparseBooleanMatrix; /** * Class containing utility functions. * @version 2.03 */ public class Utils { // Prevent instantiation. private Utils() {} /** * Shuffle a list in-place. * * Fisher-Yates shuffle, see * http://en.wikipedia.org/wiki/Fisher–Yates_shuffle * */ public static void shuffle(List<Object> list) { Random random = org.mymedialite.util.Random.getInstance(); for (int i = list.size() - 1; i >= 0; i--) { int r = random.nextInt(i + 1); // Swap position i with position r Object tmp = list.get(i); list.set(i, list.get(r)); list.set(r, tmp); } } /** * Display dataset statistics. * @param train the training data * @param test the test data * @param user_attributes the user attributes * @param item_attributes the item attributes * @param display_overlap if set true, display the user/item overlap between train and test */ public static void displayDataStats( IRatings train, IRatings test, SparseBooleanMatrix user_attributes, SparseBooleanMatrix item_attributes, boolean display_overlap) { // Training data stats int num_users = train.allUsers().size(); int num_items = train.allItems().size(); long matrix_size = (long) num_users * num_items; long empty_size = matrix_size - train.size(); double sparsity = (double) 100L * empty_size / matrix_size; System.out.println("Training data: " + num_users + " users, " + num_items + " items, " + train.size() + " ratings, sparsity " + sparsity); if (train instanceof ITimedRatings) { ITimedRatings time_train = (ITimedRatings)train; System.out.println("Rating period: " + time_train.earliestTime() + " to " + time_train.latestTime()); } // Test data stats if (test != null) { num_users = test.allUsers().size(); num_items = test.allItems().size(); matrix_size = (long) num_users * num_items; empty_size = matrix_size - test.size(); // TODO depends on the eval scheme whether this is correct sparsity = (double) 100L * empty_size / matrix_size; System.out.println("Test data: " + num_users + " users, " + num_items + " items, " + test.size() + " ratings, sparsity " + sparsity); if (test instanceof ITimedRatings) { ITimedRatings time_test = (ITimedRatings)test; System.out.println("rating period: " + time_test.earliestTime() + " to " + time_test.latestTime()); } } // Count and display the overlap between train and test if (display_overlap && test != null) { int num_new_users = 0; int num_new_items = 0; long start = Calendar.getInstance().getTimeInMillis(); for(int u : test.allUsers()) if(Arrays.asList(train.allUsers()).contains(u)) num_new_users++; for(int i : test.allItems()) if(Arrays.asList(train.allItems()).contains(i)) num_new_items++; System.out.println(num_new_users + " new users, " + num_new_items + " new items, " + (Calendar.getInstance().getTimeInMillis() - start) / 1000 + " seconds"); } displayAttributeStats(user_attributes, item_attributes); } /** * Display data statistics for item recommendation datasets. * @param training_data the training dataset * @param test_data the test dataset * @param user_attributes the user attributes * @param item_attributes the item attributes */ public static void displayDataStats( IPosOnlyFeedback training_data, IPosOnlyFeedback test_data, SparseBooleanMatrix user_attributes, SparseBooleanMatrix item_attributes) { // training data stats int num_users = training_data.allUsers().size(); int num_items = training_data.allItems().size(); long matrix_size = (long) num_users * num_items; long empty_size = matrix_size - training_data.size(); double sparsity = (double) 100L * empty_size / matrix_size; System.out.println("Training data: " + num_users + " users, " + num_items + " items, " + training_data.size() + " events, sparsity " + sparsity); // test data stats if (test_data != null) { num_users = test_data.allUsers().size(); num_items = test_data.allItems().size(); matrix_size = (long) num_users * num_items; empty_size = matrix_size - test_data.size(); sparsity = (double) 100L * empty_size / matrix_size; // TODO depends on the eval scheme whether this is correct System.out.println("Test data: " + num_users + " users, " + num_items + " items, " + test_data.size() + " events, sparsity " + sparsity); } displayAttributeStats(user_attributes, item_attributes); } /** * Display statistics for user and item attributes. * @param user_attributes the user attributes * @param item_attributes the item attributes */ public static void displayAttributeStats(SparseBooleanMatrix user_attributes, SparseBooleanMatrix item_attributes) { if(user_attributes != null || item_attributes != null) System.out.print("Attribute data: "); if (user_attributes != null) { System.out.println( user_attributes.numberOfColumns() + " user attributes for " + + user_attributes.numberOfRows() + " users, " + + user_attributes.numberOfEntries() + " assignments, " + + user_attributes.nonEmptyRowIDs().size() + " users with attribute assignments" ); } if (item_attributes != null) { System.out.println( item_attributes.nonEmptyColumnIDs().size() + " item attributes for " + + item_attributes.numberOfRows() + " items, " + + item_attributes.numberOfEntries() + " assignments, " + item_attributes.nonEmptyRowIDs().size() + " items with attribute assignments" ); } } public static String combine(String directory, String filename) { if(!directory.endsWith("/") && !directory.endsWith("\\")) directory = directory + "/"; String path = directory + filename; return path; } public static <T extends Number> double average(Collection<T> values) { double sum = 0; for(Number value : values) { sum += value.doubleValue(); } return sum / values.size(); } public static <T> Collection<T> intersect(Collection<T> a, Collection<T> b) { Set<T> intersection = new HashSet<T>(a); intersection.retainAll(b); return intersection; } public static <T> Collection<T> union(Collection<T> a, Collection<T> b) { Set<T> intersection = new HashSet<T>(a); intersection.addAll(b); return intersection; } public static Integer parseInteger(String string) { try { Integer integer = Integer.parseInt(string); return integer; } catch (NumberFormatException e) { return null; } } }