//Copyright (C) 2010, 2011 Zeno Gantner, Chris Newell // //This file is part of MyMediaLite. // //MyMediaLite is free software: you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //MyMediaLite is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. // //You should have received a copy of the GNU General Public License //along with MyMediaLite. If not, see <http://www.gnu.org/licenses/>. package org.mymedialite.data; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntList; import java.util.ArrayList; import java.util.List; /** * k-fold cross-validation split for rating prediction. * * Please note that k-fold cross-validation is not the best/most realistic way of evaluating * recommender system algorithms. In particular, chronological splits (see RatingsChronologicalSplit) * are more realistic. * * The dataset must not be modified after the split - this would lead to undefined behavior. * @version 2.03 */ public class RatingCrossValidationSplit implements ISplit<IRatings> { private int numberOfFolds; private List<IRatings> train; private List<IRatings> test; /** * */ public int numberOfFolds() { return numberOfFolds; } /** * */ public List<IRatings> train() { return train; } /** * */ public List<IRatings> test() { return test; } /** * Create a k-fold split of rating prediction data. * @param ratings the dataset * @param num_folds the number of folds */ public RatingCrossValidationSplit(IRatings ratings, int num_folds) { if (num_folds < 2) throw new IllegalArgumentException("num_folds must be at least 2."); this.numberOfFolds = num_folds; // Randomize List<Integer> random_indices = ratings.randomIndex(); // Create index lists List<IntList> train_indices = new ArrayList<IntList>(num_folds); List<IntList> test_indices = new ArrayList<IntList>(num_folds); for (int i = 0; i < num_folds; i++) { train_indices.add(new IntArrayList()); test_indices.add(new IntArrayList()); } // Assign indices to folds for (int i : random_indices) for (int j = 0; j < num_folds; j++) if (j == i % num_folds) test_indices.get(j).add(i); else train_indices.get(j).add(i); // Create split data structures train = new ArrayList<IRatings>(num_folds); test = new ArrayList<IRatings>(num_folds); for (int i = 0; i < num_folds; i++) if (ratings instanceof ITimedRatings) { train.add(new TimedRatingsProxy((ITimedRatings) ratings, train_indices.get(i))); test.add(new TimedRatingsProxy((ITimedRatings) ratings, test_indices.get(i))); } else { train.add(new RatingsProxy(ratings, train_indices.get(i))); test.add(new RatingsProxy(ratings, test_indices.get(i))); } } }