//Copyright (C) 2010, 2011 Zeno Gantner, Chris
//
//This file is part of MyMediaLite.
//
//MyMediaLite is free software: you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//MyMediaLite is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with MyMediaLite. If not, see <http://www.gnu.org/licenses/>.
package org.mymedialite.data;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
/**
* Per-user chronological split for rating prediction.
*
* Chronological splits (splits according to the time of the rating) treat all ratings before
* a certain time as training ratings, and the ones after that time as test/validation ratings.
*
* Here, the split date may differ from user to user.
* In the constructor, you can either specify which part (ratio) or how many of a user's rating
* are supposed to be used for validation.
*
* The dataset must not be modified after the split - this would lead to undefined behavior.
* @version 2.03
*/
public class RatingsPerUserChronologicalSplit implements ISplit<ITimedRatings> {
/**
*
*/
public int numberOfFolds() {
return 1;
}
/**
*
*/
public List<ITimedRatings> train;
/**
*
*/
public List<ITimedRatings> test;
@Override
public List<ITimedRatings> train() {
return train;
}
@Override
public List<ITimedRatings> test() {
return test;
}
/**
* Create a chronological split of rating prediction data.
*
* If ratings have exactly the same date and time, and they are close to the threshold between
* train and test, there is no guaranteed order between them (ties are broken according to how the
* sorting procedure sorts the ratings).
*
* @param ratings the dataset
* @param ratio the ratio of ratings to use for validation (per user)
*/
public RatingsPerUserChronologicalSplit(ITimedRatings ratings, double ratio) {
if (ratio <= 0 && ratio >= 1)
throw new IllegalArgumentException("ratio must be between 0 and 1");
IntList train_indices = new IntArrayList();
IntList test_indices = new IntArrayList();
// For every user, perform the split and assign the ratings accordingly
for (int u : ratings.allUsers()) {
List<Integer> chronological_index = ratings.byUser().get(u);
Collections.sort(chronological_index, ratings);
int num_test_ratings = (int) Math.round(ratings.byUser().get(u).size() * ratio);
int num_train_ratings = ratings.byUser().get(u).size() - num_test_ratings;
// Assign indices to training part
for (int i = 0; i < num_train_ratings; i++)
train_indices.add(chronological_index.get(i));
// Assign indices to test part
for (int i = 0; i < num_test_ratings; i++)
test_indices.add(chronological_index.get(i + num_train_ratings));
}
// Create split data structures
train.add(new TimedRatingsProxy(ratings, train_indices));
test.add(new TimedRatingsProxy(ratings, test_indices));
}
/**
* Create a chronological split of rating prediction data.
*
* If ratings have exactly the same date and time, and they are close to the threshold between
* train and test, there is no guaranteed order between them (ties are broken according to how the
* sorting procedure sorts the ratings).
*
* @param ratings the dataset
* @param num_test_ratings_per_user the number of test ratings (per user)
*/
public RatingsPerUserChronologicalSplit(ITimedRatings ratings, int num_test_ratings_per_user) {
IntList train_indices = new IntArrayList();
IntList test_indices = new IntArrayList();
// For every user, perform the split and assign the ratings accordingly
for (int u : ratings.allUsers()) {
List<Integer> chronological_index = ratings.byUser().get(u);
Collections.sort(chronological_index, ratings);
int num_test_ratings = Math.min(num_test_ratings_per_user, ratings.byUser().get(u).size());
int num_train_ratings = ratings.byUser().get(u).size() - num_test_ratings;
// Assign indices to training part
for (int i = 0; i < num_train_ratings; i++)
train_indices.add(chronological_index.get(i));
// Assign indices to test part
for (int i = 0; i < num_test_ratings; i++)
test_indices.add(chronological_index.get(i + num_train_ratings));
}
// Create split data structures
train.add(new TimedRatingsProxy(ratings, train_indices));
test.add(new TimedRatingsProxy(ratings, test_indices));
}
}