// Copyright (C) 2012 Chris Newell // // This file is part of MyMediaLite. // // MyMediaLite is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // MyMediaLite is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with MyMediaLite. If not, see <http://www.gnu.org/licenses/>. package org.mymedialite.io; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import org.mymedialite.data.IEntityMapping; import org.mymedialite.datatype.SparseBooleanMatrix; /** * Class that offers static methods to read hierarchical attribute data into SparseBooleanMatrix objects. * * @version 2.03 */ public class HierarchicalAttributeData { // Prevent instantiation. private HierarchicalAttributeData() {} private static final String separator = "/"; /** * Read hierachical attribute data from a file. * * Each line must consist of an item identifier followed by one or more hierarchical attributes * which are represented by a path. The path is divided into hierarchical nodes by a "/" character. * character. Leading and trailing separator characters are ignored. * * There can be one or more lines per item. Empty lines are ignored. * * See Constants.SPLIT_CHARS for details of the permissible field separators. * * @param filename the name of the file to be read from * @param itemMapping the mapping object for the given entity type * @param attributeMapping the mapping object for the attributes * @return the attribute data */ public static SparseBooleanMatrix read(String filename, IEntityMapping itemMapping, IEntityMapping attributeMapping) throws IOException { try { BufferedReader reader = new BufferedReader(new FileReader(filename)); return read(reader, itemMapping, attributeMapping); } catch (IOException e) { throw new IOException("Unable to read file " + filename + ": " + e.getMessage()); } } /** * Read hierachical attribute data from a BufferedReader. * * Each line must consist of an item identifier followed by one or more hierarchical attributes * which are represented by a path. The path is divided into hierarchical nodes by a "/" character. * character. Leading and trailing separator characters are ignored. * * There can be one or more lines per item. Empty lines are ignored. * * See Constants.SPLIT_CHARS for details of the permissible field separators. * * @param reader a BufferedReader to be read from * @param itemMapping the mapping object for the given entity type * @param attributeMapping the mapping object for the attributes * @return the attribute data */ public static SparseBooleanMatrix read(BufferedReader reader, IEntityMapping itemMapping, IEntityMapping attributeMapping) throws IOException { SparseBooleanMatrix matrix = new SparseBooleanMatrix(); String line; while ((line = reader.readLine()) != null) { line = line.trim(); if (line.length() == 0) continue; String[] tokens = line.split(Constants.SPLIT_CHARS, 0); if (tokens.length < 2) throw new IOException("Expected at least 2 columns: " + line); int entity_id = itemMapping.toInternalID(tokens[0]); for(int i = 1; i < tokens.length; i++) { String path = trim(tokens[i]); Integer attr_id = attributeMapping.toInternalID(path); matrix.set(entity_id, attr_id, true); int index; while((index = path.lastIndexOf(separator)) > 0) { path = path.substring(0, index); Integer parent_id = attributeMapping.toInternalID(path); matrix.set(entity_id, parent_id, true); } } } return matrix; } private static String trim(String path) { int start = 0; int length = path.length(); if(path.indexOf(separator) == 0) start++; if(path.lastIndexOf(separator) == length - 1) length--; //return path.substring(start, length); // TODO remove this path = path.substring(start, length); if(path.startsWith("genres/")) path = path.substring(7); if(path.startsWith("formats/")) path = path.substring(8); if(path.startsWith("people/")) path = path.substring(7); if(path.startsWith("subjects/")) path = path.substring(9); if(path.startsWith("places/")) path = path.substring(7); if(path.startsWith("topics/")) path = path.substring(7); return path; } }