/*
* RapidMiner
*
* Copyright (C) 2001-2014 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.io;
import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.FileDataRowReader;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.gui.wizards.ExampleSourceConfigurationWizardCreator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeAttributeFile;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeConfiguration;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.att.AttributeDataSource;
import com.rapidminer.tools.att.AttributeDataSources;
import com.rapidminer.tools.att.AttributeSet;
import com.rapidminer.tools.io.Encoding;
/**
* <p>
* This operator reads an example set from (a) file(s). Probably you can use the default parameter values for the most
* file formats (including the format produced by the ExampleSetWriter, CSV, ...). Please refer to section
* {@rapidminer.ref sec:inputfiles|First steps/File formats} for details on the attribute description file set by the
* parameter <var>attributes</var> used to specify attribute types. You can use the wizard of this operator or the tool
* Attribute Editor in order to create those meta data .aml files for your datasets.
* </p>
*
* <p>
* This operator supports the reading of data from multiple source files. Each attribute (including special attributes
* like labels, weights, ...) might be read from another file. Please note that only the minimum number of lines of all
* files will be read, i.e. if one of the data source files has less lines than the others, only this number of examples
* will be read.
* </p>
*
* <p>
* The split points can be defined with regular expressions (please refer to the annex of the RapidMiner tutorial for an
* overview). The default split parameter ",\s*|;\s*|\s+" should work for most file formats. This regular
* expression describes the following column separators
* <ul>
* <li>the character "," followed by a whitespace of arbitrary length (also no white space)</li>
* <li>the character ";" followed by a whitespace of arbitrary length (also no white space)</li>
* <li>a whitespace of arbitrary length (min. 1)</li>
* </ul>
* A logical XOR is defined by "|". Other useful separators might be "\t" for tabulars, "
* " for a single whitespace, and "\s" for any whitespace.
* </p>
*
* <p>
* Quoting is also possible with ". You can escape quotes with a backslash, i.e. \". Please note that you can
* change these characters by adjusting the corresponding settings.
* </p>
*
* <p>
* Additionally you can specify comment characters which can be used at arbitrary locations of the data lines. Any
* content after the comment character will be ignored. Unknown attribute values can be marked with empty strings (if
* this is possible for your column separators) or by a question mark (recommended).
* </p>
*
* @author Simon Fischer, Ingo Mierswa
*/
public class ExampleSource extends AbstractExampleSource {
/**
* The parameter name for "Filename for the XML attribute description file. This file also contains the names
* of the files to read the data from."
*/
public static final String PARAMETER_ATTRIBUTES = "attributes";
static {
AbstractReader.registerReaderDescription(new ReaderDescription("aml", ExampleSource.class, PARAMETER_ATTRIBUTES));
}
/**
* The parameter name for "The fraction of the data set which should be read (1 = all; only used if sample_size
* = -1)"
*/
public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio";
/**
* The parameter name for "The exact number of samples which should be read (-1 = use sample ratio; if not -1,
* sample_ratio will not have any effect)"
*/
public static final String PARAMETER_SAMPLE_SIZE = "sample_size";
/** The parameter name for "Indicates if the loaded data should be permuted." */
public static final String PARAMETER_PERMUTATE = "permute";
/** The parameter name for "Column separators for data files (regular expression)" */
public static final String PARAMETER_COLUMN_SEPARATORS = "column_separators";
/** The parameter name for "Indicates if a comment character should be used" */
public static final String PARAMETER_USE_COMMENT_CHARACTERS = "use_comment_characters";
/** The parameter name for "Lines beginning with these characters are ignored." */
public static final String PARAMETER_COMMENT_CHARS = "comment_chars";
/** The parameter name for "Character that is used as decimal point." */
public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character";
/** The parameter name for "Indicates if quotes should be regarded (slower!)." */
public static final String PARAMETER_USE_QUOTES = "use_quotes";
/** Specifies the used quoting character. */
public static final String PARAMETER_QUOTE_CHARACTER = "quote_character";
/** Specifies the used character for escaping quoting. */
public static final String PARAMETER_QUOTING_ESCAPE_CHARACTER = "quoting_escape_character";
/** Indicates if the lines should be trimmed during reading. */
public static final String PARAMETER_TRIM_LINES = "trim_lines";
/** Indicates if lines leading to errors should be skipped. */
public static final String PARAMETER_SKIP_ERROR_LINES = "skip_error_lines";
/** The parameter name for "Determines, how the data is represented internally." */
public static final String PARAMETER_DATAMANAGEMENT = "datamanagement";
public ExampleSource(OperatorDescription description) {
super(description);
}
@Override
public MetaData getGeneratedMetaData() throws OperatorException {
getLogger().fine("Generating meta data for " + this.getName());
File attributeFile = getParameterAsFile(PARAMETER_ATTRIBUTES);
if (attributeFile == null) {
return new ExampleSetMetaData();
}
AttributeDataSources attributeDataSources;
try {
attributeDataSources = AttributeDataSource.createAttributeDataSources(attributeFile, true, this);
} catch (IOException e) {
throw new UserError(this, e, 302, new Object[] { attributeFile, e.getMessage() });
} catch (com.rapidminer.tools.XMLException e) {
throw new UserError(this, e, 401, e.getMessage());
} catch (ParserConfigurationException e) {
throw new UserError(this, e, 401, e.toString());
} catch (SAXException e) {
throw new UserError(this, e, 401, e.toString());
}
AttributeSet attributeSet = new AttributeSet(attributeDataSources);
ExampleSetMetaData emd = new ExampleSetMetaData();
for (Map.Entry<String, Attribute> entry : attributeSet.getSpecialAttributes().entrySet()) {
AttributeMetaData a = new AttributeMetaData(entry.getValue());
a.setRole(entry.getKey());
emd.addAttribute(a);
}
for (Attribute attribute : attributeSet.getRegularAttributes()) {
emd.addAttribute(new AttributeMetaData(attribute));
}
return emd;
}
@Override
protected boolean isMetaDataCacheable() {
return true;
}
@Override
public ExampleSet createExampleSet() throws OperatorException {
AttributeDataSources attributeDataSources = null;
FileDataRowReader reader = null;
File attributeFile = getParameterAsFile(PARAMETER_ATTRIBUTES);
try {
attributeDataSources = AttributeDataSource.createAttributeDataSources(attributeFile, true, this);
char[] commentCharacters = null;
if (getParameterAsBoolean(PARAMETER_USE_COMMENT_CHARACTERS)) {
commentCharacters = getParameterAsString(PARAMETER_COMMENT_CHARS).toCharArray();
}
reader = new FileDataRowReader(
new DataRowFactory(getParameterAsInt(PARAMETER_DATAMANAGEMENT), getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0)),
attributeDataSources.getDataSources(),
getParameterAsDouble(PARAMETER_SAMPLE_RATIO),
getParameterAsInt(PARAMETER_SAMPLE_SIZE),
getParameterAsString(PARAMETER_COLUMN_SEPARATORS),
commentCharacters,
getParameterAsBoolean(PARAMETER_USE_QUOTES),
getParameterAsString(PARAMETER_QUOTE_CHARACTER).charAt(0),
getParameterAsString(PARAMETER_QUOTING_ESCAPE_CHARACTER).charAt(0),
getParameterAsBoolean(PARAMETER_TRIM_LINES),
getParameterAsBoolean(PARAMETER_SKIP_ERROR_LINES),
// only null if old version of description format: Then emulate old behavior using root operator
attributeDataSources.getEncoding() == null ? Encoding.getEncoding(this) : attributeDataSources.getEncoding(),
RandomGenerator.getRandomGenerator(getParameterAsBoolean(RandomGenerator.PARAMETER_USE_LOCAL_RANDOM_SEED), getParameterAsInt(RandomGenerator.PARAMETER_LOCAL_RANDOM_SEED)));
} catch (IOException e) {
throw new UserError(this, e, 302, new Object[] { attributeFile, e.getMessage() });
} catch (com.rapidminer.tools.XMLException e) {
throw new UserError(this, e, 401, e.getMessage());
} catch (ParserConfigurationException e) {
throw new UserError(this, e, 401, e.toString());
} catch (SAXException e) {
throw new UserError(this, e, 401, e.toString());
}
AttributeSet attributeSet = new AttributeSet(attributeDataSources);
ExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes(), reader, getParameterAsBoolean(PARAMETER_PERMUTATE));
ExampleSet result = table.createExampleSet(attributeSet);
return result;
}
@Override
protected boolean supportsEncoding() {
return true;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();
ParameterType type = new ParameterTypeConfiguration(ExampleSourceConfigurationWizardCreator.class, this);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeAttributeFile(PARAMETER_ATTRIBUTES, "Filename for the xml attribute description file. This file also contains the names of the files to read the data from.", false));
type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)", 0.0d, 1.0d, 1.0d);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)", -1, Integer.MAX_VALUE, -1));
types.add(new ParameterTypeBoolean(PARAMETER_PERMUTATE, "Indicates if the loaded data should be permutated.", false));
types.add(new ParameterTypeString(PARAMETER_DECIMAL_POINT_CHARACTER, "Character that is used as decimal point.", "."));
types.add(new ParameterTypeString(PARAMETER_COLUMN_SEPARATORS, "Column separators for data files (regular expression)", ",\\s*|;\\s*|\\s+"));
types.add(new ParameterTypeBoolean(PARAMETER_USE_COMMENT_CHARACTERS, "Indicates if a comment character should be used.", true));
type = new ParameterTypeString(PARAMETER_COMMENT_CHARS, "Any content in a line after one of these characters will be ignored.", "#");
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_COMMENT_CHARACTERS, false, true));
types.add(type);
types.add(new ParameterTypeBoolean(PARAMETER_USE_QUOTES, "Indicates if quotes should be regarded.", true));
type = new ParameterTypeString(PARAMETER_QUOTE_CHARACTER, "Specifies the character which should be used for quoting.", "\"");
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_QUOTES, false, true));
types.add(type);
type = new ParameterTypeString(PARAMETER_QUOTING_ESCAPE_CHARACTER, "Specifies the character which should be used for escape the quoting.", "\\");
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_QUOTES, false, true));
types.add(type);
types.add(new ParameterTypeBoolean(PARAMETER_TRIM_LINES, "Indicates if lines should be trimmed (empty spaces are removed at the beginning and the end) before the column split is performed.", false));
types.add(new ParameterTypeBoolean(PARAMETER_SKIP_ERROR_LINES, "Indicates if lines which can not be read should be skipped instead of letting this operator fail its execution.", false));
types.add(new ParameterTypeCategory(PARAMETER_DATAMANAGEMENT, "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, DataRowFactory.TYPE_DOUBLE_ARRAY));
types.addAll(super.getParameterTypes());
types.addAll(RandomGenerator.getRandomGeneratorParameters(this));
return types;
}
}