ExampleSource.java example

/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2014 by RapidMiner and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapidminer.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.io;

import java.io.File;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.FileDataRowReader;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.gui.wizards.ExampleSourceConfigurationWizardCreator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeAttributeFile;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeConfiguration;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.att.AttributeDataSource;
import com.rapidminer.tools.att.AttributeDataSources;
import com.rapidminer.tools.att.AttributeSet;
import com.rapidminer.tools.io.Encoding;

/**
 * <p>
 * This operator reads an example set from (a) file(s). Probably you can use the default parameter values for the most
 * file formats (including the format produced by the ExampleSetWriter, CSV, ...). Please refer to section
 * {@rapidminer.ref sec:inputfiles|First steps/File formats} for details on the attribute description file set by the
 * parameter <var>attributes</var> used to specify attribute types. You can use the wizard of this operator or the tool
 * Attribute Editor in order to create those meta data .aml files for your datasets.
 * </p>
 * 
 * <p>
 * This operator supports the reading of data from multiple source files. Each attribute (including special attributes
 * like labels, weights, ...) might be read from another file. Please note that only the minimum number of lines of all
 * files will be read, i.e. if one of the data source files has less lines than the others, only this number of examples
 * will be read.
 * </p>
 * 
 * <p>
 * The split points can be defined with regular expressions (please refer to the annex of the RapidMiner tutorial for an
 * overview). The default split parameter ",\s*|;\s*|\s+" should work for most file formats. This regular
 * expression describes the following column separators
 * <ul>
 * <li>the character "," followed by a whitespace of arbitrary length (also no white space)</li>
 * <li>the character ";" followed by a whitespace of arbitrary length (also no white space)</li>
 * <li>a whitespace of arbitrary length (min. 1)</li>
 * </ul>
 * A logical XOR is defined by "|". Other useful separators might be "\t" for tabulars, "
 * " for a single whitespace, and "\s" for any whitespace.
 * </p>
 * 
 * <p>
 * Quoting is also possible with ". You can escape quotes with a backslash, i.e. \". Please note that you can
 * change these characters by adjusting the corresponding settings.
 * </p>
 * 
 * <p>
 * Additionally you can specify comment characters which can be used at arbitrary locations of the data lines. Any
 * content after the comment character will be ignored. Unknown attribute values can be marked with empty strings (if
 * this is possible for your column separators) or by a question mark (recommended).
 * </p>
 * 
 * @author Simon Fischer, Ingo Mierswa
 */
public class ExampleSource extends AbstractExampleSource {

	/**
	 * The parameter name for "Filename for the XML attribute description file. This file also contains the names
	 * of the files to read the data from."
	 */
	public static final String PARAMETER_ATTRIBUTES = "attributes";

	static {
		AbstractReader.registerReaderDescription(new ReaderDescription("aml", ExampleSource.class, PARAMETER_ATTRIBUTES));
	}

	/**
	 * The parameter name for "The fraction of the data set which should be read (1 = all; only used if sample_size
	 * = -1)"
	 */
	public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio";

	/**
	 * The parameter name for "The exact number of samples which should be read (-1 = use sample ratio; if not -1,
	 * sample_ratio will not have any effect)"
	 */
	public static final String PARAMETER_SAMPLE_SIZE = "sample_size";

	/** The parameter name for "Indicates if the loaded data should be permuted." */
	public static final String PARAMETER_PERMUTATE = "permute";

	/** The parameter name for "Column separators for data files (regular expression)" */
	public static final String PARAMETER_COLUMN_SEPARATORS = "column_separators";

	/** The parameter name for "Indicates if a comment character should be used" */
	public static final String PARAMETER_USE_COMMENT_CHARACTERS = "use_comment_characters";

	/** The parameter name for "Lines beginning with these characters are ignored." */
	public static final String PARAMETER_COMMENT_CHARS = "comment_chars";

	/** The parameter name for "Character that is used as decimal point." */
	public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character";

	/** The parameter name for "Indicates if quotes should be regarded (slower!)." */
	public static final String PARAMETER_USE_QUOTES = "use_quotes";

	/** Specifies the used quoting character. */
	public static final String PARAMETER_QUOTE_CHARACTER = "quote_character";

	/** Specifies the used character for escaping quoting. */
	public static final String PARAMETER_QUOTING_ESCAPE_CHARACTER = "quoting_escape_character";

	/** Indicates if the lines should be trimmed during reading. */
	public static final String PARAMETER_TRIM_LINES = "trim_lines";

	/** Indicates if lines leading to errors should be skipped. */
	public static final String PARAMETER_SKIP_ERROR_LINES = "skip_error_lines";

	/** The parameter name for "Determines, how the data is represented internally." */
	public static final String PARAMETER_DATAMANAGEMENT = "datamanagement";

	public ExampleSource(OperatorDescription description) {
		super(description);
	}

	@Override
	public MetaData getGeneratedMetaData() throws OperatorException {
		getLogger().fine("Generating meta data for " + this.getName());
		File attributeFile = getParameterAsFile(PARAMETER_ATTRIBUTES);
		if (attributeFile == null) {
			return new ExampleSetMetaData();
		}
		AttributeDataSources attributeDataSources;
		try {
			attributeDataSources = AttributeDataSource.createAttributeDataSources(attributeFile, true, this);
		} catch (IOException e) {
			throw new UserError(this, e, 302, new Object[] { attributeFile, e.getMessage() });
		} catch (com.rapidminer.tools.XMLException e) {
			throw new UserError(this, e, 401, e.getMessage());
		} catch (ParserConfigurationException e) {
			throw new UserError(this, e, 401, e.toString());
		} catch (SAXException e) {
			throw new UserError(this, e, 401, e.toString());
		}
		AttributeSet attributeSet = new AttributeSet(attributeDataSources);
		ExampleSetMetaData emd = new ExampleSetMetaData();
		for (Map.Entry<String, Attribute> entry : attributeSet.getSpecialAttributes().entrySet()) {
			AttributeMetaData a = new AttributeMetaData(entry.getValue());
			a.setRole(entry.getKey());
			emd.addAttribute(a);
		}
		for (Attribute attribute : attributeSet.getRegularAttributes()) {
			emd.addAttribute(new AttributeMetaData(attribute));
		}

		return emd;
	}

	@Override
	protected boolean isMetaDataCacheable() {
		return true;
	}

	@Override
	public ExampleSet createExampleSet() throws OperatorException {

		AttributeDataSources attributeDataSources = null;
		FileDataRowReader reader = null;

		File attributeFile = getParameterAsFile(PARAMETER_ATTRIBUTES);
		try {
			attributeDataSources = AttributeDataSource.createAttributeDataSources(attributeFile, true, this);
			char[] commentCharacters = null;
			if (getParameterAsBoolean(PARAMETER_USE_COMMENT_CHARACTERS)) {
				commentCharacters = getParameterAsString(PARAMETER_COMMENT_CHARS).toCharArray();
			}
			reader = new FileDataRowReader(
					new DataRowFactory(getParameterAsInt(PARAMETER_DATAMANAGEMENT), getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0)), 
					attributeDataSources.getDataSources(), 
					getParameterAsDouble(PARAMETER_SAMPLE_RATIO), 
					getParameterAsInt(PARAMETER_SAMPLE_SIZE), 
					getParameterAsString(PARAMETER_COLUMN_SEPARATORS),
					commentCharacters, 
					getParameterAsBoolean(PARAMETER_USE_QUOTES), 
					getParameterAsString(PARAMETER_QUOTE_CHARACTER).charAt(0), 
					getParameterAsString(PARAMETER_QUOTING_ESCAPE_CHARACTER).charAt(0), 
					getParameterAsBoolean(PARAMETER_TRIM_LINES), 
					getParameterAsBoolean(PARAMETER_SKIP_ERROR_LINES), 
					// only null if old version of description format: Then emulate old behavior using root operator
					attributeDataSources.getEncoding() == null ? Encoding.getEncoding(this) : attributeDataSources.getEncoding(), 
							RandomGenerator.getRandomGenerator(getParameterAsBoolean(RandomGenerator.PARAMETER_USE_LOCAL_RANDOM_SEED), getParameterAsInt(RandomGenerator.PARAMETER_LOCAL_RANDOM_SEED)));
		} catch (IOException e) {
			throw new UserError(this, e, 302, new Object[] { attributeFile, e.getMessage() });
		} catch (com.rapidminer.tools.XMLException e) {
			throw new UserError(this, e, 401, e.getMessage());
		} catch (ParserConfigurationException e) {
			throw new UserError(this, e, 401, e.toString());
		} catch (SAXException e) {
			throw new UserError(this, e, 401, e.toString());
		}

		AttributeSet attributeSet = new AttributeSet(attributeDataSources);

		ExampleTable table = new MemoryExampleTable(attributeSet.getAllAttributes(), reader, getParameterAsBoolean(PARAMETER_PERMUTATE));
		ExampleSet result = table.createExampleSet(attributeSet);
		return result;
	}

	@Override
	protected boolean supportsEncoding() {
		return true;
	}

	@Override
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = new LinkedList<ParameterType>();
		ParameterType type = new ParameterTypeConfiguration(ExampleSourceConfigurationWizardCreator.class, this);
		type.setExpert(false);
		types.add(type);
		types.add(new ParameterTypeAttributeFile(PARAMETER_ATTRIBUTES, "Filename for the xml attribute description file. This file also contains the names of the files to read the data from.", false));
		type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of the data set which should be read (1 = all; only used if sample_size = -1)", 0.0d, 1.0d, 1.0d);
		type.setExpert(false);
		types.add(type);
		types.add(new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The exact number of samples which should be read (-1 = use sample ratio; if not -1, sample_ratio will not have any effect)", -1, Integer.MAX_VALUE, -1));
		types.add(new ParameterTypeBoolean(PARAMETER_PERMUTATE, "Indicates if the loaded data should be permutated.", false));

		types.add(new ParameterTypeString(PARAMETER_DECIMAL_POINT_CHARACTER, "Character that is used as decimal point.", "."));

		types.add(new ParameterTypeString(PARAMETER_COLUMN_SEPARATORS, "Column separators for data files (regular expression)", ",\\s*|;\\s*|\\s+"));

		types.add(new ParameterTypeBoolean(PARAMETER_USE_COMMENT_CHARACTERS, "Indicates if a comment character should be used.", true));
		type = new ParameterTypeString(PARAMETER_COMMENT_CHARS, "Any content in a line after one of these characters will be ignored.", "#");
		type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_COMMENT_CHARACTERS, false, true));
		types.add(type);

		types.add(new ParameterTypeBoolean(PARAMETER_USE_QUOTES, "Indicates if quotes should be regarded.", true));
		type = new ParameterTypeString(PARAMETER_QUOTE_CHARACTER, "Specifies the character which should be used for quoting.", "\"");
		type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_QUOTES, false, true));
		types.add(type);

		type = new ParameterTypeString(PARAMETER_QUOTING_ESCAPE_CHARACTER, "Specifies the character which should be used for escape the quoting.", "\\");
		type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_QUOTES, false, true));
		types.add(type);

		types.add(new ParameterTypeBoolean(PARAMETER_TRIM_LINES, "Indicates if lines should be trimmed (empty spaces are removed at the beginning and the end) before the column split is performed.", false));

		types.add(new ParameterTypeBoolean(PARAMETER_SKIP_ERROR_LINES, "Indicates if lines which can not be read should be skipped instead of letting this operator fail its execution.", false));

		types.add(new ParameterTypeCategory(PARAMETER_DATAMANAGEMENT, "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, DataRowFactory.TYPE_DOUBLE_ARRAY));

		types.addAll(super.getParameterTypes());
		types.addAll(RandomGenerator.getRandomGeneratorParameters(this));


		return types;
	}
}