* RapidMiner
* Copyright (C) 2001-2014 by RapidMiner and the contributors
* Complete list of developers available at our web site:
* http://rapidminer.com
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
package com.rapidminer.operator.features.selection;
import java.util.LinkedList;
import java.util.List;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.features.FeatureOperator;
import com.rapidminer.operator.features.Individual;
import com.rapidminer.operator.features.KeepBest;
import com.rapidminer.operator.features.Population;
import com.rapidminer.operator.features.PopulationOperator;
import com.rapidminer.operator.features.RedundanceRemoval;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
* <p>
* This operator realizes the two deterministic greedy feature selection
* algorithms forward selection and backward elimination. However, we added some
* enhancements to the standard algorithms which are described below:
* </p>
* <h4>Forward Selection</h4>
* <ol>
* <li>Create an initial population with {@rapidminer.math n} individuals where
* {@rapidminer.math n} is the input example set's number of attributes. Each
* individual will use exactly one of the features.</li>
* <li>Evaluate the attribute sets and select only the best {@rapidminer.math k}.</li>
* <li>For each of the {@rapidminer.math k} attribute sets do: If there are
* {@rapidminer.math j} unused attributes, make {@rapidminer.math j} copies of the attribute
* set and add exactly one of the previously unused attributes to the attribute
* set.</li>
* <li>As long as the performance improved in the last {@rapidminer.math p}
* iterations go to 2</li>
* </ol>
* <h4>Backward Elimination</h4>
* <ol>
* <li>Start with an attribute set which uses all features.</li>
* <li>Evaluate all attribute sets and select the best {@rapidminer.math k}.</li>
* <li>For each of the {@rapidminer.math k} attribute sets do: If there are
* {@rapidminer.math j} attributes used, make {@rapidminer.math j} copies of the attribute
* set and remove exactly one of the previously used attributes from the
* attribute set.</li>
* <li>As long as the performance improved in the last {@rapidminer.math p}
* iterations go to 2</li>
* </ol>
* <p>
* The parameter {@rapidminer.math k} can be specified by the parameter
* <code>keep_best</code>, the parameter {@rapidminer.math p} can be specified by
* the parameter <code>generations_without_improval</code>. These parameters
* have default values 1 which means that the standard selection algorithms are
* used. Using other values increase the runtime but might help to avoid local
* extrema in the search for the global optimum.
* </p>
* <p>
* Another unusual parameter is <code>maximum_number_of_generations</code>.
* This parameter bounds the number of iterations to this maximum of feature
* selections / deselections. In combination with
* <code>generations_without_improval</code> this allows several different
* selection schemes (which are described for forward selection, backward
* elimination works analogous):
* <ul>
* <li><code>maximum_number_of_generations</code> = {@rapidminer.math m} and
* <code>generations_without_improval</code> = {@rapidminer.math p}: Selects
* maximal {@rapidminer.math m} features. The selection stops if not performance
* improvement was measured in the last {@rapidminer.math p} generations.</li>
* <li><code>maximum_number_of_generations</code> = {@rapidminer.math -1} and
* <code>generations_without_improval</code> = {@rapidminer.math p}: Tries to
* selects new features until no performance improvement was measured in the
* last {@rapidminer.math p} generations.</li>
* <li><code>maximum_number_of_generations</code> = {@rapidminer.math m} and
* <code>generations_without_improval</code> = {@rapidminer.math -1}: Selects
* maximal {@rapidminer.math m} features. The selection stops is not stopped until all
* combinations with maximal {@rapidminer.math m} were tried. However, the result
* might contain less features than these.</li>
* <li><code>maximum_number_of_generations</code> = {@rapidminer.math -1} and
* <code>generations_without_improval</code> = {@rapidminer.math -1}: Test all
* combinations of attributes (brute force, this might take a very long time and
* should only be applied to small attribute sets).</li>
* </ul>
* </p>
* @author Simon Fischer, Ingo Mierswa
public class FeatureSelectionOperator extends FeatureOperator {
/** The parameter name for "Forward selection or backward elimination." */
public static final String PARAMETER_SELECTION_DIRECTION = "selection_direction";
/** The parameter name for "Keep the best n individuals in each generation." */
public static final String PARAMETER_KEEP_BEST = "keep_best";
/** The parameter name for "Stop after n generations without improvement of the performance (-1: stops if the maximum_number_of_generations is reached)." */
public static final String PARAMETER_GENERATIONS_WITHOUT_IMPROVAL = "generations_without_improval";
public static final String PARAMETER_LIMIT_GENERATIONS_WITHOUT_IMPROVAL = "limit_generations_without_improval";
/** The parameter name for "Delivers the maximum amount of generations (-1: might use or deselect all features)." */
public static final String PARAMETER_LIMIT_NUMBER_OF_GENERATIONS = "limit_number_of_generations";
public static final String PARAMETER_MAXIMUM_NUMBER_OF_GENERATIONS = "maximum_number_of_generations";
public static final int FORWARD_SELECTION = 0;
public static final int BACKWARD_ELIMINATION = 1;
private static final String[] DIRECTIONS = { "forward", "backward" };
private int generationsWOImp;
private int maxGenerations;
public FeatureSelectionOperator(OperatorDescription description) {
protected ExampleSetMetaData modifyInnerOutputExampleSet(ExampleSetMetaData metaData) {
return metaData;
protected ExampleSetMetaData modifyOutputExampleSet(ExampleSetMetaData metaData) {
return metaData;
public void doWork() throws OperatorException {
int getDefaultDirection() {
* May <tt>es</tt> have <i>n</i> features. The initial population
* contains (depending on whether forward selection or backward elimination
* is used) either
* <ul>
* <li><i>n</i> elements with exactly 1 feature switched on or
* <li>1 element with all <i>n</i> features switched on.
* </ul>
public Population createInitialPopulation(ExampleSet es) throws UndefinedParameterError {
int direction = getParameterAsInt(PARAMETER_SELECTION_DIRECTION);
Population initP = new Population();
if (direction == FORWARD_SELECTION) {
for (int a = 0; a < es.getAttributes().size(); a++) {
double[] weights = new double[es.getAttributes().size()];
weights[a] = 1.0d;
initP.add(new Individual(weights));
} else {
double[] weights = new double[es.getAttributes().size()];
for (int a = 0; a < es.getAttributes().size(); a++) {
weights[a] = 1.0d;
initP.add(new Individual(weights));
return initP;
* The operators performs two steps:
* <ol>
* <li>forward selection/backward elimination
* <li>kick out all but the <tt>keep_best</tt> individuals
* <li>remove redundant individuals
* </ol>
public List<PopulationOperator> getPreEvaluationPopulationOperators(ExampleSet input) throws OperatorException {
int direction = getParameterAsInt(PARAMETER_SELECTION_DIRECTION);
int keepBest = getParameterAsInt(PARAMETER_KEEP_BEST);
List<PopulationOperator> preOp = new LinkedList<PopulationOperator>();
preOp.add(new KeepBest(keepBest));
if (direction == FORWARD_SELECTION) {
preOp.add(new ForwardSelection());
if (this.maxGenerations <= 0)
this.maxGenerations = input.getAttributes().size() - 1;
this.maxGenerations--; // ensures the correct number of
// features
} else {
preOp.add(new BackwardElimination());
if (this.maxGenerations <= 0)
this.maxGenerations = input.getAttributes().size();
preOp.add(new RedundanceRemoval());
return preOp;
/** empty list */
public List<PopulationOperator> getPostEvaluationPopulationOperators(ExampleSet input) throws OperatorException {
return new LinkedList<PopulationOperator>();
* Returns true if the best individual is not better than the last
* generation's best individual.
public boolean solutionGoodEnough(Population pop) throws OperatorException {
return pop.empty() || ((generationsWOImp > 0) && (pop.getGenerationsWithoutImproval() >= generationsWOImp)) || (pop.getGeneration() >= maxGenerations);
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();
ParameterType type = new ParameterTypeCategory(PARAMETER_SELECTION_DIRECTION, "Forward selection or backward elimination.", DIRECTIONS, getDefaultDirection());
type = new ParameterTypeBoolean(PARAMETER_LIMIT_GENERATIONS_WITHOUT_IMPROVAL, "Indicates if the optimization should be aborted if this number of generations showed no improvement. If unchecked, always the maximal number of generations will be used.", true);
type = new ParameterTypeInt(PARAMETER_GENERATIONS_WITHOUT_IMPROVAL, "Stop after n generations without improval of the performance.", 1, Integer.MAX_VALUE, 1);
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_LIMIT_GENERATIONS_WITHOUT_IMPROVAL, false, true));
types.add(new ParameterTypeBoolean(PARAMETER_LIMIT_NUMBER_OF_GENERATIONS, "Defines if the number of generations should be limited on a specific number.", false, false));
type = new ParameterTypeInt(PARAMETER_MAXIMUM_NUMBER_OF_GENERATIONS, "Defines the maximum amount of generations.", 1, Integer.MAX_VALUE, 10);
type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_LIMIT_NUMBER_OF_GENERATIONS, true, true));
types.add(new ParameterTypeInt(PARAMETER_KEEP_BEST, "Keep the best n individuals in each generation.", 1, Integer.MAX_VALUE, 1));
return types;