* RapidMiner
* Copyright (C) 2001-2014 by RapidMiner and the contributors
* Complete list of developers available at our web site:
* http://rapidminer.com
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
package com.rapidminer.operator.learner.meta;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Vector;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.SplittedExampleSet;
import com.rapidminer.operator.Model;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ValueDouble;
import com.rapidminer.operator.learner.PredictionModel;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetPassThroughRule;
import com.rapidminer.operator.ports.metadata.GeneratePredictionModelTransformationRule;
import com.rapidminer.operator.ports.metadata.SetRelation;
import com.rapidminer.operator.ports.metadata.SubprocessTransformRule;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.Tools;
import com.rapidminer.tools.container.Pair;
* Subgroup discovery learner.
* @author Martin Scholz
public class SDRulesetInduction extends OperatorChain {
private InputPort exampleSetInput = getInputPorts().createPort("training set", ExampleSet.class);
private OutputPort trainingInnerSource = getSubprocess(0).getInnerSources().createPort("training set");
private InputPort modelInnerSink = getSubprocess(0).getInnerSinks().createPort("model", PredictionModel.class);
private OutputPort modelOutput = getOutputPorts().createPort("model");
* Name of the variable specifying the maximal number of iterations of the
* learner.
public static final String PARAMETER_ITERATIONS = "iterations";
/** Name of the flag indicating internal bootstrapping. */
public static final String PARAMETER_RATIO_INTERNAL_BOOTSTRAP = "ratio_internal_bootstrap";
* A parameter whether to discard all rules not lying on the convex hull in
* ROC space.
public static final String PARAMETER_ROC_CONVEX_HULL_FILTER = "ROC_convex_hull_filter";
* Boolean parameter: true for additive reweighting, false for
* multiplicative.
public static final String PARAMETER_ADDITIVE_REWEIGHT = "additive_reweight";
* Boolean parameter to specify whether the label priors should be equally
* likely after first iteration.
public static final String PARAMETER_GAMMA = "gamma";
* Name of special attribute counting the times an example has been covered
* by a rule. This attribute is created for additive reweighting, only.
/** Discard models with an advantage of less than the specified value. */
public static final double MIN_ADVANTAGE = 0.001;
// A performance measure to be visualized. Not yet implemented!
private double performance = 0;
// field for visualizing performance
private int currentIteration;
/** Constructor. */
public SDRulesetInduction(OperatorDescription description) {
super(description, "Training");
getTransformer().addRule(new ExampleSetPassThroughRule(exampleSetInput, trainingInnerSource, SetRelation.EQUAL) {
public ExampleSetMetaData modifyExampleSet(ExampleSetMetaData metaData) {
AttributeMetaData weightAttribute = new AttributeMetaData("weight", Ontology.REAL, Attributes.WEIGHT_NAME);
AttributeMetaData specialAttribute = new AttributeMetaData(TIMES_COVERED, Ontology.REAL, TIMES_COVERED);
return metaData;
getTransformer().addRule(new SubprocessTransformRule(getSubprocess(0)));
getTransformer().addRule(new GeneratePredictionModelTransformationRule(exampleSetInput, modelOutput, PredictionModel.class));
addValue(new ValueDouble("performance", "The performance.") {
public double getDoubleValue() {
return performance;
addValue(new ValueDouble("iteration", "The current iteration.") {
public double getDoubleValue() {
return currentIteration;
public static int getPosIndex(Attribute label) {
return (label.getMapping().getPositiveIndex());
* Creates a weight attribute if not yet done and fills it with an initial
* value so that positive and negative examples are equally probable.
* @param exampleSet
* the example set to be prepared
private double[] prepareWeights(ExampleSet exampleSet) throws OperatorException {
Attribute weightAttr = com.rapidminer.example.Tools.createWeightAttribute(exampleSet);
Attribute timesCoveredAttrib = null;
boolean additive = this.getParameterAsBoolean(PARAMETER_ADDITIVE_REWEIGHT);
if (additive && (timesCoveredAttrib = exampleSet.getAttributes().get(TIMES_COVERED)) == null) {
timesCoveredAttrib = com.rapidminer.example.Tools.createSpecialAttribute(exampleSet, TIMES_COVERED, Ontology.INTEGER);
Iterator<Example> exRead = exampleSet.iterator();
int numPos = 0;
final int positiveClass = getPosIndex(exampleSet.getAttributes().getLabel());
final int negativeClass = 1 - positiveClass;
while (exRead.hasNext()) {
if ((exRead.next().getLabel()) == positiveClass)
final double[] classPriors = new double[2];
classPriors[positiveClass] = ((double) numPos) / exampleSet.size();
classPriors[negativeClass] = 1.0d - classPriors[positiveClass];
final double posWeight = 0.5 / classPriors[positiveClass];
final double negWeight = 0.5 / classPriors[negativeClass];
exRead = exampleSet.iterator();
while (exRead.hasNext()) {
Example example = exRead.next();
double w = (example.getLabel() == positiveClass) ? posWeight : negWeight;
example.setValue(weightAttr, w);
if (additive)
example.setValue(timesCoveredAttrib, 0);
return classPriors;
* Runs the "embedded" learner on the example set and retuns a
* model.
* @param exampleSet
* an <code>ExampleSet</code> to train a model for
* @return a <code>Model</code>
private Model trainModel(ExampleSet exampleSet) throws OperatorException {
return modelInnerSink.getData(Model.class);
* Constructs a <code>Model</code> repeatedly running a weak learner,
* reweighting the training example set accordingly, and combining the
* hypothesis using the available weighted performance values. If the input
* contains a model, then this model is used as a starting point for
* weighting the examples.
public void doWork() throws OperatorException {
// Reads the input example set and initializes its weights.
ExampleSet exampleSet = exampleSetInput.getData(ExampleSet.class);
// Check if label is present and fits the learning task
if (exampleSet.getAttributes().getLabel() == null) {
throw new UserError(this, 105);
Model model = this.trainRuleset(exampleSet, this.prepareWeights(exampleSet));
/** Main method for training the ensemble classifier */
private SDEnsemble trainRuleset(ExampleSet trainingSet, final double[] classPriors) throws OperatorException {
// for models and their probability estimates
Vector<Pair<Model, double[][]>> modelInfo = new Vector<Pair<Model, double[][]>>();
// check whether to use the complete training set for training
final double splitRatio = this.getParameterAsDouble(PARAMETER_RATIO_INTERNAL_BOOTSTRAP);
final boolean bootstrap = ((splitRatio > 0) && (splitRatio < 1.0));
log(bootstrap ? "Bootstrapping enabled." : "Bootstrapping disabled.");
// maximum number of iterations
final int iterations = this.getParameterAsInt(PARAMETER_ITERATIONS);
final boolean roc_filter = this.getParameterAsBoolean(PARAMETER_ROC_CONVEX_HULL_FILTER);
List<double[]> rocCurve = null;
if (roc_filter) {
rocCurve = new LinkedList<double[]>();
rocCurve.add(new double[] { 0, 0 });
rocCurve.add(new double[] { 1, 1 });
for (int i = 0; i < iterations; i++) {
this.currentIteration = i;
// int size = trainingSet.getSize();
ExampleSet splittedSet = trainingSet;
if (bootstrap == true) {
splittedSet = new SplittedExampleSet(trainingSet, splitRatio, SplittedExampleSet.SHUFFLED_SAMPLING, getParameterAsBoolean(RandomGenerator.PARAMETER_USE_LOCAL_RANDOM_SEED), getParameterAsInt(RandomGenerator.PARAMETER_LOCAL_RANDOM_SEED));
((SplittedExampleSet) splittedSet).selectSingleSubset(0); // switch to training set
// train one model per iteration
Model model = this.trainModel(splittedSet);
ExampleSet resultSet = null;
if (bootstrap == true) {
((SplittedExampleSet) splittedSet).selectSingleSubset(1); // switch to out-of-bag set
resultSet = model.apply(splittedSet); // apply model to all examples
} else {
resultSet = model.apply(trainingSet); // apply model to all examples
// get the weighted performance value of the example set with
// respect to the model
SDReweightMeasures wp = new SDReweightMeasures(resultSet);
final boolean additive = this.getParameterAsBoolean(PARAMETER_ADDITIVE_REWEIGHT);
if (!additive) {
// Calculate the unweighted distributions and the true/false
// positive rate:
double[][] modelWeightMatrix = new double[2][2];
double tpr = 0;
double fpr = 0;
boolean defaultRule = false;
// assuming indexes "0" and "1" for predictions:
int[][] predClasses = new int[2][];
predClasses[0] = wp.getCoveredExamplesNumForPred(0);
predClasses[1] = wp.getCoveredExamplesNumForPred(1);
int[] rowTotals = new int[2];
rowTotals[0] = predClasses[0][0] + predClasses[0][1];
rowTotals[1] = predClasses[1][0] + predClasses[1][1];
int total = rowTotals[0] + rowTotals[1];
// Just the distribution for the covered subset is stored.
// It is not visible which label is explicitly predicted
// (syntactically)
// in the rule, so we assume the label that results in higher
// WRAcc.
double cov0 = ((double) rowTotals[0]) / total;
double cov1 = ((double) rowTotals[1]) / total;
double prior0 = ((double) predClasses[0][0] + predClasses[1][0]) / total;
double prior1 = ((double) predClasses[0][1] + predClasses[1][1]) / total; // used
// later
double bias0 = Math.abs(((double) predClasses[0][0] / rowTotals[0]) - prior0);
double bias1 = Math.abs(((double) predClasses[1][0] / rowTotals[1]) - prior0);
int subset = (Double.isNaN(bias1) || cov0 * bias0 >= cov1 * bias1) ? 0 : 1; // WRAcc
// is
// coverage
// *
// bias
// The subset not covered by the rule is marked with zero
// estimates.
modelWeightMatrix[subset][0] = ((double) predClasses[subset][0]) / rowTotals[subset];
modelWeightMatrix[subset][1] = ((double) predClasses[subset][1]) / rowTotals[subset];
double ratio0 = (((double) predClasses[subset][0]) / total) / prior0;
double ratio1 = (((double) predClasses[subset][1]) / total) / prior1;
// Reweight the example set with respect to the weighted
// performance values.
// The last parameter is the positive class. It is selected so
// that TPr is higher.
wp.reweightExamples(trainingSet, ((ratio0 > ratio1) ? 0 : 1), subset);
// As "positive" and "negative" depend on the explicitly
// predicted class
// (which is not visible) we sometimes need to translate tnr
// into tpr.
if (roc_filter) {
tpr = Math.max(ratio0, ratio1);
fpr = Math.min(ratio0, ratio1);
defaultRule = (cov0 == 0) || (cov1 == 0);
// If activated just keep rules lying on the convex hull in ROC
// space:
if ((defaultRule == false) && (roc_filter == false || this.isOnConvexHull(rocCurve, tpr, fpr))) {
// Add the new model and its weights to the collection of
// models:
modelInfo.add(new Pair<Model, double[][]> (model, modelWeightMatrix));
if (roc_filter) {
StringBuffer message = new StringBuffer("The convex hull in ROC space contains the following points (TPr/FPr):" + Tools.getLineSeparator());
Iterator it = rocCurve.iterator();
while (it.hasNext()) {
double[] tpfp = (double[]) it.next();
message.append("(" + tpfp[0] + ", " + tpfp[1] + ") ");
// Build a Model object.
short combinationMethod = this.getParameterAsBoolean(PARAMETER_ADDITIVE_REWEIGHT) ? SDEnsemble.RULE_COMBINE_ADDITIVE : SDEnsemble.RULE_COMBINE_MULTIPLY;
return new SDEnsemble(trainingSet, modelInfo, classPriors, combinationMethod);
private void debugMessage(SDReweightMeasures wp) {
String message = Tools.getLineSeparator() + "Model learned - training performance of rule:" + Tools.getLineSeparator() + "TPR: " + wp.getProbability(0, 0) + " FPR: " + wp.getProbability(1, 0) + " | Positively predicted: " + (wp.getProbability(1, 0) + wp.getProbability(0, 0)) + Tools.getLineSeparator() + "FNR: " + wp.getProbability(0, 1) + " TNR: "
+ wp.getProbability(1, 1) + " | Negatively predicted: " + (wp.getProbability(0, 1) + wp.getProbability(1, 1)) + Tools.getLineSeparator() + "Positively labeled: " + (wp.getProbability(0, 0) + wp.getProbability(0, 1)) + Tools.getLineSeparator() + "Negatively labeled: " + (wp.getProbability(1, 0) + wp.getProbability(1, 1));
LogService.getGlobal().log(message, LogService.STATUS);
private boolean isOnConvexHull(List<double[]> rocCurve, double tpr, double fpr) {
if ((tpr <= 0) || (tpr > 1) || (fpr < 0) || (fpr >= 1))
return false;
ListIterator iter = rocCurve.listIterator();
double slope = Double.POSITIVE_INFINITY;
boolean fprGreater = true;
while (fprGreater) {
double[] current = (double[]) (iter.next());
fprGreater = (fpr > current[1]);
if (fprGreater) {
double newSlope = (tpr - current[0]) / (fpr - current[1]);
if (newSlope >= slope) {
} else {
slope = newSlope; // slope connecting the new point to the
// candidate
double finalSlope = (1 - current[0]) / (1 - current[1]); // connection
// new
// point
// to
// (1,1)
if (slope <= finalSlope) { // slope needs to be greater
// than connection to (1,1)
return false; // candidate lies below
} else if (fpr == current[1]) { // no slope defined
if (tpr > current[0]) {
rocCurve.set(iter.previousIndex(), new double[] { tpr, fpr });
} else
return false;
} else { // The last slope is still available. It must be higher
// than the next one!
double nextSlope = (current[0] - tpr) / (current[1] - fpr);
if (slope > nextSlope) {
rocCurve.add(iter.previousIndex(), new double[] { tpr, fpr });
} else
return false;
slope = (1 - tpr) / (1 - fpr); // slope of connecting line between
// candidate and (1,1)
iter = rocCurve.listIterator(rocCurve.size());
while (iter.hasPrevious()) {
double[] current = (double[]) iter.previous();
if (current[1] <= fpr) // found the candidate from end of list
return true; // done.
double newSlope = (current[0] - tpr) / (current[1] - fpr); // slope
// new
// point
// to
// candidate
if ((current[1]) < 1 && (newSlope <= slope)) { // needs to be
// greater than last
// slope
} else
slope = newSlope;
return true;
* Adds the parameters "number of iterations" and "model
* file".
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
ParameterType type = new ParameterTypeInt(PARAMETER_ITERATIONS, "The maximum number of iterations.", 1, Integer.MAX_VALUE, 10);
types.add(new ParameterTypeDouble(PARAMETER_RATIO_INTERNAL_BOOTSTRAP, "Fraction of examples used for training (internal bootstrapping). If activated (value < 1) only the rest is used to estimate the biases.", 0, 1, 0.7));
types.add(new ParameterTypeBoolean(PARAMETER_ROC_CONVEX_HULL_FILTER, "A parameter whether to discard all rules not lying on the convex hull in ROC space.", true));
types.add(new ParameterTypeBoolean(PARAMETER_ADDITIVE_REWEIGHT, "If enabled then resampling is done by additive reweighting, otherwise by multiplicative reweighting.", true));
types.add(new ParameterTypeDouble(PARAMETER_GAMMA, "Factor used for multiplicative reweighting. Has no effect in case of additive reweighting.", 0, 1, 0.9));
return types;