/*
* RapidMiner
*
* Copyright (C) 2001-2014 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.clustering.clusterer;
import java.util.ArrayList;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Tools;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorCapability;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.clustering.CentroidClusterModel;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.operator.learner.CapabilityProvider;
import com.rapidminer.operator.ports.metadata.DistanceMeasurePrecondition;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.math.similarity.DistanceMeasure;
import com.rapidminer.tools.math.similarity.DistanceMeasureHelper;
import com.rapidminer.tools.math.similarity.DistanceMeasures;
/**
* This operator represents an implementation of k-medoids. This operator will create a cluster attribute if not present
* yet.
*
* @author Sebastian Land
*/
public class KMedoids extends RMAbstractClusterer implements CapabilityProvider {
/** The parameter name for "the maximal number of clusters" */
public static final String PARAMETER_K = "k";
/**
* The parameter name for "the maximal number of runs of the k method with random initialization that are
* performed"
*/
public static final String PARAMETER_MAX_RUNS = "max_runs";
/** The parameter name for "the maximal number of iterations performed for one run of the k method" */
public static final String PARAMETER_MAX_OPTIMIZATION_STEPS = "max_optimization_steps";
private DistanceMeasureHelper measureHelper = new DistanceMeasureHelper(this);
public KMedoids(OperatorDescription description) {
super(description);
getExampleSetInputPort().addPrecondition(new DistanceMeasurePrecondition(getExampleSetInputPort(), this));
}
@Override
public boolean supportsCapability(OperatorCapability capability) {
int measureType = DistanceMeasures.MIXED_MEASURES_TYPE;
try {
measureType = measureHelper.getSelectedMeasureType();
} catch (Exception e) {
}
switch (capability) {
case BINOMINAL_ATTRIBUTES:
case POLYNOMINAL_ATTRIBUTES:
return (measureType == DistanceMeasures.MIXED_MEASURES_TYPE) ||
(measureType == DistanceMeasures.NOMINAL_MEASURES_TYPE);
case NUMERICAL_ATTRIBUTES:
return (measureType == DistanceMeasures.MIXED_MEASURES_TYPE) ||
(measureType == DistanceMeasures.DIVERGENCES_TYPE) ||
(measureType == DistanceMeasures.NUMERICAL_MEASURES_TYPE);
case POLYNOMINAL_LABEL:
case BINOMINAL_LABEL:
case NUMERICAL_LABEL:
case WEIGHTED_EXAMPLES:
case MISSING_VALUES:
return true;
default:
return false;
}
}
@Override
public ClusterModel generateClusterModel(ExampleSet exampleSet) throws OperatorException {
int k = getParameterAsInt(PARAMETER_K);
int maxOptimizationSteps = getParameterAsInt(PARAMETER_MAX_OPTIMIZATION_STEPS);
int maxRuns = getParameterAsInt(PARAMETER_MAX_RUNS);
DistanceMeasure measure = measureHelper.getInitializedMeasure(exampleSet);
// checking and creating ids if necessary
Tools.checkAndCreateIds(exampleSet);
// additional checks
Tools.onlyNonMissingValues(exampleSet, "KMedoids");
if (exampleSet.size() < k) {
throw new UserError(this, 142, k);
}
// extracting attribute names
Attributes attributes = exampleSet.getAttributes();
ArrayList<String> attributeNames = new ArrayList<String>(attributes.size());
for (Attribute attribute : attributes)
attributeNames.add(attribute.getName());
RandomGenerator generator = RandomGenerator.getRandomGenerator(this);
double minimalIntraClusterDistance = Double.POSITIVE_INFINITY;
CentroidClusterModel bestModel = null;
int[] bestAssignments = null;
double[] values = new double[attributes.size()];
for (int iter = 0; iter < maxRuns; iter++) {
checkForStop();
CentroidClusterModel model = new CentroidClusterModel(exampleSet, k, attributeNames, measure, getParameterAsBoolean(RMAbstractClusterer.PARAMETER_ADD_AS_LABEL), getParameterAsBoolean(RMAbstractClusterer.PARAMETER_REMOVE_UNLABELED));
// init centroids
int i = 0;
for (Integer index : generator.nextIntSetWithRange(0, exampleSet.size(), k)) {
model.assignExample(i, getAsDoubleArray(exampleSet.getExample(index), attributes, values));
i++;
}
model.finishAssign();
// run optimization steps
int[] centroidAssignments = new int[exampleSet.size()];
boolean stable = false;
for (int step = 0; (step < maxOptimizationSteps) && !stable; step++) {
checkForStop();
// assign examples to new centroids
i = 0;
for (Example example : exampleSet) {
double[] exampleValues = getAsDoubleArray(example, attributes, values);
double nearestDistance = measure.calculateDistance(model.getCentroidCoordinates(0), exampleValues);
int nearestIndex = 0;
for (int centroidIndex = 1; centroidIndex < k; centroidIndex++) {
double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidIndex), exampleValues);
if (distance < nearestDistance) {
nearestDistance = distance;
nearestIndex = centroidIndex;
}
}
centroidAssignments[i] = nearestIndex;
i++;
}
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
double[] bestMedoidValues = new double[attributes.size()];
double bestDistanceSum = Double.POSITIVE_INFINITY;
for (Example medoid : exampleSet) {
// calculate intra cluster distance if this example is used as medoid
double distanceSum = 0;
double[] medoidValues = getAsDoubleArray(medoid, attributes, values);
int j = 0;
for (Example example : exampleSet) {
// add only if in current cluster
if (centroidAssignments[j] == clusterIndex)
distanceSum += measure.calculateDistance(getAsDoubleArray(example, attributes, values), medoidValues);
j++;
}
if (distanceSum < bestDistanceSum) {
bestDistanceSum = distanceSum;
bestMedoidValues = medoidValues;
}
}
// assigning into model as best point using average of one
model.getCentroid(clusterIndex).assignExample(bestMedoidValues);
}
stable = model.finishAssign();
}
// assessing quality of this model
double distanceSum = 0;
i = 0;
for (Example example : exampleSet) {
double distance = measure.calculateDistance(model.getCentroidCoordinates(centroidAssignments[i]), getAsDoubleArray(example, attributes, values));
distanceSum += distance * distance;
i++;
}
if (distanceSum < minimalIntraClusterDistance || Double.isInfinite(minimalIntraClusterDistance)) {
bestModel = model;
minimalIntraClusterDistance = distanceSum;
bestAssignments = centroidAssignments;
}
}
bestModel.setClusterAssignments(bestAssignments, exampleSet);
if (addsClusterAttribute()) {
Attribute cluster = AttributeFactory.createAttribute("cluster", Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(cluster);
exampleSet.getAttributes().setCluster(cluster);
int i = 0;
for (Example example : exampleSet) {
example.setValue(cluster, "cluster_" + bestAssignments[i]);
i++;
}
}
return bestModel;
}
private double[] getAsDoubleArray(Example example, Attributes attributes, double[] values) {
int i = 0;
for (Attribute attribute : attributes) {
values[i] = example.getValue(attribute);
i++;
}
return values;
}
@Override
public Class<? extends ClusterModel> getClusterModelClass() {
return CentroidClusterModel.class;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeInt(PARAMETER_K, "The number of clusters which should be detected.", 2, Integer.MAX_VALUE, 2, false));
types.add(new ParameterTypeInt(PARAMETER_MAX_RUNS, "The maximal number of runs of k-Means with random initialization that are performed.", 1, Integer.MAX_VALUE, 10, false));
types.add(new ParameterTypeInt(PARAMETER_MAX_OPTIMIZATION_STEPS, "The maximal number of iterations performed for one run of k-Means.", 1, Integer.MAX_VALUE, 100, false));
types.addAll(RandomGenerator.getRandomGeneratorParameters(this));
types.addAll(DistanceMeasures.getParameterTypes(this));
return types;
}
}