/*
* RapidMiner
*
* Copyright (C) 2001-2014 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.dfki.madm.operator;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.clustering.ClusterModel;
import com.rapidminer.operator.clustering.clusterer.RMAbstractClusterer;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.math.similarity.DistanceMeasure;
/**
* This algorithm is the first part of K-Means++ descried in the paper "k-means++: The Advantages of Careful Seeding" by David Arther and Sergei Vassilvitskii
*
* @author Patrick Kalka
*
*/
public class KMeanspp extends RMAbstractClusterer {
/** Short description for GUI */
public static final String SHORT_DESCRIPTION = "Determine the first k centroids using the K-Means++ heuristic described in \"k-means++: The Advantages of Careful Seeding\" by David Arthur and Sergei Vassilvitskii 2007";
/** Label for button */
public static final String PARAMETER_USE_KPP = "determine_good_start_values";
/** ExampleSet to work on */
private ExampleSet exampleSet = null;
/** DistanceMeasure to use */
private DistanceMeasure measure = null;
private RandomGenerator generator = null;
private int examplesize = -1;
private int minK = 0;
/**
* Initialization of K-Means++
*
* @param description
* @param anz initial Cluster count
* @param es ExampleSet to work on
* @param measure DistanceMeasure to use
* @throws OperatorException
*/
public KMeanspp(OperatorDescription description,int anz, ExampleSet es, DistanceMeasure measure, RandomGenerator generator) throws OperatorException {
super(description);
this.minK = anz;
this.exampleSet = es;
this.examplesize = es.size();
this.measure = measure;
this.generator = generator;
}
/**
* start the algorithm
*
* @return array with Ids of the centroids
* @throws ProcessStoppedException
*/
public int[] getStart() throws ProcessStoppedException {
int[] ret = new int[minK];
int i = 0;
int anz = 0;
// take the first Centroid at random
for (Integer index : generator.nextIntSetWithRange(0, exampleSet.size(), 1)) {
ret[anz] = index;
anz++;
i = index;
}
while (anz < minK) {
boolean again = false;
checkForStop();
do {
checkForStop();
again = false;
double[] shortest = new double[examplesize];
double maxProb = 0;
int maxPorbId = -1;
double distSum = 0;
//sum of shortest path between chosen centroids an all Points
for (int j = 0; j < examplesize; j++) {
double minDist = -1;
Example ex = exampleSet.getExample(j);
for(Integer id : ret) {
double dist = measure.calculateDistance(ex, exampleSet.getExample(id));
if(minDist == -1 || minDist > dist) {
minDist = dist;
}
}
distSum += minDist;
shortest[j] = minDist;
}
//get maximal Probability
for (int j = 0; j < examplesize; j++) {
double prob = Math.pow(shortest[j], 2) / Math.pow(distSum, 2);
if (prob > maxProb) {
maxPorbId = j;
maxProb = prob;
}
}
i = maxPorbId;
for(Integer id : ret) {
if (id == i)
again = true;
}
} while(again);
ret[anz] = i;
anz++;
}
return ret;
}
@Override
public ClusterModel generateClusterModel(ExampleSet exampleSet)
throws OperatorException {
return null;
}
}