/*
* Copyright 2007-2010 Sun Microsystems, Inc.
*
* This file is part of Project Darkstar Server.
*
* Project Darkstar Server is free software: you can redistribute it
* and/or modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation and
* distributed hereunder to you.
*
* Project Darkstar Server is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* --
*/
package com.sun.sgs.impl.service.nodemap.affinity;
import com.sun.sgs.impl.service.nodemap.affinity.graph.LabelVertex;
import com.sun.sgs.impl.service.nodemap.affinity.graph.WeightedEdge;
import com.sun.sgs.auth.Identity;
import edu.uci.ics.jung.graph.Graph;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Set;
/**
* Utility methods for "goodness" measurements of found groups.
*/
public final class AffinityGroupGoodness {
/**
* A private constructor: we do not want instances of this class to
* be constructed, as it contains only static utility methods.
*/
private AffinityGroupGoodness() {
}
/**
* Given a graph and a set of partitions of it, calculate the modularity.
* Modularity is a quality measure for the goodness of a clustering
* algorithm, and is, essentially, the number of edges within communities
* subtracted by the expected number of such edges.
* <p>
* The modularity will be a number between 0.0 and 1.0, with a higher
* number being better.
* <p>
* See "Finding community structure in networks using eigenvectors
* of matrices" 2006 Mark Newman and "Finding community structure in
* very large networks" 2004 Clauset, Newman, Moore.
* <p>
* Note that modularity can only be calculated on a complete graph.
*
* @param graph the graph which was divided into communities
* @param groups the communities found in the graph
* @return the modularity of the groups found in the graph
*/
public static double calcModularity(Graph<LabelVertex, WeightedEdge> graph,
Collection<AffinityGroup> groups)
{
// NOTE: this algorithm might need to be optimized if we use it for
// more than goodness testing.
// m is the sum of edge weights for all edges in the graph
long m = 0;
for (WeightedEdge e : graph.getEdges()) {
m = m + e.getWeight();
}
final long doublem = 2 * m;
final long doublemsquare = doublem * doublem;
// For each pair of vertices that are in the same community,
// compute 1/(2m) * Sum(A[i,j] - P[i,j]), where P[i,j] is k[i]k[j]/2m.
// See equation (18) in Newman's 2006 paper. P[i,j] is the probable
// weight of an edge between vertices i and j, and A[i,j] is the
// actual weight. k[i] is the sum of weights of edges connected to
// vertex i.
//
// Note also that modularity can be expressed as
// Sum(e[i,i] - a[i]*a[i]) where e[i,i] is the fraction of edges inside
// the community i and a[i] is the fraction of ends of edges that are
// attached to vertices in community i.
// See equation (7) in Clauset, Newman, Moore 2004 paper.
long sum = 0;
for (AffinityGroup g : groups) {
// ingroup is weighted edge count within the community
long ingroup = 0;
// totEdges is the total number of connections for this community
long totEdges = 0;
Set<Identity> ids = g.getIdentities();
int size = ids.size();
ArrayList<LabelVertex> groupList =
new ArrayList<LabelVertex>(size);
for (Identity id : ids) {
groupList.add(new LabelVertex(id));
}
for (LabelVertex vertex : groupList) {
for (WeightedEdge edge : graph.getIncidentEdges(vertex)) {
totEdges = totEdges + edge.getWeight();
}
}
// Look at each of the pairs in the community to find the number
// of edges within
for (int i = 0; i < size - 1; i++) {
LabelVertex v1 = groupList.get(i);
for (int j = i + 1; j < size; j++) {
LabelVertex v2 = groupList.get(j);
// Calculate the adjacency info for v1 and v2; each edge
// is counted twice to account for the two vertices it
// connects.
// We allow parallel (multiple) edges in the graph so
// use findEdgeSet.
Collection<WeightedEdge> edges = graph.findEdgeSet(v1, v2);
for (WeightedEdge edge : edges) {
ingroup = ingroup + (edge.getWeight() * 2);
}
}
}
// ingroup is e[i,i] * doublem.
// totEdges is a[i] * doublem.
// Multiply ingroup by doublem here so we can, outside this loop,
// divide the sum by doublemsquare to remove the effects of counting
// each edge twice.
sum = sum + (ingroup * doublem - (totEdges * totEdges));
}
double q = (double) sum / doublemsquare;
// Ensure that the final value is between 0.0 and 1.0. This number
// can go slightly negative if we have groups with single nodes.
q = Math.min(1.0, Math.max(0.0, q));
return q;
}
/**
* Calculates Jaccard's index for a pair of affinity group collections,
* which is a measurement of similarity of the groups found in the two
* collections. The value will be between {@code 0.0} and {@code 1.0},
* with higher values indicating stronger similarity between two samples.
* See page 8 of "Near linear time algorithm to detect community structures
* in large-scale networks" 2007 Raghavan, Albert, Kumara.
* <p>
* Because Jaccard's index uses computed groups, rather than a graph,
* it can be useful when the graphs are distributed or incomplete.
* <p>
* @param sample1 the first sample
* @param sample2 the second sample
* @return the Jaccard index, a value between {@code 0.0} and {@code 1.0},
* with higher values indicating more similarity
*/
public static double calcJaccard(Collection<AffinityGroup> sample1,
Collection<AffinityGroup> sample2)
{
// a is number of pairs of identities in same affinity group
// in both samples
// b is number of pairs that are in the same affinity gruop
// in the first sample only
// c is the number of pairs that in the same affinity group
// in the second sample only
long a = 0;
long b = 0;
long c = 0;
for (AffinityGroup group : sample1) {
ArrayList<Identity> groupList =
new ArrayList<Identity>(group.getIdentities());
int size = groupList.size();
for (int i = 0; i < size - 1; i++) {
Identity v1 = groupList.get(i);
for (int j = i + 1; j < size; j++) {
Identity v2 = groupList.get(j);
// v1 and v2 are in the same group in sample1. Are they
// in the same group in sample2?
if (inSameGroup(v1, v2, sample2)) {
a++;
} else {
b++;
}
}
}
}
for (AffinityGroup group : sample2) {
ArrayList<Identity> groupList =
new ArrayList<Identity>(group.getIdentities());
int size = groupList.size();
for (int i = 0; i < size - 1; i++) {
Identity v1 = groupList.get(i);
for (int j = i + 1; j < size; j++) {
Identity v2 = groupList.get(j);
// v1 and v2 are in the same group in sample2. Count those
// that are not in the same group in sample1.
if (!inSameGroup(v1, v2, sample1)) {
c++;
}
}
}
}
// Jaccard's index (or coefficient) is defined as a/(a+b+c).
return ((double) a / (double) (a + b + c));
}
/**
* Returns {@code true} if two identities are in the same
* {@code AffinityGroup} in a given affinity group collection.
* @param id1 the first identity
* @param id2 the second identity
* @param sample the affinity group collection
* @return {@code true} if {@code id1} and {@code id2} are in the
* same affinity group in the {@code sample} collection of affinity
* groups
*/
private static boolean inSameGroup(Identity id1, Identity id2,
Collection<AffinityGroup> sample)
{
// Note: this method doesn't assume that affinity groups will
// contain disjoint members - it is legal for an Identity to
// be found in two groups.
for (AffinityGroup g : sample) {
Set<Identity> idents = g.getIdentities();
if (idents.contains(id1) && idents.contains(id2)) {
return true;
}
}
return false;
}
}