/* * Copyright 2007-2010 Sun Microsystems, Inc. * * This file is part of Project Darkstar Server. * * Project Darkstar Server is free software: you can redistribute it * and/or modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation and * distributed hereunder to you. * * Project Darkstar Server is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * -- */ package com.sun.sgs.impl.service.nodemap.affinity; import com.sun.sgs.impl.service.nodemap.affinity.graph.LabelVertex; import com.sun.sgs.impl.service.nodemap.affinity.graph.WeightedEdge; import com.sun.sgs.auth.Identity; import edu.uci.ics.jung.graph.Graph; import java.util.ArrayList; import java.util.Collection; import java.util.Set; /** * Utility methods for "goodness" measurements of found groups. */ public final class AffinityGroupGoodness { /** * A private constructor: we do not want instances of this class to * be constructed, as it contains only static utility methods. */ private AffinityGroupGoodness() { } /** * Given a graph and a set of partitions of it, calculate the modularity. * Modularity is a quality measure for the goodness of a clustering * algorithm, and is, essentially, the number of edges within communities * subtracted by the expected number of such edges. * <p> * The modularity will be a number between 0.0 and 1.0, with a higher * number being better. * <p> * See "Finding community structure in networks using eigenvectors * of matrices" 2006 Mark Newman and "Finding community structure in * very large networks" 2004 Clauset, Newman, Moore. * <p> * Note that modularity can only be calculated on a complete graph. * * @param graph the graph which was divided into communities * @param groups the communities found in the graph * @return the modularity of the groups found in the graph */ public static double calcModularity(Graph<LabelVertex, WeightedEdge> graph, Collection<AffinityGroup> groups) { // NOTE: this algorithm might need to be optimized if we use it for // more than goodness testing. // m is the sum of edge weights for all edges in the graph long m = 0; for (WeightedEdge e : graph.getEdges()) { m = m + e.getWeight(); } final long doublem = 2 * m; final long doublemsquare = doublem * doublem; // For each pair of vertices that are in the same community, // compute 1/(2m) * Sum(A[i,j] - P[i,j]), where P[i,j] is k[i]k[j]/2m. // See equation (18) in Newman's 2006 paper. P[i,j] is the probable // weight of an edge between vertices i and j, and A[i,j] is the // actual weight. k[i] is the sum of weights of edges connected to // vertex i. // // Note also that modularity can be expressed as // Sum(e[i,i] - a[i]*a[i]) where e[i,i] is the fraction of edges inside // the community i and a[i] is the fraction of ends of edges that are // attached to vertices in community i. // See equation (7) in Clauset, Newman, Moore 2004 paper. long sum = 0; for (AffinityGroup g : groups) { // ingroup is weighted edge count within the community long ingroup = 0; // totEdges is the total number of connections for this community long totEdges = 0; Set<Identity> ids = g.getIdentities(); int size = ids.size(); ArrayList<LabelVertex> groupList = new ArrayList<LabelVertex>(size); for (Identity id : ids) { groupList.add(new LabelVertex(id)); } for (LabelVertex vertex : groupList) { for (WeightedEdge edge : graph.getIncidentEdges(vertex)) { totEdges = totEdges + edge.getWeight(); } } // Look at each of the pairs in the community to find the number // of edges within for (int i = 0; i < size - 1; i++) { LabelVertex v1 = groupList.get(i); for (int j = i + 1; j < size; j++) { LabelVertex v2 = groupList.get(j); // Calculate the adjacency info for v1 and v2; each edge // is counted twice to account for the two vertices it // connects. // We allow parallel (multiple) edges in the graph so // use findEdgeSet. Collection<WeightedEdge> edges = graph.findEdgeSet(v1, v2); for (WeightedEdge edge : edges) { ingroup = ingroup + (edge.getWeight() * 2); } } } // ingroup is e[i,i] * doublem. // totEdges is a[i] * doublem. // Multiply ingroup by doublem here so we can, outside this loop, // divide the sum by doublemsquare to remove the effects of counting // each edge twice. sum = sum + (ingroup * doublem - (totEdges * totEdges)); } double q = (double) sum / doublemsquare; // Ensure that the final value is between 0.0 and 1.0. This number // can go slightly negative if we have groups with single nodes. q = Math.min(1.0, Math.max(0.0, q)); return q; } /** * Calculates Jaccard's index for a pair of affinity group collections, * which is a measurement of similarity of the groups found in the two * collections. The value will be between {@code 0.0} and {@code 1.0}, * with higher values indicating stronger similarity between two samples. * See page 8 of "Near linear time algorithm to detect community structures * in large-scale networks" 2007 Raghavan, Albert, Kumara. * <p> * Because Jaccard's index uses computed groups, rather than a graph, * it can be useful when the graphs are distributed or incomplete. * <p> * @param sample1 the first sample * @param sample2 the second sample * @return the Jaccard index, a value between {@code 0.0} and {@code 1.0}, * with higher values indicating more similarity */ public static double calcJaccard(Collection<AffinityGroup> sample1, Collection<AffinityGroup> sample2) { // a is number of pairs of identities in same affinity group // in both samples // b is number of pairs that are in the same affinity gruop // in the first sample only // c is the number of pairs that in the same affinity group // in the second sample only long a = 0; long b = 0; long c = 0; for (AffinityGroup group : sample1) { ArrayList<Identity> groupList = new ArrayList<Identity>(group.getIdentities()); int size = groupList.size(); for (int i = 0; i < size - 1; i++) { Identity v1 = groupList.get(i); for (int j = i + 1; j < size; j++) { Identity v2 = groupList.get(j); // v1 and v2 are in the same group in sample1. Are they // in the same group in sample2? if (inSameGroup(v1, v2, sample2)) { a++; } else { b++; } } } } for (AffinityGroup group : sample2) { ArrayList<Identity> groupList = new ArrayList<Identity>(group.getIdentities()); int size = groupList.size(); for (int i = 0; i < size - 1; i++) { Identity v1 = groupList.get(i); for (int j = i + 1; j < size; j++) { Identity v2 = groupList.get(j); // v1 and v2 are in the same group in sample2. Count those // that are not in the same group in sample1. if (!inSameGroup(v1, v2, sample1)) { c++; } } } } // Jaccard's index (or coefficient) is defined as a/(a+b+c). return ((double) a / (double) (a + b + c)); } /** * Returns {@code true} if two identities are in the same * {@code AffinityGroup} in a given affinity group collection. * @param id1 the first identity * @param id2 the second identity * @param sample the affinity group collection * @return {@code true} if {@code id1} and {@code id2} are in the * same affinity group in the {@code sample} collection of affinity * groups */ private static boolean inSameGroup(Identity id1, Identity id2, Collection<AffinityGroup> sample) { // Note: this method doesn't assume that affinity groups will // contain disjoint members - it is legal for an Identity to // be found in two groups. for (AffinityGroup g : sample) { Set<Identity> idents = g.getIdentities(); if (idents.contains(id1) && idents.contains(id2)) { return true; } } return false; } }