package org.baderlab.csplugins.brainlib; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.TreeSet; /** * Copyright (c) 2005 Memorial Sloan-Kettering Cancer Center * * * * Code written by: Gary Bader * * Authors: Gary Bader, Chris Sander * * * * This library is free software; you can redistribute it and/or modify it * * under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation; either version 2.1 of the License, or * * any later version. * * * * This library is distributed in the hope that it will be useful, but * * WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF * * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. The software and * * documentation provided hereunder is on an "as is" basis, and * * Memorial Sloan-Kettering Cancer Center * * has no obligations to provide maintenance, support, * * updates, enhancements or modifications. In no event shall the * * Memorial Sloan-Kettering Cancer Center * * be liable to any party for direct, indirect, special, * * incidental or consequential damages, including lost profits, arising * * out of the use of this software and its documentation, even if * * Memorial Sloan-Kettering Cancer Center * * has been advised of the possibility of such damage. See * * the GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this library; if not, write to the Free Software Foundation, * * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * * * * User: GaryBader * * Date: Aug 4, 2005 * * Time: 5:59:00 PM */ /** * Implements average linkage hierarchical clustering */ public class AvgLinkHierarchicalClustering { /** * result: The clustering solution. Each row in the matrix describes one linking event, * with the two columns containing the integer identifier of the nodes that were joined. * The original elements are numbered 0..nelements-1, nodes are numbered * -1..-(nelements-1). */ protected int[][] result = null; /** * linkDistance: For each node, the distance between the two subnodes that were joined. The * number of nodes is equal to the number of items clustered minus one. */ protected double[] linkDistance = null; protected DistanceMatrix distanceMatrix = null; protected int nelements = 0; //number of elements in the distance matrix protected int[] leafOrder; //stores the order of the final output of the matrix - filled during clustering protected boolean optimalLeafOrdering = true; //by default, use optimal leaf ordering (otherwise heuristic is used) protected boolean singleLinkage = false; //by default, use average linkage clustering protected ArrayList labelHighlight; //a list of LabelColorPair objects public final static String LEAF_ORDERING_BARJOSEPH2003 = "Bar-Joseph"; /** * linkedLeaves: pairs of leaves linked at each node of the result. Each row in this matrix * describes one linking event, with the 2 columns containing the integer identifier of the leaves * that were joined. */ protected int[][] linkedLeaves = null; /** * @param distanceMatrix The distance matrix, with zeros along the diagonal. * The distance matrix is a ragged array containing the distances * As the distance matrix is symmetric, with zeros on the diagonal, only the * lower triangular half of the distance matrix is saved. * Distances must be normalized to 0..1 since these methods * need to convert from distances to similarities occasionally. */ public AvgLinkHierarchicalClustering(DistanceMatrix distanceMatrix) { this.distanceMatrix = distanceMatrix; nelements = distanceMatrix.getMatrixDimension(); result = new int[nelements - 1][2]; linkedLeaves = new int[nelements - 1][2]; linkDistance = new double[nelements]; leafOrder = new int[nelements]; } /** * Get the String label of the clustered element at original index 'elementIndex' * * @param elementIndex The index of the element in the original distance matrix used to create the clustering * @return The String element label */ public String getLabel(int elementIndex) { String elementLabel = (String) distanceMatrix.getLabels().get(elementIndex); return elementLabel; } /** * Private class to hold a pair of integers for distance matrix lookup */ private class Pair { public int i, j; public Pair() { this.i = 0; this.j = 0; } } /** * This function searches the distance matrix to find the pair with the shortest * distance between them. The indices of the pair are returned in pair; the * distance itself is returned by the function. * * @param pair index i and j of the pair stored in a Pair object * @return The shortest distance found */ private double findClosestPair(int nNodes, Pair pair, DistanceMatrix dmInternal) { double distance = dmInternal.getValue(1, 0); for (int i = 0; i < nNodes; i++) { for (int j = 0; j < i; j++) { if (dmInternal.getValue(i, j) < distance) { distance = dmInternal.getValue(i, j); pair.i = i; pair.j = j; } } } //if closest pair not found in loop, the 1,0 is the closest pair and this should be captured outside of this call return distance; } /** * Performs hierarchical clustering using the distance matrix provided in the constructor * (average linkage clustering) * This clustering code ported to Java from Michiel de Hoon's open source clustering library * http://bonsai.ims.u-tokyo.ac.jp/%7Emdehoon/software/software.html */ public void run() { /* Keep track of the number of elements in each cluster * (needed to calculate the average) */ int[] number = new int[nelements]; /* Setup a list specifying to which cluster a gene belongs */ int[] clusterid = new int[nelements]; for (int j = 0; j < nelements; j++) { number[j] = 1; clusterid[j] = j; } //copy the distance matrix so that the original matrix is not affected by this method DistanceMatrix dmInternal = distanceMatrix.copy(); //hierarchical clustering step Pair pair = new Pair(); for (int nNodes = nelements; nNodes > 1; nNodes--) { pair.i = 1; //set to start coordinates of the search in findClosestPair pair.j = 0; linkDistance[nelements - nNodes] = findClosestPair(nNodes, pair, dmInternal); int isaved = pair.i; int jsaved = pair.j; int sum = 0; /* Save linked leaves from the closest pair */ linkedLeaves[nelements - nNodes][0] = pair.i; linkedLeaves[nelements - nNodes][1] = pair.j; /* Save result */ result[nelements - nNodes][0] = clusterid[isaved]; result[nelements - nNodes][1] = clusterid[jsaved]; if (!singleLinkage) { /* Update the distances - average linkage */ sum = number[isaved] + number[jsaved]; for (int j = 0; j < jsaved; j++) { dmInternal.setValue(jsaved, j, dmInternal.getValue(isaved, j) * number[isaved] + dmInternal.getValue(jsaved, j) * number[jsaved]); dmInternal.setValue(jsaved, j, dmInternal.getValue(jsaved, j) / sum); } for (int j = jsaved + 1; j < isaved; j++) { dmInternal.setValue(j, jsaved, dmInternal.getValue(isaved, j) * number[isaved] + dmInternal.getValue(j, jsaved) * number[jsaved]); dmInternal.setValue(j, jsaved, dmInternal.getValue(j, jsaved) / sum); } for (int j = isaved + 1; j < nNodes; j++) { dmInternal.setValue(j, jsaved, dmInternal.getValue(j, isaved) * number[isaved] + dmInternal.getValue(j, jsaved) * number[jsaved]); dmInternal.setValue(j, jsaved, dmInternal.getValue(j, jsaved) / sum); } for (int j = 0; j < isaved; j++) dmInternal.setValue(isaved, j, dmInternal.getValue(nNodes - 1, j)); for (int j = isaved + 1; j < nNodes - 1; j++) dmInternal.setValue(j, isaved, dmInternal.getValue(nNodes - 1, j)); } /* Update number of elements in the clusters */ number[jsaved] = sum; number[isaved] = number[nNodes - 1]; /* Update clusterids */ clusterid[jsaved] = nNodes - nelements - 1; clusterid[isaved] = clusterid[nNodes - 1]; } dmInternal = null; //free this potentially large matrix //Create an ordering for the leaves if (optimalLeafOrdering) { orderLeavesBarJoseph2003(result, distanceMatrix, leafOrder); } else { orderLeavesEisenHeuristic(leafOrder); } } public void old_run() { /* Keep track of the number of elements in each cluster * (needed to calculate the average) */ int[] number = new int[nelements]; /* Setup a list specifying to which cluster a gene belongs */ int[] clusterid = new int[nelements]; for (int j = 0; j < nelements; j++) { number[j] = 1; clusterid[j] = j; } //copy the distance matrix so that the original matrix is not affected by this method DistanceMatrix dmInternal = distanceMatrix.copy(); //hierarchical clustering step Pair pair = new Pair(); for (int nNodes = nelements; nNodes > 1; nNodes--) { pair.i = 1; //set to start coordinates of the search in findClosestPair pair.j = 0; linkDistance[nelements - nNodes] = findClosestPair(nNodes, pair, dmInternal); int isaved = pair.i; int jsaved = pair.j; int sum = 0; /* Save result */ result[nelements - nNodes][0] = clusterid[isaved]; result[nelements - nNodes][1] = clusterid[jsaved]; /* Update the distances - average linkage */ sum = number[isaved] + number[jsaved]; for (int j = 0; j < jsaved; j++) { dmInternal.setValue(jsaved, j, dmInternal.getValue(isaved, j) * number[isaved] + dmInternal.getValue(jsaved, j) * number[jsaved]); dmInternal.setValue(jsaved, j, dmInternal.getValue(jsaved, j) / sum); } for (int j = jsaved + 1; j < isaved; j++) { dmInternal.setValue(j, jsaved, dmInternal.getValue(isaved, j) * number[isaved] + dmInternal.getValue(j, jsaved) * number[jsaved]); dmInternal.setValue(j, jsaved, dmInternal.getValue(j, jsaved) / sum); } for (int j = isaved + 1; j < nNodes; j++) { dmInternal.setValue(j, jsaved, dmInternal.getValue(j, isaved) * number[isaved] + dmInternal.getValue(j, jsaved) * number[jsaved]); dmInternal.setValue(j, jsaved, dmInternal.getValue(j, jsaved) / sum); } for (int j = 0; j < isaved; j++) dmInternal.setValue(isaved, j, dmInternal.getValue(nNodes - 1, j)); for (int j = isaved + 1; j < nNodes - 1; j++) dmInternal.setValue(j, isaved, dmInternal.getValue(nNodes - 1, j)); /* Update number of elements in the clusters */ number[jsaved] = sum; number[isaved] = number[nNodes - 1]; /* Update clusterids */ clusterid[jsaved] = nNodes - nelements - 1; clusterid[isaved] = clusterid[nNodes - 1]; } dmInternal = null; //free this potentially large matrix //Create an ordering for the leaves if (optimalLeafOrdering) { orderLeavesBarJoseph2003(result, distanceMatrix, leafOrder); } else { orderLeavesEisenHeuristic(leafOrder); } } /** * Takes hierarchical clustering output and divides the elements in the tree structure * into clusters. The number of clusters is specified by the user. * <p/> * This clustering code ported to Java from Michiel de Hoon's open source clustering library * http://bonsai.ims.u-tokyo.ac.jp/%7Emdehoon/software/software.html * * @param nclusters The number of clusters to be formed. Ranges from 1..nelements. * @return int[nelements] - The number of the cluster to which each element was assigned. */ public int[] cutTree(int nclusters) { int i, j, k; int icluster = 0; int n = nelements - nclusters; /* number of nodes to join */ int[] nodeid; int clusterid[] = new int[nelements]; /* Check the tree */ boolean flag = false; if (nclusters > nelements || nclusters < 1) flag = true; for (i = 0; i < nelements - 1; i++) { if (result[i][0] >= nelements || result[i][0] < -i || result[i][1] >= nelements || result[i][1] < -i) { flag = true; break; } } /* Assign all elements to cluster -1 and return if an error is found. */ if (flag) { for (i = 0; i < nelements; i++) clusterid[i] = -1; return null; } /* The tree array is safe to use. */ for (i = nelements - 2; i >= n; i--) { k = result[i][0]; if (k >= 0) { clusterid[k] = icluster; icluster++; } k = result[i][1]; if (k >= 0) { clusterid[k] = icluster; icluster++; } } nodeid = new int[n]; for (i = 0; i < n; i++) nodeid[i] = -1; for (i = n - 1; i >= 0; i--) { if (nodeid[i] < 0) { j = icluster; nodeid[i] = j; icluster++; } else j = nodeid[i]; k = result[i][0]; if (k < 0) nodeid[-k - 1] = j; else clusterid[k] = j; k = result[i][1]; if (k < 0) nodeid[-k - 1] = j; else clusterid[k] = j; } return clusterid; } /** * Checks if optimal leaf ordering is turned on (this is the default) * */ public boolean isOptimalLeafOrdering() { return optimalLeafOrdering; } /** * If true (default), uses Bar-Joseph 2003 optimal leaf ordering * If false, uses Eisen heuristic leaf ordering * * @param optimalLeafOrdering */ public void setOptimalLeafOrdering(boolean optimalLeafOrdering) { this.optimalLeafOrdering = optimalLeafOrdering; } public void setLeafOrderingMethod(String leafOrderingMethod) { if (leafOrderingMethod == LEAF_ORDERING_BARJOSEPH2003) { this.optimalLeafOrdering = true; } else { this.optimalLeafOrdering = false; } } public boolean isSingleLinkage() { return singleLinkage; } public void setSingleLinkage(boolean flag) { singleLinkage = flag; } /** * Returns the clustering solution as a HierarchicalClusteringResultTree object. * Note: Each time this method is called, a HierarchicalClusteringResultTree * is created from more efficient internal data structures. * * @return The root of a tree describing the clustering solution */ public HierarchicalClusteringResultTree getResult() { return convertResultTreeToTreeClass(result, linkDistance, leafOrder); } /** * Convert hierarchical clustering result tree to HierarchicalClusteringResultTree * Note: converts from the leaves up towards the root */ private HierarchicalClusteringResultTree convertResultTreeToTreeClass(int[][] resultTree, double[] linkDistance, int[] leafOrder) { //save the leaf order list for convenient searching ArrayList leafOrderList = new ArrayList(); for (int i = 0; i < leafOrder.length; i++) { int index = leafOrder[i]; leafOrderList.add(new Integer(index)); } ArrayList clusteredElementLabels = distanceMatrix.getLabels(); /** * resultTree: Each row in the matrix describes one linking event, * with the two columns containing the name of the nodes that were joined. * The original elements are numbered 0..nelements-1, nodes are numbered * -1..-(nelements-1). */ HierarchicalClusteringResultTree t = null; //root HierarchicalClusteringResultTree[] internalNodeList = new HierarchicalClusteringResultTree[resultTree.length + 1]; //one for each internal node (node index starts at 1) for (int i = 0; i < resultTree.length; i++) { HierarchicalClusteringResultTree tleft = null; HierarchicalClusteringResultTree tright = null; //left if (resultTree[i][0] >= 0) { //leaf tleft = new HierarchicalClusteringResultTree(resultTree[i][0], leafOrderList.indexOf(new Integer(resultTree[i][0])), (String) clusteredElementLabels.get(resultTree[i][0])); } else { //node tleft = internalNodeList[-resultTree[i][0]]; } //right if (resultTree[i][1] >= 0) { //leaf tright = new HierarchicalClusteringResultTree(resultTree[i][1], leafOrderList.indexOf(new Integer(resultTree[i][1])), (String) clusteredElementLabels.get(resultTree[i][1])); } else { //node tright = internalNodeList[-resultTree[i][1]]; } HierarchicalClusteringResultTree leftLeaf = new HierarchicalClusteringResultTree(linkedLeaves[i][0], leafOrderList.indexOf(new Integer(linkedLeaves[i][0])), (String) clusteredElementLabels.get(linkedLeaves[i][0])); HierarchicalClusteringResultTree rightLeaf = new HierarchicalClusteringResultTree(linkedLeaves[i][1], leafOrderList.indexOf(new Integer(linkedLeaves[i][1])), (String) clusteredElementLabels.get(linkedLeaves[i][1])); t = internalNodeList[i + 1] = new HierarchicalClusteringResultTree(tleft, tright, i + 1, linkDistance[i], leftLeaf, rightLeaf); } return t; } /** * Gets the number of elements that were clustered */ public int getNelements() { return nelements; } /** * Return the maximum distance found in the clustering result */ public double getMaxDistance() { double maxDistance = 0.0; for (int i = 0; i < linkDistance.length; i++) { maxDistance = Math.max(maxDistance, linkDistance[i]); } return maxDistance; } /** * Return the order to output the leaves * Array values are leaf indices, so i..length provides the order of leaf indices */ public int[] getLeafOrder() { return leafOrder; } /** * Order the leaves on the tree in a reasonable order. This imposes an ordering on the rows of the * distance matrix. (Used for the Eisen heuristic leaf ordering) * * @param order Original order of elements * @param nodeorder Order of nodes * @param nodecounts Number of elements per node (cluster) * @param NodeElement Representation of tree - each row represents a node and contains 2 children * @param leafOrder Will store the output order for the leaves */ private void treeSort(double[] order, double[] nodeorder, int[] nodecounts, int NodeElement[][], int[] leafOrder) { int nNodes = nelements - 1; double[] neworder = new double[nelements]; /* initialized to 0.0 */ int[] clusterids = new int[nelements]; for (int i = 0; i < nelements; i++) { clusterids[i] = i; } for (int i = 0; i < nNodes; i++) { int i1 = NodeElement[i][0]; int i2 = NodeElement[i][1]; double order1 = (i1 < 0) ? nodeorder[-i1 - 1] : order[i1]; double order2 = (i2 < 0) ? nodeorder[-i2 - 1] : order[i2]; int count1 = (i1 < 0) ? nodecounts[-i1 - 1] : 1; int count2 = (i2 < 0) ? nodecounts[-i2 - 1] : 1; /* If order1 and order2 are equal, their order is determined by * the order in which they were clustered */ if (i1 < i2) { double increase = (order1 < order2) ? count1 : count2; for (int j = 0; j < nelements; j++) { int clusterid = clusterids[j]; if (clusterid == i1 && order1 >= order2) neworder[j] += increase; if (clusterid == i2 && order1 < order2) neworder[j] += increase; if (clusterid == i1 || clusterid == i2) clusterids[j] = -i - 1; } } else { double increase = (order1 <= order2) ? count1 : count2; for (int j = 0; j < nelements; j++) { int clusterid = clusterids[j]; if (clusterid == i1 && order1 > order2) neworder[j] += increase; if (clusterid == i2 && order1 <= order2) neworder[j] += increase; if (clusterid == i1 || clusterid == i2) clusterids[j] = -i - 1; } } } //keep track of how to output the final matrix so that it is this order sort(neworder, leafOrder); return; } /** * Sets up an index table given the data, such that data[index[]] is in * increasing order. The array data is unchanged. */ private void sort(double data[], int index[]) { /*this is like a 2-column sort in Excel, where one colomn defines the sort and then the other column contains the order you want*/ class DataIndexPair implements Comparable { public double data; public int index; public DataIndexPair(double data, int index) { this.data = data; this.index = index; } public int compareTo(Object o) { DataIndexPair that = (DataIndexPair) o; return (int) (this.data - that.data); } } TreeSet indexValue = new TreeSet(); for (int i = 0; i < data.length; i++) { DataIndexPair dataIndexDataIndexPair = new DataIndexPair(data[i], i); indexValue.add(dataIndexDataIndexPair); } //values are sorted by keys Iterator values = indexValue.iterator(); int i = 0; while (values.hasNext()) { DataIndexPair dataIndexPair = (DataIndexPair) values.next(); index[i] = dataIndexPair.index; i++; } } /** * Orders leaf nodes according to a heuristic * Note: final ordering highly dependent on input ordering * <p/> * Ported from Michael Eisen's Cluster software code * http://rana.lbl.gov/EisenSoftware.htm * * @param leafOrder */ private void orderLeavesEisenHeuristic(int[] leafOrder) { int nNodes = nelements - 1; double[] nodeorder = new double[nNodes]; int[] nodecounts = new int[nNodes]; //number of elements in a node(cluster) //order of elements to cluster - this is just the index of the elements of the DistanceMatrix double[] origOrder = new double[nelements]; //order of elements in the distance matrix for (int i = 0; i < origOrder.length; i++) { origOrder[i] = i; } for (int i = 0; i < nNodes; i++) { int min1 = result[i][0]; int min2 = result[i][1]; /* min1 and min2 are the elements that are to be joined */ double order1; double order2; int counts1; int counts2; if (min1 < 0) { int index1 = -min1 - 1; order1 = nodeorder[index1]; counts1 = nodecounts[index1]; linkDistance[i] = Math.max(linkDistance[i], linkDistance[index1]); } else { order1 = origOrder[min1]; counts1 = 1; } if (min2 < 0) { int index2 = -min2 - 1; order2 = nodeorder[index2]; counts2 = nodecounts[index2]; linkDistance[i] = Math.max(linkDistance[i], linkDistance[index2]); } else { order2 = origOrder[min2]; counts2 = 1; } nodecounts[i] = counts1 + counts2; nodeorder[i] = (counts1 * order1 + counts2 * order2) / (counts1 + counts2); } /* Now set up order based on the tree structure */ for (int i = 0; i < nelements; i++) { leafOrder[i] = i; //geneindex should be nelements long - global and filled here } treeSort(origOrder, nodeorder, nodecounts, result, leafOrder); } //following code starting here is for optimal leaf ordering private final int rightTree = 2; private final int leftTree = 1; private final double maxAdd = 2; private class LeafPair { private int leftLeaf; private int rightLeaf; private LeafPair preLeft; private LeafPair preRight; private int n1, n2; // Pair for best tree construction public LeafPair(int l, int r, LeafPair pl, LeafPair pr, int t1, int t2) { leftLeaf = l; rightLeaf = r; preLeft = pl; preRight = pr; n1 = t1; n2 = t2; } } private class LeafDist implements Comparable { public int n; public double dist; public LeafPair best; public LeafDist(int to, double d, LeafPair p) { n = to; dist = d; best = p; } public int compareTo(Object o) { LeafDist ld = (LeafDist) o; //Changed the comparator. In Java 7 compareTo method throws a java.lang.IllegalArgumentException: Comparison method violates its general contract! //In Java 7 they changed the implementation of the compareto method cause this thrown exception //suggestions on web included forcing java to use the old implementation, checking for Nan values or changing the implementation. //Checking for Nan did not solve the problem. if(this.dist > ld.dist) return 1; if(this.dist < ld.dist) return -1; else return 0; //return (int) (this.dist - ld.dist); } } private class Leaf { private int index; private LeafDist[] curDist, newDist; private double[][] distMat; private int listSize, newSize; public double bestNew; void setSize(int size) { listSize = size; } int giveSize() { return listSize; } LeafDist[] giveList() { return curDist; } void initNewSize(int nSize) { newDist = new LeafDist[nSize]; } void initNewDist() { newDist = null; } int giveIndex() { return index; } // Each gene is assigned a leaf, initially it does not have // any pairing gene. public Leaf(int num, double[][] mat) { index = num; distMat = mat; LeafPair oneLeaf = new LeafPair(index, -1, null, null, 1, 0); curDist = new LeafDist[1]; curDist[0] = new LeafDist(index, 0, oneLeaf); listSize = 1; newDist = null; newSize = 0; bestNew = Double.MAX_VALUE; } // replace previous pair list (from previous subtree) with // a new one from the current subtree void replace() { int i; // delete previous distance list for (i = 0; i < listSize; i++) { curDist[i] = null; } curDist = null; listSize = newSize; // bestNew becomes the maximum distance that may help in future // trees. bestNew += maxAdd; curDist = new LeafDist[listSize]; for (i = 0; i < newSize; i++) { curDist[i] = newDist[i]; } newDist = null; Arrays.sort(curDist); newSize = 0; bestNew = Double.MAX_VALUE; } // Adds corner to list of corners, and finds the best distance // with corner on the other side. void addToNew(Leaf[] corner1, Leaf[] corner2, int n1, int n2, int c1, int c2, double max1) { int fromNum, fromIndex; LeafDist[] fDist; Leaf curLeaf; int i, j; double[] bestC = new double[nelements + 1]; double curVal, bestD; double bestPos; int[] bForC = new int[nelements + 1]; int cForD = 0, dPlace = 0; double maxAC = Double.MAX_VALUE; for (j = 0; j < c1; j++) { fromIndex = corner1[j].giveIndex(); bestC[fromIndex] = Double.MAX_VALUE; for (i = 0; i < listSize; i++) { bestPos = curDist[i].dist + max1; if (bestPos > bestC[fromIndex]) // optimization i = listSize; else { curVal = curDist[i].dist + distMat[curDist[i].n][fromIndex]; if (curVal < bestC[fromIndex]) { bestC[fromIndex] = curVal; bForC[fromIndex] = i; } } } if (bestC[fromIndex] < maxAC) maxAC = bestC[fromIndex]; } for (j = 0; j < c2; j++) { bestD = Double.MAX_VALUE; curLeaf = corner2[j]; fromNum = curLeaf.giveSize(); fromIndex = curLeaf.giveIndex(); fDist = curLeaf.giveList(); for (i = 0; i < fromNum; i++) { bestPos = curLeaf.curDist[i].dist + maxAC; if (bestPos > bestD) i = fromNum; // optimization else { curVal = bestC[fDist[i].n] + curLeaf.curDist[i].dist; if (curVal < bestD) { bestD = curVal; cForD = curLeaf.curDist[i].n; dPlace = i; } } } LeafPair newPair = new LeafPair(index, fromIndex, curDist[bForC[cForD]].best, fDist[dPlace].best, n1, n2); newDist[newSize] = new LeafDist(fromIndex, bestD, newPair); newSize++; LeafPair cornerPair = new LeafPair(fromIndex, index, fDist[dPlace].best, curDist[bForC[cForD]].best, n2, n1); curLeaf.addNewDist(index, bestD, cornerPair); } bForC = null; bestC = null; } // adds a new pair to the dist list void addNewDist(int n, double dist, LeafPair p) { newDist[newSize] = new LeafDist(n, dist, p); newSize++; } // the optimization for the last two subtrees, no need // to compute distance to all leaves, the best will suffice (see paper). int findLast(Leaf[] corner1, Leaf[] corner2, int n1, int n2) { LeafDist[] myDist; Leaf curLeaf; int i, j; double curVal, best = Double.MAX_VALUE; double myVal; int myInd = 0, bestIndl = 0, bestIndr = 0, mBestl = 0, mBestr = 0; LeafPair lpre = null, rpre = null; LeafDist[][] fDist = new LeafDist[n2][]; for (i = 0; i < n2; i++) { fDist[i] = corner2[i].giveList(); } for (j = 0; j < n1; j++) { curLeaf = corner1[j]; myDist = curLeaf.giveList(); myVal = myDist[0].dist; myInd = curLeaf.giveIndex(); for (i = 0; i < n2; i++) { curVal = myVal + fDist[i][0].dist + distMat[myInd][corner2[i].giveIndex()]; if (best > curVal) { best = curVal; bestIndl = myDist[0].n; bestIndr = fDist[i][0].n; mBestl = myInd; mBestr = corner2[i].giveIndex(); } } } LeafPair newPair; int place = 0, size; for (i = 0; i < n2; i++) { if (corner2[i].giveIndex() == bestIndr) { size = corner2[i].giveSize(); for (j = 0; j < size; j++) { if (fDist[i][j].n == mBestr) { rpre = fDist[i][j].best; j = size; } } i = n2; } } for (i = 0; i < n1; i++) { if (corner1[i].giveIndex() == bestIndl) { size = corner1[i].giveSize(); myDist = corner1[i].giveList(); for (j = 0; j < size; j++) { if (myDist[j].n == mBestl) { lpre = myDist[j].best; j = size; } } newPair = new LeafPair(bestIndl, bestIndr, lpre, rpre, n1, n2); place = i; corner1[i].initNewSize(1); corner1[i].addNewDist(bestIndr, best, newPair); corner1[i].replace(); i = n1; } } fDist = null; return place; } } private class Tree { private Tree left, right; private int numLeafs; private Leaf[] allLeafs; //list of Leaf instances private int nodeNum; private double[][] mat; // generate a leaf (tree with only one node) public Tree(int index, double[][] m) { mat = m; nodeNum = index; allLeafs = new Leaf[1]; Leaf myLeaf = new Leaf(index, mat); allLeafs[0] = myLeaf; numLeafs = 1; left = right = null; } // combine two trees public Tree(Tree t1, Tree t2, int nNum) { nodeNum = nNum; mat = t1.giveMat(); int n1 = t1.giveNumLeafs(); int n2 = t2.giveNumLeafs(); numLeafs = n1 + n2; allLeafs = new Leaf[numLeafs]; int i; Leaf[] l = t1.giveLeafs(); for (i = 0; i < n1; i++) allLeafs[i] = l[i]; l = t2.giveLeafs(); for (i = 0; i < n2; i++) allLeafs[n1 + i] = l[i]; left = t1; right = t2; } // compute the optimal order for this tree int compDist() { if (numLeafs == 1) { // no nodes to flip return 0; } int i, j; left.compDist(); // compute optimal order matrix t subtree right.compDist(); int n1 = left.giveNumLeafs(); int n2 = right.giveNumLeafs(); if (n1 + n2 == nelements) { // last two subtrees return lastTree(n1, n2); // this is an optimization which // reduces the running time by half, see paper for details } if (n1 > 1 && n2 > 1) { return compDist(n1, n2); // calling another optimization } else { // one subtree has only one leaf Leaf[] l1 = left.giveLeafs(); Leaf[] l2 = right.giveLeafs(); for (j = 0; j < n2; j++) l2[j].initNewSize(n1); for (i = 0; i < n1; i++) { l1[i].initNewSize(n2); // this function actually computes the set of optimal orders // of leftmost and rightmost leaves in the combined tree. l1[i].addToNew(l2, l2, n1, n2, n2, n2, Double.MIN_VALUE); l1[i].replace(); } for (j = 0; j < n2; j++) l2[j].replace(); } return 0; } // optimization, perfromed for combining the last two subtrees int lastTree(int n1, int n2) { Leaf[] l1 = left.giveLeafs(); Leaf[] l2 = right.giveLeafs(); int res = l1[0].findLast(l1, l2, n1, n2); return res; } // optimization which allows for early termination of the search // see paper for details int compDist(int tot1, int tot2) { Tree t1 = left; Tree t2 = right; Tree t1l = t1.left; Tree t1r = t1.right; Tree t2l = t2.left; Tree t2r = t2.right; int n1 = t1l.giveNumLeafs(); int n2 = t1r.giveNumLeafs(); int n3 = t2l.giveNumLeafs(); int n4 = t2r.giveNumLeafs(); Leaf[] l1 = t1l.giveLeafs(); Leaf[] l2 = t1r.giveLeafs(); Leaf[] l3 = t2l.giveLeafs(); Leaf[] l4 = t2r.giveLeafs(); Leaf[] c2 = t2.giveLeafs(); double mint1lt2l, mint1lt2r, mint1rt2l, mint1rt2r; mint1lt2l = mint1lt2r = mint1rt2l = mint1rt2r = 1; int i, j, i1, i2, j3, j4; // compute minimum similarity between genes in these // two subtrees for (i = 0; i < n1; i++) { i1 = l1[i].giveIndex(); for (j = 0; j < n3; j++) { j3 = l3[j].giveIndex(); if (mat[i1][j3] < mint1rt2r) { mint1rt2r = mat[i1][j3]; } } for (j = 0; j < n4; j++) { j4 = l4[j].giveIndex(); if (mat[i1][j4] < mint1rt2l) { mint1rt2l = mat[i1][j4]; } } } for (i = 0; i < n2; i++) { i2 = l2[i].giveIndex(); for (j = 0; j < n3; j++) { j3 = l3[j].giveIndex(); if (mat[i2][j3] < mint1lt2r) { mint1lt2r = mat[i2][j3]; } } for (j = 0; j < n4; j++) { j4 = l4[j].giveIndex(); if (mat[i2][j4] < mint1lt2l) { mint1lt2l = mat[i2][j4]; } } } for (j = 0; j < tot2; j++) c2[j].initNewSize(tot1); for (i = 0; i < n1; i++) { l1[i].initNewSize(tot2); // use precomputed minimums to terminate the search early l1[i].addToNew(l4, l3, tot1, tot2, n4, n3, mint1lt2l); l1[i].addToNew(l3, l4, tot1, tot2, n3, n4, mint1lt2r); l1[i].replace(); } for (i = 0; i < n2; i++) { l2[i].initNewSize(tot2); l2[i].addToNew(l4, l3, tot1, tot2, n4, n3, mint1rt2l); l2[i].addToNew(l3, l4, tot1, tot2, n3, n4, mint1rt2r); l2[i].replace(); } for (j = 0; j < tot2; j++) c2[j].replace(); return 0; } // called by the main program to return the optimal order // of the tree leaves Object[] returnOrder() { int start = compDist(); LeafDist[] myDist = allLeafs[start].giveList(); Double bestDist = new Double(myDist[0].dist); LeafPair best = myDist[0].best; int[] res = new int[numLeafs]; // used to find the correct order of the leaves // of the tree compTree(best.preLeft, res, 0, best.n1 - 1, leftTree); compTree(best.preRight, res, best.n1, numLeafs - 1, rightTree); Object[] ret = new Object[2]; ret[0] = res; ret[1] = bestDist; return ret; } // computes the new order of the leaves, based on the optimal // ordering computation (using the 'best' struct). void compTree(LeafPair best, int[] res, int start, int last, int l) { if (start == last) { res[start] = best.leftLeaf; return; } if (l == leftTree) { compTree(best.preLeft, res, start, start + best.n1 - 1, leftTree); compTree(best.preRight, res, start + best.n1, last, rightTree); } if (l == rightTree) { compTree(best.preLeft, res, start + best.n2, last, rightTree); compTree(best.preRight, res, start, start + best.n2 - 1, leftTree); } } // computes the sum of similarities in the current order // of the tree leaves double curDist(double[][] m) { if (numLeafs == 1) return 0; double d1 = left.curDist(m); double d2 = right.curDist(m); int lCorner = left.findRight(); int rCorner = right.findLeft(); return (d1 + d2 + m[lCorner][rCorner]); } // find the rightmost leaf int findRight() { if (numLeafs == 1) return allLeafs[0].giveIndex(); return right.findRight(); } int findLeft() { if (numLeafs == 1) return allLeafs[0].giveIndex(); return left.findLeft(); } // finds the initial order of the tree leaves int[] initOrder() { int[] res = new int[numLeafs + 1]; fillArray(res, 0, numLeafs - 1); return res; } void fillArray(int[] array, int s, int l) { if (numLeafs == 1) { array[s] = allLeafs[0].giveIndex(); return; } int n1 = left.giveNumLeafs(); left.fillArray(array, s, s + n1 - 1); right.fillArray(array, s + n1, l); return; } boolean isLeaf() { return (numLeafs == 1); } int giveIndex() { return nodeNum; } double[][] giveMat() { return mat; } int giveNumLeafs() { return numLeafs; } Leaf[] giveLeafs() { return allLeafs; } } //TODO: reimplement similarity matrix for efficiency purposes /** * Find an optimal leaf ordering * Note: exact ordering still depends on input order, but much less than heuristic ordering * <p/> * This code is ported from Ziv Bar-Joseph's optimal leaf ordering code available here: * http://www.cs.cmu.edu/~zivbj/ and as described in the following papers: * <p/> * Bar-Joseph Z, Gifford DK, Jaakkola TS * Fast optimal leaf ordering for hierarchical clustering * Bioinformatics. 2001;17 Suppl 1:S22-9 * <p/> * Bar-Joseph Z, Demaine ED, Gifford DK, Srebro N, Hamel AM, Jaakkola TS * K-ary clustering with optimal leaf ordering for gene expression data * Bioinformatics. 2003 Jun 12;19(9):1070-8 * * @param resultTree The result of hierarchical clustering * @param dm The distance matrix used to cluster */ private void orderLeavesBarJoseph2003(int[][] resultTree, DistanceMatrix dm, int[] leafOrder) { //convert dm to double matrix for this code - all indices in this code start from 1 double[][] mat = new double[dm.getMatrixDimension() + 1][dm.getMatrixDimension() + 1]; for (int i = 1; i < dm.getMatrixDimension() + 1; i++) { for (int j = 1; j < dm.getMatrixDimension() + 1; j++) { mat[i][j] = (double) (dm.getValue(i - 1, j - 1) - 1.0f); //convert to negative similarity matrix - assumes dm is normalized } } //convert resultTree to Tree Tree t = convertResultTreeToTreeClass(resultTree, mat); //debug //t = barJosephClustering(mat); //double init = -1 * t.curDist(mat); // initial ordering score //System.out.println("Initial sum of similarities: " + init); Object[] ret = t.returnOrder(); // the optimal leaf algorithm call int[] arr = (int[]) ret[0]; //move optimal ordering to leafOrder for (int i = 0; i < nelements; i++) { leafOrder[i] = arr[i] - 1; } //double min = -1 * ((Double) ret[1]).doubleValue(); //System.out.println("Optimal sum of similarities: " + min); //System.out.println("Improvement: " + ((min - init) / (init) * 100) + "%"); } /** * Convert hierarchical clustering result tree to Tree */ private Tree convertResultTreeToTreeClass(int[][] resultTree, double[][] mat) { Tree t = null; //root Tree[] tlist = new Tree[nelements]; //one for each internal node for (int i = 0; i < nelements - 1; i++) { Tree tleft = null; Tree tright = null; if (resultTree[i][0] >= 0) { //leaf tleft = new Tree(resultTree[i][0] + 1, mat); //convert to 1..nelements } else { //node tleft = tlist[-resultTree[i][0]]; } if (resultTree[i][1] >= 0) { //leaf tright = new Tree(resultTree[i][1] + 1, mat); //convert to 1..nelements } else { //node tright = tlist[-resultTree[i][1]]; } t = tlist[i + 1] = new Tree(tright, tleft, i + 1); //right/left is reversed here in bar joseph code (not very important) } return t; } //from original bar joseph code - only here for debug purposes private Tree barJosephClustering(double[][] m) { int i, j, k; int num = nelements; double[][] newM = new double[num + 1][]; Tree[] allTrees = new Tree[num + 1]; // copy similarity matrix for (i = 1; i < num + 1; i++) { newM[i] = new double[num + 1]; for (j = 1; j < num + 1; j++) { newM[i][j] = m[i][j]; } } // initially each gene is assigned to its own cluster for (i = 1; i < num + 1; i++) { allTrees[i] = new Tree(i, m); } double max; int r = 0, l = 0; double lSize, rSize; Tree temp; for (i = 1; i < num; i++) { max = Double.MIN_VALUE; // find minimum entry in (converted) similarity matrix for (k = 1; k < num; k++) { if (allTrees[k] != null) { for (j = k + 1; j < num + 1; j++) { if (allTrees[j] != null) { if (max < (-1 * newM[j][k])) { max = (-1 * newM[j][k]); l = k; r = j; } } } } } rSize = allTrees[r].giveNumLeafs(); lSize = allTrees[l].giveNumLeafs(); System.out.print("NODE" + i + "X" + '\t'); if (allTrees[l].isLeaf()) System.out.print("GENE" + allTrees[l].giveIndex() + "X\t"); else System.out.print("NODE" + allTrees[l].giveIndex() + "X\t"); if (allTrees[r].isLeaf()) System.out.print("GENE" + allTrees[r].giveIndex() + "X\t"); else System.out.print("NODE" + allTrees[r].giveIndex() + "X\t"); System.out.println(max); temp = allTrees[l]; allTrees[l] = null; // combine the two clusters allTrees[l] = new Tree(temp, allTrees[r], i); allTrees[r] = null; // update similarity matrix for (j = 1; j < num + 1; j++) { if (allTrees[j] != null && j != l) { newM[j][l] = (lSize * newM[j][l] + rSize * newM[j][r]) / (lSize + rSize); newM[l][j] = newM[j][l]; } } } Tree res = null; // find root of the tree for (i = 1; i < num + 1; i++) { if (allTrees[i] != null) { res = allTrees[i]; } } return res; } /** * Write the distance matrix out as a Cytoscape SIF file (stores the connections) and an EA file (stores the * distance measure of each connection) * * @param sifFileName The SIF filename * @param edgeAttributeFileName The edge attribute filename * @param distanceCutoff Don't output connections above this cutoff * @throws IOException If there is a problem writing either file * TODO: move this and other format writer methods into a utility class. Also create a method to copy this * TODO: information directly into Cytoscape from a Cytoscape plugin */ public void writeResultsToCytoscapeFormat(File sifFileName, File edgeAttributeFileName, double distanceCutoff) throws IOException { //initialization ArrayList labels = distanceMatrix.getLabels(); BufferedWriter brSIF = new BufferedWriter(new FileWriter(sifFileName)); BufferedWriter brEA = new BufferedWriter(new FileWriter(edgeAttributeFileName)); //create EA file header brEA.write("ClusterDistance"); brEA.newLine(); //convert distance matrix to Cytoscape format for (int i = 0; i < nelements; i++) { for (int j = i; j < nelements; j++) { if ((distanceMatrix.getValue(i, j) <= distanceCutoff) && (distanceMatrix.getValue(i, j) != 0.0)) { //create SIF file brSIF.write(labels.get(i) + "\tcl\t" + labels.get(j)); brSIF.newLine(); //create EA file brEA.write(labels.get(i) + " (cl) " + labels.get(j) + " = " + distanceMatrix.getValue(i, j)); brEA.newLine(); } } } brSIF.close(); brEA.close(); } /** * Saves the clustering results to GTR format for loading up into e.g Java Treeview * http://jtreeview.sourceforge.net * Column 1: NodeIdentifier * Column 2: Left child of node * Column 3: Right child of node * Column 4: Correlation between the left and right child */ public String writeResultsToGTRFormat() { StringBuffer sb = new StringBuffer(); String lineSep = System.getProperty("line.separator"); for (int i = 0; i < result.length; i++) { //node identifier sb.append("NODE" + (i + 1) + "X" + "\t"); //left child sb.append((result[i][0] >= 0 ? "GENE" + Math.abs(result[i][0]) : "NODE" + Math.abs(result[i][0])) + "X" + "\t"); //right child sb.append(((result[i][1] >= 0) ? "GENE" + Math.abs(result[i][1]) : "NODE" + Math.abs(result[i][1])) + "X" + "\t"); sb.append((1 - linkDistance[i]) + lineSep); } return sb.toString(); } /** * Used to store label list / color pairs for highlighting output of CDT files */ private class LabelColorPair { private String color; private ArrayList labels; public LabelColorPair(String color, ArrayList labels) { this.color = color; this.labels = labels; } public String getColor() { return color; } public void setColor(String color) { this.color = color; } public ArrayList getLabels() { return labels; } public void setLabels(ArrayList labels) { this.labels = labels; } } /** * Set a list of labels to highlight in the CDT file output * Labels should not overlap previously set labels * If a single label name is set to be highlighted in two or more colors, the behavior is undefined. * * @param labelsToHighlight The list of label Strings to highlight * @param color The color to highlight in the typical HTML format e.g. #FFFFFF=white, #FFFF00=yellow * Background color will always be white. */ public void setLabelHighlightInCDTOutput(ArrayList labelsToHighlight, String color) { if (labelHighlight == null) { labelHighlight = new ArrayList(); } LabelColorPair lcp = new LabelColorPair(color, labelsToHighlight); labelHighlight.add(lcp); } /** * Saves the distance matrix as a CDT file suitable for loading into a tree/cluster viewing program like jtreeview * Elements are highlighted in according to colors set by the setLabelHighlightInCDTOutput method */ public String toCDTString() { StringBuffer sb = new StringBuffer(); String lineSep = System.getProperty("line.separator"); //header line 1 if (labelHighlight != null) { sb.append("GID\tUNIQID\tNAME\tBGCOLOR\tGWEIGHT"); } else { sb.append("GID\tUNIQID\tNAME\tGWEIGHT"); } ArrayList labels = distanceMatrix.getLabels(); for (int i = 0; i < labels.size(); i++) { int indexi = leafOrder[i]; sb.append("\t" + (String) labels.get(indexi)); } sb.append(lineSep); //header line 2 sb.append("EWEIGHT\t\t\t"); for (int i = 0; i < nelements; i++) { sb.append("\t1.000000"); } sb.append(lineSep); //rows for (int i = 0; i < nelements; i++) { int indexi = leafOrder[i]; sb.append("GENE" + indexi + "X\t" + labels.get(indexi) + "\t" + labels.get(indexi) + "\t"); if (labelHighlight != null) { //go through labelHighlight list to check for a label highlight color boolean found = false; for (int j = 0; j < labelHighlight.size(); j++) { LabelColorPair labelColorPair = (LabelColorPair) labelHighlight.get(j); ArrayList labelsToHighlight = labelColorPair.getLabels(); if (labelsToHighlight.contains(labels.get(indexi))) { found = true; sb.append(labelColorPair.getColor()); break; } } if (!found) { //output the default sb.append("#FFFFFF"); } } sb.append("\t1.000000"); for (int j = 0; j < nelements; j++) { int indexj = leafOrder[j]; sb.append("\t" + distanceMatrix.getValue(indexi, indexj)); } sb.append(lineSep); } return sb.toString(); } }