package org.seqcode.math.stats; public class InfoTheoryUtil { // the symbols in the sequence are coded as 0...numStates-1 // the entropy if returned in nats // p is a vector to calculate probabilities public static double discreteEntropy(int[] sequence,int[] p) { double v = 0.0; double vt = 0.0; int i = 0; int numStates = p.length; for (i=0;i<numStates;i++) { p[i] = 0; } for (i=0;i<sequence.length;i++) { p[sequence[i]]++; } for (i=0;i<numStates;i++) { if (p[i] > 0) { vt = ((double) p[i])/((double) sequence.length); v = v - vt*Math.log(vt); } } if (v <= 0.0) { v = 0.0; } return v; } public static double discreteEntropy(int[] sequence,int numStates) { int[] p = new int[numStates]; return discreteEntropy(sequence,p); } // the sequences must be of the same length // returns the mutual information in nats public static double discreteMutualInformation(int[] sequenceX,int[] sequenceY,int[] pX, int[] pY, int[][] pXY) { double v = 0.0; double vt = 0.0; int i = 0; int j = 0; int numStatesX = pX.length; int numStatesY = pY.length; v = v + discreteEntropy(sequenceX,pX); v = v + discreteEntropy(sequenceY,pY); for (i=0;i<numStatesX;i++) { for (j=0;j<numStatesY;j++) { pXY[i][j] = 0; } } for (i=0;i<sequenceX.length;i++) { pXY[sequenceX[i]][sequenceY[i]]++; } for (i=0;i<numStatesX;i++) { for (j=0;j<numStatesY;j++) { if (pXY[i][j] > 0) { vt = ((double) pXY[i][j])/((double) sequenceX.length); v = v + vt*Math.log(vt); } } } if (v <= 0.0) { v = 0.0; } return v; } public static double discreteMutualInformation(int[] sequenceX, int[] sequenceY,int numStates) { int[] pX = new int[numStates]; int[] pY = new int[numStates]; int[][] pXY = new int[numStates][numStates]; return discreteMutualInformation(sequenceX,sequenceY,pX,pY,pXY); } // zeros are treated specially (essentially as missing values) - only entries for which // one sequence has non-zero entries are considered public static double discreteMutualInformationOverlap(int[] sequenceX,int[] sequenceY,int[] pX, int[] pY, int[][] pXY) { double v = 0.0; double vt = 0.0; int i = 0; int j = 0; int numStatesX = pX.length; int numStatesY = pY.length; int seqL = 0; for (i=0;i<sequenceX.length;i++) { if (sequenceX[i] > 0 | sequenceY[i] > 0) { seqL++; pX[sequenceX[i]]++; pY[sequenceY[i]]++; pXY[sequenceX[i]][sequenceY[i]]++; } } // calculate entropy of X for (i=0;i<numStatesX;i++) { if (pX[i] > 0) { vt = ((double) pX[i])/((double) seqL); v = v - vt*Math.log(vt); } } // calculate entropy of Y for (i=0;i<numStatesY;i++) { if (pY[i] > 0) { vt = ((double) pY[i])/((double) seqL); v = v - vt*Math.log(vt); } } // calculate joint entropy for (i=0;i<numStatesX;i++) { for (j=0;j<numStatesY;j++) { if (pXY[i][j] > 0) { vt = ((double) pXY[i][j])/((double) seqL); v = v + vt*Math.log(vt); } } } if (v <= 0.0) { v = 0.0; } return v; } public static double discreteMutualInformationOverlap(int[] sequenceX, int[] sequenceY,int numStates) { int[] pX = new int[numStates]; int[] pY = new int[numStates]; int[][] pXY = new int[numStates][numStates]; return discreteMutualInformationOverlap(sequenceX,sequenceY,pX,pY,pXY); } }