package gr.iti.mklab.visual.dimreduction; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import org.ejml.data.DenseMatrix64F; import org.ejml.factory.DecompositionFactory; import org.ejml.factory.SingularValueDecomposition; import org.ejml.ops.CommonOps; import org.ejml.ops.SingularOps; import gr.iti.mklab.visual.utilities.Normalization; /** * <p> * The following class uses EJML to perform basic principle component analysis. This class is a modification * of the * <a href="https://code.google.com/p/efficient-java-matrix-library/wiki/PrincipleComponentAnalysisExample" > * class</a> written by Peter Abeles. * </p> * <p> * PCA is typically derived as an eigenvalue problem. However in this implementation * {@link org.ejml.factory.SingularValueDecomposition SVD} is used since it can produce a more numerically * stable solution. Computation using EVD requires explicitly computing the variance of each sample set. The * variance is computed by squaring the residual, which can cause loss of precision. * </p> * <p> * The class was extended to allow simultaneous PCA projection and whitening. * </p> * * @author Peter Abeles * @author Elefterios Spyromitros-Xioufis */ public class PCA { /** how many principle components are used **/ private int numComponents; /** number of elements in each sample **/ private int sampleSize; /** number of samples that will be used for learning the PCA **/ private int numSamples; /** counts the number of currently loaded samples **/ private int sampleIndex; /** where the data is stored **/ private DenseMatrix64F A = new DenseMatrix64F(1, 1); /** mean values of each element across all samples **/ private DenseMatrix64F means; /** principle component subspace is stored in the rows **/ private DenseMatrix64F V_t; /** a diagonal matrix with the singular values, required if whitening is applied **/ private DenseMatrix64F W; /** whether to apply whitening */ private boolean doWhitening; /** whether to compute the SVD in compact form, false by default **/ private boolean compact = false; private boolean isPcaInitialized = false; /** * Constructor. Whitening is not applied. * * @param numComponents * Number of components it will use to describe each sample. Typically much smaller than the * number of elements in each sample. * @param numSamples * Number of samples that will be processed. Only required for allocating memory for the data * array when learning is performed. 1 can be used at projection time. * @param sampleSize * number of elements in each sample */ public PCA(int numComponents, int numSamples, int sampleSize) { this(numComponents, numSamples, sampleSize, false); } /** * Constructor. * * @param numComponents * Number of components it will use to describe each sample. Typically much smaller than the * number of elements in each sample. * @param numSamples * Number of samples that will be processed. Only required for allocating memory for the data * array when learning is performed. 1 can be used at projection time. * @param sampleSize * number of elements in each sample * @param doWhitening * whether to apply whitening */ public PCA(int numComponents, int numSamples, int sampleSize, boolean doWhitening) { if (numComponents > sampleSize) { throw new IllegalArgumentException("More components requested than the data's length."); } this.numComponents = numComponents; this.numSamples = numSamples; this.sampleSize = sampleSize; this.doWhitening = doWhitening; A.reshape(numSamples, sampleSize, false); sampleIndex = 0; } /** * Adds a new sample of the raw data to internal data structure for later processing. All the samples must * be added before computeBasis is called. * * @param sampleData * sample from original raw data */ public void addSample(double[] sampleData) { if (sampleIndex >= numSamples) throw new IllegalArgumentException("Too many samples"); if (sampleData.length != sampleSize) throw new IllegalArgumentException("Unexpected sample size"); for (int i = 0; i < sampleData.length; i++) { A.set(sampleIndex, i, sampleData[i]); } sampleIndex++; } /** * Computes a basis (the principle components) from the most dominant eigenvectors. */ public void computeBasis() { if (sampleIndex != numSamples) throw new IllegalArgumentException("Not all the data has been added"); if (numComponents > numSamples) throw new IllegalArgumentException( "More data needed to compute the desired number of components"); means = new DenseMatrix64F(sampleSize, 1); // compute the mean of all the samples for (int i = 0; i < numSamples; i++) { for (int j = 0; j < sampleSize; j++) { double val = means.get(j); means.set(j, val + A.get(i, j)); } } for (int j = 0; j < sampleSize; j++) { double avg = means.get(j) / numSamples; means.set(j, avg); } // subtract the mean from the original data for (int i = 0; i < numSamples; i++) { for (int j = 0; j < sampleSize; j++) { A.set(i, j, A.get(i, j) - means.get(j)); } } // compute SVD and save time by not computing U SingularValueDecomposition<DenseMatrix64F> svd = DecompositionFactory.svd(numSamples, sampleSize, false, true, compact); if (!svd.decompose(A)) throw new RuntimeException("SVD failed"); V_t = svd.getV(null, true); W = svd.getW(null); // singular values are in an arbitrary order initially and need to be sorted in descending order SingularOps.descendingOrder(null, false, W, V_t, true); // strip off unneeded components and find the basis V_t.reshape(numComponents, sampleSize, true); } /** * Converts a vector from sample space into eigen space. If {@link #doWhitening} is true, then the vector * is also L2 normalized after projection. * * @param sampleData * Sample space vector * @return Eigen space projected vector * @throws Exception */ public double[] sampleToEigenSpace(double[] sampleData) throws Exception { if (!isPcaInitialized) { // initiallize PCA correctly! throw new Exception("PCA is not correctly initiallized!"); } if (sampleData.length != sampleSize) { throw new IllegalArgumentException("Unexpected vector length!"); } DenseMatrix64F sample = new DenseMatrix64F(sampleSize, 1, true, sampleData); DenseMatrix64F projectedSample = new DenseMatrix64F(numComponents, 1); // mean subtraction CommonOps.sub(sample, means, sample); // projection CommonOps.mult(V_t, sample, projectedSample); // whitening if (doWhitening) { // always perform this normalization step when whitening is used return Normalization.normalizeL2(projectedSample.data); } else { return projectedSample.data; } } /** * Writes the means, the eigenvalues and the PCA matrix to a text file. The 1st row of the file contains * the training sample means per component, the 2nd row contains the eigenvalues in descending order and * subsequent rows contain contain the eigenvectors in descending eigenvalue order. * * @param PCAFileName * the PCA file * @throws Exception */ public void savePCAToFile(String PCAFileName) throws Exception { if (isPcaInitialized) { throw new Exception("Cannot save, PCA is initialized!"); } if (V_t == null) { throw new Exception("Cannot save to file, PCA matrix is null!"); } BufferedWriter out = new BufferedWriter(new FileWriter(PCAFileName)); // the first line of the file contains the training sample means per component for (int i = 0; i < sampleSize - 1; i++) { out.write(means.get(i) + " "); } out.write(means.get(sampleSize - 1) + "\n"); // the second line of the file contains the eigenvalues in descending order for (int i = 0; i < numComponents - 1; i++) { out.write(W.get(i, i) + " "); } out.write(W.get(numComponents - 1, numComponents - 1) + "\n"); // the next lines of the file contain the eigenvectors in descending eigenvalue order for (int i = 0; i < numComponents; i++) { for (int j = 0; j < sampleSize - 1; j++) { out.write(V_t.get(i, j) + " "); } out.write(V_t.get(i, sampleSize - 1) + "\n"); } out.close(); } /** * Loads the PCA matrix, means and eigenvalues matrix (if {@link #doWhitening} is true) from the given * file. The file is supposed to be generated by the {@link #savePCAToFile(String)} method. * * @param PCAFileName * the PCA file * @throws Exception */ public void loadPCAFromFile(String PCAFileName) throws Exception { // parse the PCA projection file and put the PCA components in a 2-d double array BufferedReader in = new BufferedReader(new FileReader(PCAFileName)); String line = ""; // parse the first line which contains the training sample means line = in.readLine(); String[] meanString = line.trim().split(" "); if (meanString.length != sampleSize) { throw new Exception("Means line is wrong!"); } means = new DenseMatrix64F(sampleSize, 1); for (int j = 0; j < sampleSize; j++) { means.set(j, Double.parseDouble(meanString[j])); } // parse the first line which contains the eigenvalues and initialize the diagonal eigenvalue matrix W line = in.readLine(); if (doWhitening) { String[] eigenvaluesString = line.trim().split(" "); if (eigenvaluesString.length < numComponents) { throw new Exception("Eigenvalues line is wrong!"); } W = new DenseMatrix64F(numComponents, numComponents); for (int i = 0; i < numComponents; i++) { // transform the eigenValues double eigenvalue = Double.parseDouble(eigenvaluesString[i]); eigenvalue = Math.pow(eigenvalue, -0.5); W.set(i, i, eigenvalue); } } V_t = new DenseMatrix64F(numComponents, sampleSize); for (int i = 0; i < numComponents; i++) { if (i % 100 == 0) { System.out.println(i + " PCA components loaded."); } try { line = in.readLine(); } catch (IOException e) { throw new Exception( "Check whether the given PCA matrix contains the correct number of components!"); } String[] componentString = line.trim().split(" "); for (int j = 0; j < sampleSize; j++) { double componentElement = Double.parseDouble(componentString[j]); V_t.set(i, j, componentElement); } } // if whitening is true then whiten the PCA matrix V_t by multiplying it with W if (doWhitening) { System.out.print("Whitening the PCA matrix.."); DenseMatrix64F V_t_w = new DenseMatrix64F(numComponents, sampleSize); CommonOps.mult(W, V_t, V_t_w); V_t = V_t_w; } in.close(); isPcaInitialized = true; } public void setCompact(boolean compact) { this.compact = compact; } }