package com.jujutsu.tsne; /* * Copyright (c) 2009-2014, Peter Abeles. All Rights Reserved. * * This file is part of Efficient Java Matrix Library (EJML). * * Adapted by 2014, Leif Jonsson, added pca method. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.ejml.data.DenseMatrix64F; import org.ejml.factory.DecompositionFactory; import org.ejml.interfaces.decomposition.SingularValueDecomposition; import org.ejml.ops.CommonOps; import org.ejml.ops.NormOps; import org.ejml.ops.SingularOps; /** * <p> * The following is a simple example of how to perform basic principal component analysis in EJML. * </p> * * <p> * Principal Component Analysis (PCA) is typically used to develop a linear model for a set of data * (e.g. face images) which can then be used to test for membership. PCA works by converting the * set of data to a new basis that is a subspace of the original set. The subspace is selected * to maximize information. * </p> * <p> * PCA is typically derived as an eigenvalue problem. However in this implementation {@link org.ejml.interfaces.decomposition.SingularValueDecomposition SVD} * is used instead because it will produce a more numerically stable solution. Computation using EVD requires explicitly * computing the variance of each sample set. The variance is computed by squaring the residual, which can * cause loss of precision. * </p> * * <p> * Usage:<br> * 1) call setup()<br> * 2) For each sample (e.g. an image ) call addSample()<br> * 3) After all the samples have been added call computeBasis()<br> * 4) Call sampleToEigenSpace() , eigenToSampleSpace() , errorMembership() , response() * </p> * * @author Peter Abeles */ public class PrincipalComponentAnalysis { // principal component subspace is stored in the rows private DenseMatrix64F V_t; // how many principal components are used private int numComponents; // where the data is stored private DenseMatrix64F A = new DenseMatrix64F(1,1); private int sampleIndex; // mean values of each element across all the samples double mean[]; public PrincipalComponentAnalysis() { } /** * Must be called before any other functions. Declares and sets up internal data structures. * * @param numSamples Number of samples that will be processed. * @param sampleSize Number of elements in each sample. */ public void setup( int numSamples , int sampleSize ) { mean = new double[ sampleSize ]; A.reshape(numSamples,sampleSize,false); sampleIndex = 0; numComponents = -1; } /** * Adds a new sample of the raw data to internal data structure for later processing. All the samples * must be added before computeBasis is called. * * @param sampleData Sample from original raw data. */ public void addSample( double[] sampleData ) { if( A.getNumCols() != sampleData.length ) throw new IllegalArgumentException("Unexpected sample size"); if( sampleIndex >= A.getNumRows() ) throw new IllegalArgumentException("Too many samples"); for( int i = 0; i < sampleData.length; i++ ) { A.set(sampleIndex,i,sampleData[i]); } sampleIndex++; } /** * Computes a basis (the principal components) from the most dominant eigenvectors. * * @param numComponents Number of vectors it will use to describe the data. Typically much * smaller than the number of elements in the input vector. */ public void computeBasis( int numComponents ) { if( numComponents > A.getNumCols() ) throw new IllegalArgumentException("More components requested that the data's length."); if( sampleIndex != A.getNumRows() ) throw new IllegalArgumentException("Not all the data has been added"); if( numComponents > sampleIndex ) throw new IllegalArgumentException("More data needed to compute the desired number of components"); this.numComponents = numComponents; // compute the mean of all the samples for( int i = 0; i < A.getNumRows(); i++ ) { for( int j = 0; j < mean.length; j++ ) { mean[j] += A.get(i,j); } } for( int j = 0; j < mean.length; j++ ) { mean[j] /= A.getNumRows(); } // subtract the mean from the original data for( int i = 0; i < A.getNumRows(); i++ ) { for( int j = 0; j < mean.length; j++ ) { A.set(i,j,A.get(i,j)-mean[j]); } } // Compute SVD and save time by not computing U SingularValueDecomposition<DenseMatrix64F> svd = DecompositionFactory.svd(A.numRows, A.numCols, false, true, false); if( !svd.decompose(A) ) throw new RuntimeException("SVD failed"); V_t = svd.getV(null,true); DenseMatrix64F W = svd.getW(null); // Singular values are in an arbitrary order initially SingularOps.descendingOrder(null,false,W,V_t,true); // strip off unneeded components and find the basis V_t.reshape(numComponents,mean.length,true); } /** * Returns a vector from the PCA's basis. * * @param which Which component's vector is to be returned. * @return Vector from the PCA basis. */ public double[] getBasisVector( int which ) { if( which < 0 || which >= numComponents ) throw new IllegalArgumentException("Invalid component"); DenseMatrix64F v = new DenseMatrix64F(1,A.numCols); CommonOps.extract(V_t,which,which+1,0,A.numCols,v,0,0); return v.data; } /** * Converts a vector from sample space into eigen space. * * @param sampleData Sample space data. * @return Eigen space projection. */ public double[] sampleToEigenSpace( double[] sampleData ) { if( sampleData.length != A.getNumCols() ) throw new IllegalArgumentException("Unexpected sample length"); DenseMatrix64F mean = DenseMatrix64F.wrap(A.getNumCols(),1,this.mean); DenseMatrix64F s = new DenseMatrix64F(A.getNumCols(),1,true,sampleData); DenseMatrix64F r = new DenseMatrix64F(numComponents,1); CommonOps.subtract(s, mean, s); CommonOps.mult(V_t,s,r); return r.data; } /** * Converts a vector from eigen space into sample space. * * @param eigenData Eigen space data. * @return Sample space projection. */ public double[] eigenToSampleSpace( double[] eigenData ) { if( eigenData.length != numComponents ) throw new IllegalArgumentException("Unexpected sample length"); DenseMatrix64F s = new DenseMatrix64F(A.getNumCols(),1); DenseMatrix64F r = DenseMatrix64F.wrap(numComponents,1,eigenData); CommonOps.multTransA(V_t,r,s); DenseMatrix64F mean = DenseMatrix64F.wrap(A.getNumCols(),1,this.mean); CommonOps.add(s,mean,s); return s.data; } /** * <p> * The membership error for a sample. If the error is less than a threshold then * it can be considered a member. The threshold's value depends on the data set. * </p> * <p> * The error is computed by projecting the sample into eigenspace then projecting * it back into sample space and * </p> * * @param sampleA The sample whose membership status is being considered. * @return Its membership error. */ public double errorMembership( double[] sampleA ) { double[] eig = sampleToEigenSpace(sampleA); double[] reproj = eigenToSampleSpace(eig); double total = 0; for( int i = 0; i < reproj.length; i++ ) { double d = sampleA[i] - reproj[i]; total += d*d; } return Math.sqrt(total); } /** * Computes the dot product of each basis vector against the sample. Can be used as a measure * for membership in the training sample set. High values correspond to a better fit. * * @param sample Sample of original data. * @return Higher value indicates it is more likely to be a member of input dataset. */ public double response( double[] sample ) { if( sample.length != A.numCols ) throw new IllegalArgumentException("Expected input vector to be in sample space"); DenseMatrix64F dots = new DenseMatrix64F(numComponents,1); DenseMatrix64F s = DenseMatrix64F.wrap(A.numCols,1,sample); CommonOps.mult(V_t,s,dots); return NormOps.normF(dots); } public double [][] pca(double [][]matrix, int no_dims) { double [][] trafoed = new double[matrix.length][matrix[0].length]; setup(matrix.length, matrix[0].length); for (int i = 0; i < matrix.length; i++) { addSample(matrix[i]); } computeBasis(no_dims); for (int i = 0; i < matrix.length; i++) { trafoed[i] = sampleToEigenSpace(matrix[i]); for (int j = 0; j < trafoed[i].length; j++) { trafoed[i][j] *= -1; } } return trafoed; } }