PCA.java
/*
* Licensed to the Hipparchus project under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The Hipparchus project licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.hipparchus.stat.projection;
import org.hipparchus.exception.MathIllegalStateException;
import org.hipparchus.linear.EigenDecompositionSymmetric;
import org.hipparchus.linear.MatrixUtils;
import org.hipparchus.linear.RealMatrix;
import org.hipparchus.stat.LocalizedStatFormats;
import org.hipparchus.stat.StatUtils;
import org.hipparchus.stat.correlation.Covariance;
import org.hipparchus.stat.descriptive.moment.StandardDeviation;
/**
* Principal component analysis (PCA) is a statistical technique for reducing the dimensionality of a dataset.
* <a href="https://en.wikipedia.org/wiki/Principal_component_analysis">PCA</a> can be thought of as a
* projection or scaling of the data to reduce the number of dimensions but done in a way
* that preserves as much information as possible.
* @since 3.0
*/
public class PCA {
/**
* The number of components (reduced dimensions) for this projection.
*/
private final int numC;
/**
* Whether to scale (standardize) the input data as well as center (normalize).
*/
private final boolean scale;
/**
* Whether to correct for bias when standardizing. Ignored when only centering.
*/
private final boolean biasCorrection;
/**
* The by column (feature) averages (means) from the fitted data.
*/
private double[] center;
/**
* The by column (feature) standard deviates from the fitted data.
*/
private double[] std;
/**
* The eigenValues (variance) of our projection model.
*/
private double[] eigenValues;
/**
* The eigenVectors (components) of our projection model.
*/
private RealMatrix principalComponents;
/**
* Utility class when scaling.
*/
private final StandardDeviation sd;
/**
* Create a PCA with the ability to adjust scaling parameters.
*
* @param numC the number of components
* @param scale whether to also scale (correlation) rather than just center (covariance)
* @param biasCorrection whether to adjust for bias when scaling
*/
public PCA(int numC, boolean scale, boolean biasCorrection) {
this.numC = numC;
this.scale = scale;
this.biasCorrection = biasCorrection;
sd = scale ? new StandardDeviation(biasCorrection) : null;
}
/**
* A default PCA will center but not scale.
*
* @param numC the number of components
*/
public PCA(int numC) {
this(numC, false, true);
}
/** GEt number of components.
* @return the number of components
*/
public int getNumComponents() {
return numC;
}
/** Check whether scaling (correlation) or no scaling (covariance) is used.
* @return whether scaling (correlation) or no scaling (covariance) is used
*/
public boolean isScale() {
return scale;
}
/** Check whether scaling (correlation), if in use, adjusts for bias.
* @return whether scaling (correlation), if in use, adjusts for bias
*/
public boolean isBiasCorrection() {
return biasCorrection;
}
/** Get principal component variances.
* @return the principal component variances, ordered from largest to smallest, which are the eigenvalues of the covariance or correlation matrix of the fitted data
*/
public double[] getVariance() {
validateState("getVariance");
return eigenValues.clone();
}
/** Get by column center (or mean) of the fitted data.
* @return the by column center (or mean) of the fitted data
*/
public double[] getCenter() {
validateState("getCenter");
return center.clone();
}
/**
* Returns the principal components of our projection model.
* These are the eigenvectors of our covariance/correlation matrix.
*
* @return the principal components
*/
public double[][] getComponents() {
validateState("getComponents");
return principalComponents.getData();
}
/**
* Fit our model to the data and then transform it to the reduced dimensions.
*
* @param data the input data
* @return the fitted data
*/
public double[][] fitAndTransform(double[][] data) {
center = null;
RealMatrix normalizedM = getNormalizedMatrix(data);
calculatePrincipalComponents(normalizedM);
return normalizedM.multiply(principalComponents).getData();
}
/**
* Transform the supplied data using our projection model.
*
* @param data the input data
* @return the fitted data
*/
public double[][] transform(double[][] data) {
validateState("transform");
RealMatrix normalizedM = getNormalizedMatrix(data);
return normalizedM.multiply(principalComponents).getData();
}
/**
* Fit our model to the data, ready for subsequence transforms.
*
* @param data the input data
* @return this
*/
public PCA fit(double[][] data) {
center = null;
RealMatrix normalized = getNormalizedMatrix(data);
calculatePrincipalComponents(normalized);
return this;
}
/** Check if the state allows an operation to be performed.
* @param from name of the operation
* @exception MathIllegalStateException if the state does not allows operation
*/
private void validateState(String from) {
if (center == null) {
throw new MathIllegalStateException(LocalizedStatFormats.ILLEGAL_STATE_PCA, from);
}
}
/** Compute eigenvalues and principal components.
* <p>
* The results are stored in the instance itself
* <p>
* @param normalizedM normalized matrix
*/
private void calculatePrincipalComponents(RealMatrix normalizedM) {
RealMatrix covarianceM = new Covariance(normalizedM).getCovarianceMatrix();
EigenDecompositionSymmetric decomposition = new EigenDecompositionSymmetric(covarianceM);
eigenValues = decomposition.getEigenvalues();
principalComponents = MatrixUtils.createRealMatrix(eigenValues.length, numC);
for (int c = 0; c < numC; c++) {
for (int f = 0; f < eigenValues.length; f++) {
principalComponents.setEntry(f, c, decomposition.getEigenvector(c).getEntry(f));
}
}
}
/**
* This will either normalize (center) or
* standardize (center plus scale) the input data.
*
* @param input the input data
* @return the normalized (or standardized) matrix
*/
private RealMatrix getNormalizedMatrix(double[][] input) {
int numS = input.length;
int numF = input[0].length;
boolean calculating = center == null;
if (calculating) {
center = new double[numF];
if (scale) {
std = new double[numF];
}
}
double[][] normalized = new double[numS][numF];
for (int f = 0; f < numF; f++) {
if (calculating) {
calculateNormalizeParameters(input, numS, f);
}
for (int s = 0; s < numS; s++) {
normalized[s][f] = input[s][f] - center[f];
}
if (scale) {
for (int s = 0; s < numS; s++) {
normalized[s][f] /= std[f];
}
}
}
return MatrixUtils.createRealMatrix(normalized);
}
/** compute normalized parameters.
* @param input the input data
* @param numS number of data rows
* @param f index of the component
*/
private void calculateNormalizeParameters(double[][] input, int numS, int f) {
double[] column = new double[numS];
for (int s = 0; s < numS; s++) {
column[s] = input[s][f];
}
center[f] = StatUtils.mean(column);
if (scale) {
std[f] = sd.evaluate(column, center[f]);
}
}
}