View Javadoc
1   /*
2    * Licensed to the Hipparchus project under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The Hipparchus project licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.hipparchus.stat.projection;
18  
19  import org.hipparchus.exception.MathIllegalStateException;
20  import org.hipparchus.linear.EigenDecompositionSymmetric;
21  import org.hipparchus.linear.MatrixUtils;
22  import org.hipparchus.linear.RealMatrix;
23  import org.hipparchus.stat.LocalizedStatFormats;
24  import org.hipparchus.stat.StatUtils;
25  import org.hipparchus.stat.correlation.Covariance;
26  import org.hipparchus.stat.descriptive.moment.StandardDeviation;
27  
28  /**
29   * Principal component analysis (PCA) is a statistical technique for reducing the dimensionality of a dataset.
30   * <a href="https://en.wikipedia.org/wiki/Principal_component_analysis">PCA</a> can be thought of as a
31   * projection or scaling of the data to reduce the number of dimensions but done in a way
32   * that preserves as much information as possible.
33   * @since 3.0
34   */
35  public class PCA {
36      /**
37       * The number of components (reduced dimensions) for this projection.
38       */
39      private final int numC;
40  
41      /**
42       * Whether to scale (standardize) the input data as well as center (normalize).
43       */
44      private final boolean scale;
45  
46      /**
47       * Whether to correct for bias when standardizing. Ignored when only centering.
48       */
49      private final boolean biasCorrection;
50  
51      /**
52       * The by column (feature) averages (means) from the fitted data.
53       */
54      private double[] center;
55  
56      /**
57       * The by column (feature) standard deviates from the fitted data.
58       */
59      private double[] std;
60  
61      /**
62       * The eigenValues (variance) of our projection model.
63       */
64      private double[] eigenValues;
65  
66      /**
67       * The eigenVectors (components) of our projection model.
68       */
69      private RealMatrix principalComponents;
70  
71      /**
72       * Utility class when scaling.
73       */
74      private final StandardDeviation sd;
75  
76      /**
77       * Create a PCA with the ability to adjust scaling parameters.
78       *
79       * @param numC the number of components
80       * @param scale whether to also scale (correlation) rather than just center (covariance)
81       * @param biasCorrection whether to adjust for bias when scaling
82       */
83      public PCA(int numC, boolean scale, boolean biasCorrection) {
84          this.numC = numC;
85          this.scale = scale;
86          this.biasCorrection = biasCorrection;
87          sd = scale ? new StandardDeviation(biasCorrection) : null;
88      }
89  
90      /**
91       * A default PCA will center but not scale.
92       *
93       * @param numC the number of components
94       */
95      public PCA(int numC) {
96          this(numC, false, true);
97      }
98  
99      /** GEt number of components.
100      * @return the number of components
101      */
102     public int getNumComponents() {
103         return numC;
104     }
105 
106     /** Check whether scaling (correlation) or no scaling (covariance) is used.
107      * @return whether scaling (correlation) or no scaling (covariance) is used
108      */
109     public boolean isScale() {
110         return scale;
111     }
112 
113     /** Check whether scaling (correlation), if in use, adjusts for bias.
114      * @return whether scaling (correlation), if in use, adjusts for bias
115      */
116     public boolean isBiasCorrection() {
117         return biasCorrection;
118     }
119 
120     /** Get principal component variances.
121      * @return the principal component variances, ordered from largest to smallest, which are the eigenvalues of the covariance or correlation matrix of the fitted data
122      */
123     public double[] getVariance() {
124         validateState("getVariance");
125         return eigenValues.clone();
126     }
127 
128     /** Get by column center (or mean) of the fitted data.
129      * @return the by column center (or mean) of the fitted data
130      */
131     public double[] getCenter() {
132         validateState("getCenter");
133         return center.clone();
134     }
135 
136     /**
137      * Returns the principal components of our projection model.
138      * These are the eigenvectors of our covariance/correlation matrix.
139      *
140      * @return the principal components
141      */
142     public double[][] getComponents() {
143         validateState("getComponents");
144         return principalComponents.getData();
145     }
146 
147     /**
148      * Fit our model to the data and then transform it to the reduced dimensions.
149      *
150      * @param data the input data
151      * @return the fitted data
152      */
153     public double[][] fitAndTransform(double[][] data) {
154         center = null;
155         RealMatrix normalizedM = getNormalizedMatrix(data);
156         calculatePrincipalComponents(normalizedM);
157         return normalizedM.multiply(principalComponents).getData();
158     }
159 
160     /**
161      * Transform the supplied data using our projection model.
162      *
163      * @param data the input data
164      * @return the fitted data
165      */
166     public double[][] transform(double[][] data) {
167         validateState("transform");
168         RealMatrix normalizedM = getNormalizedMatrix(data);
169         return normalizedM.multiply(principalComponents).getData();
170     }
171 
172     /**
173      * Fit our model to the data, ready for subsequence transforms.
174      *
175      * @param data the input data
176      * @return this
177      */
178     public PCA fit(double[][] data) {
179         center = null;
180         RealMatrix normalized = getNormalizedMatrix(data);
181         calculatePrincipalComponents(normalized);
182         return this;
183     }
184 
185     /** Check if the state allows an operation to be performed.
186      * @param from name of the operation
187      * @exception MathIllegalStateException if the state does not allows operation
188      */
189     private void validateState(String from) {
190         if (center == null) {
191             throw new MathIllegalStateException(LocalizedStatFormats.ILLEGAL_STATE_PCA, from);
192         }
193 
194     }
195 
196     /** Compute eigenvalues and principal components.
197      * <p>
198      * The results are stored in the instance itself
199      * <p>
200      * @param normalizedM normalized matrix
201      */
202     private void calculatePrincipalComponents(RealMatrix normalizedM) {
203         RealMatrix covarianceM = new Covariance(normalizedM).getCovarianceMatrix();
204         EigenDecompositionSymmetric decomposition = new EigenDecompositionSymmetric(covarianceM);
205         eigenValues = decomposition.getEigenvalues();
206         principalComponents = MatrixUtils.createRealMatrix(eigenValues.length, numC);
207         for (int c = 0; c < numC; c++) {
208             for (int f = 0; f < eigenValues.length; f++) {
209                 principalComponents.setEntry(f, c, decomposition.getEigenvector(c).getEntry(f));
210             }
211         }
212     }
213 
214     /**
215      * This will either normalize (center) or
216      * standardize (center plus scale) the input data.
217      *
218      * @param input the input data
219      * @return the normalized (or standardized) matrix
220      */
221     private RealMatrix getNormalizedMatrix(double[][] input) {
222         int numS = input.length;
223         int numF = input[0].length;
224         boolean calculating = center == null;
225         if (calculating) {
226             center = new double[numF];
227             if (scale) {
228                 std = new double[numF];
229             }
230         }
231 
232         double[][] normalized = new double[numS][numF];
233         for (int f = 0; f < numF; f++) {
234             if (calculating) {
235                 calculateNormalizeParameters(input, numS, f);
236             }
237             for (int s = 0; s < numS; s++) {
238                 normalized[s][f] = input[s][f] - center[f];
239             }
240             if (scale) {
241                 for (int s = 0; s < numS; s++) {
242                     normalized[s][f] /= std[f];
243                 }
244             }
245         }
246 
247         return MatrixUtils.createRealMatrix(normalized);
248     }
249 
250     /** compute normalized parameters.
251      * @param input the input data
252      * @param numS number of data rows
253      * @param f index of the component
254      */
255     private void calculateNormalizeParameters(double[][] input, int numS, int f) {
256         double[] column = new double[numS];
257         for (int s = 0; s < numS; s++) {
258             column[s] = input[s][f];
259         }
260         center[f] = StatUtils.mean(column);
261         if (scale) {
262             std[f] = sd.evaluate(column, center[f]);
263         }
264     }
265 }