View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  package org.hipparchus.stat.correlation;
23  
24  import org.hipparchus.distribution.continuous.TDistribution;
25  import org.hipparchus.exception.LocalizedCoreFormats;
26  import org.hipparchus.exception.MathIllegalArgumentException;
27  import org.hipparchus.linear.BlockRealMatrix;
28  import org.hipparchus.linear.RealMatrix;
29  import org.hipparchus.stat.LocalizedStatFormats;
30  import org.hipparchus.stat.regression.SimpleRegression;
31  import org.hipparchus.util.FastMath;
32  import org.hipparchus.util.MathArrays;
33  import org.hipparchus.util.MathUtils;
34  
35  /**
36   * Computes Pearson's product-moment correlation coefficients for pairs of arrays
37   * or columns of a matrix.
38   * <p>
39   * The constructors that take <code>RealMatrix</code> or
40   * <code>double[][]</code> arguments generate correlation matrices.  The
41   * columns of the input matrices are assumed to represent variable values.
42   * Correlations are given by the formula:
43   * <p>
44   * <code>cor(X, Y) = &Sigma;[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
45   * <p>
46   * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
47   * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
48   * <p>
49   * To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()}
50   * to construct an instance with no data and then {@link #correlation(double[], double[])}.
51   * Correlation matrices can also be computed directly from an instance with no data using
52   * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()},
53   * {@link #getCorrelationPValues()},  or {@link #getCorrelationStandardErrors()}; however, one of the
54   * constructors supplying data or a covariance matrix must be used to create the instance.
55   */
56  public class PearsonsCorrelation {
57  
58      /** correlation matrix */
59      private final RealMatrix correlationMatrix;
60  
61      /** number of observations */
62      private final int nObs;
63  
64      /**
65       * Create a PearsonsCorrelation instance without data.
66       */
67      public PearsonsCorrelation() {
68          super();
69          correlationMatrix = null;
70          nObs = 0;
71      }
72  
73      /**
74       * Create a PearsonsCorrelation from a rectangular array
75       * whose columns represent values of variables to be correlated.
76       *
77       * Throws MathIllegalArgumentException if the input array does not have at least
78       * two columns and two rows.  Pairwise correlations are set to NaN if one
79       * of the correlates has zero variance.
80       *
81       * @param data rectangular array with columns representing variables
82       * @throws MathIllegalArgumentException if the input data array is not
83       * rectangular with at least two rows and two columns.
84       * @see #correlation(double[], double[])
85       */
86      public PearsonsCorrelation(double[][] data) {
87          this(new BlockRealMatrix(data));
88      }
89  
90      /**
91       * Create a PearsonsCorrelation from a RealMatrix whose columns
92       * represent variables to be correlated.
93       *
94       * Throws MathIllegalArgumentException if the matrix does not have at least
95       * two columns and two rows.  Pairwise correlations are set to NaN if one
96       * of the correlates has zero variance.
97       *
98       * @param matrix matrix with columns representing variables to correlate
99       * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
100      * @see #correlation(double[], double[])
101      */
102     public PearsonsCorrelation(RealMatrix matrix) {
103         nObs = matrix.getRowDimension();
104         correlationMatrix = computeCorrelationMatrix(matrix);
105     }
106 
107     /**
108      * Create a PearsonsCorrelation from a {@link Covariance}.  The correlation
109      * matrix is computed by scaling the Covariance's covariance matrix.
110      * The Covariance instance must have been created from a data matrix with
111      * columns representing variable values.
112      *
113      * @param covariance Covariance instance
114      */
115     public PearsonsCorrelation(Covariance covariance) {
116         RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
117         MathUtils.checkNotNull(covarianceMatrix, LocalizedStatFormats.COVARIANCE_MATRIX);
118         nObs = covariance.getN();
119         correlationMatrix = covarianceToCorrelation(covarianceMatrix);
120     }
121 
122     /**
123      * Create a PearsonsCorrelation from a covariance matrix. The correlation
124      * matrix is computed by scaling the covariance matrix.
125      *
126      * @param covarianceMatrix covariance matrix
127      * @param numberOfObservations the number of observations in the dataset used to compute
128      * the covariance matrix
129      */
130     public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
131         nObs = numberOfObservations;
132         correlationMatrix = covarianceToCorrelation(covarianceMatrix);
133     }
134 
135     /**
136      * Returns the correlation matrix.
137      *
138      * <p>This method will return null if the argumentless constructor was used
139      * to create this instance, even if {@link #computeCorrelationMatrix(double[][])}
140      * has been called before it is activated.</p>
141      *
142      * @return correlation matrix
143      */
144     public RealMatrix getCorrelationMatrix() {
145         return correlationMatrix;
146     }
147 
148     /**
149      * Returns a matrix of standard errors associated with the estimates
150      * in the correlation matrix.<br>
151      * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
152      * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
153      *
154      * <p>The formula used to compute the standard error is <br>
155      * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
156      * where <code>r</code> is the estimated correlation coefficient and
157      * <code>n</code> is the number of observations in the source dataset.</p>
158      *
159      * <p>To use this method, one of the constructors that supply an input
160      * matrix must have been used to create this instance.</p>
161      *
162      * @return matrix of correlation standard errors
163      * @throws NullPointerException if this instance was created with no data
164      */
165     public RealMatrix getCorrelationStandardErrors() {
166         int nVars = correlationMatrix.getColumnDimension();
167         double[][] out = new double[nVars][nVars];
168         for (int i = 0; i < nVars; i++) {
169             for (int j = 0; j < nVars; j++) {
170                 double r = correlationMatrix.getEntry(i, j);
171                 out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2));
172             }
173         }
174         return new BlockRealMatrix(out);
175     }
176 
177     /**
178      * Returns a matrix of p-values associated with the (two-sided) null
179      * hypothesis that the corresponding correlation coefficient is zero.
180      *
181      * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
182      * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
183      * a value with absolute value greater than or equal to <br>
184      * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
185      *
186      * <p>The values in the matrix are sometimes referred to as the
187      * <i>significance</i> of the corresponding correlation coefficients.</p>
188      *
189      * <p>To use this method, one of the constructors that supply an input
190      * matrix must have been used to create this instance.</p>
191      *
192      * @return matrix of p-values
193      * @throws org.hipparchus.exception.MathIllegalStateException
194      * if an error occurs estimating probabilities
195      * @throws NullPointerException if this instance was created with no data
196      */
197     public RealMatrix getCorrelationPValues() {
198         TDistribution tDistribution = new TDistribution(nObs - 2);
199         int nVars = correlationMatrix.getColumnDimension();
200         double[][] out = new double[nVars][nVars];
201         for (int i = 0; i < nVars; i++) {
202             for (int j = 0; j < nVars; j++) {
203                 if (i == j) {
204                     out[i][j] = 0d;
205                 } else {
206                     double r = correlationMatrix.getEntry(i, j);
207                     double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r)));
208                     out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
209                 }
210             }
211         }
212         return new BlockRealMatrix(out);
213     }
214 
215 
216     /**
217      * Computes the correlation matrix for the columns of the
218      * input matrix, using {@link #correlation(double[], double[])}.
219      *
220      * Throws MathIllegalArgumentException if the matrix does not have at least
221      * two columns and two rows.  Pairwise correlations are set to NaN if one
222      * of the correlates has zero variance.
223      *
224      * @param matrix matrix with columns representing variables to correlate
225      * @return correlation matrix
226      * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
227      * @see #correlation(double[], double[])
228      */
229     public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
230         checkSufficientData(matrix);
231         int nVars = matrix.getColumnDimension();
232         RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
233         for (int i = 0; i < nVars; i++) {
234             for (int j = 0; j < i; j++) {
235               double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
236               outMatrix.setEntry(i, j, corr);
237               outMatrix.setEntry(j, i, corr);
238             }
239             outMatrix.setEntry(i, i, 1d);
240         }
241         return outMatrix;
242     }
243 
244     /**
245      * Computes the correlation matrix for the columns of the
246      * input rectangular array.  The columns of the array represent values
247      * of variables to be correlated.
248      *
249      * Throws MathIllegalArgumentException if the matrix does not have at least
250      * two columns and two rows or if the array is not rectangular. Pairwise
251      * correlations are set to NaN if one of the correlates has zero variance.
252      *
253      * @param data matrix with columns representing variables to correlate
254      * @return correlation matrix
255      * @throws MathIllegalArgumentException if the array does not contain sufficient data
256      * @see #correlation(double[], double[])
257      */
258     public RealMatrix computeCorrelationMatrix(double[][] data) {
259        return computeCorrelationMatrix(new BlockRealMatrix(data));
260     }
261 
262     /**
263      * Computes the Pearson's product-moment correlation coefficient between two arrays.
264      *
265      * <p>Throws MathIllegalArgumentException if the arrays do not have the same length
266      * or their common length is less than 2.  Returns {@code NaN} if either of the arrays
267      * has zero variance (i.e., if one of the arrays does not contain at least two distinct
268      * values).</p>
269      *
270      * @param xArray first data array
271      * @param yArray second data array
272      * @return Returns Pearson's correlation coefficient for the two arrays
273      * @throws MathIllegalArgumentException if the arrays lengths do not match
274      * @throws MathIllegalArgumentException if there is insufficient data
275      */
276     public double correlation(final double[] xArray, final double[] yArray) {
277         MathArrays.checkEqualLength(xArray, yArray);
278         if (xArray.length < 2) {
279             throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_DIMENSION,
280                                                    xArray.length, 2);
281         }
282 
283         SimpleRegression regression = new SimpleRegression();
284         for(int i = 0; i < xArray.length; i++) {
285             regression.addData(xArray[i], yArray[i]);
286         }
287         return regression.getR();
288     }
289 
290     /**
291      * Derives a correlation matrix from a covariance matrix.
292      *
293      * <p>Uses the formula <br>
294      * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
295      * <code>r(&middot;,&middot;)</code> is the correlation coefficient and
296      * <code>s(&middot;)</code> means standard deviation.</p>
297      *
298      * @param covarianceMatrix the covariance matrix
299      * @return correlation matrix
300      */
301     public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
302         int nVars = covarianceMatrix.getColumnDimension();
303         RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
304         for (int i = 0; i < nVars; i++) {
305             double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i));
306             outMatrix.setEntry(i, i, 1d);
307             for (int j = 0; j < i; j++) {
308                 double entry = covarianceMatrix.getEntry(i, j) /
309                        (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j)));
310                 outMatrix.setEntry(i, j, entry);
311                 outMatrix.setEntry(j, i, entry);
312             }
313         }
314         return outMatrix;
315     }
316 
317     /**
318      * Throws MathIllegalArgumentException if the matrix does not have at least
319      * two columns and two rows.
320      *
321      * @param matrix matrix to check for sufficiency
322      * @throws MathIllegalArgumentException if there is insufficient data
323      */
324     private void checkSufficientData(final RealMatrix matrix) {
325         int nRows = matrix.getRowDimension();
326         int nCols = matrix.getColumnDimension();
327         if (nRows < 2 || nCols < 2) {
328             throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
329                                                    nRows, nCols);
330         }
331     }
332 }