1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22 package org.hipparchus.stat.correlation;
23
24 import org.hipparchus.distribution.continuous.TDistribution;
25 import org.hipparchus.exception.LocalizedCoreFormats;
26 import org.hipparchus.exception.MathIllegalArgumentException;
27 import org.hipparchus.linear.BlockRealMatrix;
28 import org.hipparchus.linear.RealMatrix;
29 import org.hipparchus.stat.LocalizedStatFormats;
30 import org.hipparchus.stat.regression.SimpleRegression;
31 import org.hipparchus.util.FastMath;
32 import org.hipparchus.util.MathArrays;
33 import org.hipparchus.util.MathUtils;
34
35 /**
36 * Computes Pearson's product-moment correlation coefficients for pairs of arrays
37 * or columns of a matrix.
38 * <p>
39 * The constructors that take <code>RealMatrix</code> or
40 * <code>double[][]</code> arguments generate correlation matrices. The
41 * columns of the input matrices are assumed to represent variable values.
42 * Correlations are given by the formula:
43 * <p>
44 * <code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
45 * <p>
46 * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
47 * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
48 * <p>
49 * To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()}
50 * to construct an instance with no data and then {@link #correlation(double[], double[])}.
51 * Correlation matrices can also be computed directly from an instance with no data using
52 * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()},
53 * {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the
54 * constructors supplying data or a covariance matrix must be used to create the instance.
55 */
56 public class PearsonsCorrelation {
57
58 /** correlation matrix */
59 private final RealMatrix correlationMatrix;
60
61 /** number of observations */
62 private final int nObs;
63
64 /**
65 * Create a PearsonsCorrelation instance without data.
66 */
67 public PearsonsCorrelation() {
68 super();
69 correlationMatrix = null;
70 nObs = 0;
71 }
72
73 /**
74 * Create a PearsonsCorrelation from a rectangular array
75 * whose columns represent values of variables to be correlated.
76 *
77 * Throws MathIllegalArgumentException if the input array does not have at least
78 * two columns and two rows. Pairwise correlations are set to NaN if one
79 * of the correlates has zero variance.
80 *
81 * @param data rectangular array with columns representing variables
82 * @throws MathIllegalArgumentException if the input data array is not
83 * rectangular with at least two rows and two columns.
84 * @see #correlation(double[], double[])
85 */
86 public PearsonsCorrelation(double[][] data) {
87 this(new BlockRealMatrix(data));
88 }
89
90 /**
91 * Create a PearsonsCorrelation from a RealMatrix whose columns
92 * represent variables to be correlated.
93 *
94 * Throws MathIllegalArgumentException if the matrix does not have at least
95 * two columns and two rows. Pairwise correlations are set to NaN if one
96 * of the correlates has zero variance.
97 *
98 * @param matrix matrix with columns representing variables to correlate
99 * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
100 * @see #correlation(double[], double[])
101 */
102 public PearsonsCorrelation(RealMatrix matrix) {
103 nObs = matrix.getRowDimension();
104 correlationMatrix = computeCorrelationMatrix(matrix);
105 }
106
107 /**
108 * Create a PearsonsCorrelation from a {@link Covariance}. The correlation
109 * matrix is computed by scaling the Covariance's covariance matrix.
110 * The Covariance instance must have been created from a data matrix with
111 * columns representing variable values.
112 *
113 * @param covariance Covariance instance
114 */
115 public PearsonsCorrelation(Covariance covariance) {
116 RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
117 MathUtils.checkNotNull(covarianceMatrix, LocalizedStatFormats.COVARIANCE_MATRIX);
118 nObs = covariance.getN();
119 correlationMatrix = covarianceToCorrelation(covarianceMatrix);
120 }
121
122 /**
123 * Create a PearsonsCorrelation from a covariance matrix. The correlation
124 * matrix is computed by scaling the covariance matrix.
125 *
126 * @param covarianceMatrix covariance matrix
127 * @param numberOfObservations the number of observations in the dataset used to compute
128 * the covariance matrix
129 */
130 public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
131 nObs = numberOfObservations;
132 correlationMatrix = covarianceToCorrelation(covarianceMatrix);
133 }
134
135 /**
136 * Returns the correlation matrix.
137 *
138 * <p>This method will return null if the argumentless constructor was used
139 * to create this instance, even if {@link #computeCorrelationMatrix(double[][])}
140 * has been called before it is activated.</p>
141 *
142 * @return correlation matrix
143 */
144 public RealMatrix getCorrelationMatrix() {
145 return correlationMatrix;
146 }
147
148 /**
149 * Returns a matrix of standard errors associated with the estimates
150 * in the correlation matrix.<br>
151 * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
152 * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
153 *
154 * <p>The formula used to compute the standard error is <br>
155 * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
156 * where <code>r</code> is the estimated correlation coefficient and
157 * <code>n</code> is the number of observations in the source dataset.</p>
158 *
159 * <p>To use this method, one of the constructors that supply an input
160 * matrix must have been used to create this instance.</p>
161 *
162 * @return matrix of correlation standard errors
163 * @throws NullPointerException if this instance was created with no data
164 */
165 public RealMatrix getCorrelationStandardErrors() {
166 int nVars = correlationMatrix.getColumnDimension();
167 double[][] out = new double[nVars][nVars];
168 for (int i = 0; i < nVars; i++) {
169 for (int j = 0; j < nVars; j++) {
170 double r = correlationMatrix.getEntry(i, j);
171 out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2));
172 }
173 }
174 return new BlockRealMatrix(out);
175 }
176
177 /**
178 * Returns a matrix of p-values associated with the (two-sided) null
179 * hypothesis that the corresponding correlation coefficient is zero.
180 *
181 * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
182 * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
183 * a value with absolute value greater than or equal to <br>
184 * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
185 *
186 * <p>The values in the matrix are sometimes referred to as the
187 * <i>significance</i> of the corresponding correlation coefficients.</p>
188 *
189 * <p>To use this method, one of the constructors that supply an input
190 * matrix must have been used to create this instance.</p>
191 *
192 * @return matrix of p-values
193 * @throws org.hipparchus.exception.MathIllegalStateException
194 * if an error occurs estimating probabilities
195 * @throws NullPointerException if this instance was created with no data
196 */
197 public RealMatrix getCorrelationPValues() {
198 TDistribution tDistribution = new TDistribution(nObs - 2);
199 int nVars = correlationMatrix.getColumnDimension();
200 double[][] out = new double[nVars][nVars];
201 for (int i = 0; i < nVars; i++) {
202 for (int j = 0; j < nVars; j++) {
203 if (i == j) {
204 out[i][j] = 0d;
205 } else {
206 double r = correlationMatrix.getEntry(i, j);
207 double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r)));
208 out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
209 }
210 }
211 }
212 return new BlockRealMatrix(out);
213 }
214
215
216 /**
217 * Computes the correlation matrix for the columns of the
218 * input matrix, using {@link #correlation(double[], double[])}.
219 *
220 * Throws MathIllegalArgumentException if the matrix does not have at least
221 * two columns and two rows. Pairwise correlations are set to NaN if one
222 * of the correlates has zero variance.
223 *
224 * @param matrix matrix with columns representing variables to correlate
225 * @return correlation matrix
226 * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
227 * @see #correlation(double[], double[])
228 */
229 public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
230 checkSufficientData(matrix);
231 int nVars = matrix.getColumnDimension();
232 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
233 for (int i = 0; i < nVars; i++) {
234 for (int j = 0; j < i; j++) {
235 double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
236 outMatrix.setEntry(i, j, corr);
237 outMatrix.setEntry(j, i, corr);
238 }
239 outMatrix.setEntry(i, i, 1d);
240 }
241 return outMatrix;
242 }
243
244 /**
245 * Computes the correlation matrix for the columns of the
246 * input rectangular array. The columns of the array represent values
247 * of variables to be correlated.
248 *
249 * Throws MathIllegalArgumentException if the matrix does not have at least
250 * two columns and two rows or if the array is not rectangular. Pairwise
251 * correlations are set to NaN if one of the correlates has zero variance.
252 *
253 * @param data matrix with columns representing variables to correlate
254 * @return correlation matrix
255 * @throws MathIllegalArgumentException if the array does not contain sufficient data
256 * @see #correlation(double[], double[])
257 */
258 public RealMatrix computeCorrelationMatrix(double[][] data) {
259 return computeCorrelationMatrix(new BlockRealMatrix(data));
260 }
261
262 /**
263 * Computes the Pearson's product-moment correlation coefficient between two arrays.
264 *
265 * <p>Throws MathIllegalArgumentException if the arrays do not have the same length
266 * or their common length is less than 2. Returns {@code NaN} if either of the arrays
267 * has zero variance (i.e., if one of the arrays does not contain at least two distinct
268 * values).</p>
269 *
270 * @param xArray first data array
271 * @param yArray second data array
272 * @return Returns Pearson's correlation coefficient for the two arrays
273 * @throws MathIllegalArgumentException if the arrays lengths do not match
274 * @throws MathIllegalArgumentException if there is insufficient data
275 */
276 public double correlation(final double[] xArray, final double[] yArray) {
277 MathArrays.checkEqualLength(xArray, yArray);
278 if (xArray.length < 2) {
279 throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_DIMENSION,
280 xArray.length, 2);
281 }
282
283 SimpleRegression regression = new SimpleRegression();
284 for(int i = 0; i < xArray.length; i++) {
285 regression.addData(xArray[i], yArray[i]);
286 }
287 return regression.getR();
288 }
289
290 /**
291 * Derives a correlation matrix from a covariance matrix.
292 *
293 * <p>Uses the formula <br>
294 * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
295 * <code>r(·,·)</code> is the correlation coefficient and
296 * <code>s(·)</code> means standard deviation.</p>
297 *
298 * @param covarianceMatrix the covariance matrix
299 * @return correlation matrix
300 */
301 public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
302 int nVars = covarianceMatrix.getColumnDimension();
303 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
304 for (int i = 0; i < nVars; i++) {
305 double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i));
306 outMatrix.setEntry(i, i, 1d);
307 for (int j = 0; j < i; j++) {
308 double entry = covarianceMatrix.getEntry(i, j) /
309 (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j)));
310 outMatrix.setEntry(i, j, entry);
311 outMatrix.setEntry(j, i, entry);
312 }
313 }
314 return outMatrix;
315 }
316
317 /**
318 * Throws MathIllegalArgumentException if the matrix does not have at least
319 * two columns and two rows.
320 *
321 * @param matrix matrix to check for sufficiency
322 * @throws MathIllegalArgumentException if there is insufficient data
323 */
324 private void checkSufficientData(final RealMatrix matrix) {
325 int nRows = matrix.getRowDimension();
326 int nCols = matrix.getColumnDimension();
327 if (nRows < 2 || nCols < 2) {
328 throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
329 nRows, nCols);
330 }
331 }
332 }