1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This is not the original file distributed by the Apache Software Foundation 20 * It has been modified by the Hipparchus project 21 */ 22 package org.hipparchus.stat.correlation; 23 24 import org.hipparchus.distribution.continuous.TDistribution; 25 import org.hipparchus.exception.LocalizedCoreFormats; 26 import org.hipparchus.exception.MathIllegalArgumentException; 27 import org.hipparchus.linear.BlockRealMatrix; 28 import org.hipparchus.linear.RealMatrix; 29 import org.hipparchus.stat.LocalizedStatFormats; 30 import org.hipparchus.stat.regression.SimpleRegression; 31 import org.hipparchus.util.FastMath; 32 import org.hipparchus.util.MathArrays; 33 import org.hipparchus.util.MathUtils; 34 35 /** 36 * Computes Pearson's product-moment correlation coefficients for pairs of arrays 37 * or columns of a matrix. 38 * <p> 39 * The constructors that take <code>RealMatrix</code> or 40 * <code>double[][]</code> arguments generate correlation matrices. The 41 * columns of the input matrices are assumed to represent variable values. 42 * Correlations are given by the formula: 43 * <p> 44 * <code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code> 45 * <p> 46 * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code> 47 * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations. 48 * <p> 49 * To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()} 50 * to construct an instance with no data and then {@link #correlation(double[], double[])}. 51 * Correlation matrices can also be computed directly from an instance with no data using 52 * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()}, 53 * {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the 54 * constructors supplying data or a covariance matrix must be used to create the instance. 55 */ 56 public class PearsonsCorrelation { 57 58 /** correlation matrix */ 59 private final RealMatrix correlationMatrix; 60 61 /** number of observations */ 62 private final int nObs; 63 64 /** 65 * Create a PearsonsCorrelation instance without data. 66 */ 67 public PearsonsCorrelation() { 68 super(); 69 correlationMatrix = null; 70 nObs = 0; 71 } 72 73 /** 74 * Create a PearsonsCorrelation from a rectangular array 75 * whose columns represent values of variables to be correlated. 76 * 77 * Throws MathIllegalArgumentException if the input array does not have at least 78 * two columns and two rows. Pairwise correlations are set to NaN if one 79 * of the correlates has zero variance. 80 * 81 * @param data rectangular array with columns representing variables 82 * @throws MathIllegalArgumentException if the input data array is not 83 * rectangular with at least two rows and two columns. 84 * @see #correlation(double[], double[]) 85 */ 86 public PearsonsCorrelation(double[][] data) { 87 this(new BlockRealMatrix(data)); 88 } 89 90 /** 91 * Create a PearsonsCorrelation from a RealMatrix whose columns 92 * represent variables to be correlated. 93 * 94 * Throws MathIllegalArgumentException if the matrix does not have at least 95 * two columns and two rows. Pairwise correlations are set to NaN if one 96 * of the correlates has zero variance. 97 * 98 * @param matrix matrix with columns representing variables to correlate 99 * @throws MathIllegalArgumentException if the matrix does not contain sufficient data 100 * @see #correlation(double[], double[]) 101 */ 102 public PearsonsCorrelation(RealMatrix matrix) { 103 nObs = matrix.getRowDimension(); 104 correlationMatrix = computeCorrelationMatrix(matrix); 105 } 106 107 /** 108 * Create a PearsonsCorrelation from a {@link Covariance}. The correlation 109 * matrix is computed by scaling the Covariance's covariance matrix. 110 * The Covariance instance must have been created from a data matrix with 111 * columns representing variable values. 112 * 113 * @param covariance Covariance instance 114 */ 115 public PearsonsCorrelation(Covariance covariance) { 116 RealMatrix covarianceMatrix = covariance.getCovarianceMatrix(); 117 MathUtils.checkNotNull(covarianceMatrix, LocalizedStatFormats.COVARIANCE_MATRIX); 118 nObs = covariance.getN(); 119 correlationMatrix = covarianceToCorrelation(covarianceMatrix); 120 } 121 122 /** 123 * Create a PearsonsCorrelation from a covariance matrix. The correlation 124 * matrix is computed by scaling the covariance matrix. 125 * 126 * @param covarianceMatrix covariance matrix 127 * @param numberOfObservations the number of observations in the dataset used to compute 128 * the covariance matrix 129 */ 130 public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) { 131 nObs = numberOfObservations; 132 correlationMatrix = covarianceToCorrelation(covarianceMatrix); 133 } 134 135 /** 136 * Returns the correlation matrix. 137 * 138 * <p>This method will return null if the argumentless constructor was used 139 * to create this instance, even if {@link #computeCorrelationMatrix(double[][])} 140 * has been called before it is activated.</p> 141 * 142 * @return correlation matrix 143 */ 144 public RealMatrix getCorrelationMatrix() { 145 return correlationMatrix; 146 } 147 148 /** 149 * Returns a matrix of standard errors associated with the estimates 150 * in the correlation matrix.<br> 151 * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard 152 * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code> 153 * 154 * <p>The formula used to compute the standard error is <br> 155 * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code> 156 * where <code>r</code> is the estimated correlation coefficient and 157 * <code>n</code> is the number of observations in the source dataset.</p> 158 * 159 * <p>To use this method, one of the constructors that supply an input 160 * matrix must have been used to create this instance.</p> 161 * 162 * @return matrix of correlation standard errors 163 * @throws NullPointerException if this instance was created with no data 164 */ 165 public RealMatrix getCorrelationStandardErrors() { 166 int nVars = correlationMatrix.getColumnDimension(); 167 double[][] out = new double[nVars][nVars]; 168 for (int i = 0; i < nVars; i++) { 169 for (int j = 0; j < nVars; j++) { 170 double r = correlationMatrix.getEntry(i, j); 171 out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2)); 172 } 173 } 174 return new BlockRealMatrix(out); 175 } 176 177 /** 178 * Returns a matrix of p-values associated with the (two-sided) null 179 * hypothesis that the corresponding correlation coefficient is zero. 180 * 181 * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability 182 * that a random variable distributed as <code>t<sub>n-2</sub></code> takes 183 * a value with absolute value greater than or equal to <br> 184 * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p> 185 * 186 * <p>The values in the matrix are sometimes referred to as the 187 * <i>significance</i> of the corresponding correlation coefficients.</p> 188 * 189 * <p>To use this method, one of the constructors that supply an input 190 * matrix must have been used to create this instance.</p> 191 * 192 * @return matrix of p-values 193 * @throws org.hipparchus.exception.MathIllegalStateException 194 * if an error occurs estimating probabilities 195 * @throws NullPointerException if this instance was created with no data 196 */ 197 public RealMatrix getCorrelationPValues() { 198 TDistribution tDistribution = new TDistribution(nObs - 2); 199 int nVars = correlationMatrix.getColumnDimension(); 200 double[][] out = new double[nVars][nVars]; 201 for (int i = 0; i < nVars; i++) { 202 for (int j = 0; j < nVars; j++) { 203 if (i == j) { 204 out[i][j] = 0d; 205 } else { 206 double r = correlationMatrix.getEntry(i, j); 207 double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r))); 208 out[i][j] = 2 * tDistribution.cumulativeProbability(-t); 209 } 210 } 211 } 212 return new BlockRealMatrix(out); 213 } 214 215 216 /** 217 * Computes the correlation matrix for the columns of the 218 * input matrix, using {@link #correlation(double[], double[])}. 219 * 220 * Throws MathIllegalArgumentException if the matrix does not have at least 221 * two columns and two rows. Pairwise correlations are set to NaN if one 222 * of the correlates has zero variance. 223 * 224 * @param matrix matrix with columns representing variables to correlate 225 * @return correlation matrix 226 * @throws MathIllegalArgumentException if the matrix does not contain sufficient data 227 * @see #correlation(double[], double[]) 228 */ 229 public RealMatrix computeCorrelationMatrix(RealMatrix matrix) { 230 checkSufficientData(matrix); 231 int nVars = matrix.getColumnDimension(); 232 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); 233 for (int i = 0; i < nVars; i++) { 234 for (int j = 0; j < i; j++) { 235 double corr = correlation(matrix.getColumn(i), matrix.getColumn(j)); 236 outMatrix.setEntry(i, j, corr); 237 outMatrix.setEntry(j, i, corr); 238 } 239 outMatrix.setEntry(i, i, 1d); 240 } 241 return outMatrix; 242 } 243 244 /** 245 * Computes the correlation matrix for the columns of the 246 * input rectangular array. The columns of the array represent values 247 * of variables to be correlated. 248 * 249 * Throws MathIllegalArgumentException if the matrix does not have at least 250 * two columns and two rows or if the array is not rectangular. Pairwise 251 * correlations are set to NaN if one of the correlates has zero variance. 252 * 253 * @param data matrix with columns representing variables to correlate 254 * @return correlation matrix 255 * @throws MathIllegalArgumentException if the array does not contain sufficient data 256 * @see #correlation(double[], double[]) 257 */ 258 public RealMatrix computeCorrelationMatrix(double[][] data) { 259 return computeCorrelationMatrix(new BlockRealMatrix(data)); 260 } 261 262 /** 263 * Computes the Pearson's product-moment correlation coefficient between two arrays. 264 * 265 * <p>Throws MathIllegalArgumentException if the arrays do not have the same length 266 * or their common length is less than 2. Returns {@code NaN} if either of the arrays 267 * has zero variance (i.e., if one of the arrays does not contain at least two distinct 268 * values).</p> 269 * 270 * @param xArray first data array 271 * @param yArray second data array 272 * @return Returns Pearson's correlation coefficient for the two arrays 273 * @throws MathIllegalArgumentException if the arrays lengths do not match 274 * @throws MathIllegalArgumentException if there is insufficient data 275 */ 276 public double correlation(final double[] xArray, final double[] yArray) { 277 MathArrays.checkEqualLength(xArray, yArray); 278 if (xArray.length < 2) { 279 throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_DIMENSION, 280 xArray.length, 2); 281 } 282 283 SimpleRegression regression = new SimpleRegression(); 284 for(int i = 0; i < xArray.length; i++) { 285 regression.addData(xArray[i], yArray[i]); 286 } 287 return regression.getR(); 288 } 289 290 /** 291 * Derives a correlation matrix from a covariance matrix. 292 * 293 * <p>Uses the formula <br> 294 * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where 295 * <code>r(·,·)</code> is the correlation coefficient and 296 * <code>s(·)</code> means standard deviation.</p> 297 * 298 * @param covarianceMatrix the covariance matrix 299 * @return correlation matrix 300 */ 301 public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) { 302 int nVars = covarianceMatrix.getColumnDimension(); 303 RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars); 304 for (int i = 0; i < nVars; i++) { 305 double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i)); 306 outMatrix.setEntry(i, i, 1d); 307 for (int j = 0; j < i; j++) { 308 double entry = covarianceMatrix.getEntry(i, j) / 309 (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j))); 310 outMatrix.setEntry(i, j, entry); 311 outMatrix.setEntry(j, i, entry); 312 } 313 } 314 return outMatrix; 315 } 316 317 /** 318 * Throws MathIllegalArgumentException if the matrix does not have at least 319 * two columns and two rows. 320 * 321 * @param matrix matrix to check for sufficiency 322 * @throws MathIllegalArgumentException if there is insufficient data 323 */ 324 private void checkSufficientData(final RealMatrix matrix) { 325 int nRows = matrix.getRowDimension(); 326 int nCols = matrix.getColumnDimension(); 327 if (nRows < 2 || nCols < 2) { 328 throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_ROWS_AND_COLUMNS, 329 nRows, nCols); 330 } 331 } 332 }