PearsonsCorrelation.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
- * This is not the original file distributed by the Apache Software Foundation
- * It has been modified by the Hipparchus project
- */
- package org.hipparchus.stat.correlation;
- import org.hipparchus.distribution.continuous.TDistribution;
- import org.hipparchus.exception.LocalizedCoreFormats;
- import org.hipparchus.exception.MathIllegalArgumentException;
- import org.hipparchus.linear.BlockRealMatrix;
- import org.hipparchus.linear.RealMatrix;
- import org.hipparchus.stat.LocalizedStatFormats;
- import org.hipparchus.stat.regression.SimpleRegression;
- import org.hipparchus.util.FastMath;
- import org.hipparchus.util.MathArrays;
- import org.hipparchus.util.MathUtils;
- /**
- * Computes Pearson's product-moment correlation coefficients for pairs of arrays
- * or columns of a matrix.
- * <p>
- * The constructors that take <code>RealMatrix</code> or
- * <code>double[][]</code> arguments generate correlation matrices. The
- * columns of the input matrices are assumed to represent variable values.
- * Correlations are given by the formula:
- * <p>
- * <code>cor(X, Y) = Σ[(x<sub>i</sub> - E(X))(y<sub>i</sub> - E(Y))] / [(n - 1)s(X)s(Y)]</code>
- * <p>
- * where <code>E(X)</code> is the mean of <code>X</code>, <code>E(Y)</code>
- * is the mean of the <code>Y</code> values and s(X), s(Y) are standard deviations.
- * <p>
- * To compute the correlation coefficient for a single pair of arrays, use {@link #PearsonsCorrelation()}
- * to construct an instance with no data and then {@link #correlation(double[], double[])}.
- * Correlation matrices can also be computed directly from an instance with no data using
- * {@link #computeCorrelationMatrix(double[][])}. In order to use {@link #getCorrelationMatrix()},
- * {@link #getCorrelationPValues()}, or {@link #getCorrelationStandardErrors()}; however, one of the
- * constructors supplying data or a covariance matrix must be used to create the instance.
- */
- public class PearsonsCorrelation {
- /** correlation matrix */
- private final RealMatrix correlationMatrix;
- /** number of observations */
- private final int nObs;
- /**
- * Create a PearsonsCorrelation instance without data.
- */
- public PearsonsCorrelation() {
- super();
- correlationMatrix = null;
- nObs = 0;
- }
- /**
- * Create a PearsonsCorrelation from a rectangular array
- * whose columns represent values of variables to be correlated.
- *
- * Throws MathIllegalArgumentException if the input array does not have at least
- * two columns and two rows. Pairwise correlations are set to NaN if one
- * of the correlates has zero variance.
- *
- * @param data rectangular array with columns representing variables
- * @throws MathIllegalArgumentException if the input data array is not
- * rectangular with at least two rows and two columns.
- * @see #correlation(double[], double[])
- */
- public PearsonsCorrelation(double[][] data) {
- this(new BlockRealMatrix(data));
- }
- /**
- * Create a PearsonsCorrelation from a RealMatrix whose columns
- * represent variables to be correlated.
- *
- * Throws MathIllegalArgumentException if the matrix does not have at least
- * two columns and two rows. Pairwise correlations are set to NaN if one
- * of the correlates has zero variance.
- *
- * @param matrix matrix with columns representing variables to correlate
- * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
- * @see #correlation(double[], double[])
- */
- public PearsonsCorrelation(RealMatrix matrix) {
- nObs = matrix.getRowDimension();
- correlationMatrix = computeCorrelationMatrix(matrix);
- }
- /**
- * Create a PearsonsCorrelation from a {@link Covariance}. The correlation
- * matrix is computed by scaling the Covariance's covariance matrix.
- * The Covariance instance must have been created from a data matrix with
- * columns representing variable values.
- *
- * @param covariance Covariance instance
- */
- public PearsonsCorrelation(Covariance covariance) {
- RealMatrix covarianceMatrix = covariance.getCovarianceMatrix();
- MathUtils.checkNotNull(covarianceMatrix, LocalizedStatFormats.COVARIANCE_MATRIX);
- nObs = covariance.getN();
- correlationMatrix = covarianceToCorrelation(covarianceMatrix);
- }
- /**
- * Create a PearsonsCorrelation from a covariance matrix. The correlation
- * matrix is computed by scaling the covariance matrix.
- *
- * @param covarianceMatrix covariance matrix
- * @param numberOfObservations the number of observations in the dataset used to compute
- * the covariance matrix
- */
- public PearsonsCorrelation(RealMatrix covarianceMatrix, int numberOfObservations) {
- nObs = numberOfObservations;
- correlationMatrix = covarianceToCorrelation(covarianceMatrix);
- }
- /**
- * Returns the correlation matrix.
- *
- * <p>This method will return null if the argumentless constructor was used
- * to create this instance, even if {@link #computeCorrelationMatrix(double[][])}
- * has been called before it is activated.</p>
- *
- * @return correlation matrix
- */
- public RealMatrix getCorrelationMatrix() {
- return correlationMatrix;
- }
- /**
- * Returns a matrix of standard errors associated with the estimates
- * in the correlation matrix.<br>
- * <code>getCorrelationStandardErrors().getEntry(i,j)</code> is the standard
- * error associated with <code>getCorrelationMatrix.getEntry(i,j)</code>
- *
- * <p>The formula used to compute the standard error is <br>
- * <code>SE<sub>r</sub> = ((1 - r<sup>2</sup>) / (n - 2))<sup>1/2</sup></code>
- * where <code>r</code> is the estimated correlation coefficient and
- * <code>n</code> is the number of observations in the source dataset.</p>
- *
- * <p>To use this method, one of the constructors that supply an input
- * matrix must have been used to create this instance.</p>
- *
- * @return matrix of correlation standard errors
- * @throws NullPointerException if this instance was created with no data
- */
- public RealMatrix getCorrelationStandardErrors() {
- int nVars = correlationMatrix.getColumnDimension();
- double[][] out = new double[nVars][nVars];
- for (int i = 0; i < nVars; i++) {
- for (int j = 0; j < nVars; j++) {
- double r = correlationMatrix.getEntry(i, j);
- out[i][j] = FastMath.sqrt((1 - r * r) /(nObs - 2));
- }
- }
- return new BlockRealMatrix(out);
- }
- /**
- * Returns a matrix of p-values associated with the (two-sided) null
- * hypothesis that the corresponding correlation coefficient is zero.
- *
- * <p><code>getCorrelationPValues().getEntry(i,j)</code> is the probability
- * that a random variable distributed as <code>t<sub>n-2</sub></code> takes
- * a value with absolute value greater than or equal to <br>
- * <code>|r|((n - 2) / (1 - r<sup>2</sup>))<sup>1/2</sup></code></p>
- *
- * <p>The values in the matrix are sometimes referred to as the
- * <i>significance</i> of the corresponding correlation coefficients.</p>
- *
- * <p>To use this method, one of the constructors that supply an input
- * matrix must have been used to create this instance.</p>
- *
- * @return matrix of p-values
- * @throws org.hipparchus.exception.MathIllegalStateException
- * if an error occurs estimating probabilities
- * @throws NullPointerException if this instance was created with no data
- */
- public RealMatrix getCorrelationPValues() {
- TDistribution tDistribution = new TDistribution(nObs - 2);
- int nVars = correlationMatrix.getColumnDimension();
- double[][] out = new double[nVars][nVars];
- for (int i = 0; i < nVars; i++) {
- for (int j = 0; j < nVars; j++) {
- if (i == j) {
- out[i][j] = 0d;
- } else {
- double r = correlationMatrix.getEntry(i, j);
- double t = FastMath.abs(r * FastMath.sqrt((nObs - 2)/(1 - r * r)));
- out[i][j] = 2 * tDistribution.cumulativeProbability(-t);
- }
- }
- }
- return new BlockRealMatrix(out);
- }
- /**
- * Computes the correlation matrix for the columns of the
- * input matrix, using {@link #correlation(double[], double[])}.
- *
- * Throws MathIllegalArgumentException if the matrix does not have at least
- * two columns and two rows. Pairwise correlations are set to NaN if one
- * of the correlates has zero variance.
- *
- * @param matrix matrix with columns representing variables to correlate
- * @return correlation matrix
- * @throws MathIllegalArgumentException if the matrix does not contain sufficient data
- * @see #correlation(double[], double[])
- */
- public RealMatrix computeCorrelationMatrix(RealMatrix matrix) {
- checkSufficientData(matrix);
- int nVars = matrix.getColumnDimension();
- RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
- for (int i = 0; i < nVars; i++) {
- for (int j = 0; j < i; j++) {
- double corr = correlation(matrix.getColumn(i), matrix.getColumn(j));
- outMatrix.setEntry(i, j, corr);
- outMatrix.setEntry(j, i, corr);
- }
- outMatrix.setEntry(i, i, 1d);
- }
- return outMatrix;
- }
- /**
- * Computes the correlation matrix for the columns of the
- * input rectangular array. The columns of the array represent values
- * of variables to be correlated.
- *
- * Throws MathIllegalArgumentException if the matrix does not have at least
- * two columns and two rows or if the array is not rectangular. Pairwise
- * correlations are set to NaN if one of the correlates has zero variance.
- *
- * @param data matrix with columns representing variables to correlate
- * @return correlation matrix
- * @throws MathIllegalArgumentException if the array does not contain sufficient data
- * @see #correlation(double[], double[])
- */
- public RealMatrix computeCorrelationMatrix(double[][] data) {
- return computeCorrelationMatrix(new BlockRealMatrix(data));
- }
- /**
- * Computes the Pearson's product-moment correlation coefficient between two arrays.
- *
- * <p>Throws MathIllegalArgumentException if the arrays do not have the same length
- * or their common length is less than 2. Returns {@code NaN} if either of the arrays
- * has zero variance (i.e., if one of the arrays does not contain at least two distinct
- * values).</p>
- *
- * @param xArray first data array
- * @param yArray second data array
- * @return Returns Pearson's correlation coefficient for the two arrays
- * @throws MathIllegalArgumentException if the arrays lengths do not match
- * @throws MathIllegalArgumentException if there is insufficient data
- */
- public double correlation(final double[] xArray, final double[] yArray) {
- MathArrays.checkEqualLength(xArray, yArray);
- if (xArray.length < 2) {
- throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_DIMENSION,
- xArray.length, 2);
- }
- SimpleRegression regression = new SimpleRegression();
- for(int i = 0; i < xArray.length; i++) {
- regression.addData(xArray[i], yArray[i]);
- }
- return regression.getR();
- }
- /**
- * Derives a correlation matrix from a covariance matrix.
- *
- * <p>Uses the formula <br>
- * <code>r(X,Y) = cov(X,Y)/s(X)s(Y)</code> where
- * <code>r(·,·)</code> is the correlation coefficient and
- * <code>s(·)</code> means standard deviation.</p>
- *
- * @param covarianceMatrix the covariance matrix
- * @return correlation matrix
- */
- public RealMatrix covarianceToCorrelation(RealMatrix covarianceMatrix) {
- int nVars = covarianceMatrix.getColumnDimension();
- RealMatrix outMatrix = new BlockRealMatrix(nVars, nVars);
- for (int i = 0; i < nVars; i++) {
- double sigma = FastMath.sqrt(covarianceMatrix.getEntry(i, i));
- outMatrix.setEntry(i, i, 1d);
- for (int j = 0; j < i; j++) {
- double entry = covarianceMatrix.getEntry(i, j) /
- (sigma * FastMath.sqrt(covarianceMatrix.getEntry(j, j)));
- outMatrix.setEntry(i, j, entry);
- outMatrix.setEntry(j, i, entry);
- }
- }
- return outMatrix;
- }
- /**
- * Throws MathIllegalArgumentException if the matrix does not have at least
- * two columns and two rows.
- *
- * @param matrix matrix to check for sufficiency
- * @throws MathIllegalArgumentException if there is insufficient data
- */
- private void checkSufficientData(final RealMatrix matrix) {
- int nRows = matrix.getRowDimension();
- int nCols = matrix.getColumnDimension();
- if (nRows < 2 || nCols < 2) {
- throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_ROWS_AND_COLUMNS,
- nRows, nCols);
- }
- }
- }