1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This is not the original file distributed by the Apache Software Foundation 20 * It has been modified by the Hipparchus project 21 */ 22 23 package org.hipparchus.stat.regression; 24 import java.io.Serializable; 25 26 import org.hipparchus.distribution.continuous.TDistribution; 27 import org.hipparchus.exception.LocalizedCoreFormats; 28 import org.hipparchus.exception.MathIllegalArgumentException; 29 import org.hipparchus.stat.LocalizedStatFormats; 30 import org.hipparchus.util.FastMath; 31 import org.hipparchus.util.MathUtils; 32 import org.hipparchus.util.Precision; 33 34 /** 35 * Estimates an ordinary least squares regression model 36 * with one independent variable. 37 * <p> 38 * <code> y = intercept + slope * x </code></p> 39 * <p> 40 * Standard errors for <code>intercept</code> and <code>slope</code> are 41 * available as well as ANOVA, r-square and Pearson's r statistics.</p> 42 * <p> 43 * Observations (x,y pairs) can be added to the model one at a time or they 44 * can be provided in a 2-dimensional array. The observations are not stored 45 * in memory, so there is no limit to the number of observations that can be 46 * added to the model.</p> 47 * <p>* <strong>Usage Notes</strong>:</p> 48 * <ul> 49 * <li> When there are fewer than two observations in the model, or when 50 * there is no variation in the x values (i.e. all x values are the same) 51 * all statistics return <code>NaN</code>. At least two observations with 52 * different x coordinates are required to estimate a bivariate regression 53 * model. 54 * </li> 55 * <li> Getters for the statistics always compute values based on the current 56 * set of observations -- i.e., you can get statistics, then add more data 57 * and get updated statistics without using a new instance. There is no 58 * "compute" method that updates all statistics. Each of the getters performs 59 * the necessary computations to return the requested statistic. 60 * </li> 61 * <li> The intercept term may be suppressed by passing {@code false} to 62 * the {@link #SimpleRegression(boolean)} constructor. When the 63 * {@code hasIntercept} property is false, the model is estimated without a 64 * constant term and {@link #getIntercept()} returns {@code 0}.</li> 65 * </ul> 66 * 67 */ 68 public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression { 69 70 /** Serializable version identifier */ 71 private static final long serialVersionUID = -3004689053607543335L; 72 73 /** sum of x values */ 74 private double sumX; 75 76 /** total variation in x (sum of squared deviations from xbar) */ 77 private double sumXX; 78 79 /** sum of y values */ 80 private double sumY; 81 82 /** total variation in y (sum of squared deviations from ybar) */ 83 private double sumYY; 84 85 /** sum of products */ 86 private double sumXY; 87 88 /** number of observations */ 89 private long n; 90 91 /** mean of accumulated x values, used in updating formulas */ 92 private double xbar; 93 94 /** mean of accumulated y values, used in updating formulas */ 95 private double ybar; 96 97 /** include an intercept or not */ 98 private final boolean hasIntercept; 99 // ---------------------Public methods-------------------------------------- 100 101 /** 102 * Create an empty SimpleRegression instance 103 */ 104 public SimpleRegression() { 105 this(true); 106 } 107 /** 108 * Create a SimpleRegression instance, specifying whether or not to estimate 109 * an intercept. 110 * 111 * <p>Use {@code false} to estimate a model with no intercept. When the 112 * {@code hasIntercept} property is false, the model is estimated without a 113 * constant term and {@link #getIntercept()} returns {@code 0}.</p> 114 * 115 * @param includeIntercept whether or not to include an intercept term in 116 * the regression model 117 */ 118 public SimpleRegression(boolean includeIntercept) { 119 super(); 120 hasIntercept = includeIntercept; 121 } 122 123 /** 124 * Adds the observation (x,y) to the regression data set. 125 * <p> 126 * Uses updating formulas for means and sums of squares defined in 127 * "Algorithms for Computing the Sample Variance: Analysis and 128 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 129 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in 130 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p> 131 * 132 * 133 * @param x independent variable value 134 * @param y dependent variable value 135 */ 136 public void addData(final double x,final double y) { 137 if (n == 0) { 138 xbar = x; 139 ybar = y; 140 } else { 141 if( hasIntercept ){ 142 final double fact1 = 1.0 + n; 143 final double fact2 = n / (1.0 + n); 144 final double dx = x - xbar; 145 final double dy = y - ybar; 146 sumXX += dx * dx * fact2; 147 sumYY += dy * dy * fact2; 148 sumXY += dx * dy * fact2; 149 xbar += dx / fact1; 150 ybar += dy / fact1; 151 } 152 } 153 if( !hasIntercept ){ 154 sumXX += x * x ; 155 sumYY += y * y ; 156 sumXY += x * y ; 157 } 158 sumX += x; 159 sumY += y; 160 n++; 161 } 162 163 /** 164 * Appends data from another regression calculation to this one. 165 * 166 * <p>The mean update formulae are based on a paper written by Philippe 167 * Pébay: 168 * <a 169 * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf"> 170 * Formulas for Robust, One-Pass Parallel Computation of Covariances and 171 * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report 172 * SAND2008-6212, Sandia National Laboratories.</p> 173 * 174 * @param reg model to append data from 175 */ 176 public void append(SimpleRegression reg) { 177 if (n == 0) { 178 xbar = reg.xbar; 179 ybar = reg.ybar; 180 sumXX = reg.sumXX; 181 sumYY = reg.sumYY; 182 sumXY = reg.sumXY; 183 } else { 184 if (hasIntercept) { 185 final double fact1 = reg.n / (double) (reg.n + n); 186 final double fact2 = n * reg.n / (double) (reg.n + n); 187 final double dx = reg.xbar - xbar; 188 final double dy = reg.ybar - ybar; 189 sumXX += reg.sumXX + dx * dx * fact2; 190 sumYY += reg.sumYY + dy * dy * fact2; 191 sumXY += reg.sumXY + dx * dy * fact2; 192 xbar += dx * fact1; 193 ybar += dy * fact1; 194 }else{ 195 sumXX += reg.sumXX; 196 sumYY += reg.sumYY; 197 sumXY += reg.sumXY; 198 } 199 } 200 sumX += reg.sumX; 201 sumY += reg.sumY; 202 n += reg.n; 203 } 204 205 /** 206 * Removes the observation (x,y) from the regression data set. 207 * <p> 208 * Mirrors the addData method. This method permits the use of 209 * SimpleRegression instances in streaming mode where the regression 210 * is applied to a sliding "window" of observations, however the caller is 211 * responsible for maintaining the set of observations in the window.</p> 212 * 213 * The method has no effect if there are no points of data (i.e. n=0) 214 * 215 * @param x independent variable value 216 * @param y dependent variable value 217 */ 218 public void removeData(final double x,final double y) { 219 if (n > 0) { 220 if (hasIntercept) { 221 final double fact1 = n - 1.0; 222 final double fact2 = n / (n - 1.0); 223 final double dx = x - xbar; 224 final double dy = y - ybar; 225 sumXX -= dx * dx * fact2; 226 sumYY -= dy * dy * fact2; 227 sumXY -= dx * dy * fact2; 228 xbar -= dx / fact1; 229 ybar -= dy / fact1; 230 } else { 231 final double fact1 = n - 1.0; 232 sumXX -= x * x; 233 sumYY -= y * y; 234 sumXY -= x * y; 235 xbar -= x / fact1; 236 ybar -= y / fact1; 237 } 238 sumX -= x; 239 sumY -= y; 240 n--; 241 } 242 } 243 244 /** 245 * Adds the observations represented by the elements in 246 * <code>data</code>. 247 * <p> 248 * <code>(data[0][0],data[0][1])</code> will be the first observation, then 249 * <code>(data[1][0],data[1][1])</code>, etc.</p> 250 * <p> 251 * This method does not replace data that has already been added. The 252 * observations represented by <code>data</code> are added to the existing 253 * dataset.</p> 254 * <p> 255 * To replace all data, use <code>clear()</code> before adding the new 256 * data.</p> 257 * 258 * @param data array of observations to be added 259 * @throws MathIllegalArgumentException if the length of {@code data[i]} is not 260 * greater than or equal to 2 261 */ 262 public void addData(final double[][] data) throws MathIllegalArgumentException { 263 for (int i = 0; i < data.length; i++) { 264 if( data[i].length < 2 ){ 265 throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION, 266 data[i].length, 2); 267 } 268 addData(data[i][0], data[i][1]); 269 } 270 } 271 272 /** 273 * Adds one observation to the regression model. 274 * 275 * @param x the independent variables which form the design matrix 276 * @param y the dependent or response variable 277 * @throws MathIllegalArgumentException if the length of {@code x} does not equal 278 * the number of independent variables in the model 279 */ 280 @Override 281 public void addObservation(final double[] x,final double y) 282 throws MathIllegalArgumentException { 283 if( x == null || x.length == 0 ){ 284 throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1); 285 } 286 addData( x[0], y ); 287 } 288 289 /** 290 * Adds a series of observations to the regression model. The lengths of 291 * x and y must be the same and x must be rectangular. 292 * 293 * @param x a series of observations on the independent variables 294 * @param y a series of observations on the dependent variable 295 * The length of x and y must be the same 296 * @throws MathIllegalArgumentException if {@code x} is not rectangular, does not match 297 * the length of {@code y} or does not contain sufficient data to estimate the model 298 */ 299 @Override 300 public void addObservations(final double[][] x,final double[] y) throws MathIllegalArgumentException { 301 MathUtils.checkNotNull(x, LocalizedCoreFormats.INPUT_ARRAY); 302 MathUtils.checkNotNull(y, LocalizedCoreFormats.INPUT_ARRAY); 303 MathUtils.checkDimension(x.length, y.length); 304 boolean obsOk = true; 305 for( int i = 0 ; i < x.length; i++){ 306 if( x[i] == null || x[i].length == 0 ){ 307 obsOk = false; 308 } 309 } 310 if( !obsOk ){ 311 throw new MathIllegalArgumentException( 312 LocalizedStatFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS, 313 0, 1); 314 } 315 for( int i = 0 ; i < x.length ; i++){ 316 addData( x[i][0], y[i] ); 317 } 318 } 319 320 /** 321 * Removes observations represented by the elements in <code>data</code>. 322 * <p> 323 * If the array is larger than the current n, only the first n elements are 324 * processed. This method permits the use of SimpleRegression instances in 325 * streaming mode where the regression is applied to a sliding "window" of 326 * observations, however the caller is responsible for maintaining the set 327 * of observations in the window.</p> 328 * <p> 329 * To remove all data, use <code>clear()</code>.</p> 330 * 331 * @param data array of observations to be removed 332 */ 333 public void removeData(double[][] data) { 334 for (int i = 0; i < data.length && n > 0; i++) { 335 removeData(data[i][0], data[i][1]); 336 } 337 } 338 339 /** 340 * Clears all data from the model. 341 */ 342 @Override 343 public void clear() { 344 sumX = 0d; 345 sumXX = 0d; 346 sumY = 0d; 347 sumYY = 0d; 348 sumXY = 0d; 349 n = 0; 350 } 351 352 /** 353 * Returns the number of observations that have been added to the model. 354 * 355 * @return n number of observations that have been added. 356 */ 357 @Override 358 public long getN() { 359 return n; 360 } 361 362 /** 363 * Returns the "predicted" <code>y</code> value associated with the 364 * supplied <code>x</code> value, based on the data that has been 365 * added to the model when this method is activated. 366 * <p> 367 * <code> predict(x) = intercept + slope * x </code></p> 368 * <p>* <strong>Preconditions</strong>:</p> 369 * <ul> 370 * <li>At least two observations (with at least two different x values) 371 * must have been added before invoking this method. If this method is 372 * invoked before a model can be estimated, <code>Double,NaN</code> is 373 * returned. 374 * </li></ul> 375 * 376 * @param x input <code>x</code> value 377 * @return predicted <code>y</code> value 378 */ 379 public double predict(final double x) { 380 final double b1 = getSlope(); 381 if (hasIntercept) { 382 return getIntercept(b1) + b1 * x; 383 } 384 return b1 * x; 385 } 386 387 /** 388 * Returns the intercept of the estimated regression line, if 389 * {@link #hasIntercept()} is true; otherwise 0. 390 * <p> 391 * The least squares estimate of the intercept is computed using the 392 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 393 * The intercept is sometimes denoted b0.</p> 394 * <p><strong>Preconditions</strong>:</p> 395 * <ul> 396 * <li>At least two observations (with at least two different x values) 397 * must have been added before invoking this method. If this method is 398 * invoked before a model can be estimated, <code>Double,NaN</code> is 399 * returned. 400 * </li></ul> 401 * 402 * @return the intercept of the regression line if the model includes an 403 * intercept; 0 otherwise 404 * @see #SimpleRegression(boolean) 405 */ 406 public double getIntercept() { 407 return hasIntercept ? getIntercept(getSlope()) : 0.0; 408 } 409 410 /** 411 * Returns true if the model includes an intercept term. 412 * 413 * @return true if the regression includes an intercept; false otherwise 414 * @see #SimpleRegression(boolean) 415 */ 416 @Override 417 public boolean hasIntercept() { 418 return hasIntercept; 419 } 420 421 /** 422 * Returns the slope of the estimated regression line. 423 * <p> 424 * The least squares estimate of the slope is computed using the 425 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>. 426 * The slope is sometimes denoted b1.</p> 427 * <p>* <strong>Preconditions</strong>:</p> 428 * <ul> 429 * <li>At least two observations (with at least two different x values) 430 * must have been added before invoking this method. If this method is 431 * invoked before a model can be estimated, <code>Double.NaN</code> is 432 * returned. 433 * </li></ul> 434 * 435 * @return the slope of the regression line 436 */ 437 public double getSlope() { 438 if (n < 2) { 439 return Double.NaN; //not enough data 440 } 441 if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) { 442 return Double.NaN; //not enough variation in x 443 } 444 return sumXY / sumXX; 445 } 446 447 /** 448 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm"> 449 * sum of squared errors</a> (SSE) associated with the regression 450 * model. 451 * <p> 452 * The sum is computed using the computational formula</p> 453 * <p> 454 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p> 455 * <p> 456 * where <code>SYY</code> is the sum of the squared deviations of the y 457 * values about their mean, <code>SXX</code> is similarly defined and 458 * <code>SXY</code> is the sum of the products of x and y mean deviations. 459 * </p><p> 460 * The sums are accumulated using the updating algorithm referenced in 461 * {@link #addData}.</p> 462 * <p> 463 * The return value is constrained to be non-negative - i.e., if due to 464 * rounding errors the computational formula returns a negative result, 465 * 0 is returned.</p> 466 * <p>* <strong>Preconditions</strong>:</p> 467 * <ul> 468 * <li>At least two observations (with at least two different x values) 469 * must have been added before invoking this method. If this method is 470 * invoked before a model can be estimated, <code>Double,NaN</code> is 471 * returned. 472 * </li></ul> 473 * 474 * @return sum of squared errors associated with the regression model 475 */ 476 public double getSumSquaredErrors() { 477 return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX); 478 } 479 480 /** 481 * Returns the sum of squared deviations of the y values about their mean. 482 * <p> 483 * This is defined as SSTO 484 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p> 485 * <p> 486 * If {@code n < 2}, this returns <code>Double.NaN</code>.</p> 487 * 488 * @return sum of squared deviations of y values 489 */ 490 public double getTotalSumSquares() { 491 if (n < 2) { 492 return Double.NaN; 493 } 494 return sumYY; 495 } 496 497 /** 498 * Returns the sum of squared deviations of the x values about their mean. 499 * 500 * <p>If {@code n < 2}, this returns <code>Double.NaN</code>.</p> 501 * 502 * @return sum of squared deviations of x values 503 */ 504 public double getXSumSquares() { 505 if (n < 2) { 506 return Double.NaN; 507 } 508 return sumXX; 509 } 510 511 /** 512 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>. 513 * 514 * @return sum of cross products 515 */ 516 public double getSumOfCrossProducts() { 517 return sumXY; 518 } 519 520 /** 521 * Returns the sum of squared deviations of the predicted y values about 522 * their mean (which equals the mean of y). 523 * <p> 524 * This is usually abbreviated SSR or SSM. It is defined as SSM 525 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p> 526 * <p>* <strong>Preconditions</strong>:</p> 527 * <ul> 528 * <li>At least two observations (with at least two different x values) 529 * must have been added before invoking this method. If this method is 530 * invoked before a model can be estimated, <code>Double.NaN</code> is 531 * returned. 532 * </li></ul> 533 * 534 * @return sum of squared deviations of predicted y values 535 */ 536 public double getRegressionSumSquares() { 537 return getRegressionSumSquares(getSlope()); 538 } 539 540 /** 541 * Returns the sum of squared errors divided by the degrees of freedom, 542 * usually abbreviated MSE. 543 * <p> 544 * If there are fewer than <strong>three</strong> data pairs in the model, 545 * or if there is no variation in <code>x</code>, this returns 546 * <code>Double.NaN</code>.</p> 547 * 548 * @return sum of squared deviations of y values 549 */ 550 public double getMeanSquareError() { 551 if (n < 3) { 552 return Double.NaN; 553 } 554 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1)); 555 } 556 557 /** 558 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html"> 559 * Pearson's product moment correlation coefficient</a>, 560 * usually denoted r. 561 * <p>* <strong>Preconditions</strong>:</p> 562 * <ul> 563 * <li>At least two observations (with at least two different x values) 564 * must have been added before invoking this method. If this method is 565 * invoked before a model can be estimated, <code>Double,NaN</code> is 566 * returned. 567 * </li></ul> 568 * 569 * @return Pearson's r 570 */ 571 public double getR() { 572 double b1 = getSlope(); 573 double result = FastMath.sqrt(getRSquare()); 574 if (b1 < 0) { 575 result = -result; 576 } 577 return result; 578 } 579 580 /** 581 * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 582 * coefficient of determination</a>, 583 * usually denoted r-square. 584 * <p>* <strong>Preconditions</strong>:</p> 585 * <ul> 586 * <li>At least two observations (with at least two different x values) 587 * must have been added before invoking this method. If this method is 588 * invoked before a model can be estimated, <code>Double,NaN</code> is 589 * returned. 590 * </li></ul> 591 * 592 * @return r-square 593 */ 594 public double getRSquare() { 595 double ssto = getTotalSumSquares(); 596 return (ssto - getSumSquaredErrors()) / ssto; 597 } 598 599 /** 600 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm"> 601 * standard error of the intercept estimate</a>, 602 * usually denoted s(b0). 603 * <p> 604 * If there are fewer that <strong>three</strong> observations in the 605 * model, or if there is no variation in x, this returns 606 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is 607 * returned when the intercept is constrained to be zero 608 * 609 * @return standard error associated with intercept estimate 610 */ 611 public double getInterceptStdErr() { 612 if( !hasIntercept ){ 613 return Double.NaN; 614 } 615 return FastMath.sqrt( 616 getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX)); 617 } 618 619 /** 620 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard 621 * error of the slope estimate</a>, 622 * usually denoted s(b1). 623 * <p> 624 * If there are fewer that <strong>three</strong> data pairs in the model, 625 * or if there is no variation in x, this returns <code>Double.NaN</code>. 626 * </p> 627 * 628 * @return standard error associated with slope estimate 629 */ 630 public double getSlopeStdErr() { 631 return FastMath.sqrt(getMeanSquareError() / sumXX); 632 } 633 634 /** 635 * Returns the half-width of a 95% confidence interval for the slope 636 * estimate. 637 * <p> 638 * The 95% confidence interval is</p> 639 * <p> 640 * <code>(getSlope() - getSlopeConfidenceInterval(), 641 * getSlope() + getSlopeConfidenceInterval())</code></p> 642 * <p> 643 * If there are fewer that <strong>three</strong> observations in the 644 * model, or if there is no variation in x, this returns 645 * <code>Double.NaN</code>.</p> 646 * <p>* <strong>Usage Note</strong>:</p> 647 * <p> 648 * The validity of this statistic depends on the assumption that the 649 * observations included in the model are drawn from a 650 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 651 * Bivariate Normal Distribution</a>.</p> 652 * 653 * @return half-width of 95% confidence interval for the slope estimate 654 * @throws MathIllegalArgumentException if the confidence interval can not be computed. 655 */ 656 public double getSlopeConfidenceInterval() throws MathIllegalArgumentException { 657 return getSlopeConfidenceInterval(0.05d); 658 } 659 660 /** 661 * Returns the half-width of a (100-100*alpha)% confidence interval for 662 * the slope estimate. 663 * <p> 664 * The (100-100*alpha)% confidence interval is </p> 665 * <p> 666 * <code>(getSlope() - getSlopeConfidenceInterval(), 667 * getSlope() + getSlopeConfidenceInterval())</code></p> 668 * <p> 669 * To request, for example, a 99% confidence interval, use 670 * <code>alpha = .01</code></p> 671 * <p> 672 * <strong>Usage Note</strong>:<br> 673 * The validity of this statistic depends on the assumption that the 674 * observations included in the model are drawn from a 675 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 676 * Bivariate Normal Distribution</a>.</p> 677 * <p> 678 * <strong> Preconditions:</strong></p><ul> 679 * <li>If there are fewer that <strong>three</strong> observations in the 680 * model, or if there is no variation in x, this returns 681 * <code>Double.NaN</code>. 682 * </li> 683 * <li>{@code (0 < alpha < 1)}; otherwise an 684 * <code>MathIllegalArgumentException</code> is thrown. 685 * </li></ul> 686 * 687 * @param alpha the desired significance level 688 * @return half-width of 95% confidence interval for the slope estimate 689 * @throws MathIllegalArgumentException if the confidence interval can not be computed. 690 */ 691 public double getSlopeConfidenceInterval(final double alpha) 692 throws MathIllegalArgumentException { 693 if (n < 3) { 694 return Double.NaN; 695 } 696 if (alpha >= 1 || alpha <= 0) { 697 throw new MathIllegalArgumentException(LocalizedStatFormats.SIGNIFICANCE_LEVEL, 698 alpha, 0, 1); 699 } 700 // No advertised MathIllegalArgumentException here - will return NaN above 701 TDistribution distribution = new TDistribution(n - 2); 702 return getSlopeStdErr() * 703 distribution.inverseCumulativeProbability(1d - alpha / 2d); 704 } 705 706 /** 707 * Returns the significance level of the slope (equiv) correlation. 708 * <p> 709 * Specifically, the returned value is the smallest <code>alpha</code> 710 * such that the slope confidence interval with significance level 711 * equal to <code>alpha</code> does not include <code>0</code>. 712 * On regression output, this is often denoted <code>Prob(|t| > 0)</code> 713 * </p><p> 714 * <strong>Usage Note</strong>:<br> 715 * The validity of this statistic depends on the assumption that the 716 * observations included in the model are drawn from a 717 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html"> 718 * Bivariate Normal Distribution</a>.</p> 719 * <p> 720 * If there are fewer that <strong>three</strong> observations in the 721 * model, or if there is no variation in x, this returns 722 * <code>Double.NaN</code>.</p> 723 * 724 * @return significance level for slope/correlation 725 * @throws org.hipparchus.exception.MathIllegalStateException 726 * if the significance level can not be computed. 727 */ 728 public double getSignificance() { 729 if (n < 3) { 730 return Double.NaN; 731 } 732 // No advertised MathIllegalArgumentException here - will return NaN above 733 TDistribution distribution = new TDistribution(n - 2); 734 return 2d * (1.0 - distribution.cumulativeProbability( 735 FastMath.abs(getSlope()) / getSlopeStdErr())); 736 } 737 738 // ---------------------Private methods----------------------------------- 739 740 /** 741 * Returns the intercept of the estimated regression line, given the slope. 742 * <p> 743 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p> 744 * 745 * @param slope current slope 746 * @return the intercept of the regression line 747 */ 748 private double getIntercept(final double slope) { 749 if( hasIntercept){ 750 return (sumY - slope * sumX) / n; 751 } 752 return 0.0; 753 } 754 755 /** 756 * Computes SSR from b1. 757 * 758 * @param slope regression slope estimate 759 * @return sum of squared deviations of predicted y values 760 */ 761 private double getRegressionSumSquares(final double slope) { 762 return slope * slope * sumXX; 763 } 764 765 /** 766 * Performs a regression on data present in buffers and outputs a RegressionResults object. 767 * 768 * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true 769 * a {@code MathIllegalArgumentException} is thrown. If there is no intercept term, the model must 770 * contain at least 2 observations.</p> 771 * 772 * @return RegressionResults acts as a container of regression output 773 * @throws MathIllegalArgumentException if the model is not correctly specified 774 * @throws MathIllegalArgumentException if there is not sufficient data in the model to 775 * estimate the regression parameters 776 */ 777 @Override 778 public RegressionResults regress() throws MathIllegalArgumentException { 779 if (hasIntercept) { 780 if (n < 3) { 781 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION); 782 } 783 if (FastMath.abs(sumXX) > Precision.SAFE_MIN) { 784 final double[] params = { getIntercept(), getSlope() }; 785 final double mse = getMeanSquareError(); 786 final double _syy = sumYY + sumY * sumY / n; 787 final double[] vcv = { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX }; 788 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true, 789 false); 790 } else { 791 final double[] params = { sumY / n, Double.NaN }; 792 // final double mse = getMeanSquareError(); 793 final double[] vcv = { ybar / (n - 1.0), Double.NaN, Double.NaN }; 794 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true, 795 false); 796 } 797 } else { 798 if (n < 2) { 799 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION); 800 } 801 if (!Double.isNaN(sumXX)) { 802 final double[] vcv = { getMeanSquareError() / sumXX }; 803 final double[] params = { sumXY / sumXX }; 804 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false, 805 false); 806 } else { 807 final double[] vcv = { Double.NaN }; 808 final double[] params = { Double.NaN }; 809 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false, 810 false); 811 } 812 } 813 } 814 815 /** 816 * Performs a regression on data present in buffers including only regressors 817 * indexed in variablesToInclude and outputs a RegressionResults object 818 * @param variablesToInclude an array of indices of regressors to include 819 * @return RegressionResults acts as a container of regression output 820 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length 821 * @throws MathIllegalArgumentException if a requested variable is not present in model 822 */ 823 @Override 824 public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException { 825 if (variablesToInclude == null || variablesToInclude.length == 0) { 826 throw new MathIllegalArgumentException(LocalizedCoreFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED); 827 } 828 if (variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept)) { 829 throw new MathIllegalArgumentException( 830 LocalizedCoreFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES, 831 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2); 832 } 833 834 if (hasIntercept) { 835 if (variablesToInclude.length == 2) { 836 if (variablesToInclude[0] == 1) { 837 throw new MathIllegalArgumentException(LocalizedCoreFormats.NOT_INCREASING_SEQUENCE); 838 } else if (variablesToInclude[0] != 0) { 839 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE, 840 variablesToInclude[0], 0, 1); 841 } 842 if (variablesToInclude[1] != 1) { 843 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE, 844 variablesToInclude[0], 0, 1); 845 } 846 return regress(); 847 }else{ 848 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ) { 849 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE, 850 variablesToInclude[0], 0, 1); 851 } 852 final double _mean = sumY * sumY / n; 853 final double _syy = sumYY + _mean; 854 if( variablesToInclude[0] == 0 ){ 855 //just the mean 856 final double[] vcv = { sumYY/(((n-1)*n)) }; 857 final double[] params = { ybar }; 858 return new RegressionResults( 859 params, new double[][] {vcv}, true, n, 1, 860 sumY, _syy+_mean, sumYY,true,false); 861 862 }else if( variablesToInclude[0] == 1){ 863 //final double _syy = sumYY + sumY * sumY / ((double) n); 864 final double _sxx = sumXX + sumX * sumX / n; 865 final double _sxy = sumXY + sumX * sumY / n; 866 final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx); 867 final double _mse = _sse/((n-1)); 868 if( !Double.isNaN(_sxx) ){ 869 final double[] vcv = { _mse / _sxx }; 870 final double[] params = { _sxy/_sxx }; 871 return new RegressionResults( 872 params, new double[][] {vcv}, true, n, 1, 873 sumY, _syy, _sse,false,false); 874 }else{ 875 final double[] vcv = {Double.NaN }; 876 final double[] params = { Double.NaN }; 877 return new RegressionResults( 878 params, new double[][] {vcv}, true, n, 1, 879 Double.NaN, Double.NaN, Double.NaN,false,false); 880 } 881 } 882 } 883 } else { 884 if (variablesToInclude[0] != 0) { 885 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE, 886 variablesToInclude[0], 0, 0); 887 } 888 return regress(); 889 } 890 891 return null; 892 } 893 }