1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22
23 package org.hipparchus.stat.regression;
24 import java.io.Serializable;
25
26 import org.hipparchus.distribution.continuous.TDistribution;
27 import org.hipparchus.exception.LocalizedCoreFormats;
28 import org.hipparchus.exception.MathIllegalArgumentException;
29 import org.hipparchus.stat.LocalizedStatFormats;
30 import org.hipparchus.util.FastMath;
31 import org.hipparchus.util.MathUtils;
32 import org.hipparchus.util.Precision;
33
34 /**
35 * Estimates an ordinary least squares regression model
36 * with one independent variable.
37 * <p>
38 * <code> y = intercept + slope * x </code></p>
39 * <p>
40 * Standard errors for <code>intercept</code> and <code>slope</code> are
41 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
42 * <p>
43 * Observations (x,y pairs) can be added to the model one at a time or they
44 * can be provided in a 2-dimensional array. The observations are not stored
45 * in memory, so there is no limit to the number of observations that can be
46 * added to the model.</p>
47 * <p>* <strong>Usage Notes</strong>:</p>
48 * <ul>
49 * <li> When there are fewer than two observations in the model, or when
50 * there is no variation in the x values (i.e. all x values are the same)
51 * all statistics return <code>NaN</code>. At least two observations with
52 * different x coordinates are required to estimate a bivariate regression
53 * model.
54 * </li>
55 * <li> Getters for the statistics always compute values based on the current
56 * set of observations -- i.e., you can get statistics, then add more data
57 * and get updated statistics without using a new instance. There is no
58 * "compute" method that updates all statistics. Each of the getters performs
59 * the necessary computations to return the requested statistic.
60 * </li>
61 * <li> The intercept term may be suppressed by passing {@code false} to
62 * the {@link #SimpleRegression(boolean)} constructor. When the
63 * {@code hasIntercept} property is false, the model is estimated without a
64 * constant term and {@link #getIntercept()} returns {@code 0}.</li>
65 * </ul>
66 *
67 */
68 public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
69
70 /** Serializable version identifier */
71 private static final long serialVersionUID = -3004689053607543335L;
72
73 /** sum of x values */
74 private double sumX;
75
76 /** total variation in x (sum of squared deviations from xbar) */
77 private double sumXX;
78
79 /** sum of y values */
80 private double sumY;
81
82 /** total variation in y (sum of squared deviations from ybar) */
83 private double sumYY;
84
85 /** sum of products */
86 private double sumXY;
87
88 /** number of observations */
89 private long n;
90
91 /** mean of accumulated x values, used in updating formulas */
92 private double xbar;
93
94 /** mean of accumulated y values, used in updating formulas */
95 private double ybar;
96
97 /** include an intercept or not */
98 private final boolean hasIntercept;
99 // ---------------------Public methods--------------------------------------
100
101 /**
102 * Create an empty SimpleRegression instance
103 */
104 public SimpleRegression() {
105 this(true);
106 }
107 /**
108 * Create a SimpleRegression instance, specifying whether or not to estimate
109 * an intercept.
110 *
111 * <p>Use {@code false} to estimate a model with no intercept. When the
112 * {@code hasIntercept} property is false, the model is estimated without a
113 * constant term and {@link #getIntercept()} returns {@code 0}.</p>
114 *
115 * @param includeIntercept whether or not to include an intercept term in
116 * the regression model
117 */
118 public SimpleRegression(boolean includeIntercept) {
119 super();
120 hasIntercept = includeIntercept;
121 }
122
123 /**
124 * Adds the observation (x,y) to the regression data set.
125 * <p>
126 * Uses updating formulas for means and sums of squares defined in
127 * "Algorithms for Computing the Sample Variance: Analysis and
128 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
129 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
130 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
131 *
132 *
133 * @param x independent variable value
134 * @param y dependent variable value
135 */
136 public void addData(final double x,final double y) {
137 if (n == 0) {
138 xbar = x;
139 ybar = y;
140 } else {
141 if( hasIntercept ){
142 final double fact1 = 1.0 + n;
143 final double fact2 = n / (1.0 + n);
144 final double dx = x - xbar;
145 final double dy = y - ybar;
146 sumXX += dx * dx * fact2;
147 sumYY += dy * dy * fact2;
148 sumXY += dx * dy * fact2;
149 xbar += dx / fact1;
150 ybar += dy / fact1;
151 }
152 }
153 if( !hasIntercept ){
154 sumXX += x * x ;
155 sumYY += y * y ;
156 sumXY += x * y ;
157 }
158 sumX += x;
159 sumY += y;
160 n++;
161 }
162
163 /**
164 * Appends data from another regression calculation to this one.
165 *
166 * <p>The mean update formulae are based on a paper written by Philippe
167 * Pébay:
168 * <a
169 * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
170 * Formulas for Robust, One-Pass Parallel Computation of Covariances and
171 * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
172 * SAND2008-6212, Sandia National Laboratories.</p>
173 *
174 * @param reg model to append data from
175 */
176 public void append(SimpleRegression reg) {
177 if (n == 0) {
178 xbar = reg.xbar;
179 ybar = reg.ybar;
180 sumXX = reg.sumXX;
181 sumYY = reg.sumYY;
182 sumXY = reg.sumXY;
183 } else {
184 if (hasIntercept) {
185 final double fact1 = reg.n / (double) (reg.n + n);
186 final double fact2 = n * reg.n / (double) (reg.n + n);
187 final double dx = reg.xbar - xbar;
188 final double dy = reg.ybar - ybar;
189 sumXX += reg.sumXX + dx * dx * fact2;
190 sumYY += reg.sumYY + dy * dy * fact2;
191 sumXY += reg.sumXY + dx * dy * fact2;
192 xbar += dx * fact1;
193 ybar += dy * fact1;
194 }else{
195 sumXX += reg.sumXX;
196 sumYY += reg.sumYY;
197 sumXY += reg.sumXY;
198 }
199 }
200 sumX += reg.sumX;
201 sumY += reg.sumY;
202 n += reg.n;
203 }
204
205 /**
206 * Removes the observation (x,y) from the regression data set.
207 * <p>
208 * Mirrors the addData method. This method permits the use of
209 * SimpleRegression instances in streaming mode where the regression
210 * is applied to a sliding "window" of observations, however the caller is
211 * responsible for maintaining the set of observations in the window.</p>
212 *
213 * The method has no effect if there are no points of data (i.e. n=0)
214 *
215 * @param x independent variable value
216 * @param y dependent variable value
217 */
218 public void removeData(final double x,final double y) {
219 if (n > 0) {
220 if (hasIntercept) {
221 final double fact1 = n - 1.0;
222 final double fact2 = n / (n - 1.0);
223 final double dx = x - xbar;
224 final double dy = y - ybar;
225 sumXX -= dx * dx * fact2;
226 sumYY -= dy * dy * fact2;
227 sumXY -= dx * dy * fact2;
228 xbar -= dx / fact1;
229 ybar -= dy / fact1;
230 } else {
231 final double fact1 = n - 1.0;
232 sumXX -= x * x;
233 sumYY -= y * y;
234 sumXY -= x * y;
235 xbar -= x / fact1;
236 ybar -= y / fact1;
237 }
238 sumX -= x;
239 sumY -= y;
240 n--;
241 }
242 }
243
244 /**
245 * Adds the observations represented by the elements in
246 * <code>data</code>.
247 * <p>
248 * <code>(data[0][0],data[0][1])</code> will be the first observation, then
249 * <code>(data[1][0],data[1][1])</code>, etc.</p>
250 * <p>
251 * This method does not replace data that has already been added. The
252 * observations represented by <code>data</code> are added to the existing
253 * dataset.</p>
254 * <p>
255 * To replace all data, use <code>clear()</code> before adding the new
256 * data.</p>
257 *
258 * @param data array of observations to be added
259 * @throws MathIllegalArgumentException if the length of {@code data[i]} is not
260 * greater than or equal to 2
261 */
262 public void addData(final double[][] data) throws MathIllegalArgumentException {
263 for (double[] datum : data) {
264 if (datum.length < 2) {
265 throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION,
266 datum.length, 2);
267 }
268 addData(datum[0], datum[1]);
269 }
270 }
271
272 /**
273 * Adds one observation to the regression model.
274 *
275 * @param x the independent variables which form the design matrix
276 * @param y the dependent or response variable
277 * @throws MathIllegalArgumentException if the length of {@code x} does not equal
278 * the number of independent variables in the model
279 */
280 @Override
281 public void addObservation(final double[] x,final double y)
282 throws MathIllegalArgumentException {
283 if( x == null || x.length == 0 ){
284 throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
285 }
286 addData( x[0], y );
287 }
288
289 /**
290 * Adds a series of observations to the regression model. The lengths of
291 * x and y must be the same and x must be rectangular.
292 *
293 * @param x a series of observations on the independent variables
294 * @param y a series of observations on the dependent variable
295 * The length of x and y must be the same
296 * @throws MathIllegalArgumentException if {@code x} is not rectangular, does not match
297 * the length of {@code y} or does not contain sufficient data to estimate the model
298 */
299 @Override
300 public void addObservations(final double[][] x,final double[] y) throws MathIllegalArgumentException {
301 MathUtils.checkNotNull(x, LocalizedCoreFormats.INPUT_ARRAY);
302 MathUtils.checkNotNull(y, LocalizedCoreFormats.INPUT_ARRAY);
303 MathUtils.checkDimension(x.length, y.length);
304 boolean obsOk = true;
305 for (double[] doubles : x) {
306 if (doubles == null || doubles.length == 0) {
307 obsOk = false;
308 }
309 }
310 if( !obsOk ){
311 throw new MathIllegalArgumentException(
312 LocalizedStatFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
313 0, 1);
314 }
315 for( int i = 0 ; i < x.length ; i++){
316 addData( x[i][0], y[i] );
317 }
318 }
319
320 /**
321 * Removes observations represented by the elements in <code>data</code>.
322 * <p>
323 * If the array is larger than the current n, only the first n elements are
324 * processed. This method permits the use of SimpleRegression instances in
325 * streaming mode where the regression is applied to a sliding "window" of
326 * observations, however the caller is responsible for maintaining the set
327 * of observations in the window.</p>
328 * <p>
329 * To remove all data, use <code>clear()</code>.</p>
330 *
331 * @param data array of observations to be removed
332 */
333 public void removeData(double[][] data) {
334 for (int i = 0; i < data.length && n > 0; i++) {
335 removeData(data[i][0], data[i][1]);
336 }
337 }
338
339 /**
340 * Clears all data from the model.
341 */
342 @Override
343 public void clear() {
344 sumX = 0d;
345 sumXX = 0d;
346 sumY = 0d;
347 sumYY = 0d;
348 sumXY = 0d;
349 n = 0;
350 }
351
352 /**
353 * Returns the number of observations that have been added to the model.
354 *
355 * @return n number of observations that have been added.
356 */
357 @Override
358 public long getN() {
359 return n;
360 }
361
362 /**
363 * Returns the "predicted" <code>y</code> value associated with the
364 * supplied <code>x</code> value, based on the data that has been
365 * added to the model when this method is activated.
366 * <p>
367 * <code> predict(x) = intercept + slope * x </code></p>
368 * <p>* <strong>Preconditions</strong>:</p>
369 * <ul>
370 * <li>At least two observations (with at least two different x values)
371 * must have been added before invoking this method. If this method is
372 * invoked before a model can be estimated, <code>Double,NaN</code> is
373 * returned.
374 * </li></ul>
375 *
376 * @param x input <code>x</code> value
377 * @return predicted <code>y</code> value
378 */
379 public double predict(final double x) {
380 final double b1 = getSlope();
381 if (hasIntercept) {
382 return getIntercept(b1) + b1 * x;
383 }
384 return b1 * x;
385 }
386
387 /**
388 * Returns the intercept of the estimated regression line, if
389 * {@link #hasIntercept()} is true; otherwise 0.
390 * <p>
391 * The least squares estimate of the intercept is computed using the
392 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
393 * The intercept is sometimes denoted b0.</p>
394 * <p><strong>Preconditions</strong>:</p>
395 * <ul>
396 * <li>At least two observations (with at least two different x values)
397 * must have been added before invoking this method. If this method is
398 * invoked before a model can be estimated, <code>Double,NaN</code> is
399 * returned.
400 * </li></ul>
401 *
402 * @return the intercept of the regression line if the model includes an
403 * intercept; 0 otherwise
404 * @see #SimpleRegression(boolean)
405 */
406 public double getIntercept() {
407 return hasIntercept ? getIntercept(getSlope()) : 0.0;
408 }
409
410 /**
411 * Returns true if the model includes an intercept term.
412 *
413 * @return true if the regression includes an intercept; false otherwise
414 * @see #SimpleRegression(boolean)
415 */
416 @Override
417 public boolean hasIntercept() {
418 return hasIntercept;
419 }
420
421 /**
422 * Returns the slope of the estimated regression line.
423 * <p>
424 * The least squares estimate of the slope is computed using the
425 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
426 * The slope is sometimes denoted b1.</p>
427 * <p>* <strong>Preconditions</strong>:</p>
428 * <ul>
429 * <li>At least two observations (with at least two different x values)
430 * must have been added before invoking this method. If this method is
431 * invoked before a model can be estimated, <code>Double.NaN</code> is
432 * returned.
433 * </li></ul>
434 *
435 * @return the slope of the regression line
436 */
437 public double getSlope() {
438 if (n < 2) {
439 return Double.NaN; //not enough data
440 }
441 if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
442 return Double.NaN; //not enough variation in x
443 }
444 return sumXY / sumXX;
445 }
446
447 /**
448 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
449 * sum of squared errors</a> (SSE) associated with the regression
450 * model.
451 * <p>
452 * The sum is computed using the computational formula</p>
453 * <p>
454 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
455 * <p>
456 * where <code>SYY</code> is the sum of the squared deviations of the y
457 * values about their mean, <code>SXX</code> is similarly defined and
458 * <code>SXY</code> is the sum of the products of x and y mean deviations.
459 * </p><p>
460 * The sums are accumulated using the updating algorithm referenced in
461 * {@link #addData}.</p>
462 * <p>
463 * The return value is constrained to be non-negative - i.e., if due to
464 * rounding errors the computational formula returns a negative result,
465 * 0 is returned.</p>
466 * <p>* <strong>Preconditions</strong>:</p>
467 * <ul>
468 * <li>At least two observations (with at least two different x values)
469 * must have been added before invoking this method. If this method is
470 * invoked before a model can be estimated, <code>Double,NaN</code> is
471 * returned.
472 * </li></ul>
473 *
474 * @return sum of squared errors associated with the regression model
475 */
476 public double getSumSquaredErrors() {
477 return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
478 }
479
480 /**
481 * Returns the sum of squared deviations of the y values about their mean.
482 * <p>
483 * This is defined as SSTO
484 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
485 * <p>
486 * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
487 *
488 * @return sum of squared deviations of y values
489 */
490 public double getTotalSumSquares() {
491 if (n < 2) {
492 return Double.NaN;
493 }
494 return sumYY;
495 }
496
497 /**
498 * Returns the sum of squared deviations of the x values about their mean.
499 *
500 * <p>If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
501 *
502 * @return sum of squared deviations of x values
503 */
504 public double getXSumSquares() {
505 if (n < 2) {
506 return Double.NaN;
507 }
508 return sumXX;
509 }
510
511 /**
512 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
513 *
514 * @return sum of cross products
515 */
516 public double getSumOfCrossProducts() {
517 return sumXY;
518 }
519
520 /**
521 * Returns the sum of squared deviations of the predicted y values about
522 * their mean (which equals the mean of y).
523 * <p>
524 * This is usually abbreviated SSR or SSM. It is defined as SSM
525 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
526 * <p>* <strong>Preconditions</strong>:</p>
527 * <ul>
528 * <li>At least two observations (with at least two different x values)
529 * must have been added before invoking this method. If this method is
530 * invoked before a model can be estimated, <code>Double.NaN</code> is
531 * returned.
532 * </li></ul>
533 *
534 * @return sum of squared deviations of predicted y values
535 */
536 public double getRegressionSumSquares() {
537 return getRegressionSumSquares(getSlope());
538 }
539
540 /**
541 * Returns the sum of squared errors divided by the degrees of freedom,
542 * usually abbreviated MSE.
543 * <p>
544 * If there are fewer than <strong>three</strong> data pairs in the model,
545 * or if there is no variation in <code>x</code>, this returns
546 * <code>Double.NaN</code>.</p>
547 *
548 * @return sum of squared deviations of y values
549 */
550 public double getMeanSquareError() {
551 if (n < 3) {
552 return Double.NaN;
553 }
554 return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
555 }
556
557 /**
558 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
559 * Pearson's product moment correlation coefficient</a>,
560 * usually denoted r.
561 * <p>* <strong>Preconditions</strong>:</p>
562 * <ul>
563 * <li>At least two observations (with at least two different x values)
564 * must have been added before invoking this method. If this method is
565 * invoked before a model can be estimated, <code>Double,NaN</code> is
566 * returned.
567 * </li></ul>
568 *
569 * @return Pearson's r
570 */
571 public double getR() {
572 double b1 = getSlope();
573 double result = FastMath.sqrt(getRSquare());
574 if (b1 < 0) {
575 result = -result;
576 }
577 return result;
578 }
579
580 /**
581 * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
582 * coefficient of determination</a>,
583 * usually denoted r-square.
584 * <p>* <strong>Preconditions</strong>:</p>
585 * <ul>
586 * <li>At least two observations (with at least two different x values)
587 * must have been added before invoking this method. If this method is
588 * invoked before a model can be estimated, <code>Double,NaN</code> is
589 * returned.
590 * </li></ul>
591 *
592 * @return r-square
593 */
594 public double getRSquare() {
595 double ssto = getTotalSumSquares();
596 return (ssto - getSumSquaredErrors()) / ssto;
597 }
598
599 /**
600 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
601 * standard error of the intercept estimate</a>,
602 * usually denoted s(b0).
603 * <p>
604 * If there are fewer that <strong>three</strong> observations in the
605 * model, or if there is no variation in x, this returns
606 * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
607 * returned when the intercept is constrained to be zero
608 *
609 * @return standard error associated with intercept estimate
610 */
611 public double getInterceptStdErr() {
612 if( !hasIntercept ){
613 return Double.NaN;
614 }
615 return FastMath.sqrt(
616 getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
617 }
618
619 /**
620 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
621 * error of the slope estimate</a>,
622 * usually denoted s(b1).
623 * <p>
624 * If there are fewer that <strong>three</strong> data pairs in the model,
625 * or if there is no variation in x, this returns <code>Double.NaN</code>.
626 * </p>
627 *
628 * @return standard error associated with slope estimate
629 */
630 public double getSlopeStdErr() {
631 return FastMath.sqrt(getMeanSquareError() / sumXX);
632 }
633
634 /**
635 * Returns the half-width of a 95% confidence interval for the slope
636 * estimate.
637 * <p>
638 * The 95% confidence interval is</p>
639 * <p>
640 * <code>(getSlope() - getSlopeConfidenceInterval(),
641 * getSlope() + getSlopeConfidenceInterval())</code></p>
642 * <p>
643 * If there are fewer that <strong>three</strong> observations in the
644 * model, or if there is no variation in x, this returns
645 * <code>Double.NaN</code>.</p>
646 * <p>* <strong>Usage Note</strong>:</p>
647 * <p>
648 * The validity of this statistic depends on the assumption that the
649 * observations included in the model are drawn from a
650 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
651 * Bivariate Normal Distribution</a>.</p>
652 *
653 * @return half-width of 95% confidence interval for the slope estimate
654 * @throws MathIllegalArgumentException if the confidence interval can not be computed.
655 */
656 public double getSlopeConfidenceInterval() throws MathIllegalArgumentException {
657 return getSlopeConfidenceInterval(0.05d);
658 }
659
660 /**
661 * Returns the half-width of a (100-100*alpha)% confidence interval for
662 * the slope estimate.
663 * <p>
664 * The (100-100*alpha)% confidence interval is </p>
665 * <p>
666 * <code>(getSlope() - getSlopeConfidenceInterval(),
667 * getSlope() + getSlopeConfidenceInterval())</code></p>
668 * <p>
669 * To request, for example, a 99% confidence interval, use
670 * <code>alpha = .01</code></p>
671 * <p>
672 * <strong>Usage Note</strong>:<br>
673 * The validity of this statistic depends on the assumption that the
674 * observations included in the model are drawn from a
675 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
676 * Bivariate Normal Distribution</a>.</p>
677 * <p>
678 * <strong> Preconditions:</strong></p><ul>
679 * <li>If there are fewer that <strong>three</strong> observations in the
680 * model, or if there is no variation in x, this returns
681 * <code>Double.NaN</code>.
682 * </li>
683 * <li>{@code (0 < alpha < 1)}; otherwise an
684 * <code>MathIllegalArgumentException</code> is thrown.
685 * </li></ul>
686 *
687 * @param alpha the desired significance level
688 * @return half-width of 95% confidence interval for the slope estimate
689 * @throws MathIllegalArgumentException if the confidence interval can not be computed.
690 */
691 public double getSlopeConfidenceInterval(final double alpha)
692 throws MathIllegalArgumentException {
693 if (n < 3) {
694 return Double.NaN;
695 }
696 if (alpha >= 1 || alpha <= 0) {
697 throw new MathIllegalArgumentException(LocalizedStatFormats.SIGNIFICANCE_LEVEL,
698 alpha, 0, 1);
699 }
700 // No advertised MathIllegalArgumentException here - will return NaN above
701 TDistribution distribution = new TDistribution(n - 2);
702 return getSlopeStdErr() *
703 distribution.inverseCumulativeProbability(1d - alpha / 2d);
704 }
705
706 /**
707 * Returns the significance level of the slope (equiv) correlation.
708 * <p>
709 * Specifically, the returned value is the smallest <code>alpha</code>
710 * such that the slope confidence interval with significance level
711 * equal to <code>alpha</code> does not include <code>0</code>.
712 * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
713 * </p><p>
714 * <strong>Usage Note</strong>:<br>
715 * The validity of this statistic depends on the assumption that the
716 * observations included in the model are drawn from a
717 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
718 * Bivariate Normal Distribution</a>.</p>
719 * <p>
720 * If there are fewer that <strong>three</strong> observations in the
721 * model, or if there is no variation in x, this returns
722 * <code>Double.NaN</code>.</p>
723 *
724 * @return significance level for slope/correlation
725 * @throws org.hipparchus.exception.MathIllegalStateException
726 * if the significance level can not be computed.
727 */
728 public double getSignificance() {
729 if (n < 3) {
730 return Double.NaN;
731 }
732 // No advertised MathIllegalArgumentException here - will return NaN above
733 TDistribution distribution = new TDistribution(n - 2);
734 return 2d * (1.0 - distribution.cumulativeProbability(
735 FastMath.abs(getSlope()) / getSlopeStdErr()));
736 }
737
738 // ---------------------Private methods-----------------------------------
739
740 /**
741 * Returns the intercept of the estimated regression line, given the slope.
742 * <p>
743 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
744 *
745 * @param slope current slope
746 * @return the intercept of the regression line
747 */
748 private double getIntercept(final double slope) {
749 if( hasIntercept){
750 return (sumY - slope * sumX) / n;
751 }
752 return 0.0;
753 }
754
755 /**
756 * Computes SSR from b1.
757 *
758 * @param slope regression slope estimate
759 * @return sum of squared deviations of predicted y values
760 */
761 private double getRegressionSumSquares(final double slope) {
762 return slope * slope * sumXX;
763 }
764
765 /**
766 * Performs a regression on data present in buffers and outputs a RegressionResults object.
767 *
768 * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
769 * a {@code MathIllegalArgumentException} is thrown. If there is no intercept term, the model must
770 * contain at least 2 observations.</p>
771 *
772 * @return RegressionResults acts as a container of regression output
773 * @throws MathIllegalArgumentException if the model is not correctly specified
774 * @throws MathIllegalArgumentException if there is not sufficient data in the model to
775 * estimate the regression parameters
776 */
777 @Override
778 public RegressionResults regress() throws MathIllegalArgumentException {
779 if (hasIntercept) {
780 if (n < 3) {
781 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION);
782 }
783 if (FastMath.abs(sumXX) > Precision.SAFE_MIN) {
784 final double[] params = { getIntercept(), getSlope() };
785 final double mse = getMeanSquareError();
786 final double _syy = sumYY + sumY * sumY / n;
787 final double[] vcv = { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
788 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true,
789 false);
790 } else {
791 final double[] params = { sumY / n, Double.NaN };
792 // final double mse = getMeanSquareError();
793 final double[] vcv = { ybar / (n - 1.0), Double.NaN, Double.NaN };
794 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
795 false);
796 }
797 } else {
798 if (n < 2) {
799 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION);
800 }
801 if (!Double.isNaN(sumXX)) {
802 final double[] vcv = { getMeanSquareError() / sumXX };
803 final double[] params = { sumXY / sumXX };
804 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
805 false);
806 } else {
807 final double[] vcv = { Double.NaN };
808 final double[] params = { Double.NaN };
809 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
810 false);
811 }
812 }
813 }
814
815 /**
816 * Performs a regression on data present in buffers including only regressors
817 * indexed in variablesToInclude and outputs a RegressionResults object
818 * @param variablesToInclude an array of indices of regressors to include
819 * @return RegressionResults acts as a container of regression output
820 * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
821 * @throws MathIllegalArgumentException if a requested variable is not present in model
822 */
823 @Override
824 public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException {
825 if (variablesToInclude == null || variablesToInclude.length == 0) {
826 throw new MathIllegalArgumentException(LocalizedCoreFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
827 }
828 if (variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept)) {
829 throw new MathIllegalArgumentException(
830 LocalizedCoreFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
831 (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
832 }
833
834 if (hasIntercept) {
835 if (variablesToInclude.length == 2) {
836 if (variablesToInclude[0] == 1) {
837 throw new MathIllegalArgumentException(LocalizedCoreFormats.NOT_INCREASING_SEQUENCE);
838 } else if (variablesToInclude[0] != 0) {
839 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
840 variablesToInclude[0], 0, 1);
841 }
842 if (variablesToInclude[1] != 1) {
843 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
844 variablesToInclude[0], 0, 1);
845 }
846 return regress();
847 }else{
848 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ) {
849 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
850 variablesToInclude[0], 0, 1);
851 }
852 final double _mean = sumY * sumY / n;
853 final double _syy = sumYY + _mean;
854 if( variablesToInclude[0] == 0 ){
855 //just the mean
856 final double[] vcv = { sumYY/(((n-1)*n)) };
857 final double[] params = { ybar };
858 return new RegressionResults(
859 params, new double[][] {vcv}, true, n, 1,
860 sumY, _syy+_mean, sumYY,true,false);
861
862 }else if( variablesToInclude[0] == 1){
863 //final double _syy = sumYY + sumY * sumY / ((double) n);
864 final double _sxx = sumXX + sumX * sumX / n;
865 final double _sxy = sumXY + sumX * sumY / n;
866 final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
867 final double _mse = _sse/((n-1));
868 if( !Double.isNaN(_sxx) ){
869 final double[] vcv = { _mse / _sxx };
870 final double[] params = { _sxy/_sxx };
871 return new RegressionResults(
872 params, new double[][] {vcv}, true, n, 1,
873 sumY, _syy, _sse,false,false);
874 }else{
875 final double[] vcv = {Double.NaN };
876 final double[] params = { Double.NaN };
877 return new RegressionResults(
878 params, new double[][] {vcv}, true, n, 1,
879 Double.NaN, Double.NaN, Double.NaN,false,false);
880 }
881 }
882 }
883 } else {
884 if (variablesToInclude[0] != 0) {
885 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
886 variablesToInclude[0], 0, 0);
887 }
888 return regress();
889 }
890
891 return null;
892 }
893 }