SimpleRegression.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. /*
  18.  * This is not the original file distributed by the Apache Software Foundation
  19.  * It has been modified by the Hipparchus project
  20.  */

  21. package org.hipparchus.stat.regression;
  22. import java.io.Serializable;

  23. import org.hipparchus.distribution.continuous.TDistribution;
  24. import org.hipparchus.exception.LocalizedCoreFormats;
  25. import org.hipparchus.exception.MathIllegalArgumentException;
  26. import org.hipparchus.stat.LocalizedStatFormats;
  27. import org.hipparchus.util.FastMath;
  28. import org.hipparchus.util.MathUtils;
  29. import org.hipparchus.util.Precision;

  30. /**
  31.  * Estimates an ordinary least squares regression model
  32.  * with one independent variable.
  33.  * <p>
  34.  * <code> y = intercept + slope * x  </code></p>
  35.  * <p>
  36.  * Standard errors for <code>intercept</code> and <code>slope</code> are
  37.  * available as well as ANOVA, r-square and Pearson's r statistics.</p>
  38.  * <p>
  39.  * Observations (x,y pairs) can be added to the model one at a time or they
  40.  * can be provided in a 2-dimensional array.  The observations are not stored
  41.  * in memory, so there is no limit to the number of observations that can be
  42.  * added to the model.</p>
  43.  * <p>* <strong>Usage Notes</strong>:</p>
  44.  * <ul>
  45.  * <li> When there are fewer than two observations in the model, or when
  46.  * there is no variation in the x values (i.e. all x values are the same)
  47.  * all statistics return <code>NaN</code>. At least two observations with
  48.  * different x coordinates are required to estimate a bivariate regression
  49.  * model.
  50.  * </li>
  51.  * <li> Getters for the statistics always compute values based on the current
  52.  * set of observations -- i.e., you can get statistics, then add more data
  53.  * and get updated statistics without using a new instance.  There is no
  54.  * "compute" method that updates all statistics.  Each of the getters performs
  55.  * the necessary computations to return the requested statistic.
  56.  * </li>
  57.  * <li> The intercept term may be suppressed by passing {@code false} to
  58.  * the {@link #SimpleRegression(boolean)} constructor.  When the
  59.  * {@code hasIntercept} property is false, the model is estimated without a
  60.  * constant term and {@link #getIntercept()} returns {@code 0}.</li>
  61.  * </ul>
  62.  *
  63.  */
  64. public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {

  65.     /** Serializable version identifier */
  66.     private static final long serialVersionUID = -3004689053607543335L;

  67.     /** sum of x values */
  68.     private double sumX;

  69.     /** total variation in x (sum of squared deviations from xbar) */
  70.     private double sumXX;

  71.     /** sum of y values */
  72.     private double sumY;

  73.     /** total variation in y (sum of squared deviations from ybar) */
  74.     private double sumYY;

  75.     /** sum of products */
  76.     private double sumXY;

  77.     /** number of observations */
  78.     private long n;

  79.     /** mean of accumulated x values, used in updating formulas */
  80.     private double xbar;

  81.     /** mean of accumulated y values, used in updating formulas */
  82.     private double ybar;

  83.     /** include an intercept or not */
  84.     private final boolean hasIntercept;
  85.     // ---------------------Public methods--------------------------------------

  86.     /**
  87.      * Create an empty SimpleRegression instance
  88.      */
  89.     public SimpleRegression() {
  90.         this(true);
  91.     }
  92.     /**
  93.     * Create a SimpleRegression instance, specifying whether or not to estimate
  94.     * an intercept.
  95.     *
  96.     * <p>Use {@code false} to estimate a model with no intercept.  When the
  97.     * {@code hasIntercept} property is false, the model is estimated without a
  98.     * constant term and {@link #getIntercept()} returns {@code 0}.</p>
  99.     *
  100.     * @param includeIntercept whether or not to include an intercept term in
  101.     * the regression model
  102.     */
  103.     public SimpleRegression(boolean includeIntercept) {
  104.         super();
  105.         hasIntercept = includeIntercept;
  106.     }

  107.     /**
  108.      * Adds the observation (x,y) to the regression data set.
  109.      * <p>
  110.      * Uses updating formulas for means and sums of squares defined in
  111.      * "Algorithms for Computing the Sample Variance: Analysis and
  112.      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
  113.      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
  114.      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
  115.      *
  116.      *
  117.      * @param x independent variable value
  118.      * @param y dependent variable value
  119.      */
  120.     public void addData(final double x,final double y) {
  121.         if (n == 0) {
  122.             xbar = x;
  123.             ybar = y;
  124.         } else {
  125.             if( hasIntercept ){
  126.                 final double fact1 = 1.0 + n;
  127.                 final double fact2 = n / (1.0 + n);
  128.                 final double dx = x - xbar;
  129.                 final double dy = y - ybar;
  130.                 sumXX += dx * dx * fact2;
  131.                 sumYY += dy * dy * fact2;
  132.                 sumXY += dx * dy * fact2;
  133.                 xbar += dx / fact1;
  134.                 ybar += dy / fact1;
  135.             }
  136.          }
  137.         if( !hasIntercept ){
  138.             sumXX += x * x ;
  139.             sumYY += y * y ;
  140.             sumXY += x * y ;
  141.         }
  142.         sumX += x;
  143.         sumY += y;
  144.         n++;
  145.     }

  146.     /**
  147.      * Appends data from another regression calculation to this one.
  148.      *
  149.      * <p>The mean update formulae are based on a paper written by Philippe
  150.      * P&eacute;bay:
  151.      * <a
  152.      * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
  153.      * Formulas for Robust, One-Pass Parallel Computation of Covariances and
  154.      * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
  155.      * SAND2008-6212, Sandia National Laboratories.</p>
  156.      *
  157.      * @param reg model to append data from
  158.      */
  159.     public void append(SimpleRegression reg) {
  160.         if (n == 0) {
  161.             xbar = reg.xbar;
  162.             ybar = reg.ybar;
  163.             sumXX = reg.sumXX;
  164.             sumYY = reg.sumYY;
  165.             sumXY = reg.sumXY;
  166.         } else {
  167.             if (hasIntercept) {
  168.                 final double fact1 = reg.n / (double) (reg.n + n);
  169.                 final double fact2 = n * reg.n / (double) (reg.n + n);
  170.                 final double dx = reg.xbar - xbar;
  171.                 final double dy = reg.ybar - ybar;
  172.                 sumXX += reg.sumXX + dx * dx * fact2;
  173.                 sumYY += reg.sumYY + dy * dy * fact2;
  174.                 sumXY += reg.sumXY + dx * dy * fact2;
  175.                 xbar += dx * fact1;
  176.                 ybar += dy * fact1;
  177.             }else{
  178.                 sumXX += reg.sumXX;
  179.                 sumYY += reg.sumYY;
  180.                 sumXY += reg.sumXY;
  181.             }
  182.         }
  183.         sumX += reg.sumX;
  184.         sumY += reg.sumY;
  185.         n += reg.n;
  186.     }

  187.     /**
  188.      * Removes the observation (x,y) from the regression data set.
  189.      * <p>
  190.      * Mirrors the addData method.  This method permits the use of
  191.      * SimpleRegression instances in streaming mode where the regression
  192.      * is applied to a sliding "window" of observations, however the caller is
  193.      * responsible for maintaining the set of observations in the window.</p>
  194.      *
  195.      * The method has no effect if there are no points of data (i.e. n=0)
  196.      *
  197.      * @param x independent variable value
  198.      * @param y dependent variable value
  199.      */
  200.     public void removeData(final double x,final double y) {
  201.         if (n > 0) {
  202.             if (hasIntercept) {
  203.                 final double fact1 = n - 1.0;
  204.                 final double fact2 = n / (n - 1.0);
  205.                 final double dx = x - xbar;
  206.                 final double dy = y - ybar;
  207.                 sumXX -= dx * dx * fact2;
  208.                 sumYY -= dy * dy * fact2;
  209.                 sumXY -= dx * dy * fact2;
  210.                 xbar -= dx / fact1;
  211.                 ybar -= dy / fact1;
  212.             } else {
  213.                 final double fact1 = n - 1.0;
  214.                 sumXX -= x * x;
  215.                 sumYY -= y * y;
  216.                 sumXY -= x * y;
  217.                 xbar -= x / fact1;
  218.                 ybar -= y / fact1;
  219.             }
  220.              sumX -= x;
  221.              sumY -= y;
  222.              n--;
  223.         }
  224.     }

  225.     /**
  226.      * Adds the observations represented by the elements in
  227.      * <code>data</code>.
  228.      * <p>
  229.      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
  230.      * <code>(data[1][0],data[1][1])</code>, etc.</p>
  231.      * <p>
  232.      * This method does not replace data that has already been added.  The
  233.      * observations represented by <code>data</code> are added to the existing
  234.      * dataset.</p>
  235.      * <p>
  236.      * To replace all data, use <code>clear()</code> before adding the new
  237.      * data.</p>
  238.      *
  239.      * @param data array of observations to be added
  240.      * @throws MathIllegalArgumentException if the length of {@code data[i]} is not
  241.      * greater than or equal to 2
  242.      */
  243.     public void addData(final double[][] data) throws MathIllegalArgumentException {
  244.         for (int i = 0; i < data.length; i++) {
  245.             if( data[i].length < 2 ){
  246.                throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION,
  247.                     data[i].length, 2);
  248.             }
  249.             addData(data[i][0], data[i][1]);
  250.         }
  251.     }

  252.     /**
  253.      * Adds one observation to the regression model.
  254.      *
  255.      * @param x the independent variables which form the design matrix
  256.      * @param y the dependent or response variable
  257.      * @throws MathIllegalArgumentException if the length of {@code x} does not equal
  258.      * the number of independent variables in the model
  259.      */
  260.     @Override
  261.     public void addObservation(final double[] x,final double y)
  262.             throws MathIllegalArgumentException {
  263.         if( x == null || x.length == 0 ){
  264.             throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
  265.         }
  266.         addData( x[0], y );
  267.     }

  268.     /**
  269.      * Adds a series of observations to the regression model. The lengths of
  270.      * x and y must be the same and x must be rectangular.
  271.      *
  272.      * @param x a series of observations on the independent variables
  273.      * @param y a series of observations on the dependent variable
  274.      * The length of x and y must be the same
  275.      * @throws MathIllegalArgumentException if {@code x} is not rectangular, does not match
  276.      * the length of {@code y} or does not contain sufficient data to estimate the model
  277.      */
  278.     @Override
  279.     public void addObservations(final double[][] x,final double[] y) throws MathIllegalArgumentException {
  280.         MathUtils.checkNotNull(x, LocalizedCoreFormats.INPUT_ARRAY);
  281.         MathUtils.checkNotNull(y, LocalizedCoreFormats.INPUT_ARRAY);
  282.         MathUtils.checkDimension(x.length, y.length);
  283.         boolean obsOk = true;
  284.         for( int i = 0 ; i < x.length; i++){
  285.             if( x[i] == null || x[i].length == 0 ){
  286.                 obsOk = false;
  287.             }
  288.         }
  289.         if( !obsOk ){
  290.             throw new MathIllegalArgumentException(
  291.                   LocalizedStatFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
  292.                   0, 1);
  293.         }
  294.         for( int i = 0 ; i < x.length ; i++){
  295.             addData( x[i][0], y[i] );
  296.         }
  297.     }

  298.     /**
  299.      * Removes observations represented by the elements in <code>data</code>.
  300.       * <p>
  301.      * If the array is larger than the current n, only the first n elements are
  302.      * processed.  This method permits the use of SimpleRegression instances in
  303.      * streaming mode where the regression is applied to a sliding "window" of
  304.      * observations, however the caller is responsible for maintaining the set
  305.      * of observations in the window.</p>
  306.      * <p>
  307.      * To remove all data, use <code>clear()</code>.</p>
  308.      *
  309.      * @param data array of observations to be removed
  310.      */
  311.     public void removeData(double[][] data) {
  312.         for (int i = 0; i < data.length && n > 0; i++) {
  313.             removeData(data[i][0], data[i][1]);
  314.         }
  315.     }

  316.     /**
  317.      * Clears all data from the model.
  318.      */
  319.     @Override
  320.     public void clear() {
  321.         sumX = 0d;
  322.         sumXX = 0d;
  323.         sumY = 0d;
  324.         sumYY = 0d;
  325.         sumXY = 0d;
  326.         n = 0;
  327.     }

  328.     /**
  329.      * Returns the number of observations that have been added to the model.
  330.      *
  331.      * @return n number of observations that have been added.
  332.      */
  333.     @Override
  334.     public long getN() {
  335.         return n;
  336.     }

  337.     /**
  338.      * Returns the "predicted" <code>y</code> value associated with the
  339.      * supplied <code>x</code> value,  based on the data that has been
  340.      * added to the model when this method is activated.
  341.      * <p>
  342.      * <code> predict(x) = intercept + slope * x </code></p>
  343.      * <p>* <strong>Preconditions</strong>:</p>
  344.      * <ul>
  345.      * <li>At least two observations (with at least two different x values)
  346.      * must have been added before invoking this method. If this method is
  347.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  348.      * returned.
  349.      * </li></ul>
  350.      *
  351.      * @param x input <code>x</code> value
  352.      * @return predicted <code>y</code> value
  353.      */
  354.     public double predict(final double x) {
  355.         final double b1 = getSlope();
  356.         if (hasIntercept) {
  357.             return getIntercept(b1) + b1 * x;
  358.         }
  359.         return b1 * x;
  360.     }

  361.     /**
  362.      * Returns the intercept of the estimated regression line, if
  363.      * {@link #hasIntercept()} is true; otherwise 0.
  364.      * <p>
  365.      * The least squares estimate of the intercept is computed using the
  366.      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
  367.      * The intercept is sometimes denoted b0.</p>
  368.      * <p><strong>Preconditions</strong>:</p>
  369.      * <ul>
  370.      * <li>At least two observations (with at least two different x values)
  371.      * must have been added before invoking this method. If this method is
  372.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  373.      * returned.
  374.      * </li></ul>
  375.      *
  376.      * @return the intercept of the regression line if the model includes an
  377.      * intercept; 0 otherwise
  378.      * @see #SimpleRegression(boolean)
  379.      */
  380.     public double getIntercept() {
  381.         return hasIntercept ? getIntercept(getSlope()) : 0.0;
  382.     }

  383.     /**
  384.      * Returns true if the model includes an intercept term.
  385.      *
  386.      * @return true if the regression includes an intercept; false otherwise
  387.      * @see #SimpleRegression(boolean)
  388.      */
  389.     @Override
  390.     public boolean hasIntercept() {
  391.         return hasIntercept;
  392.     }

  393.     /**
  394.     * Returns the slope of the estimated regression line.
  395.     * <p>
  396.     * The least squares estimate of the slope is computed using the
  397.     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
  398.     * The slope is sometimes denoted b1.</p>
  399.     * <p>* <strong>Preconditions</strong>:</p>
  400.     * <ul>
  401.     * <li>At least two observations (with at least two different x values)
  402.     * must have been added before invoking this method. If this method is
  403.     * invoked before a model can be estimated, <code>Double.NaN</code> is
  404.     * returned.
  405.     * </li></ul>
  406.     *
  407.     * @return the slope of the regression line
  408.     */
  409.     public double getSlope() {
  410.         if (n < 2) {
  411.             return Double.NaN; //not enough data
  412.         }
  413.         if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
  414.             return Double.NaN; //not enough variation in x
  415.         }
  416.         return sumXY / sumXX;
  417.     }

  418.     /**
  419.      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
  420.      * sum of squared errors</a> (SSE) associated with the regression
  421.      * model.
  422.      * <p>
  423.      * The sum is computed using the computational formula</p>
  424.      * <p>
  425.      * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
  426.      * <p>
  427.      * where <code>SYY</code> is the sum of the squared deviations of the y
  428.      * values about their mean, <code>SXX</code> is similarly defined and
  429.      * <code>SXY</code> is the sum of the products of x and y mean deviations.
  430.      * </p><p>
  431.      * The sums are accumulated using the updating algorithm referenced in
  432.      * {@link #addData}.</p>
  433.      * <p>
  434.      * The return value is constrained to be non-negative - i.e., if due to
  435.      * rounding errors the computational formula returns a negative result,
  436.      * 0 is returned.</p>
  437.      * <p>* <strong>Preconditions</strong>:</p>
  438.      * <ul>
  439.      * <li>At least two observations (with at least two different x values)
  440.      * must have been added before invoking this method. If this method is
  441.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  442.      * returned.
  443.      * </li></ul>
  444.      *
  445.      * @return sum of squared errors associated with the regression model
  446.      */
  447.     public double getSumSquaredErrors() {
  448.         return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
  449.     }

  450.     /**
  451.      * Returns the sum of squared deviations of the y values about their mean.
  452.      * <p>
  453.      * This is defined as SSTO
  454.      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
  455.      * <p>
  456.      * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
  457.      *
  458.      * @return sum of squared deviations of y values
  459.      */
  460.     public double getTotalSumSquares() {
  461.         if (n < 2) {
  462.             return Double.NaN;
  463.         }
  464.         return sumYY;
  465.     }

  466.     /**
  467.      * Returns the sum of squared deviations of the x values about their mean.
  468.      *
  469.      * <p>If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
  470.      *
  471.      * @return sum of squared deviations of x values
  472.      */
  473.     public double getXSumSquares() {
  474.         if (n < 2) {
  475.             return Double.NaN;
  476.         }
  477.         return sumXX;
  478.     }

  479.     /**
  480.      * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
  481.      *
  482.      * @return sum of cross products
  483.      */
  484.     public double getSumOfCrossProducts() {
  485.         return sumXY;
  486.     }

  487.     /**
  488.      * Returns the sum of squared deviations of the predicted y values about
  489.      * their mean (which equals the mean of y).
  490.      * <p>
  491.      * This is usually abbreviated SSR or SSM.  It is defined as SSM
  492.      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
  493.      * <p>* <strong>Preconditions</strong>:</p>
  494.      * <ul>
  495.      * <li>At least two observations (with at least two different x values)
  496.      * must have been added before invoking this method. If this method is
  497.      * invoked before a model can be estimated, <code>Double.NaN</code> is
  498.      * returned.
  499.      * </li></ul>
  500.      *
  501.      * @return sum of squared deviations of predicted y values
  502.      */
  503.     public double getRegressionSumSquares() {
  504.         return getRegressionSumSquares(getSlope());
  505.     }

  506.     /**
  507.      * Returns the sum of squared errors divided by the degrees of freedom,
  508.      * usually abbreviated MSE.
  509.      * <p>
  510.      * If there are fewer than <strong>three</strong> data pairs in the model,
  511.      * or if there is no variation in <code>x</code>, this returns
  512.      * <code>Double.NaN</code>.</p>
  513.      *
  514.      * @return sum of squared deviations of y values
  515.      */
  516.     public double getMeanSquareError() {
  517.         if (n < 3) {
  518.             return Double.NaN;
  519.         }
  520.         return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
  521.     }

  522.     /**
  523.      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
  524.      * Pearson's product moment correlation coefficient</a>,
  525.      * usually denoted r.
  526.      * <p>* <strong>Preconditions</strong>:</p>
  527.      * <ul>
  528.      * <li>At least two observations (with at least two different x values)
  529.      * must have been added before invoking this method. If this method is
  530.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  531.      * returned.
  532.      * </li></ul>
  533.      *
  534.      * @return Pearson's r
  535.      */
  536.     public double getR() {
  537.         double b1 = getSlope();
  538.         double result = FastMath.sqrt(getRSquare());
  539.         if (b1 < 0) {
  540.             result = -result;
  541.         }
  542.         return result;
  543.     }

  544.     /**
  545.      * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
  546.      * coefficient of determination</a>,
  547.      * usually denoted r-square.
  548.      * <p>* <strong>Preconditions</strong>:</p>
  549.      * <ul>
  550.      * <li>At least two observations (with at least two different x values)
  551.      * must have been added before invoking this method. If this method is
  552.      * invoked before a model can be estimated, <code>Double,NaN</code> is
  553.      * returned.
  554.      * </li></ul>
  555.      *
  556.      * @return r-square
  557.      */
  558.     public double getRSquare() {
  559.         double ssto = getTotalSumSquares();
  560.         return (ssto - getSumSquaredErrors()) / ssto;
  561.     }

  562.     /**
  563.      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
  564.      * standard error of the intercept estimate</a>,
  565.      * usually denoted s(b0).
  566.      * <p>
  567.      * If there are fewer that <strong>three</strong> observations in the
  568.      * model, or if there is no variation in x, this returns
  569.      * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
  570.      * returned when the intercept is constrained to be zero
  571.      *
  572.      * @return standard error associated with intercept estimate
  573.      */
  574.     public double getInterceptStdErr() {
  575.         if( !hasIntercept ){
  576.             return Double.NaN;
  577.         }
  578.         return FastMath.sqrt(
  579.             getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
  580.     }

  581.     /**
  582.      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
  583.      * error of the slope estimate</a>,
  584.      * usually denoted s(b1).
  585.      * <p>
  586.      * If there are fewer that <strong>three</strong> data pairs in the model,
  587.      * or if there is no variation in x, this returns <code>Double.NaN</code>.
  588.      * </p>
  589.      *
  590.      * @return standard error associated with slope estimate
  591.      */
  592.     public double getSlopeStdErr() {
  593.         return FastMath.sqrt(getMeanSquareError() / sumXX);
  594.     }

  595.     /**
  596.      * Returns the half-width of a 95% confidence interval for the slope
  597.      * estimate.
  598.      * <p>
  599.      * The 95% confidence interval is</p>
  600.      * <p>
  601.      * <code>(getSlope() - getSlopeConfidenceInterval(),
  602.      * getSlope() + getSlopeConfidenceInterval())</code></p>
  603.      * <p>
  604.      * If there are fewer that <strong>three</strong> observations in the
  605.      * model, or if there is no variation in x, this returns
  606.      * <code>Double.NaN</code>.</p>
  607.      * <p>* <strong>Usage Note</strong>:</p>
  608.      * <p>
  609.      * The validity of this statistic depends on the assumption that the
  610.      * observations included in the model are drawn from a
  611.      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
  612.      * Bivariate Normal Distribution</a>.</p>
  613.      *
  614.      * @return half-width of 95% confidence interval for the slope estimate
  615.      * @throws MathIllegalArgumentException if the confidence interval can not be computed.
  616.      */
  617.     public double getSlopeConfidenceInterval() throws MathIllegalArgumentException {
  618.         return getSlopeConfidenceInterval(0.05d);
  619.     }

  620.     /**
  621.      * Returns the half-width of a (100-100*alpha)% confidence interval for
  622.      * the slope estimate.
  623.      * <p>
  624.      * The (100-100*alpha)% confidence interval is </p>
  625.      * <p>
  626.      * <code>(getSlope() - getSlopeConfidenceInterval(),
  627.      * getSlope() + getSlopeConfidenceInterval())</code></p>
  628.      * <p>
  629.      * To request, for example, a 99% confidence interval, use
  630.      * <code>alpha = .01</code></p>
  631.      * <p>
  632.      * <strong>Usage Note</strong>:<br>
  633.      * The validity of this statistic depends on the assumption that the
  634.      * observations included in the model are drawn from a
  635.      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
  636.      * Bivariate Normal Distribution</a>.</p>
  637.      * <p>
  638.      * <strong> Preconditions:</strong></p><ul>
  639.      * <li>If there are fewer that <strong>three</strong> observations in the
  640.      * model, or if there is no variation in x, this returns
  641.      * <code>Double.NaN</code>.
  642.      * </li>
  643.      * <li>{@code (0 < alpha < 1)}; otherwise an
  644.      * <code>MathIllegalArgumentException</code> is thrown.
  645.      * </li></ul>
  646.      *
  647.      * @param alpha the desired significance level
  648.      * @return half-width of 95% confidence interval for the slope estimate
  649.      * @throws MathIllegalArgumentException if the confidence interval can not be computed.
  650.      */
  651.     public double getSlopeConfidenceInterval(final double alpha)
  652.     throws MathIllegalArgumentException {
  653.         if (n < 3) {
  654.             return Double.NaN;
  655.         }
  656.         if (alpha >= 1 || alpha <= 0) {
  657.             throw new MathIllegalArgumentException(LocalizedStatFormats.SIGNIFICANCE_LEVEL,
  658.                                           alpha, 0, 1);
  659.         }
  660.         // No advertised MathIllegalArgumentException here - will return NaN above
  661.         TDistribution distribution = new TDistribution(n - 2);
  662.         return getSlopeStdErr() *
  663.             distribution.inverseCumulativeProbability(1d - alpha / 2d);
  664.     }

  665.     /**
  666.      * Returns the significance level of the slope (equiv) correlation.
  667.      * <p>
  668.      * Specifically, the returned value is the smallest <code>alpha</code>
  669.      * such that the slope confidence interval with significance level
  670.      * equal to <code>alpha</code> does not include <code>0</code>.
  671.      * On regression output, this is often denoted <code>Prob(|t| &gt; 0)</code>
  672.      * </p><p>
  673.      * <strong>Usage Note</strong>:<br>
  674.      * The validity of this statistic depends on the assumption that the
  675.      * observations included in the model are drawn from a
  676.      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
  677.      * Bivariate Normal Distribution</a>.</p>
  678.      * <p>
  679.      * If there are fewer that <strong>three</strong> observations in the
  680.      * model, or if there is no variation in x, this returns
  681.      * <code>Double.NaN</code>.</p>
  682.      *
  683.      * @return significance level for slope/correlation
  684.      * @throws org.hipparchus.exception.MathIllegalStateException
  685.      * if the significance level can not be computed.
  686.      */
  687.     public double getSignificance() {
  688.         if (n < 3) {
  689.             return Double.NaN;
  690.         }
  691.         // No advertised MathIllegalArgumentException here - will return NaN above
  692.         TDistribution distribution = new TDistribution(n - 2);
  693.         return 2d * (1.0 - distribution.cumulativeProbability(
  694.                     FastMath.abs(getSlope()) / getSlopeStdErr()));
  695.     }

  696.     // ---------------------Private methods-----------------------------------

  697.     /**
  698.     * Returns the intercept of the estimated regression line, given the slope.
  699.     * <p>
  700.     * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
  701.     *
  702.     * @param slope current slope
  703.     * @return the intercept of the regression line
  704.     */
  705.     private double getIntercept(final double slope) {
  706.       if( hasIntercept){
  707.         return (sumY - slope * sumX) / n;
  708.       }
  709.       return 0.0;
  710.     }

  711.     /**
  712.      * Computes SSR from b1.
  713.      *
  714.      * @param slope regression slope estimate
  715.      * @return sum of squared deviations of predicted y values
  716.      */
  717.     private double getRegressionSumSquares(final double slope) {
  718.         return slope * slope * sumXX;
  719.     }

  720.     /**
  721.      * Performs a regression on data present in buffers and outputs a RegressionResults object.
  722.      *
  723.      * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
  724.      * a {@code MathIllegalArgumentException} is thrown.  If there is no intercept term, the model must
  725.      * contain at least 2 observations.</p>
  726.      *
  727.      * @return RegressionResults acts as a container of regression output
  728.      * @throws MathIllegalArgumentException if the model is not correctly specified
  729.      * @throws MathIllegalArgumentException if there is not sufficient data in the model to
  730.      * estimate the regression parameters
  731.      */
  732.     @Override
  733.     public RegressionResults regress() throws MathIllegalArgumentException {
  734.         if (hasIntercept) {
  735.             if (n < 3) {
  736.                 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION);
  737.             }
  738.             if (FastMath.abs(sumXX) > Precision.SAFE_MIN) {
  739.                 final double[] params = { getIntercept(), getSlope() };
  740.                 final double mse = getMeanSquareError();
  741.                 final double _syy = sumYY + sumY * sumY / n;
  742.                 final double[] vcv = { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
  743.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true,
  744.                         false);
  745.             } else {
  746.                 final double[] params = { sumY / n, Double.NaN };
  747.                 // final double mse = getMeanSquareError();
  748.                 final double[] vcv = { ybar / (n - 1.0), Double.NaN, Double.NaN };
  749.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
  750.                         false);
  751.             }
  752.         } else {
  753.             if (n < 2) {
  754.                 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION);
  755.             }
  756.             if (!Double.isNaN(sumXX)) {
  757.                 final double[] vcv = { getMeanSquareError() / sumXX };
  758.                 final double[] params = { sumXY / sumXX };
  759.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
  760.                         false);
  761.             } else {
  762.                 final double[] vcv = { Double.NaN };
  763.                 final double[] params = { Double.NaN };
  764.                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
  765.                         false);
  766.             }
  767.         }
  768.     }

  769.     /**
  770.      * Performs a regression on data present in buffers including only regressors
  771.      * indexed in variablesToInclude and outputs a RegressionResults object
  772.      * @param variablesToInclude an array of indices of regressors to include
  773.      * @return RegressionResults acts as a container of regression output
  774.      * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
  775.      * @throws MathIllegalArgumentException if a requested variable is not present in model
  776.      */
  777.     @Override
  778.     public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException {
  779.         if (variablesToInclude == null || variablesToInclude.length == 0) {
  780.           throw new MathIllegalArgumentException(LocalizedCoreFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
  781.         }
  782.         if (variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept)) {
  783.             throw new MathIllegalArgumentException(
  784.                     LocalizedCoreFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
  785.                     (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
  786.         }

  787.         if (hasIntercept) {
  788.             if (variablesToInclude.length == 2) {
  789.                 if (variablesToInclude[0] == 1) {
  790.                     throw new MathIllegalArgumentException(LocalizedCoreFormats.NOT_INCREASING_SEQUENCE);
  791.                 } else if (variablesToInclude[0] != 0) {
  792.                     throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
  793.                                                            variablesToInclude[0], 0, 1);
  794.                 }
  795.                 if (variablesToInclude[1] != 1) {
  796.                      throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
  797.                                                             variablesToInclude[0], 0, 1);
  798.                 }
  799.                 return regress();
  800.             }else{
  801.                 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ) {
  802.                      throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
  803.                                                             variablesToInclude[0], 0, 1);
  804.                 }
  805.                 final double _mean = sumY * sumY / n;
  806.                 final double _syy = sumYY + _mean;
  807.                 if( variablesToInclude[0] == 0 ){
  808.                     //just the mean
  809.                     final double[] vcv = { sumYY/(((n-1)*n)) };
  810.                     final double[] params = { ybar };
  811.                     return new RegressionResults(
  812.                       params, new double[][] {vcv}, true, n, 1,
  813.                       sumY, _syy+_mean, sumYY,true,false);

  814.                 }else if( variablesToInclude[0] == 1){
  815.                     //final double _syy = sumYY + sumY * sumY / ((double) n);
  816.                     final double _sxx = sumXX + sumX * sumX / n;
  817.                     final double _sxy = sumXY + sumX * sumY / n;
  818.                     final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
  819.                     final double _mse = _sse/((n-1));
  820.                     if( !Double.isNaN(_sxx) ){
  821.                         final double[] vcv = { _mse / _sxx };
  822.                         final double[] params = { _sxy/_sxx };
  823.                         return new RegressionResults(
  824.                                     params, new double[][] {vcv}, true, n, 1,
  825.                                     sumY, _syy, _sse,false,false);
  826.                     }else{
  827.                         final double[] vcv = {Double.NaN };
  828.                         final double[] params = { Double.NaN };
  829.                         return new RegressionResults(
  830.                                     params, new double[][] {vcv}, true, n, 1,
  831.                                     Double.NaN, Double.NaN, Double.NaN,false,false);
  832.                     }
  833.                 }
  834.             }
  835.         } else {
  836.             if (variablesToInclude[0] != 0) {
  837.                 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
  838.                                                        variablesToInclude[0], 0, 0);
  839.             }
  840.             return regress();
  841.         }

  842.         return null;
  843.     }
  844. }