Variance.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. /*
  18.  * This is not the original file distributed by the Apache Software Foundation
  19.  * It has been modified by the Hipparchus project
  20.  */
  21. package org.hipparchus.stat.descriptive.moment;

  22. import java.io.Serializable;

  23. import org.hipparchus.exception.MathIllegalArgumentException;
  24. import org.hipparchus.exception.NullArgumentException;
  25. import org.hipparchus.stat.StatUtils;
  26. import org.hipparchus.stat.descriptive.AbstractStorelessUnivariateStatistic;
  27. import org.hipparchus.stat.descriptive.AggregatableStatistic;
  28. import org.hipparchus.stat.descriptive.WeightedEvaluation;
  29. import org.hipparchus.util.MathArrays;
  30. import org.hipparchus.util.MathUtils;

  31. /**
  32.  * Computes the variance of the available values.  By default, the unbiased
  33.  * "sample variance" definitional formula is used:
  34.  * <p>
  35.  * variance = sum((x_i - mean)^2) / (n - 1)
  36.  * <p>
  37.  * where mean is the {@link Mean} and <code>n</code> is the number
  38.  * of sample observations.
  39.  * <p>
  40.  * The definitional formula does not have good numerical properties, so
  41.  * this implementation does not compute the statistic using the definitional
  42.  * formula.
  43.  * <ul>
  44.  * <li> The <code>getResult</code> method computes the variance using
  45.  * updating formulas based on West's algorithm, as described in
  46.  * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
  47.  * J. G. Lewis 1979, <i>Communications of the ACM</i>,
  48.  * vol. 22 no. 9, pp. 526-531.</a></li>
  49.  * <li> The <code>evaluate</code> methods leverage the fact that they have the
  50.  * full array of values in memory to execute a two-pass algorithm.
  51.  * Specifically, these methods use the "corrected two-pass algorithm" from
  52.  * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
  53.  * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li>
  54.  * </ul>
  55.  * <p>
  56.  * Note that adding values using <code>increment</code> or
  57.  * <code>incrementAll</code> and then executing <code>getResult</code> will
  58.  * sometimes give a different, less accurate, result than executing
  59.  * <code>evaluate</code> with the full array of values. The former approach
  60.  * should only be used when the full array of values is not available.
  61.  * <p>
  62.  * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
  63.  * be computed using this statistic.  The <code>isBiasCorrected</code>
  64.  * property determines whether the "population" or "sample" value is
  65.  * returned by the <code>evaluate</code> and <code>getResult</code> methods.
  66.  * To compute population variances, set this property to <code>false.</code>
  67.  * <p>
  68.  * <strong>Note that this implementation is not synchronized.</strong> If
  69.  * multiple threads access an instance of this class concurrently, and at least
  70.  * one of the threads invokes the <code>increment()</code> or
  71.  * <code>clear()</code> method, it must be synchronized externally.
  72.  */
  73. public class Variance extends AbstractStorelessUnivariateStatistic
  74.     implements AggregatableStatistic<Variance>, WeightedEvaluation, Serializable {

  75.     /** Serializable version identifier */
  76.     private static final long serialVersionUID = 20150412L;

  77.     /** SecondMoment is used in incremental calculation of Variance*/
  78.     protected final SecondMoment moment;

  79.     /**
  80.      * Whether or not {@link #increment(double)} should increment
  81.      * the internal second moment. When a Variance is constructed with an
  82.      * external SecondMoment as a constructor parameter, this property is
  83.      * set to false and increments must be applied to the second moment
  84.      * directly.
  85.      */
  86.     protected final boolean incMoment;

  87.     /**
  88.      * Whether or not bias correction is applied when computing the
  89.      * value of the statistic. True means that bias is corrected.  See
  90.      * {@link Variance} for details on the formula.
  91.      */
  92.     private final boolean isBiasCorrected;

  93.     /**
  94.      * Constructs a Variance with default (true) <code>isBiasCorrected</code>
  95.      * property.
  96.      */
  97.     public Variance() {
  98.         this(true);
  99.     }

  100.     /**
  101.      * Constructs a Variance based on an external second moment.
  102.      * <p>
  103.      * When this constructor is used, the statistic may only be
  104.      * incremented via the moment, i.e., {@link #increment(double)}
  105.      * does nothing; whereas {@code m2.increment(value)} increments
  106.      * both {@code m2} and the Variance instance constructed from it.
  107.      *
  108.      * @param m2 the SecondMoment (Third or Fourth moments work here as well.)
  109.      */
  110.     public Variance(final SecondMoment m2) {
  111.         this(true, m2);
  112.     }

  113.     /**
  114.      * Constructs a Variance with the specified <code>isBiasCorrected</code>
  115.      * property.
  116.      *
  117.      * @param isBiasCorrected  setting for bias correction - true means
  118.      * bias will be corrected and is equivalent to using the argumentless
  119.      * constructor
  120.      */
  121.     public Variance(boolean isBiasCorrected) {
  122.         this(new SecondMoment(), true, isBiasCorrected);
  123.     }

  124.     /**
  125.      * Constructs a Variance with the specified <code>isBiasCorrected</code>
  126.      * property and the supplied external second moment.
  127.      *
  128.      * @param isBiasCorrected  setting for bias correction - true means
  129.      * bias will be corrected
  130.      * @param m2 the SecondMoment (Third or Fourth moments work
  131.      * here as well.)
  132.      */
  133.     public Variance(boolean isBiasCorrected, SecondMoment m2) {
  134.         this(m2, false, isBiasCorrected);
  135.     }

  136.     /**
  137.      * Constructs a Variance with the specified parameters.
  138.      *
  139.      * @param m2 the SecondMoment (Third or Fourth moments work
  140.      * here as well.)
  141.      * @param incMoment if the moment shall be incremented
  142.      * @param isBiasCorrected  setting for bias correction - true means
  143.      * bias will be corrected
  144.      */
  145.     private Variance(SecondMoment m2, boolean incMoment, boolean isBiasCorrected) {
  146.         this.moment          = m2;
  147.         this.incMoment       = incMoment;
  148.         this.isBiasCorrected = isBiasCorrected;
  149.     }

  150.     /**
  151.      * Copy constructor, creates a new {@code Variance} identical
  152.      * to the {@code original}.
  153.      *
  154.      * @param original the {@code Variance} instance to copy
  155.      * @throws NullArgumentException if original is null
  156.      */
  157.     public Variance(Variance original) throws NullArgumentException {
  158.         MathUtils.checkNotNull(original);
  159.         this.moment          = original.moment.copy();
  160.         this.incMoment       = original.incMoment;
  161.         this.isBiasCorrected = original.isBiasCorrected;
  162.     }

  163.     /**
  164.      * {@inheritDoc}
  165.      * <p>If all values are available, it is more accurate to use
  166.      * {@link #evaluate(double[])} rather than adding values one at a time
  167.      * using this method and then executing {@link #getResult}, since
  168.      * <code>evaluate</code> leverages the fact that is has the full
  169.      * list of values together to execute a two-pass algorithm.
  170.      * See {@link Variance}.
  171.      * <p>
  172.      * Note also that when {@link #Variance(SecondMoment)} is used to
  173.      * create a Variance, this method does nothing. In that case, the
  174.      * SecondMoment should be incremented directly.
  175.      */
  176.     @Override
  177.     public void increment(final double d) {
  178.         if (incMoment) {
  179.             moment.increment(d);
  180.         }
  181.     }

  182.     /** {@inheritDoc} */
  183.     @Override
  184.     public double getResult() {
  185.         if (moment.n == 0) {
  186.             return Double.NaN;
  187.         } else if (moment.n == 1) {
  188.             return 0d;
  189.         } else {
  190.             if (isBiasCorrected) {
  191.                 return moment.m2 / (moment.n - 1d);
  192.             } else {
  193.                 return moment.m2 / (moment.n);
  194.             }
  195.         }
  196.     }

  197.     /** {@inheritDoc} */
  198.     @Override
  199.     public long getN() {
  200.         return moment.getN();
  201.     }

  202.     /** {@inheritDoc} */
  203.     @Override
  204.     public void clear() {
  205.         if (incMoment) {
  206.             moment.clear();
  207.         }
  208.     }

  209.     /** {@inheritDoc} */
  210.     @Override
  211.     public void aggregate(Variance other) {
  212.         MathUtils.checkNotNull(other);
  213.         if (incMoment) {
  214.             this.moment.aggregate(other.moment);
  215.         }
  216.     }

  217.     /**
  218.      * Returns the variance of the entries in the specified portion of
  219.      * the input array, or <code>Double.NaN</code> if the designated subarray
  220.      * is empty.  Note that Double.NaN may also be returned if the input
  221.      * includes NaN and / or infinite values.
  222.      * <p>
  223.      * See {@link Variance} for details on the computing algorithm.</p>
  224.      * <p>
  225.      * Returns 0 for a single-value (i.e. length = 1) sample.</p>
  226.      * <p>
  227.      * Does not change the internal state of the statistic.</p>
  228.      * <p>
  229.      * Throws <code>MathIllegalArgumentException</code> if the array is null.</p>
  230.      *
  231.      * @param values the input array
  232.      * @param begin index of the first array element to include
  233.      * @param length the number of elements to include
  234.      * @return the variance of the values or Double.NaN if length = 0
  235.      * @throws MathIllegalArgumentException if the array is null or the array index
  236.      *  parameters are not valid
  237.      */
  238.     @Override
  239.     public double evaluate(final double[] values, final int begin, final int length)
  240.         throws MathIllegalArgumentException {

  241.         double var = Double.NaN;

  242.         if (MathArrays.verifyValues(values, begin, length)) {
  243.             if (length == 1) {
  244.                 var = 0.0;
  245.             } else if (length > 1) {
  246.                 double m = StatUtils.mean(values, begin, length);
  247.                 var = evaluate(values, m, begin, length);
  248.             }
  249.         }
  250.         return var;
  251.     }

  252.     /**
  253.      * Returns the weighted variance of the entries in the specified portion of
  254.      * the input array, or <code>Double.NaN</code> if the designated subarray
  255.      * is empty.
  256.      * <p>
  257.      * Uses the formula
  258.      * <pre>
  259.      *   &Sigma;(weights[i]*(values[i] - weightedMean)²)/(&Sigma;(weights[i]) - 1)
  260.      * </pre>
  261.      * where weightedMean is the weighted mean.
  262.      * <p>
  263.      * This formula will not return the same result as the unweighted variance when all
  264.      * weights are equal, unless all weights are equal to 1. The formula assumes that
  265.      * weights are to be treated as "expansion values," as will be the case if for example
  266.      * the weights represent frequency counts. To normalize weights so that the denominator
  267.      * in the variance computation equals the length of the input vector minus one, use
  268.      * <pre>
  269.      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length));</code>
  270.      * </pre>
  271.      * <p>
  272.      * Returns 0 for a single-value (i.e. length = 1) sample.
  273.      * <p>
  274.      * Throws <code>IllegalArgumentException</code> if any of the following are true:
  275.      * <ul><li>the values array is null</li>
  276.      *     <li>the weights array is null</li>
  277.      *     <li>the weights array does not have the same length as the values array</li>
  278.      *     <li>the weights array contains one or more infinite values</li>
  279.      *     <li>the weights array contains one or more NaN values</li>
  280.      *     <li>the weights array contains negative values</li>
  281.      *     <li>the start and length arguments do not determine a valid array</li>
  282.      * </ul>
  283.      * <p>
  284.      * Does not change the internal state of the statistic.
  285.      *
  286.      * @param values the input array
  287.      * @param weights the weights array
  288.      * @param begin index of the first array element to include
  289.      * @param length the number of elements to include
  290.      * @return the weighted variance of the values or Double.NaN if length = 0
  291.      * @throws MathIllegalArgumentException if the parameters are not valid
  292.      */
  293.     @Override
  294.     public double evaluate(final double[] values, final double[] weights,
  295.                            final int begin, final int length)
  296.         throws MathIllegalArgumentException {

  297.         double var = Double.NaN;
  298.         if (MathArrays.verifyValues(values, weights,begin, length)) {
  299.             if (length == 1) {
  300.                 var = 0.0;
  301.             } else if (length > 1) {
  302.                 Mean mean = new Mean();
  303.                 double m = mean.evaluate(values, weights, begin, length);
  304.                 var = evaluate(values, weights, m, begin, length);
  305.             }
  306.         }
  307.         return var;
  308.     }

  309.     /**
  310.      * Returns the variance of the entries in the specified portion of
  311.      * the input array, using the precomputed mean value. Returns
  312.      * <code>Double.NaN</code> if the designated subarray is empty.
  313.      * <p>
  314.      * See {@link Variance} for details on the computing algorithm.
  315.      * <p>
  316.      * The formula used assumes that the supplied mean value is the arithmetic
  317.      * mean of the sample data, not a known population parameter.  This method
  318.      * is supplied only to save computation when the mean has already been
  319.      * computed.
  320.      * <p>
  321.      * Returns 0 for a single-value (i.e. length = 1) sample.
  322.      * <p>
  323.      * Does not change the internal state of the statistic.
  324.      *
  325.      * @param values the input array
  326.      * @param mean the precomputed mean value
  327.      * @param begin index of the first array element to include
  328.      * @param length the number of elements to include
  329.      * @return the variance of the values or Double.NaN if length = 0
  330.      * @throws MathIllegalArgumentException if the array is null or the array index
  331.      *  parameters are not valid
  332.      */
  333.     public double evaluate(final double[] values, final double mean,
  334.                            final int begin, final int length)
  335.         throws MathIllegalArgumentException {

  336.         double var = Double.NaN;
  337.         if (MathArrays.verifyValues(values, begin, length)) {
  338.             if (length == 1) {
  339.                 var = 0.0;
  340.             } else if (length > 1) {
  341.                 double accum = 0.0;
  342.                 double accum2 = 0.0;
  343.                 for (int i = begin; i < begin + length; i++) {
  344.                     final double dev = values[i] - mean;
  345.                     accum += dev * dev;
  346.                     accum2 += dev;
  347.                 }
  348.                 double len = length;
  349.                 if (isBiasCorrected) {
  350.                     var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
  351.                 } else {
  352.                     var = (accum - (accum2 * accum2 / len)) / len;
  353.                 }
  354.             }
  355.         }
  356.         return var;
  357.     }

  358.     /**
  359.      * Returns the variance of the entries in the input array, using the
  360.      * precomputed mean value.  Returns <code>Double.NaN</code> if the array
  361.      * is empty.
  362.      * <p>
  363.      * See {@link Variance} for details on the computing algorithm.
  364.      * <p>
  365.      * If <code>isBiasCorrected</code> is <code>true</code> the formula used
  366.      * assumes that the supplied mean value is the arithmetic mean of the
  367.      * sample data, not a known population parameter.  If the mean is a known
  368.      * population parameter, or if the "population" version of the variance is
  369.      * desired, set <code>isBiasCorrected</code> to <code>false</code> before
  370.      * invoking this method.
  371.      * <p>
  372.      * Returns 0 for a single-value (i.e. length = 1) sample.
  373.      * <p>
  374.      * Does not change the internal state of the statistic.
  375.      *
  376.      * @param values the input array
  377.      * @param mean the precomputed mean value
  378.      * @return the variance of the values or Double.NaN if the array is empty
  379.      * @throws MathIllegalArgumentException if the array is null
  380.      */
  381.     public double evaluate(final double[] values, final double mean)
  382.         throws MathIllegalArgumentException {
  383.         return evaluate(values, mean, 0, values.length);
  384.     }

  385.     /**
  386.      * Returns the weighted variance of the entries in the specified portion of
  387.      * the input array, using the precomputed weighted mean value.  Returns
  388.      * <code>Double.NaN</code> if the designated subarray is empty.
  389.      * <p>
  390.      * Uses the formula
  391.      * <pre>
  392.      *   &Sigma;(weights[i]*(values[i] - mean)²)/(&Sigma;(weights[i]) - 1)
  393.      * </pre>
  394.      * <p>
  395.      * The formula used assumes that the supplied mean value is the weighted arithmetic
  396.      * mean of the sample data, not a known population parameter. This method
  397.      * is supplied only to save computation when the mean has already been
  398.      * computed.
  399.      * <p>
  400.      * This formula will not return the same result as the unweighted variance when all
  401.      * weights are equal, unless all weights are equal to 1. The formula assumes that
  402.      * weights are to be treated as "expansion values," as will be the case if for example
  403.      * the weights represent frequency counts. To normalize weights so that the denominator
  404.      * in the variance computation equals the length of the input vector minus one, use
  405.      * <pre>
  406.      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean);</code>
  407.      * </pre>
  408.      * <p>
  409.      * Returns 0 for a single-value (i.e. length = 1) sample.
  410.      * <p>
  411.      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
  412.      * <ul><li>the values array is null</li>
  413.      *     <li>the weights array is null</li>
  414.      *     <li>the weights array does not have the same length as the values array</li>
  415.      *     <li>the weights array contains one or more infinite values</li>
  416.      *     <li>the weights array contains one or more NaN values</li>
  417.      *     <li>the weights array contains negative values</li>
  418.      *     <li>the start and length arguments do not determine a valid array</li>
  419.      * </ul>
  420.      * <p>
  421.      * Does not change the internal state of the statistic.
  422.      *
  423.      * @param values the input array
  424.      * @param weights the weights array
  425.      * @param mean the precomputed weighted mean value
  426.      * @param begin index of the first array element to include
  427.      * @param length the number of elements to include
  428.      * @return the variance of the values or Double.NaN if length = 0
  429.      * @throws MathIllegalArgumentException if the parameters are not valid
  430.      */
  431.     public double evaluate(final double[] values, final double[] weights,
  432.                            final double mean, final int begin, final int length)
  433.         throws MathIllegalArgumentException {

  434.         double var = Double.NaN;

  435.         if (MathArrays.verifyValues(values, weights, begin, length)) {
  436.             if (length == 1) {
  437.                 var = 0.0;
  438.             } else if (length > 1) {
  439.                 double accum = 0.0;
  440.                 double accum2 = 0.0;
  441.                 for (int i = begin; i < begin + length; i++) {
  442.                     final double dev = values[i] - mean;
  443.                     accum += weights[i] * (dev * dev);
  444.                     accum2 += weights[i] * dev;
  445.                 }

  446.                 double sumWts = 0;
  447.                 for (int i = begin; i < begin + length; i++) {
  448.                     sumWts += weights[i];
  449.                 }

  450.                 if (isBiasCorrected) {
  451.                     var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
  452.                 } else {
  453.                     var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
  454.                 }
  455.             }
  456.         }
  457.         return var;
  458.     }

  459.     /**
  460.      * Returns the weighted variance of the values in the input array, using
  461.      * the precomputed weighted mean value.
  462.      * <p>
  463.      * Uses the formula
  464.      * <pre>
  465.      *   &Sigma;(weights[i]*(values[i] - mean)²)/(&Sigma;(weights[i]) - 1)
  466.      * </pre>
  467.      * <p>
  468.      * The formula used assumes that the supplied mean value is the weighted arithmetic
  469.      * mean of the sample data, not a known population parameter. This method
  470.      * is supplied only to save computation when the mean has already been
  471.      * computed.
  472.      * <p>
  473.      * This formula will not return the same result as the unweighted variance when all
  474.      * weights are equal, unless all weights are equal to 1. The formula assumes that
  475.      * weights are to be treated as "expansion values," as will be the case if for example
  476.      * the weights represent frequency counts. To normalize weights so that the denominator
  477.      * in the variance computation equals the length of the input vector minus one, use
  478.      * <pre>
  479.      *   <code>evaluate(values, MathArrays.normalizeArray(weights, values.length), mean);</code>
  480.      * </pre>
  481.      * <p>
  482.      * Returns 0 for a single-value (i.e. length = 1) sample.
  483.      * <p>
  484.      * Throws <code>MathIllegalArgumentException</code> if any of the following are true:
  485.      * <ul><li>the values array is null</li>
  486.      *     <li>the weights array is null</li>
  487.      *     <li>the weights array does not have the same length as the values array</li>
  488.      *     <li>the weights array contains one or more infinite values</li>
  489.      *     <li>the weights array contains one or more NaN values</li>
  490.      *     <li>the weights array contains negative values</li>
  491.      * </ul>
  492.      * <p>
  493.      * Does not change the internal state of the statistic.
  494.      *
  495.      * @param values the input array
  496.      * @param weights the weights array
  497.      * @param mean the precomputed weighted mean value
  498.      * @return the variance of the values or Double.NaN if length = 0
  499.      * @throws MathIllegalArgumentException if the parameters are not valid
  500.      */
  501.     public double evaluate(final double[] values, final double[] weights, final double mean)
  502.         throws MathIllegalArgumentException {
  503.         return evaluate(values, weights, mean, 0, values.length);
  504.     }

  505.     /** Check if bias is corrected.
  506.      * @return Returns the isBiasCorrected.
  507.      */
  508.     public boolean isBiasCorrected() {
  509.         return isBiasCorrected;
  510.     }

  511.     /**
  512.      * Returns a new copy of this variance with the given bias correction
  513.      * setting.
  514.      *
  515.      * @param biasCorrection The bias correction flag to set.
  516.      * @return a copy of this instance with the given bias correction setting
  517.      */
  518.     public Variance withBiasCorrection(boolean biasCorrection) {
  519.         return new Variance(this.moment, this.incMoment, biasCorrection);
  520.     }

  521.     /** {@inheritDoc} */
  522.     @Override
  523.     public Variance copy() {
  524.         return new Variance(this);
  525.     }

  526. }