OneWayAnova.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. /*
  18.  * This is not the original file distributed by the Apache Software Foundation
  19.  * It has been modified by the Hipparchus project
  20.  */
  21. package org.hipparchus.stat.inference;

  22. import java.util.ArrayList;
  23. import java.util.Collection;

  24. import org.hipparchus.distribution.continuous.FDistribution;
  25. import org.hipparchus.exception.MathIllegalArgumentException;
  26. import org.hipparchus.exception.MathIllegalStateException;
  27. import org.hipparchus.exception.NullArgumentException;
  28. import org.hipparchus.stat.LocalizedStatFormats;
  29. import org.hipparchus.stat.descriptive.StreamingStatistics;
  30. import org.hipparchus.util.MathUtils;

  31. /**
  32.  * Implements one-way ANOVA (analysis of variance) statistics.
  33.  *
  34.  * <p> Tests for differences between two or more categories of univariate data
  35.  * (for example, the body mass index of accountants, lawyers, doctors and
  36.  * computer programmers).  When two categories are given, this is equivalent to
  37.  * the {@link TTest}.
  38.  * </p><p>
  39.  * Uses the {@link FDistribution
  40.  * Hipparchus F Distribution implementation} to estimate exact p-values.</p>
  41.  * <p>This implementation is based on a description at
  42.  * <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">One way Anova (dead link)</a></p>
  43.  * <pre>
  44.  * Abbreviations: bg = between groups,
  45.  *                wg = within groups,
  46.  *                ss = sum squared deviations
  47.  * </pre>
  48.  *
  49.  */
  50. public class OneWayAnova {

  51.     /** Empty constructor.
  52.      * <p>
  53.      * This constructor is not strictly necessary, but it prevents spurious
  54.      * javadoc warnings with JDK 18 and later.
  55.      * </p>
  56.      * @since 3.0
  57.      */
  58.     public OneWayAnova() { // NOPMD - unnecessary constructor added intentionally to make javadoc happy
  59.         // nothing to do
  60.     }

  61.     /**
  62.      * Computes the ANOVA F-value for a collection of <code>double[]</code>
  63.      * arrays.
  64.      *
  65.      * <p><strong>Preconditions</strong>:</p>
  66.      * <ul>
  67.      * <li>The categoryData <code>Collection</code> must contain
  68.      * <code>double[]</code> arrays.</li>
  69.      * <li> There must be at least two <code>double[]</code> arrays in the
  70.      * <code>categoryData</code> collection and each of these arrays must
  71.      * contain at least two values.</li></ul>
  72.      * <p>
  73.      * This implementation computes the F statistic using the definitional
  74.      * formula</p><pre>
  75.      *   F = msbg/mswg</pre>
  76.      * <p>where</p><pre>
  77.      *  msbg = between group mean square
  78.      *  mswg = within group mean square</pre>
  79.      *  <p>
  80.      * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
  81.      * here</a></p>
  82.      *
  83.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  84.      * arrays each containing data for one category
  85.      * @return Fvalue
  86.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  87.      * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
  88.      * array is less than 2 or a contained <code>double[]</code> array does not have
  89.      * at least two values
  90.      */
  91.     public double anovaFValue(final Collection<double[]> categoryData)
  92.         throws MathIllegalArgumentException, NullArgumentException {

  93.         return anovaStats(categoryData).F;

  94.     }

  95.     /**
  96.      * Computes the ANOVA P-value for a collection of <code>double[]</code>
  97.      * arrays.
  98.      *
  99.      * <p><strong>Preconditions</strong>:</p>
  100.      * <ul>
  101.      * <li>The categoryData <code>Collection</code> must contain
  102.      * <code>double[]</code> arrays.</li>
  103.      * <li> There must be at least two <code>double[]</code> arrays in the
  104.      * <code>categoryData</code> collection and each of these arrays must
  105.      * contain at least two values.</li></ul>
  106.      * <p>
  107.      * This implementation uses the
  108.      * {@link org.hipparchus.distribution.continuous.FDistribution
  109.      * Hipparchus F Distribution implementation} to estimate the exact
  110.      * p-value, using the formula</p><pre>
  111.      *   p = 1 - cumulativeProbability(F)</pre>
  112.      * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
  113.      * is the Hipparchus implementation of the F distribution.</p>
  114.      *
  115.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  116.      * arrays each containing data for one category
  117.      * @return Pvalue
  118.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  119.      * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
  120.      * array is less than 2 or a contained <code>double[]</code> array does not have
  121.      * at least two values
  122.      * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
  123.      * @throws MathIllegalStateException if the maximum number of iterations is exceeded
  124.      */
  125.     public double anovaPValue(final Collection<double[]> categoryData)
  126.         throws MathIllegalArgumentException, NullArgumentException,
  127.         MathIllegalStateException {

  128.         final AnovaStats a = anovaStats(categoryData);
  129.         // No try-catch or advertised exception because args are valid
  130.         final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
  131.         return 1.0 - fdist.cumulativeProbability(a.F);

  132.     }

  133.     /**
  134.      * Computes the ANOVA P-value for a collection of {@link StreamingStatistics}.
  135.      *
  136.      * <p><strong>Preconditions</strong>:</p>
  137.      * <ul>
  138.      * <li>The categoryData <code>Collection</code> must contain
  139.      * {@link StreamingStatistics}.</li>
  140.      * <li> There must be at least two {@link StreamingStatistics} in the
  141.      * <code>categoryData</code> collection and each of these statistics must
  142.      * contain at least two values.</li></ul>
  143.      * <p>
  144.      * This implementation uses the
  145.      * {@link org.hipparchus.distribution.continuous.FDistribution
  146.      * Hipparchus F Distribution implementation} to estimate the exact
  147.      * p-value, using the formula</p><pre>
  148.      *   p = 1 - cumulativeProbability(F)</pre>
  149.      * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
  150.      * is the Hipparchus implementation of the F distribution.</p>
  151.      *
  152.      * @param categoryData <code>Collection</code> of {@link StreamingStatistics}
  153.      * each containing data for one category
  154.      * @param allowOneElementData if true, allow computation for one catagory
  155.      * only or for one data element per category
  156.      * @return Pvalue
  157.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  158.      * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
  159.      * array is less than 2 or a contained {@link StreamingStatistics} does not have
  160.      * at least two values
  161.      * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
  162.      * @throws MathIllegalStateException if the maximum number of iterations is exceeded
  163.      */
  164.     public double anovaPValue(final Collection<StreamingStatistics> categoryData,
  165.                               final boolean allowOneElementData)
  166.         throws MathIllegalArgumentException, NullArgumentException,
  167.         MathIllegalStateException {

  168.         final AnovaStats a = anovaStats(categoryData, allowOneElementData);
  169.         final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
  170.         return 1.0 - fdist.cumulativeProbability(a.F);

  171.     }

  172.     /**
  173.      * This method calls the method that actually does the calculations (except
  174.      * P-value).
  175.      *
  176.      * @param categoryData
  177.      *            <code>Collection</code> of <code>double[]</code> arrays each
  178.      *            containing data for one category
  179.      * @return computed AnovaStats
  180.      * @throws NullArgumentException
  181.      *             if <code>categoryData</code> is <code>null</code>
  182.      * @throws MathIllegalArgumentException
  183.      *             if the length of the <code>categoryData</code> array is less
  184.      *             than 2 or a contained <code>double[]</code> array does not
  185.      *             contain at least two values
  186.      */
  187.     private AnovaStats anovaStats(final Collection<double[]> categoryData)
  188.         throws MathIllegalArgumentException, NullArgumentException {

  189.         MathUtils.checkNotNull(categoryData);

  190.         final Collection<StreamingStatistics> categoryDataSummaryStatistics =
  191.                 new ArrayList<>(categoryData.size());

  192.         // convert arrays to SummaryStatistics
  193.         for (final double[] data : categoryData) {
  194.             final StreamingStatistics dataSummaryStatistics = new StreamingStatistics();
  195.             categoryDataSummaryStatistics.add(dataSummaryStatistics);
  196.             for (final double val : data) {
  197.                 dataSummaryStatistics.addValue(val);
  198.             }
  199.         }

  200.         return anovaStats(categoryDataSummaryStatistics, false);

  201.     }

  202.     /**
  203.      * Performs an ANOVA test, evaluating the null hypothesis that there
  204.      * is no difference among the means of the data categories.
  205.      *
  206.      * <p><strong>Preconditions</strong>:</p>
  207.      * <ul>
  208.      * <li>The categoryData <code>Collection</code> must contain
  209.      * <code>double[]</code> arrays.</li>
  210.      * <li> There must be at least two <code>double[]</code> arrays in the
  211.      * <code>categoryData</code> collection and each of these arrays must
  212.      * contain at least two values.</li>
  213.      * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
  214.      * </li></ul>
  215.      * <p>
  216.      * This implementation uses the
  217.      * {@link org.hipparchus.distribution.continuous.FDistribution
  218.      * Hipparchus F Distribution implementation} to estimate the exact
  219.      * p-value, using the formula</p><pre>
  220.      *   p = 1 - cumulativeProbability(F)</pre>
  221.      * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
  222.      * is the Hipparchus implementation of the F distribution.</p>
  223.      * <p>True is returned iff the estimated p-value is less than alpha.</p>
  224.      *
  225.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  226.      * arrays each containing data for one category
  227.      * @param alpha significance level of the test
  228.      * @return true if the null hypothesis can be rejected with
  229.      * confidence 1 - alpha
  230.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  231.      * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
  232.      * array is less than 2 or a contained <code>double[]</code> array does not have
  233.      * at least two values
  234.      * @throws MathIllegalArgumentException if <code>alpha</code> is not in the range (0, 0.5]
  235.      * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
  236.      * @throws MathIllegalStateException if the maximum number of iterations is exceeded
  237.      */
  238.     public boolean anovaTest(final Collection<double[]> categoryData,
  239.                              final double alpha)
  240.         throws MathIllegalArgumentException, NullArgumentException, MathIllegalStateException {

  241.         if ((alpha <= 0) || (alpha > 0.5)) {
  242.             throw new MathIllegalArgumentException(
  243.                     LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
  244.                     alpha, 0, 0.5);
  245.         }
  246.         return anovaPValue(categoryData) < alpha;
  247.     }

  248.     /**
  249.      * This method actually does the calculations (except P-value).
  250.      *
  251.      * @param categoryData <code>Collection</code> of <code>double[]</code>
  252.      * arrays each containing data for one category
  253.      * @param allowOneElementData if true, allow computation for one category
  254.      * only or for one data element per category
  255.      * @return computed AnovaStats
  256.      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
  257.      * @throws MathIllegalArgumentException if <code>allowOneElementData</code> is false and the number of
  258.      * categories is less than 2 or a contained SummaryStatistics does not contain
  259.      * at least two values
  260.      */
  261.     private AnovaStats anovaStats(final Collection<StreamingStatistics> categoryData,
  262.                                   final boolean allowOneElementData)
  263.         throws MathIllegalArgumentException, NullArgumentException {

  264.         MathUtils.checkNotNull(categoryData);

  265.         if (!allowOneElementData) {
  266.             // check if we have enough categories
  267.             if (categoryData.size() < 2) {
  268.                 throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
  269.                                                        categoryData.size(), 2);
  270.             }

  271.             // check if each category has enough data
  272.             for (final StreamingStatistics array : categoryData) {
  273.                 if (array.getN() <= 1) {
  274.                     throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
  275.                                                            (int) array.getN(), 2);
  276.                 }
  277.             }
  278.         }

  279.         int dfwg = 0;
  280.         double sswg = 0;
  281.         double totsum = 0;
  282.         double totsumsq = 0;
  283.         int totnum = 0;

  284.         for (final StreamingStatistics data : categoryData) {

  285.             final double sum = data.getSum();
  286.             final double sumsq = data.getSumOfSquares();
  287.             final int num = (int) data.getN();
  288.             totnum += num;
  289.             totsum += sum;
  290.             totsumsq += sumsq;

  291.             dfwg += num - 1;
  292.             final double ss = sumsq - ((sum * sum) / num);
  293.             sswg += ss;
  294.         }

  295.         final double sst = totsumsq - ((totsum * totsum) / totnum);
  296.         final double ssbg = sst - sswg;
  297.         final int dfbg = categoryData.size() - 1;
  298.         final double msbg = ssbg / dfbg;
  299.         final double mswg = sswg / dfwg;
  300.         final double F = msbg / mswg;

  301.         return new AnovaStats(dfbg, dfwg, F);

  302.     }

  303.     /**
  304.      * Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
  305.      * No get/set methods provided.
  306.      */
  307.     private static class AnovaStats {

  308.         /** Degrees of freedom in numerator (between groups). */
  309.         private final int dfbg;

  310.         /** Degrees of freedom in denominator (within groups). */
  311.         private final int dfwg;

  312.         /** Statistic. */
  313.         private final double F;

  314.         /**
  315.          * Constructor
  316.          * @param dfbg degrees of freedom in numerator (between groups)
  317.          * @param dfwg degrees of freedom in denominator (within groups)
  318.          * @param F statistic
  319.          */
  320.         private AnovaStats(int dfbg, int dfwg, double F) {
  321.             this.dfbg = dfbg;
  322.             this.dfwg = dfwg;
  323.             this.F = F;
  324.         }
  325.     }

  326. }