OneWayAnova.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This is not the original file distributed by the Apache Software Foundation
* It has been modified by the Hipparchus project
*/
package org.hipparchus.stat.inference;
import java.util.ArrayList;
import java.util.Collection;
import org.hipparchus.distribution.continuous.FDistribution;
import org.hipparchus.exception.MathIllegalArgumentException;
import org.hipparchus.exception.MathIllegalStateException;
import org.hipparchus.exception.NullArgumentException;
import org.hipparchus.stat.LocalizedStatFormats;
import org.hipparchus.stat.descriptive.StreamingStatistics;
import org.hipparchus.util.MathUtils;
/**
* Implements one-way ANOVA (analysis of variance) statistics.
*
* <p> Tests for differences between two or more categories of univariate data
* (for example, the body mass index of accountants, lawyers, doctors and
* computer programmers). When two categories are given, this is equivalent to
* the {@link TTest}.
* </p><p>
* Uses the {@link FDistribution
* Hipparchus F Distribution implementation} to estimate exact p-values.</p>
* <p>This implementation is based on a description at
* <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">One way Anova (dead link)</a></p>
* <pre>
* Abbreviations: bg = between groups,
* wg = within groups,
* ss = sum squared deviations
* </pre>
*
*/
public class OneWayAnova {
/** Empty constructor.
* <p>
* This constructor is not strictly necessary, but it prevents spurious
* javadoc warnings with JDK 18 and later.
* </p>
* @since 3.0
*/
public OneWayAnova() { // NOPMD - unnecessary constructor added intentionally to make javadoc happy
// nothing to do
}
/**
* Computes the ANOVA F-value for a collection of <code>double[]</code>
* arrays.
*
* <p><strong>Preconditions</strong>:</p>
* <ul>
* <li>The categoryData <code>Collection</code> must contain
* <code>double[]</code> arrays.</li>
* <li> There must be at least two <code>double[]</code> arrays in the
* <code>categoryData</code> collection and each of these arrays must
* contain at least two values.</li></ul>
* <p>
* This implementation computes the F statistic using the definitional
* formula</p><pre>
* F = msbg/mswg</pre>
* <p>where</p><pre>
* msbg = between group mean square
* mswg = within group mean square</pre>
* <p>
* are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
* here</a></p>
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @return Fvalue
* @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
* @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
* array is less than 2 or a contained <code>double[]</code> array does not have
* at least two values
*/
public double anovaFValue(final Collection<double[]> categoryData)
throws MathIllegalArgumentException, NullArgumentException {
return anovaStats(categoryData).F;
}
/**
* Computes the ANOVA P-value for a collection of <code>double[]</code>
* arrays.
*
* <p><strong>Preconditions</strong>:</p>
* <ul>
* <li>The categoryData <code>Collection</code> must contain
* <code>double[]</code> arrays.</li>
* <li> There must be at least two <code>double[]</code> arrays in the
* <code>categoryData</code> collection and each of these arrays must
* contain at least two values.</li></ul>
* <p>
* This implementation uses the
* {@link org.hipparchus.distribution.continuous.FDistribution
* Hipparchus F Distribution implementation} to estimate the exact
* p-value, using the formula</p><pre>
* p = 1 - cumulativeProbability(F)</pre>
* <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
* is the Hipparchus implementation of the F distribution.</p>
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @return Pvalue
* @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
* @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
* array is less than 2 or a contained <code>double[]</code> array does not have
* at least two values
* @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
* @throws MathIllegalStateException if the maximum number of iterations is exceeded
*/
public double anovaPValue(final Collection<double[]> categoryData)
throws MathIllegalArgumentException, NullArgumentException,
MathIllegalStateException {
final AnovaStats a = anovaStats(categoryData);
// No try-catch or advertised exception because args are valid
final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
return 1.0 - fdist.cumulativeProbability(a.F);
}
/**
* Computes the ANOVA P-value for a collection of {@link StreamingStatistics}.
*
* <p><strong>Preconditions</strong>:</p>
* <ul>
* <li>The categoryData <code>Collection</code> must contain
* {@link StreamingStatistics}.</li>
* <li> There must be at least two {@link StreamingStatistics} in the
* <code>categoryData</code> collection and each of these statistics must
* contain at least two values.</li></ul>
* <p>
* This implementation uses the
* {@link org.hipparchus.distribution.continuous.FDistribution
* Hipparchus F Distribution implementation} to estimate the exact
* p-value, using the formula</p><pre>
* p = 1 - cumulativeProbability(F)</pre>
* <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
* is the Hipparchus implementation of the F distribution.</p>
*
* @param categoryData <code>Collection</code> of {@link StreamingStatistics}
* each containing data for one category
* @param allowOneElementData if true, allow computation for one catagory
* only or for one data element per category
* @return Pvalue
* @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
* @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
* array is less than 2 or a contained {@link StreamingStatistics} does not have
* at least two values
* @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
* @throws MathIllegalStateException if the maximum number of iterations is exceeded
*/
public double anovaPValue(final Collection<StreamingStatistics> categoryData,
final boolean allowOneElementData)
throws MathIllegalArgumentException, NullArgumentException,
MathIllegalStateException {
final AnovaStats a = anovaStats(categoryData, allowOneElementData);
final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
return 1.0 - fdist.cumulativeProbability(a.F);
}
/**
* This method calls the method that actually does the calculations (except
* P-value).
*
* @param categoryData
* <code>Collection</code> of <code>double[]</code> arrays each
* containing data for one category
* @return computed AnovaStats
* @throws NullArgumentException
* if <code>categoryData</code> is <code>null</code>
* @throws MathIllegalArgumentException
* if the length of the <code>categoryData</code> array is less
* than 2 or a contained <code>double[]</code> array does not
* contain at least two values
*/
private AnovaStats anovaStats(final Collection<double[]> categoryData)
throws MathIllegalArgumentException, NullArgumentException {
MathUtils.checkNotNull(categoryData);
final Collection<StreamingStatistics> categoryDataSummaryStatistics =
new ArrayList<>(categoryData.size());
// convert arrays to SummaryStatistics
for (final double[] data : categoryData) {
final StreamingStatistics dataSummaryStatistics = new StreamingStatistics();
categoryDataSummaryStatistics.add(dataSummaryStatistics);
for (final double val : data) {
dataSummaryStatistics.addValue(val);
}
}
return anovaStats(categoryDataSummaryStatistics, false);
}
/**
* Performs an ANOVA test, evaluating the null hypothesis that there
* is no difference among the means of the data categories.
*
* <p><strong>Preconditions</strong>:</p>
* <ul>
* <li>The categoryData <code>Collection</code> must contain
* <code>double[]</code> arrays.</li>
* <li> There must be at least two <code>double[]</code> arrays in the
* <code>categoryData</code> collection and each of these arrays must
* contain at least two values.</li>
* <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
* </li></ul>
* <p>
* This implementation uses the
* {@link org.hipparchus.distribution.continuous.FDistribution
* Hipparchus F Distribution implementation} to estimate the exact
* p-value, using the formula</p><pre>
* p = 1 - cumulativeProbability(F)</pre>
* <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
* is the Hipparchus implementation of the F distribution.</p>
* <p>True is returned iff the estimated p-value is less than alpha.</p>
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @param alpha significance level of the test
* @return true if the null hypothesis can be rejected with
* confidence 1 - alpha
* @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
* @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
* array is less than 2 or a contained <code>double[]</code> array does not have
* at least two values
* @throws MathIllegalArgumentException if <code>alpha</code> is not in the range (0, 0.5]
* @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
* @throws MathIllegalStateException if the maximum number of iterations is exceeded
*/
public boolean anovaTest(final Collection<double[]> categoryData,
final double alpha)
throws MathIllegalArgumentException, NullArgumentException, MathIllegalStateException {
if ((alpha <= 0) || (alpha > 0.5)) {
throw new MathIllegalArgumentException(
LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
alpha, 0, 0.5);
}
return anovaPValue(categoryData) < alpha;
}
/**
* This method actually does the calculations (except P-value).
*
* @param categoryData <code>Collection</code> of <code>double[]</code>
* arrays each containing data for one category
* @param allowOneElementData if true, allow computation for one category
* only or for one data element per category
* @return computed AnovaStats
* @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
* @throws MathIllegalArgumentException if <code>allowOneElementData</code> is false and the number of
* categories is less than 2 or a contained SummaryStatistics does not contain
* at least two values
*/
private AnovaStats anovaStats(final Collection<StreamingStatistics> categoryData,
final boolean allowOneElementData)
throws MathIllegalArgumentException, NullArgumentException {
MathUtils.checkNotNull(categoryData);
if (!allowOneElementData) {
// check if we have enough categories
if (categoryData.size() < 2) {
throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
categoryData.size(), 2);
}
// check if each category has enough data
for (final StreamingStatistics array : categoryData) {
if (array.getN() <= 1) {
throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
(int) array.getN(), 2);
}
}
}
int dfwg = 0;
double sswg = 0;
double totsum = 0;
double totsumsq = 0;
int totnum = 0;
for (final StreamingStatistics data : categoryData) {
final double sum = data.getSum();
final double sumsq = data.getSumOfSquares();
final int num = (int) data.getN();
totnum += num;
totsum += sum;
totsumsq += sumsq;
dfwg += num - 1;
final double ss = sumsq - ((sum * sum) / num);
sswg += ss;
}
final double sst = totsumsq - ((totsum * totsum) / totnum);
final double ssbg = sst - sswg;
final int dfbg = categoryData.size() - 1;
final double msbg = ssbg / dfbg;
final double mswg = sswg / dfwg;
final double F = msbg / mswg;
return new AnovaStats(dfbg, dfwg, F);
}
/**
* Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
* No get/set methods provided.
*/
private static class AnovaStats {
/** Degrees of freedom in numerator (between groups). */
private final int dfbg;
/** Degrees of freedom in denominator (within groups). */
private final int dfwg;
/** Statistic. */
private final double F;
/**
* Constructor
* @param dfbg degrees of freedom in numerator (between groups)
* @param dfwg degrees of freedom in denominator (within groups)
* @param F statistic
*/
private AnovaStats(int dfbg, int dfwg, double F) {
this.dfbg = dfbg;
this.dfwg = dfwg;
this.F = F;
}
}
}