1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22 package org.hipparchus.stat.inference;
23
24 import java.util.ArrayList;
25 import java.util.Collection;
26
27 import org.hipparchus.distribution.continuous.FDistribution;
28 import org.hipparchus.exception.MathIllegalArgumentException;
29 import org.hipparchus.exception.MathIllegalStateException;
30 import org.hipparchus.exception.NullArgumentException;
31 import org.hipparchus.stat.LocalizedStatFormats;
32 import org.hipparchus.stat.descriptive.StreamingStatistics;
33 import org.hipparchus.util.MathUtils;
34
35 /**
36 * Implements one-way ANOVA (analysis of variance) statistics.
37 *
38 * <p> Tests for differences between two or more categories of univariate data
39 * (for example, the body mass index of accountants, lawyers, doctors and
40 * computer programmers). When two categories are given, this is equivalent to
41 * the {@link TTest}.
42 * </p><p>
43 * Uses the {@link FDistribution
44 * Hipparchus F Distribution implementation} to estimate exact p-values.</p>
45 * <p>This implementation is based on a description at
46 * <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">One way Anova (dead link)</a></p>
47 * <pre>
48 * Abbreviations: bg = between groups,
49 * wg = within groups,
50 * ss = sum squared deviations
51 * </pre>
52 *
53 */
54 public class OneWayAnova {
55
56 /** Empty constructor.
57 * <p>
58 * This constructor is not strictly necessary, but it prevents spurious
59 * javadoc warnings with JDK 18 and later.
60 * </p>
61 * @since 3.0
62 */
63 public OneWayAnova() { // NOPMD - unnecessary constructor added intentionally to make javadoc happy
64 // nothing to do
65 }
66
67 /**
68 * Computes the ANOVA F-value for a collection of <code>double[]</code>
69 * arrays.
70 *
71 * <p><strong>Preconditions</strong>:</p>
72 * <ul>
73 * <li>The categoryData <code>Collection</code> must contain
74 * <code>double[]</code> arrays.</li>
75 * <li> There must be at least two <code>double[]</code> arrays in the
76 * <code>categoryData</code> collection and each of these arrays must
77 * contain at least two values.</li></ul>
78 * <p>
79 * This implementation computes the F statistic using the definitional
80 * formula</p><pre>
81 * F = msbg/mswg</pre>
82 * <p>where</p><pre>
83 * msbg = between group mean square
84 * mswg = within group mean square</pre>
85 * <p>
86 * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
87 * here</a></p>
88 *
89 * @param categoryData <code>Collection</code> of <code>double[]</code>
90 * arrays each containing data for one category
91 * @return Fvalue
92 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
93 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
94 * array is less than 2 or a contained <code>double[]</code> array does not have
95 * at least two values
96 */
97 public double anovaFValue(final Collection<double[]> categoryData)
98 throws MathIllegalArgumentException, NullArgumentException {
99
100 return anovaStats(categoryData).F;
101
102 }
103
104 /**
105 * Computes the ANOVA P-value for a collection of <code>double[]</code>
106 * arrays.
107 *
108 * <p><strong>Preconditions</strong>:</p>
109 * <ul>
110 * <li>The categoryData <code>Collection</code> must contain
111 * <code>double[]</code> arrays.</li>
112 * <li> There must be at least two <code>double[]</code> arrays in the
113 * <code>categoryData</code> collection and each of these arrays must
114 * contain at least two values.</li></ul>
115 * <p>
116 * This implementation uses the
117 * {@link org.hipparchus.distribution.continuous.FDistribution
118 * Hipparchus F Distribution implementation} to estimate the exact
119 * p-value, using the formula</p><pre>
120 * p = 1 - cumulativeProbability(F)</pre>
121 * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
122 * is the Hipparchus implementation of the F distribution.</p>
123 *
124 * @param categoryData <code>Collection</code> of <code>double[]</code>
125 * arrays each containing data for one category
126 * @return Pvalue
127 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
128 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
129 * array is less than 2 or a contained <code>double[]</code> array does not have
130 * at least two values
131 * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
132 * @throws MathIllegalStateException if the maximum number of iterations is exceeded
133 */
134 public double anovaPValue(final Collection<double[]> categoryData)
135 throws MathIllegalArgumentException, NullArgumentException,
136 MathIllegalStateException {
137
138 final AnovaStats a = anovaStats(categoryData);
139 // No try-catch or advertised exception because args are valid
140 final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
141 return 1.0 - fdist.cumulativeProbability(a.F);
142
143 }
144
145 /**
146 * Computes the ANOVA P-value for a collection of {@link StreamingStatistics}.
147 *
148 * <p><strong>Preconditions</strong>:</p>
149 * <ul>
150 * <li>The categoryData <code>Collection</code> must contain
151 * {@link StreamingStatistics}.</li>
152 * <li> There must be at least two {@link StreamingStatistics} in the
153 * <code>categoryData</code> collection and each of these statistics must
154 * contain at least two values.</li></ul>
155 * <p>
156 * This implementation uses the
157 * {@link org.hipparchus.distribution.continuous.FDistribution
158 * Hipparchus F Distribution implementation} to estimate the exact
159 * p-value, using the formula</p><pre>
160 * p = 1 - cumulativeProbability(F)</pre>
161 * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
162 * is the Hipparchus implementation of the F distribution.</p>
163 *
164 * @param categoryData <code>Collection</code> of {@link StreamingStatistics}
165 * each containing data for one category
166 * @param allowOneElementData if true, allow computation for one catagory
167 * only or for one data element per category
168 * @return Pvalue
169 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
170 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
171 * array is less than 2 or a contained {@link StreamingStatistics} does not have
172 * at least two values
173 * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
174 * @throws MathIllegalStateException if the maximum number of iterations is exceeded
175 */
176 public double anovaPValue(final Collection<StreamingStatistics> categoryData,
177 final boolean allowOneElementData)
178 throws MathIllegalArgumentException, NullArgumentException,
179 MathIllegalStateException {
180
181 final AnovaStats a = anovaStats(categoryData, allowOneElementData);
182 final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
183 return 1.0 - fdist.cumulativeProbability(a.F);
184
185 }
186
187 /**
188 * This method calls the method that actually does the calculations (except
189 * P-value).
190 *
191 * @param categoryData
192 * <code>Collection</code> of <code>double[]</code> arrays each
193 * containing data for one category
194 * @return computed AnovaStats
195 * @throws NullArgumentException
196 * if <code>categoryData</code> is <code>null</code>
197 * @throws MathIllegalArgumentException
198 * if the length of the <code>categoryData</code> array is less
199 * than 2 or a contained <code>double[]</code> array does not
200 * contain at least two values
201 */
202 private AnovaStats anovaStats(final Collection<double[]> categoryData)
203 throws MathIllegalArgumentException, NullArgumentException {
204
205 MathUtils.checkNotNull(categoryData);
206
207 final Collection<StreamingStatistics> categoryDataSummaryStatistics =
208 new ArrayList<>(categoryData.size());
209
210 // convert arrays to SummaryStatistics
211 for (final double[] data : categoryData) {
212 final StreamingStatistics dataSummaryStatistics = new StreamingStatistics();
213 categoryDataSummaryStatistics.add(dataSummaryStatistics);
214 for (final double val : data) {
215 dataSummaryStatistics.addValue(val);
216 }
217 }
218
219 return anovaStats(categoryDataSummaryStatistics, false);
220
221 }
222
223 /**
224 * Performs an ANOVA test, evaluating the null hypothesis that there
225 * is no difference among the means of the data categories.
226 *
227 * <p><strong>Preconditions</strong>:</p>
228 * <ul>
229 * <li>The categoryData <code>Collection</code> must contain
230 * <code>double[]</code> arrays.</li>
231 * <li> There must be at least two <code>double[]</code> arrays in the
232 * <code>categoryData</code> collection and each of these arrays must
233 * contain at least two values.</li>
234 * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
235 * </li></ul>
236 * <p>
237 * This implementation uses the
238 * {@link org.hipparchus.distribution.continuous.FDistribution
239 * Hipparchus F Distribution implementation} to estimate the exact
240 * p-value, using the formula</p><pre>
241 * p = 1 - cumulativeProbability(F)</pre>
242 * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
243 * is the Hipparchus implementation of the F distribution.</p>
244 * <p>True is returned iff the estimated p-value is less than alpha.</p>
245 *
246 * @param categoryData <code>Collection</code> of <code>double[]</code>
247 * arrays each containing data for one category
248 * @param alpha significance level of the test
249 * @return true if the null hypothesis can be rejected with
250 * confidence 1 - alpha
251 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
252 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
253 * array is less than 2 or a contained <code>double[]</code> array does not have
254 * at least two values
255 * @throws MathIllegalArgumentException if <code>alpha</code> is not in the range (0, 0.5]
256 * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
257 * @throws MathIllegalStateException if the maximum number of iterations is exceeded
258 */
259 public boolean anovaTest(final Collection<double[]> categoryData,
260 final double alpha)
261 throws MathIllegalArgumentException, NullArgumentException, MathIllegalStateException {
262
263 if ((alpha <= 0) || (alpha > 0.5)) {
264 throw new MathIllegalArgumentException(
265 LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
266 alpha, 0, 0.5);
267 }
268 return anovaPValue(categoryData) < alpha;
269 }
270
271 /**
272 * This method actually does the calculations (except P-value).
273 *
274 * @param categoryData <code>Collection</code> of <code>double[]</code>
275 * arrays each containing data for one category
276 * @param allowOneElementData if true, allow computation for one category
277 * only or for one data element per category
278 * @return computed AnovaStats
279 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
280 * @throws MathIllegalArgumentException if <code>allowOneElementData</code> is false and the number of
281 * categories is less than 2 or a contained SummaryStatistics does not contain
282 * at least two values
283 */
284 private AnovaStats anovaStats(final Collection<StreamingStatistics> categoryData,
285 final boolean allowOneElementData)
286 throws MathIllegalArgumentException, NullArgumentException {
287
288 MathUtils.checkNotNull(categoryData);
289
290 if (!allowOneElementData) {
291 // check if we have enough categories
292 if (categoryData.size() < 2) {
293 throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
294 categoryData.size(), 2);
295 }
296
297 // check if each category has enough data
298 for (final StreamingStatistics array : categoryData) {
299 if (array.getN() <= 1) {
300 throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
301 (int) array.getN(), 2);
302 }
303 }
304 }
305
306 int dfwg = 0;
307 double sswg = 0;
308 double totsum = 0;
309 double totsumsq = 0;
310 int totnum = 0;
311
312 for (final StreamingStatistics data : categoryData) {
313
314 final double sum = data.getSum();
315 final double sumsq = data.getSumOfSquares();
316 final int num = (int) data.getN();
317 totnum += num;
318 totsum += sum;
319 totsumsq += sumsq;
320
321 dfwg += num - 1;
322 final double ss = sumsq - ((sum * sum) / num);
323 sswg += ss;
324 }
325
326 final double sst = totsumsq - ((totsum * totsum) / totnum);
327 final double ssbg = sst - sswg;
328 final int dfbg = categoryData.size() - 1;
329 final double msbg = ssbg / dfbg;
330 final double mswg = sswg / dfwg;
331 final double F = msbg / mswg;
332
333 return new AnovaStats(dfbg, dfwg, F);
334
335 }
336
337 /**
338 * Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
339 * No get/set methods provided.
340 */
341 private static class AnovaStats {
342
343 /** Degrees of freedom in numerator (between groups). */
344 private final int dfbg;
345
346 /** Degrees of freedom in denominator (within groups). */
347 private final int dfwg;
348
349 /** Statistic. */
350 private final double F;
351
352 /**
353 * Constructor
354 * @param dfbg degrees of freedom in numerator (between groups)
355 * @param dfwg degrees of freedom in denominator (within groups)
356 * @param F statistic
357 */
358 private AnovaStats(int dfbg, int dfwg, double F) {
359 this.dfbg = dfbg;
360 this.dfwg = dfwg;
361 this.F = F;
362 }
363 }
364
365 }