1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This is not the original file distributed by the Apache Software Foundation 20 * It has been modified by the Hipparchus project 21 */ 22 package org.hipparchus.stat.inference; 23 24 import java.util.ArrayList; 25 import java.util.Collection; 26 27 import org.hipparchus.distribution.continuous.FDistribution; 28 import org.hipparchus.exception.MathIllegalArgumentException; 29 import org.hipparchus.exception.MathIllegalStateException; 30 import org.hipparchus.exception.NullArgumentException; 31 import org.hipparchus.stat.LocalizedStatFormats; 32 import org.hipparchus.stat.descriptive.StreamingStatistics; 33 import org.hipparchus.util.MathUtils; 34 35 /** 36 * Implements one-way ANOVA (analysis of variance) statistics. 37 * 38 * <p> Tests for differences between two or more categories of univariate data 39 * (for example, the body mass index of accountants, lawyers, doctors and 40 * computer programmers). When two categories are given, this is equivalent to 41 * the {@link TTest}. 42 * </p><p> 43 * Uses the {@link FDistribution 44 * Hipparchus F Distribution implementation} to estimate exact p-values.</p> 45 * <p>This implementation is based on a description at 46 * <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">One way Anova (dead link)</a></p> 47 * <pre> 48 * Abbreviations: bg = between groups, 49 * wg = within groups, 50 * ss = sum squared deviations 51 * </pre> 52 * 53 */ 54 public class OneWayAnova { 55 56 /** Empty constructor. 57 * <p> 58 * This constructor is not strictly necessary, but it prevents spurious 59 * javadoc warnings with JDK 18 and later. 60 * </p> 61 * @since 3.0 62 */ 63 public OneWayAnova() { // NOPMD - unnecessary constructor added intentionally to make javadoc happy 64 // nothing to do 65 } 66 67 /** 68 * Computes the ANOVA F-value for a collection of <code>double[]</code> 69 * arrays. 70 * 71 * <p><strong>Preconditions</strong>:</p> 72 * <ul> 73 * <li>The categoryData <code>Collection</code> must contain 74 * <code>double[]</code> arrays.</li> 75 * <li> There must be at least two <code>double[]</code> arrays in the 76 * <code>categoryData</code> collection and each of these arrays must 77 * contain at least two values.</li></ul> 78 * <p> 79 * This implementation computes the F statistic using the definitional 80 * formula</p><pre> 81 * F = msbg/mswg</pre> 82 * <p>where</p><pre> 83 * msbg = between group mean square 84 * mswg = within group mean square</pre> 85 * <p> 86 * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html"> 87 * here</a></p> 88 * 89 * @param categoryData <code>Collection</code> of <code>double[]</code> 90 * arrays each containing data for one category 91 * @return Fvalue 92 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 93 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code> 94 * array is less than 2 or a contained <code>double[]</code> array does not have 95 * at least two values 96 */ 97 public double anovaFValue(final Collection<double[]> categoryData) 98 throws MathIllegalArgumentException, NullArgumentException { 99 100 return anovaStats(categoryData).F; 101 102 } 103 104 /** 105 * Computes the ANOVA P-value for a collection of <code>double[]</code> 106 * arrays. 107 * 108 * <p><strong>Preconditions</strong>:</p> 109 * <ul> 110 * <li>The categoryData <code>Collection</code> must contain 111 * <code>double[]</code> arrays.</li> 112 * <li> There must be at least two <code>double[]</code> arrays in the 113 * <code>categoryData</code> collection and each of these arrays must 114 * contain at least two values.</li></ul> 115 * <p> 116 * This implementation uses the 117 * {@link org.hipparchus.distribution.continuous.FDistribution 118 * Hipparchus F Distribution implementation} to estimate the exact 119 * p-value, using the formula</p><pre> 120 * p = 1 - cumulativeProbability(F)</pre> 121 * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code> 122 * is the Hipparchus implementation of the F distribution.</p> 123 * 124 * @param categoryData <code>Collection</code> of <code>double[]</code> 125 * arrays each containing data for one category 126 * @return Pvalue 127 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 128 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code> 129 * array is less than 2 or a contained <code>double[]</code> array does not have 130 * at least two values 131 * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error 132 * @throws MathIllegalStateException if the maximum number of iterations is exceeded 133 */ 134 public double anovaPValue(final Collection<double[]> categoryData) 135 throws MathIllegalArgumentException, NullArgumentException, 136 MathIllegalStateException { 137 138 final AnovaStats a = anovaStats(categoryData); 139 // No try-catch or advertised exception because args are valid 140 final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg); 141 return 1.0 - fdist.cumulativeProbability(a.F); 142 143 } 144 145 /** 146 * Computes the ANOVA P-value for a collection of {@link StreamingStatistics}. 147 * 148 * <p><strong>Preconditions</strong>:</p> 149 * <ul> 150 * <li>The categoryData <code>Collection</code> must contain 151 * {@link StreamingStatistics}.</li> 152 * <li> There must be at least two {@link StreamingStatistics} in the 153 * <code>categoryData</code> collection and each of these statistics must 154 * contain at least two values.</li></ul> 155 * <p> 156 * This implementation uses the 157 * {@link org.hipparchus.distribution.continuous.FDistribution 158 * Hipparchus F Distribution implementation} to estimate the exact 159 * p-value, using the formula</p><pre> 160 * p = 1 - cumulativeProbability(F)</pre> 161 * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code> 162 * is the Hipparchus implementation of the F distribution.</p> 163 * 164 * @param categoryData <code>Collection</code> of {@link StreamingStatistics} 165 * each containing data for one category 166 * @param allowOneElementData if true, allow computation for one catagory 167 * only or for one data element per category 168 * @return Pvalue 169 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 170 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code> 171 * array is less than 2 or a contained {@link StreamingStatistics} does not have 172 * at least two values 173 * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error 174 * @throws MathIllegalStateException if the maximum number of iterations is exceeded 175 */ 176 public double anovaPValue(final Collection<StreamingStatistics> categoryData, 177 final boolean allowOneElementData) 178 throws MathIllegalArgumentException, NullArgumentException, 179 MathIllegalStateException { 180 181 final AnovaStats a = anovaStats(categoryData, allowOneElementData); 182 final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg); 183 return 1.0 - fdist.cumulativeProbability(a.F); 184 185 } 186 187 /** 188 * This method calls the method that actually does the calculations (except 189 * P-value). 190 * 191 * @param categoryData 192 * <code>Collection</code> of <code>double[]</code> arrays each 193 * containing data for one category 194 * @return computed AnovaStats 195 * @throws NullArgumentException 196 * if <code>categoryData</code> is <code>null</code> 197 * @throws MathIllegalArgumentException 198 * if the length of the <code>categoryData</code> array is less 199 * than 2 or a contained <code>double[]</code> array does not 200 * contain at least two values 201 */ 202 private AnovaStats anovaStats(final Collection<double[]> categoryData) 203 throws MathIllegalArgumentException, NullArgumentException { 204 205 MathUtils.checkNotNull(categoryData); 206 207 final Collection<StreamingStatistics> categoryDataSummaryStatistics = 208 new ArrayList<>(categoryData.size()); 209 210 // convert arrays to SummaryStatistics 211 for (final double[] data : categoryData) { 212 final StreamingStatistics dataSummaryStatistics = new StreamingStatistics(); 213 categoryDataSummaryStatistics.add(dataSummaryStatistics); 214 for (final double val : data) { 215 dataSummaryStatistics.addValue(val); 216 } 217 } 218 219 return anovaStats(categoryDataSummaryStatistics, false); 220 221 } 222 223 /** 224 * Performs an ANOVA test, evaluating the null hypothesis that there 225 * is no difference among the means of the data categories. 226 * 227 * <p><strong>Preconditions</strong>:</p> 228 * <ul> 229 * <li>The categoryData <code>Collection</code> must contain 230 * <code>double[]</code> arrays.</li> 231 * <li> There must be at least two <code>double[]</code> arrays in the 232 * <code>categoryData</code> collection and each of these arrays must 233 * contain at least two values.</li> 234 * <li>alpha must be strictly greater than 0 and less than or equal to 0.5. 235 * </li></ul> 236 * <p> 237 * This implementation uses the 238 * {@link org.hipparchus.distribution.continuous.FDistribution 239 * Hipparchus F Distribution implementation} to estimate the exact 240 * p-value, using the formula</p><pre> 241 * p = 1 - cumulativeProbability(F)</pre> 242 * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code> 243 * is the Hipparchus implementation of the F distribution.</p> 244 * <p>True is returned iff the estimated p-value is less than alpha.</p> 245 * 246 * @param categoryData <code>Collection</code> of <code>double[]</code> 247 * arrays each containing data for one category 248 * @param alpha significance level of the test 249 * @return true if the null hypothesis can be rejected with 250 * confidence 1 - alpha 251 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 252 * @throws MathIllegalArgumentException if the length of the <code>categoryData</code> 253 * array is less than 2 or a contained <code>double[]</code> array does not have 254 * at least two values 255 * @throws MathIllegalArgumentException if <code>alpha</code> is not in the range (0, 0.5] 256 * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error 257 * @throws MathIllegalStateException if the maximum number of iterations is exceeded 258 */ 259 public boolean anovaTest(final Collection<double[]> categoryData, 260 final double alpha) 261 throws MathIllegalArgumentException, NullArgumentException, MathIllegalStateException { 262 263 if ((alpha <= 0) || (alpha > 0.5)) { 264 throw new MathIllegalArgumentException( 265 LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, 266 alpha, 0, 0.5); 267 } 268 return anovaPValue(categoryData) < alpha; 269 } 270 271 /** 272 * This method actually does the calculations (except P-value). 273 * 274 * @param categoryData <code>Collection</code> of <code>double[]</code> 275 * arrays each containing data for one category 276 * @param allowOneElementData if true, allow computation for one category 277 * only or for one data element per category 278 * @return computed AnovaStats 279 * @throws NullArgumentException if <code>categoryData</code> is <code>null</code> 280 * @throws MathIllegalArgumentException if <code>allowOneElementData</code> is false and the number of 281 * categories is less than 2 or a contained SummaryStatistics does not contain 282 * at least two values 283 */ 284 private AnovaStats anovaStats(final Collection<StreamingStatistics> categoryData, 285 final boolean allowOneElementData) 286 throws MathIllegalArgumentException, NullArgumentException { 287 288 MathUtils.checkNotNull(categoryData); 289 290 if (!allowOneElementData) { 291 // check if we have enough categories 292 if (categoryData.size() < 2) { 293 throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_CATEGORIES_REQUIRED, 294 categoryData.size(), 2); 295 } 296 297 // check if each category has enough data 298 for (final StreamingStatistics array : categoryData) { 299 if (array.getN() <= 1) { 300 throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED, 301 (int) array.getN(), 2); 302 } 303 } 304 } 305 306 int dfwg = 0; 307 double sswg = 0; 308 double totsum = 0; 309 double totsumsq = 0; 310 int totnum = 0; 311 312 for (final StreamingStatistics data : categoryData) { 313 314 final double sum = data.getSum(); 315 final double sumsq = data.getSumOfSquares(); 316 final int num = (int) data.getN(); 317 totnum += num; 318 totsum += sum; 319 totsumsq += sumsq; 320 321 dfwg += num - 1; 322 final double ss = sumsq - ((sum * sum) / num); 323 sswg += ss; 324 } 325 326 final double sst = totsumsq - ((totsum * totsum) / totnum); 327 final double ssbg = sst - sswg; 328 final int dfbg = categoryData.size() - 1; 329 final double msbg = ssbg / dfbg; 330 final double mswg = sswg / dfwg; 331 final double F = msbg / mswg; 332 333 return new AnovaStats(dfbg, dfwg, F); 334 335 } 336 337 /** 338 * Convenience class to pass dfbg,dfwg,F values around within OneWayAnova. 339 * No get/set methods provided. 340 */ 341 private static class AnovaStats { 342 343 /** Degrees of freedom in numerator (between groups). */ 344 private final int dfbg; 345 346 /** Degrees of freedom in denominator (within groups). */ 347 private final int dfwg; 348 349 /** Statistic. */ 350 private final double F; 351 352 /** 353 * Constructor 354 * @param dfbg degrees of freedom in numerator (between groups) 355 * @param dfwg degrees of freedom in denominator (within groups) 356 * @param F statistic 357 */ 358 private AnovaStats(int dfbg, int dfwg, double F) { 359 this.dfbg = dfbg; 360 this.dfwg = dfwg; 361 this.F = F; 362 } 363 } 364 365 }