1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This is not the original file distributed by the Apache Software Foundation 20 * It has been modified by the Hipparchus project 21 */ 22 package org.hipparchus.stat.inference; 23 24 import java.util.Map; 25 import java.util.TreeMap; 26 import java.util.stream.LongStream; 27 28 import org.hipparchus.distribution.continuous.NormalDistribution; 29 import org.hipparchus.exception.LocalizedCoreFormats; 30 import org.hipparchus.exception.MathIllegalArgumentException; 31 import org.hipparchus.exception.MathIllegalStateException; 32 import org.hipparchus.exception.NullArgumentException; 33 import org.hipparchus.stat.LocalizedStatFormats; 34 import org.hipparchus.stat.ranking.NaNStrategy; 35 import org.hipparchus.stat.ranking.NaturalRanking; 36 import org.hipparchus.stat.ranking.TiesStrategy; 37 import org.hipparchus.util.FastMath; 38 import org.hipparchus.util.Precision; 39 40 /** 41 * An implementation of the Mann-Whitney U test. 42 * <p> 43 * The definitions and computing formulas used in this implementation follow 44 * those in the article, 45 * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U"> Mann-Whitney U 46 * Test</a> 47 * <p> 48 * In general, results correspond to (and have been tested against) the R 49 * wilcox.test function, with {@code exact} meaning the same thing in both APIs 50 * and {@code CORRECT} uniformly true in this implementation. For example, 51 * wilcox.test(x, y, alternative = "two.sided", mu = 0, paired = FALSE, exact = FALSE 52 * correct = TRUE) will return the same p-value as mannWhitneyUTest(x, y, 53 * false). The minimum of the W value returned by R for wilcox.test(x, y...) and 54 * wilcox.test(y, x...) should equal mannWhitneyU(x, y...). 55 */ 56 public class MannWhitneyUTest { // NOPMD - this is not a Junit test class, PMD false positive here 57 58 /** 59 * If the combined dataset contains no more values than this, test defaults to 60 * exact test. 61 */ 62 private static final int SMALL_SAMPLE_SIZE = 50; 63 64 /** Ranking algorithm. */ 65 private final NaturalRanking naturalRanking; 66 67 /** Normal distribution */ 68 private final NormalDistribution standardNormal; 69 70 /** 71 * Create a test instance using where NaN's are left in place and ties get 72 * the average of applicable ranks. 73 */ 74 public MannWhitneyUTest() { 75 naturalRanking = new NaturalRanking(NaNStrategy.FIXED, 76 TiesStrategy.AVERAGE); 77 standardNormal = new NormalDistribution(0, 1); 78 } 79 80 /** 81 * Create a test instance using the given strategies for NaN's and ties. 82 * 83 * @param nanStrategy specifies the strategy that should be used for 84 * Double.NaN's 85 * @param tiesStrategy specifies the strategy that should be used for ties 86 */ 87 public MannWhitneyUTest(final NaNStrategy nanStrategy, 88 final TiesStrategy tiesStrategy) { 89 naturalRanking = new NaturalRanking(nanStrategy, tiesStrategy); 90 standardNormal = new NormalDistribution(0, 1); 91 } 92 93 /** 94 * Computes the 95 * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U"> 96 * Mann-Whitney U statistic</a> comparing means for two independent samples 97 * possibly of different lengths. 98 * <p> 99 * This statistic can be used to perform a Mann-Whitney U test evaluating 100 * the null hypothesis that the two independent samples have equal mean. 101 * <p> 102 * Let X<sub>i</sub> denote the i'th individual of the first sample and 103 * Y<sub>j</sub> the j'th individual in the second sample. Note that the 104 * samples can have different lengths. 105 * <p> 106 * <strong>Preconditions</strong>: 107 * <ul> 108 * <li>All observations in the two samples are independent.</li> 109 * <li>The observations are at least ordinal (continuous are also 110 * ordinal).</li> 111 * </ul> 112 * 113 * @param x the first sample 114 * @param y the second sample 115 * @return Mann-Whitney U statistic (minimum of U<sup>x</sup> and 116 * U<sup>y</sup>) 117 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}. 118 * @throws MathIllegalArgumentException if {@code x} or {@code y} are 119 * zero-length. 120 */ 121 public double mannWhitneyU(final double[] x, final double[] y) 122 throws MathIllegalArgumentException, NullArgumentException { 123 124 ensureDataConformance(x, y); 125 126 final double[] z = concatenateSamples(x, y); 127 final double[] ranks = naturalRanking.rank(z); 128 129 double sumRankX = 0; 130 131 /* 132 * The ranks for x is in the first x.length entries in ranks because x 133 * is in the first x.length entries in z 134 */ 135 for (int i = 0; i < x.length; ++i) { 136 sumRankX += ranks[i]; 137 } 138 139 /* 140 * U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1, 141 * e.g. x, n1 is the number of observations in sample 1. 142 */ 143 final double U1 = sumRankX - ((long) x.length * (x.length + 1)) / 2; 144 145 /* 146 * U1 + U2 = n1 * n2 147 */ 148 final double U2 = (long) x.length * y.length - U1; 149 150 return FastMath.min(U1, U2); 151 } 152 153 /** 154 * Concatenate the samples into one array. 155 * 156 * @param x first sample 157 * @param y second sample 158 * @return concatenated array 159 */ 160 private double[] concatenateSamples(final double[] x, final double[] y) { 161 final double[] z = new double[x.length + y.length]; 162 163 System.arraycopy(x, 0, z, 0, x.length); 164 System.arraycopy(y, 0, z, x.length, y.length); 165 166 return z; 167 } 168 169 /** 170 * Returns the asymptotic <i>observed significance level</i>, or 171 * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue"> 172 * p-value</a>, associated with a <a href= 173 * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U 174 * Test</a> comparing means for two independent samples. 175 * <p> 176 * Let X<sub>i</sub> denote the i'th individual of the first sample and 177 * Y<sub>j</sub> the j'th individual in the second sample. 178 * <p> 179 * <strong>Preconditions</strong>: 180 * <ul> 181 * <li>All observations in the two samples are independent.</li> 182 * <li>The observations are at least ordinal.</li> 183 * </ul> 184 * <p> 185 * If there are no ties in the data and both samples are small (less than or 186 * equal to 50 values in the combined dataset), an exact test is performed; 187 * otherwise the test uses the normal approximation (with continuity 188 * correction). 189 * <p> 190 * If the combined dataset contains ties, the variance used in the normal 191 * approximation is bias-adjusted using the formula in the reference above. 192 * 193 * @param x the first sample 194 * @param y the second sample 195 * @return approximate 2-sized p-value 196 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}. 197 * @throws MathIllegalArgumentException if {@code x} or {@code y} are 198 * zero-length 199 */ 200 public double mannWhitneyUTest(final double[] x, final double[] y) 201 throws MathIllegalArgumentException, NullArgumentException { 202 ensureDataConformance(x, y); 203 204 // If samples are both small and there are no ties, perform exact test 205 if (x.length + y.length <= SMALL_SAMPLE_SIZE && 206 tiesMap(x, y).isEmpty()) { 207 return mannWhitneyUTest(x, y, true); 208 } else { // Normal approximation 209 return mannWhitneyUTest(x, y, false); 210 } 211 } 212 213 /** 214 * Returns the asymptotic <i>observed significance level</i>, or 215 * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue"> 216 * p-value</a>, associated with a <a href= 217 * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U 218 * Test</a> comparing means for two independent samples. 219 * <p> 220 * Let X<sub>i</sub> denote the i'th individual of the first sample and 221 * Y<sub>j</sub> the j'th individual in the second sample. 222 * <p> 223 * <strong>Preconditions</strong>: 224 * <ul> 225 * <li>All observations in the two samples are independent.</li> 226 * <li>The observations are at least ordinal.</li> 227 * </ul> 228 * <p> 229 * If {@code exact} is {@code true}, the p-value reported is exact, computed 230 * using the exact distribution of the U statistic. The computation in this 231 * case requires storage on the order of the product of the two sample 232 * sizes, so this should not be used for large samples. 233 * <p> 234 * If {@code exact} is {@code false}, the normal approximation is used to 235 * estimate the p-value. 236 * <p> 237 * If the combined dataset contains ties and {@code exact} is {@code true}, 238 * MathIllegalArgumentException is thrown. If {@code exact} is {@code false} 239 * and the ties are present, the variance used to compute the approximate 240 * p-value in the normal approximation is bias-adjusted using the formula in 241 * the reference above. 242 * 243 * @param x the first sample 244 * @param y the second sample 245 * @param exact true means compute the p-value exactly, false means use the 246 * normal approximation 247 * @return approximate 2-sided p-value 248 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}. 249 * @throws MathIllegalArgumentException if {@code x} or {@code y} are 250 * zero-length or if {@code exact} is {@code true} and ties are 251 * present in the data 252 */ 253 public double mannWhitneyUTest(final double[] x, final double[] y, 254 final boolean exact) 255 throws MathIllegalArgumentException, NullArgumentException { 256 ensureDataConformance(x, y); 257 final Map<Double, Integer> tiesMap = tiesMap(x, y); 258 final double u = mannWhitneyU(x, y); 259 if (exact) { 260 if (!tiesMap.isEmpty()) { 261 throw new MathIllegalArgumentException(LocalizedStatFormats.TIES_ARE_NOT_ALLOWED); 262 } 263 return exactP(x.length, y.length, u); 264 } 265 266 return approximateP(u, x.length, y.length, 267 varU(x.length, y.length, tiesMap)); 268 } 269 270 /** 271 * Ensures that the provided arrays fulfills the assumptions. 272 * 273 * @param x first sample 274 * @param y second sample 275 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}. 276 * @throws MathIllegalArgumentException if {@code x} or {@code y} are 277 * zero-length. 278 */ 279 private void ensureDataConformance(final double[] x, final double[] y) 280 throws MathIllegalArgumentException, NullArgumentException { 281 282 if (x == null || y == null) { 283 throw new NullArgumentException(); 284 } 285 if (x.length == 0 || y.length == 0) { 286 throw new MathIllegalArgumentException(LocalizedCoreFormats.NO_DATA); 287 } 288 } 289 290 /** 291 * Estimates the 2-sided p-value associated with a Mann-Whitney U statistic 292 * value using the normal approximation. 293 * <p> 294 * The variance passed in is assumed to be corrected for ties. Continuity 295 * correction is applied to the normal approximation. 296 * 297 * @param u Mann-Whitney U statistic 298 * @param n1 number of subjects in first sample 299 * @param n2 number of subjects in second sample 300 * @param varU variance of U (corrected for ties if these exist) 301 * @return two-sided asymptotic p-value 302 * @throws MathIllegalStateException if the p-value can not be computed due 303 * to a convergence error 304 * @throws MathIllegalStateException if the maximum number of iterations is 305 * exceeded 306 */ 307 private double approximateP(final double u, final int n1, final int n2, 308 final double varU) 309 throws MathIllegalStateException { 310 311 final double mu = (long) n1 * n2 / 2.0; 312 313 // If u == mu, return 1 314 if (Precision.equals(mu, u)) { 315 return 1; 316 } 317 318 // Force z <= 0 so we get tail probability. Also apply continuity 319 // correction 320 final double z = -Math.abs((u - mu) + 0.5) / FastMath.sqrt(varU); 321 322 return 2 * standardNormal.cumulativeProbability(z); 323 } 324 325 /** 326 * Calculates the (2-sided) p-value associated with a Mann-Whitney U 327 * statistic. 328 * <p> 329 * To compute the p-value, the probability densities for each value of U up 330 * to and including u are summed and the resulting tail probability is 331 * multiplied by 2. 332 * <p> 333 * The result of this computation is only valid when the combined n + m 334 * sample has no tied values. 335 * <p> 336 * This method should not be used for large values of n or m as it maintains 337 * work arrays of size n*m. 338 * 339 * @param u Mann-Whitney U statistic value 340 * @param n first sample size 341 * @param m second sample size 342 * @return two-sided exact p-value 343 */ 344 private double exactP(final int n, final int m, final double u) { 345 final double nm = m * n; 346 if (u > nm) { // Quick exit if u is out of range 347 return 1; 348 } 349 // Need to convert u to a mean deviation, so cumulative probability is 350 // tail probability 351 final double crit = u < nm / 2 ? u : nm / 2 - u; 352 353 double cum = 0d; 354 for (int ct = 0; ct <= crit; ct++) { 355 cum += uDensity(n, m, ct); 356 } 357 return 2 * cum; 358 } 359 360 /** 361 * Computes the probability density function for the Mann-Whitney U 362 * statistic. 363 * <p> 364 * This method should not be used for large values of n or m as it maintains 365 * work arrays of size n*m. 366 * 367 * @param n first sample size 368 * @param m second sample size 369 * @param u U-statistic value 370 * @return the probability that a U statistic derived from random samples of 371 * size n and m (containing no ties) equals u 372 */ 373 private double uDensity(final int n, final int m, double u) { 374 if (u < 0 || u > m * n) { 375 return 0; 376 } 377 final long[] freq = uFrequencies(n, m); 378 return freq[(int) FastMath.round(u + 1)] / 379 (double) LongStream.of(freq).sum(); 380 } 381 382 /** 383 * Computes frequency counts for values of the Mann-Whitney U statistc. If 384 * freq[] is the returned array, freq[u + 1] counts the frequency of U = u 385 * among all possible n-m orderings. Therefore, P(u = U) = freq[u + 1] / sum 386 * where sum is the sum of the values in the returned array. 387 * <p> 388 * Implements the algorithm presented in "Algorithm AS 62: A Generator for 389 * the Sampling Distribution of the Mann-Whitney U Statistic", L. C. Dinneen 390 * and B. C. Blakesley Journal of the Royal Statistical Society. Series C 391 * (Applied Statistics) Vol. 22, No. 2 (1973), pp. 269-273. 392 * 393 * @param n first sample size 394 * @param m second sample size 395 * @return array of U statistic value frequencies 396 */ 397 private long[] uFrequencies(final int n, final int m) { 398 final int max = FastMath.max(m, n); 399 if (max > 100) { 400 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE, 401 max, 100); 402 } 403 final int min = FastMath.min(m, n); 404 final long[] out = new long[n * m + 2]; 405 final long[] work = new long[n * m + 2]; 406 for (int i = 1; i < out.length; i++) { 407 out[i] = (i <= (max + 1)) ? 1 : 0; 408 } 409 work[1] = 0; 410 int in = max; 411 for (int i = 2; i <= min; i++) { 412 work[i] = 0; 413 in = in + max; 414 int n1 = in + 2; 415 long l = 1 + in / 2; 416 int k = i; 417 for (int j = 1; j <= l; j++) { 418 k++; 419 n1 = n1 - 1; 420 final long sum = out[j] + work[j]; 421 out[j] = sum; 422 work[k] = sum - out[n1]; 423 out[n1] = sum; 424 } 425 } 426 return out; 427 } 428 429 /** 430 * Computes the variance for a U-statistic associated with samples of 431 * sizes{@code n} and {@code m} and ties described by {@code tiesMap}. If 432 * {@code tiesMap} is non-empty, the multiplicity counts in its values set 433 * are used to adjust the variance. 434 * 435 * @param n first sample size 436 * @param m second sample size 437 * @param tiesMap map of <value, multiplicity> 438 * @return ties-adjusted variance 439 */ 440 private double varU(final int n, final int m, 441 Map<Double, Integer> tiesMap) { 442 final double nm = (long) n * m; 443 if (tiesMap.isEmpty()) { 444 return nm * (n + m + 1) / 12.0; 445 } 446 final long tSum = tiesMap.entrySet().stream() 447 .mapToLong(e -> e.getValue() * e.getValue() * e.getValue() - 448 e.getValue()) 449 .sum(); 450 final double totalN = n + m; 451 return (nm / 12) * (totalN + 1 - tSum / (totalN * (totalN - 1))); 452 453 } 454 455 /** 456 * Creates a map whose keys are values occurring more than once in the 457 * combined dataset formed from x and y. Map entry values are the number of 458 * occurrences. The returned map is empty iff there are no ties in the data. 459 * 460 * @param x first dataset 461 * @param y second dataset 462 * @return map of <value, number of times it occurs> for values occurring 463 * more than once or an empty map if there are no ties (the returned 464 * map is <em>not</em> thread-safe, which is OK in the context of the callers) 465 */ 466 private Map<Double, Integer> tiesMap(final double[] x, final double[] y) { 467 final Map<Double, Integer> tiesMap = new TreeMap<>(); // NOPMD - no concurrent access in the callers context 468 for (int i = 0; i < x.length; i++) { 469 tiesMap.merge(x[i], 1, Integer::sum); 470 } 471 for (int i = 0; i < y.length; i++) { 472 tiesMap.merge(y[i], 1, Integer::sum); 473 } 474 tiesMap.entrySet().removeIf(e -> e.getValue() == 1); 475 return tiesMap; 476 } 477 }