1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22 package org.hipparchus.stat.inference;
23
24 import java.util.Map;
25 import java.util.TreeMap;
26 import java.util.stream.LongStream;
27
28 import org.hipparchus.distribution.continuous.NormalDistribution;
29 import org.hipparchus.exception.LocalizedCoreFormats;
30 import org.hipparchus.exception.MathIllegalArgumentException;
31 import org.hipparchus.exception.MathIllegalStateException;
32 import org.hipparchus.exception.NullArgumentException;
33 import org.hipparchus.stat.LocalizedStatFormats;
34 import org.hipparchus.stat.ranking.NaNStrategy;
35 import org.hipparchus.stat.ranking.NaturalRanking;
36 import org.hipparchus.stat.ranking.TiesStrategy;
37 import org.hipparchus.util.FastMath;
38 import org.hipparchus.util.Precision;
39
40 /**
41 * An implementation of the Mann-Whitney U test.
42 * <p>
43 * The definitions and computing formulas used in this implementation follow
44 * those in the article,
45 * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U"> Mann-Whitney U
46 * Test</a>
47 * <p>
48 * In general, results correspond to (and have been tested against) the R
49 * wilcox.test function, with {@code exact} meaning the same thing in both APIs
50 * and {@code CORRECT} uniformly true in this implementation. For example,
51 * wilcox.test(x, y, alternative = "two.sided", mu = 0, paired = FALSE, exact = FALSE
52 * correct = TRUE) will return the same p-value as mannWhitneyUTest(x, y,
53 * false). The minimum of the W value returned by R for wilcox.test(x, y...) and
54 * wilcox.test(y, x...) should equal mannWhitneyU(x, y...).
55 */
56 public class MannWhitneyUTest { // NOPMD - this is not a Junit test class, PMD false positive here
57
58 /**
59 * If the combined dataset contains no more values than this, test defaults to
60 * exact test.
61 */
62 private static final int SMALL_SAMPLE_SIZE = 50;
63
64 /** Ranking algorithm. */
65 private final NaturalRanking naturalRanking;
66
67 /** Normal distribution */
68 private final NormalDistribution standardNormal;
69
70 /**
71 * Create a test instance using where NaN's are left in place and ties get
72 * the average of applicable ranks.
73 */
74 public MannWhitneyUTest() {
75 naturalRanking = new NaturalRanking(NaNStrategy.FIXED,
76 TiesStrategy.AVERAGE);
77 standardNormal = new NormalDistribution(0, 1);
78 }
79
80 /**
81 * Create a test instance using the given strategies for NaN's and ties.
82 *
83 * @param nanStrategy specifies the strategy that should be used for
84 * Double.NaN's
85 * @param tiesStrategy specifies the strategy that should be used for ties
86 */
87 public MannWhitneyUTest(final NaNStrategy nanStrategy,
88 final TiesStrategy tiesStrategy) {
89 naturalRanking = new NaturalRanking(nanStrategy, tiesStrategy);
90 standardNormal = new NormalDistribution(0, 1);
91 }
92
93 /**
94 * Computes the
95 * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">
96 * Mann-Whitney U statistic</a> comparing means for two independent samples
97 * possibly of different lengths.
98 * <p>
99 * This statistic can be used to perform a Mann-Whitney U test evaluating
100 * the null hypothesis that the two independent samples have equal mean.
101 * <p>
102 * Let X<sub>i</sub> denote the i'th individual of the first sample and
103 * Y<sub>j</sub> the j'th individual in the second sample. Note that the
104 * samples can have different lengths.
105 * <p>
106 * <strong>Preconditions</strong>:
107 * <ul>
108 * <li>All observations in the two samples are independent.</li>
109 * <li>The observations are at least ordinal (continuous are also
110 * ordinal).</li>
111 * </ul>
112 *
113 * @param x the first sample
114 * @param y the second sample
115 * @return Mann-Whitney U statistic (minimum of U<sup>x</sup> and
116 * U<sup>y</sup>)
117 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
118 * @throws MathIllegalArgumentException if {@code x} or {@code y} are
119 * zero-length.
120 */
121 public double mannWhitneyU(final double[] x, final double[] y)
122 throws MathIllegalArgumentException, NullArgumentException {
123
124 ensureDataConformance(x, y);
125
126 final double[] z = concatenateSamples(x, y);
127 final double[] ranks = naturalRanking.rank(z);
128
129 double sumRankX = 0;
130
131 /*
132 * The ranks for x is in the first x.length entries in ranks because x
133 * is in the first x.length entries in z
134 */
135 for (int i = 0; i < x.length; ++i) {
136 sumRankX += ranks[i];
137 }
138
139 /*
140 * U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
141 * e.g. x, n1 is the number of observations in sample 1.
142 */
143 final double U1 = sumRankX - ((long) x.length * (x.length + 1)) / 2;
144
145 /*
146 * U1 + U2 = n1 * n2
147 */
148 final double U2 = (long) x.length * y.length - U1;
149
150 return FastMath.min(U1, U2);
151 }
152
153 /**
154 * Concatenate the samples into one array.
155 *
156 * @param x first sample
157 * @param y second sample
158 * @return concatenated array
159 */
160 private double[] concatenateSamples(final double[] x, final double[] y) {
161 final double[] z = new double[x.length + y.length];
162
163 System.arraycopy(x, 0, z, 0, x.length);
164 System.arraycopy(y, 0, z, x.length, y.length);
165
166 return z;
167 }
168
169 /**
170 * Returns the asymptotic <i>observed significance level</i>, or
171 * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
172 * p-value</a>, associated with a <a href=
173 * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U
174 * Test</a> comparing means for two independent samples.
175 * <p>
176 * Let X<sub>i</sub> denote the i'th individual of the first sample and
177 * Y<sub>j</sub> the j'th individual in the second sample.
178 * <p>
179 * <strong>Preconditions</strong>:
180 * <ul>
181 * <li>All observations in the two samples are independent.</li>
182 * <li>The observations are at least ordinal.</li>
183 * </ul>
184 * <p>
185 * If there are no ties in the data and both samples are small (less than or
186 * equal to 50 values in the combined dataset), an exact test is performed;
187 * otherwise the test uses the normal approximation (with continuity
188 * correction).
189 * <p>
190 * If the combined dataset contains ties, the variance used in the normal
191 * approximation is bias-adjusted using the formula in the reference above.
192 *
193 * @param x the first sample
194 * @param y the second sample
195 * @return approximate 2-sized p-value
196 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
197 * @throws MathIllegalArgumentException if {@code x} or {@code y} are
198 * zero-length
199 */
200 public double mannWhitneyUTest(final double[] x, final double[] y)
201 throws MathIllegalArgumentException, NullArgumentException {
202 ensureDataConformance(x, y);
203
204 // If samples are both small and there are no ties, perform exact test
205 if (x.length + y.length <= SMALL_SAMPLE_SIZE &&
206 tiesMap(x, y).isEmpty()) {
207 return mannWhitneyUTest(x, y, true);
208 } else { // Normal approximation
209 return mannWhitneyUTest(x, y, false);
210 }
211 }
212
213 /**
214 * Returns the asymptotic <i>observed significance level</i>, or
215 * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
216 * p-value</a>, associated with a <a href=
217 * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U
218 * Test</a> comparing means for two independent samples.
219 * <p>
220 * Let X<sub>i</sub> denote the i'th individual of the first sample and
221 * Y<sub>j</sub> the j'th individual in the second sample.
222 * <p>
223 * <strong>Preconditions</strong>:
224 * <ul>
225 * <li>All observations in the two samples are independent.</li>
226 * <li>The observations are at least ordinal.</li>
227 * </ul>
228 * <p>
229 * If {@code exact} is {@code true}, the p-value reported is exact, computed
230 * using the exact distribution of the U statistic. The computation in this
231 * case requires storage on the order of the product of the two sample
232 * sizes, so this should not be used for large samples.
233 * <p>
234 * If {@code exact} is {@code false}, the normal approximation is used to
235 * estimate the p-value.
236 * <p>
237 * If the combined dataset contains ties and {@code exact} is {@code true},
238 * MathIllegalArgumentException is thrown. If {@code exact} is {@code false}
239 * and the ties are present, the variance used to compute the approximate
240 * p-value in the normal approximation is bias-adjusted using the formula in
241 * the reference above.
242 *
243 * @param x the first sample
244 * @param y the second sample
245 * @param exact true means compute the p-value exactly, false means use the
246 * normal approximation
247 * @return approximate 2-sided p-value
248 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
249 * @throws MathIllegalArgumentException if {@code x} or {@code y} are
250 * zero-length or if {@code exact} is {@code true} and ties are
251 * present in the data
252 */
253 public double mannWhitneyUTest(final double[] x, final double[] y,
254 final boolean exact)
255 throws MathIllegalArgumentException, NullArgumentException {
256 ensureDataConformance(x, y);
257 final Map<Double, Integer> tiesMap = tiesMap(x, y);
258 final double u = mannWhitneyU(x, y);
259 if (exact) {
260 if (!tiesMap.isEmpty()) {
261 throw new MathIllegalArgumentException(LocalizedStatFormats.TIES_ARE_NOT_ALLOWED);
262 }
263 return exactP(x.length, y.length, u);
264 }
265
266 return approximateP(u, x.length, y.length,
267 varU(x.length, y.length, tiesMap));
268 }
269
270 /**
271 * Ensures that the provided arrays fulfills the assumptions.
272 *
273 * @param x first sample
274 * @param y second sample
275 * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
276 * @throws MathIllegalArgumentException if {@code x} or {@code y} are
277 * zero-length.
278 */
279 private void ensureDataConformance(final double[] x, final double[] y)
280 throws MathIllegalArgumentException, NullArgumentException {
281
282 if (x == null || y == null) {
283 throw new NullArgumentException();
284 }
285 if (x.length == 0 || y.length == 0) {
286 throw new MathIllegalArgumentException(LocalizedCoreFormats.NO_DATA);
287 }
288 }
289
290 /**
291 * Estimates the 2-sided p-value associated with a Mann-Whitney U statistic
292 * value using the normal approximation.
293 * <p>
294 * The variance passed in is assumed to be corrected for ties. Continuity
295 * correction is applied to the normal approximation.
296 *
297 * @param u Mann-Whitney U statistic
298 * @param n1 number of subjects in first sample
299 * @param n2 number of subjects in second sample
300 * @param varU variance of U (corrected for ties if these exist)
301 * @return two-sided asymptotic p-value
302 * @throws MathIllegalStateException if the p-value can not be computed due
303 * to a convergence error
304 * @throws MathIllegalStateException if the maximum number of iterations is
305 * exceeded
306 */
307 private double approximateP(final double u, final int n1, final int n2,
308 final double varU)
309 throws MathIllegalStateException {
310
311 final double mu = (long) n1 * n2 / 2.0;
312
313 // If u == mu, return 1
314 if (Precision.equals(mu, u)) {
315 return 1;
316 }
317
318 // Force z <= 0 so we get tail probability. Also apply continuity
319 // correction
320 final double z = -Math.abs((u - mu) + 0.5) / FastMath.sqrt(varU);
321
322 return 2 * standardNormal.cumulativeProbability(z);
323 }
324
325 /**
326 * Calculates the (2-sided) p-value associated with a Mann-Whitney U
327 * statistic.
328 * <p>
329 * To compute the p-value, the probability densities for each value of U up
330 * to and including u are summed and the resulting tail probability is
331 * multiplied by 2.
332 * <p>
333 * The result of this computation is only valid when the combined n + m
334 * sample has no tied values.
335 * <p>
336 * This method should not be used for large values of n or m as it maintains
337 * work arrays of size n*m.
338 *
339 * @param u Mann-Whitney U statistic value
340 * @param n first sample size
341 * @param m second sample size
342 * @return two-sided exact p-value
343 */
344 private double exactP(final int n, final int m, final double u) {
345 final double nm = m * n;
346 if (u > nm) { // Quick exit if u is out of range
347 return 1;
348 }
349 // Need to convert u to a mean deviation, so cumulative probability is
350 // tail probability
351 final double crit = u < nm / 2 ? u : nm / 2 - u;
352
353 double cum = 0d;
354 for (int ct = 0; ct <= crit; ct++) {
355 cum += uDensity(n, m, ct);
356 }
357 return 2 * cum;
358 }
359
360 /**
361 * Computes the probability density function for the Mann-Whitney U
362 * statistic.
363 * <p>
364 * This method should not be used for large values of n or m as it maintains
365 * work arrays of size n*m.
366 *
367 * @param n first sample size
368 * @param m second sample size
369 * @param u U-statistic value
370 * @return the probability that a U statistic derived from random samples of
371 * size n and m (containing no ties) equals u
372 */
373 private double uDensity(final int n, final int m, double u) {
374 if (u < 0 || u > m * n) {
375 return 0;
376 }
377 final long[] freq = uFrequencies(n, m);
378 return freq[(int) FastMath.round(u + 1)] /
379 (double) LongStream.of(freq).sum();
380 }
381
382 /**
383 * Computes frequency counts for values of the Mann-Whitney U statistc. If
384 * freq[] is the returned array, freq[u + 1] counts the frequency of U = u
385 * among all possible n-m orderings. Therefore, P(u = U) = freq[u + 1] / sum
386 * where sum is the sum of the values in the returned array.
387 * <p>
388 * Implements the algorithm presented in "Algorithm AS 62: A Generator for
389 * the Sampling Distribution of the Mann-Whitney U Statistic", L. C. Dinneen
390 * and B. C. Blakesley Journal of the Royal Statistical Society. Series C
391 * (Applied Statistics) Vol. 22, No. 2 (1973), pp. 269-273.
392 *
393 * @param n first sample size
394 * @param m second sample size
395 * @return array of U statistic value frequencies
396 */
397 private long[] uFrequencies(final int n, final int m) {
398 final int max = FastMath.max(m, n);
399 if (max > 100) {
400 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE,
401 max, 100);
402 }
403 final int min = FastMath.min(m, n);
404 final long[] out = new long[n * m + 2];
405 final long[] work = new long[n * m + 2];
406 for (int i = 1; i < out.length; i++) {
407 out[i] = (i <= (max + 1)) ? 1 : 0;
408 }
409 work[1] = 0;
410 int in = max;
411 for (int i = 2; i <= min; i++) {
412 work[i] = 0;
413 in = in + max;
414 int n1 = in + 2;
415 long l = 1 + in / 2;
416 int k = i;
417 for (int j = 1; j <= l; j++) {
418 k++;
419 n1 = n1 - 1;
420 final long sum = out[j] + work[j];
421 out[j] = sum;
422 work[k] = sum - out[n1];
423 out[n1] = sum;
424 }
425 }
426 return out;
427 }
428
429 /**
430 * Computes the variance for a U-statistic associated with samples of
431 * sizes{@code n} and {@code m} and ties described by {@code tiesMap}. If
432 * {@code tiesMap} is non-empty, the multiplicity counts in its values set
433 * are used to adjust the variance.
434 *
435 * @param n first sample size
436 * @param m second sample size
437 * @param tiesMap map of <value, multiplicity>
438 * @return ties-adjusted variance
439 */
440 private double varU(final int n, final int m,
441 Map<Double, Integer> tiesMap) {
442 final double nm = (long) n * m;
443 if (tiesMap.isEmpty()) {
444 return nm * (n + m + 1) / 12.0;
445 }
446 final long tSum = tiesMap.entrySet().stream()
447 .mapToLong(e -> e.getValue() * e.getValue() * e.getValue() -
448 e.getValue())
449 .sum();
450 final double totalN = n + m;
451 return (nm / 12) * (totalN + 1 - tSum / (totalN * (totalN - 1)));
452
453 }
454
455 /**
456 * Creates a map whose keys are values occurring more than once in the
457 * combined dataset formed from x and y. Map entry values are the number of
458 * occurrences. The returned map is empty iff there are no ties in the data.
459 *
460 * @param x first dataset
461 * @param y second dataset
462 * @return map of <value, number of times it occurs> for values occurring
463 * more than once or an empty map if there are no ties (the returned
464 * map is <em>not</em> thread-safe, which is OK in the context of the callers)
465 */
466 private Map<Double, Integer> tiesMap(final double[] x, final double[] y) {
467 final Map<Double, Integer> tiesMap = new TreeMap<>(); // NOPMD - no concurrent access in the callers context
468 for (double value : x) {
469 tiesMap.merge(value, 1, Integer::sum);
470 }
471 for (double v : y) {
472 tiesMap.merge(v, 1, Integer::sum);
473 }
474 tiesMap.entrySet().removeIf(e -> e.getValue() == 1);
475 return tiesMap;
476 }
477 }