MannWhitneyUTest.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. /*
  18.  * This is not the original file distributed by the Apache Software Foundation
  19.  * It has been modified by the Hipparchus project
  20.  */
  21. package org.hipparchus.stat.inference;

  22. import java.util.Map;
  23. import java.util.TreeMap;
  24. import java.util.stream.LongStream;

  25. import org.hipparchus.distribution.continuous.NormalDistribution;
  26. import org.hipparchus.exception.LocalizedCoreFormats;
  27. import org.hipparchus.exception.MathIllegalArgumentException;
  28. import org.hipparchus.exception.MathIllegalStateException;
  29. import org.hipparchus.exception.NullArgumentException;
  30. import org.hipparchus.stat.LocalizedStatFormats;
  31. import org.hipparchus.stat.ranking.NaNStrategy;
  32. import org.hipparchus.stat.ranking.NaturalRanking;
  33. import org.hipparchus.stat.ranking.TiesStrategy;
  34. import org.hipparchus.util.FastMath;
  35. import org.hipparchus.util.Precision;

  36. /**
  37.  * An implementation of the Mann-Whitney U test.
  38.  * <p>
  39.  * The definitions and computing formulas used in this implementation follow
  40.  * those in the article,
  41.  * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U"> Mann-Whitney U
  42.  * Test</a>
  43.  * <p>
  44.  * In general, results correspond to (and have been tested against) the R
  45.  * wilcox.test function, with {@code exact} meaning the same thing in both APIs
  46.  * and {@code CORRECT} uniformly true in this implementation. For example,
  47.  * wilcox.test(x, y, alternative = "two.sided", mu = 0, paired = FALSE, exact = FALSE
  48.  * correct = TRUE) will return the same p-value as mannWhitneyUTest(x, y,
  49.  * false). The minimum of the W value returned by R for wilcox.test(x, y...) and
  50.  * wilcox.test(y, x...) should equal mannWhitneyU(x, y...).
  51.  */
  52. public class MannWhitneyUTest { // NOPMD - this is not a Junit test class, PMD false positive here

  53.     /**
  54.      * If the combined dataset contains no more values than this, test defaults to
  55.      * exact test.
  56.      */
  57.     private static final int SMALL_SAMPLE_SIZE = 50;

  58.     /** Ranking algorithm. */
  59.     private final NaturalRanking naturalRanking;

  60.     /** Normal distribution */
  61.     private final NormalDistribution standardNormal;

  62.     /**
  63.      * Create a test instance using where NaN's are left in place and ties get
  64.      * the average of applicable ranks.
  65.      */
  66.     public MannWhitneyUTest() {
  67.         naturalRanking = new NaturalRanking(NaNStrategy.FIXED,
  68.                                             TiesStrategy.AVERAGE);
  69.         standardNormal = new NormalDistribution(0, 1);
  70.     }

  71.     /**
  72.      * Create a test instance using the given strategies for NaN's and ties.
  73.      *
  74.      * @param nanStrategy specifies the strategy that should be used for
  75.      *        Double.NaN's
  76.      * @param tiesStrategy specifies the strategy that should be used for ties
  77.      */
  78.     public MannWhitneyUTest(final NaNStrategy nanStrategy,
  79.                             final TiesStrategy tiesStrategy) {
  80.         naturalRanking = new NaturalRanking(nanStrategy, tiesStrategy);
  81.         standardNormal = new NormalDistribution(0, 1);
  82.     }

  83.     /**
  84.      * Computes the
  85.      * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">
  86.      * Mann-Whitney U statistic</a> comparing means for two independent samples
  87.      * possibly of different lengths.
  88.      * <p>
  89.      * This statistic can be used to perform a Mann-Whitney U test evaluating
  90.      * the null hypothesis that the two independent samples have equal mean.
  91.      * <p>
  92.      * Let X<sub>i</sub> denote the i'th individual of the first sample and
  93.      * Y<sub>j</sub> the j'th individual in the second sample. Note that the
  94.      * samples can have different lengths.
  95.      * <p>
  96.      * <strong>Preconditions</strong>:
  97.      * <ul>
  98.      * <li>All observations in the two samples are independent.</li>
  99.      * <li>The observations are at least ordinal (continuous are also
  100.      * ordinal).</li>
  101.      * </ul>
  102.      *
  103.      * @param x the first sample
  104.      * @param y the second sample
  105.      * @return Mann-Whitney U statistic (minimum of U<sup>x</sup> and
  106.      *         U<sup>y</sup>)
  107.      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
  108.      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
  109.      *         zero-length.
  110.      */
  111.     public double mannWhitneyU(final double[] x, final double[] y)
  112.         throws MathIllegalArgumentException, NullArgumentException {

  113.         ensureDataConformance(x, y);

  114.         final double[] z = concatenateSamples(x, y);
  115.         final double[] ranks = naturalRanking.rank(z);

  116.         double sumRankX = 0;

  117.         /*
  118.          * The ranks for x is in the first x.length entries in ranks because x
  119.          * is in the first x.length entries in z
  120.          */
  121.         for (int i = 0; i < x.length; ++i) {
  122.             sumRankX += ranks[i];
  123.         }

  124.         /*
  125.          * U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
  126.          * e.g. x, n1 is the number of observations in sample 1.
  127.          */
  128.         final double U1 = sumRankX - ((long) x.length * (x.length + 1)) / 2;

  129.         /*
  130.          * U1 + U2 = n1 * n2
  131.          */
  132.         final double U2 = (long) x.length * y.length - U1;

  133.         return FastMath.min(U1, U2);
  134.     }

  135.     /**
  136.      * Concatenate the samples into one array.
  137.      *
  138.      * @param x first sample
  139.      * @param y second sample
  140.      * @return concatenated array
  141.      */
  142.     private double[] concatenateSamples(final double[] x, final double[] y) {
  143.         final double[] z = new double[x.length + y.length];

  144.         System.arraycopy(x, 0, z, 0, x.length);
  145.         System.arraycopy(y, 0, z, x.length, y.length);

  146.         return z;
  147.     }

  148.     /**
  149.      * Returns the asymptotic <i>observed significance level</i>, or
  150.      * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
  151.      * p-value</a>, associated with a <a href=
  152.      * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U
  153.      * Test</a> comparing means for two independent samples.
  154.      * <p>
  155.      * Let X<sub>i</sub> denote the i'th individual of the first sample and
  156.      * Y<sub>j</sub> the j'th individual in the second sample.
  157.      * <p>
  158.      * <strong>Preconditions</strong>:
  159.      * <ul>
  160.      * <li>All observations in the two samples are independent.</li>
  161.      * <li>The observations are at least ordinal.</li>
  162.      * </ul>
  163.      * <p>
  164.      * If there are no ties in the data and both samples are small (less than or
  165.      * equal to 50 values in the combined dataset), an exact test is performed;
  166.      * otherwise the test uses the normal approximation (with continuity
  167.      * correction).
  168.      * <p>
  169.      * If the combined dataset contains ties, the variance used in the normal
  170.      * approximation is bias-adjusted using the formula in the reference above.
  171.      *
  172.      * @param x the first sample
  173.      * @param y the second sample
  174.      * @return approximate 2-sized p-value
  175.      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
  176.      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
  177.      *         zero-length
  178.      */
  179.     public double mannWhitneyUTest(final double[] x, final double[] y)
  180.         throws MathIllegalArgumentException, NullArgumentException {
  181.         ensureDataConformance(x, y);

  182.         // If samples are both small and there are no ties, perform exact test
  183.         if (x.length + y.length <= SMALL_SAMPLE_SIZE &&
  184.             tiesMap(x, y).isEmpty()) {
  185.             return mannWhitneyUTest(x, y, true);
  186.         } else { // Normal approximation
  187.             return mannWhitneyUTest(x, y, false);
  188.         }
  189.     }

  190.     /**
  191.      * Returns the asymptotic <i>observed significance level</i>, or
  192.      * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
  193.      * p-value</a>, associated with a <a href=
  194.      * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U
  195.      * Test</a> comparing means for two independent samples.
  196.      * <p>
  197.      * Let X<sub>i</sub> denote the i'th individual of the first sample and
  198.      * Y<sub>j</sub> the j'th individual in the second sample.
  199.      * <p>
  200.      * <strong>Preconditions</strong>:
  201.      * <ul>
  202.      * <li>All observations in the two samples are independent.</li>
  203.      * <li>The observations are at least ordinal.</li>
  204.      * </ul>
  205.      * <p>
  206.      * If {@code exact} is {@code true}, the p-value reported is exact, computed
  207.      * using the exact distribution of the U statistic. The computation in this
  208.      * case requires storage on the order of the product of the two sample
  209.      * sizes, so this should not be used for large samples.
  210.      * <p>
  211.      * If {@code exact} is {@code false}, the normal approximation is used to
  212.      * estimate the p-value.
  213.      * <p>
  214.      * If the combined dataset contains ties and {@code exact} is {@code true},
  215.      * MathIllegalArgumentException is thrown. If {@code exact} is {@code false}
  216.      * and the ties are present, the variance used to compute the approximate
  217.      * p-value in the normal approximation is bias-adjusted using the formula in
  218.      * the reference above.
  219.      *
  220.      * @param x the first sample
  221.      * @param y the second sample
  222.      * @param exact true means compute the p-value exactly, false means use the
  223.      *        normal approximation
  224.      * @return approximate 2-sided p-value
  225.      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
  226.      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
  227.      *         zero-length or if {@code exact} is {@code true} and ties are
  228.      *         present in the data
  229.      */
  230.     public double mannWhitneyUTest(final double[] x, final double[] y,
  231.                                    final boolean exact)
  232.         throws MathIllegalArgumentException, NullArgumentException {
  233.         ensureDataConformance(x, y);
  234.         final Map<Double, Integer> tiesMap = tiesMap(x, y);
  235.         final double u = mannWhitneyU(x, y);
  236.         if (exact) {
  237.             if (!tiesMap.isEmpty()) {
  238.                 throw new MathIllegalArgumentException(LocalizedStatFormats.TIES_ARE_NOT_ALLOWED);
  239.             }
  240.             return exactP(x.length, y.length, u);
  241.         }

  242.         return approximateP(u, x.length, y.length,
  243.                             varU(x.length, y.length, tiesMap));
  244.     }

  245.     /**
  246.      * Ensures that the provided arrays fulfills the assumptions.
  247.      *
  248.      * @param x first sample
  249.      * @param y second sample
  250.      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
  251.      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
  252.      *         zero-length.
  253.      */
  254.     private void ensureDataConformance(final double[] x, final double[] y)
  255.         throws MathIllegalArgumentException, NullArgumentException {

  256.         if (x == null || y == null) {
  257.             throw new NullArgumentException();
  258.         }
  259.         if (x.length == 0 || y.length == 0) {
  260.             throw new MathIllegalArgumentException(LocalizedCoreFormats.NO_DATA);
  261.         }
  262.     }

  263.     /**
  264.      * Estimates the 2-sided p-value associated with a Mann-Whitney U statistic
  265.      * value using the normal approximation.
  266.      * <p>
  267.      * The variance passed in is assumed to be corrected for ties. Continuity
  268.      * correction is applied to the normal approximation.
  269.      *
  270.      * @param u Mann-Whitney U statistic
  271.      * @param n1 number of subjects in first sample
  272.      * @param n2 number of subjects in second sample
  273.      * @param varU variance of U (corrected for ties if these exist)
  274.      * @return two-sided asymptotic p-value
  275.      * @throws MathIllegalStateException if the p-value can not be computed due
  276.      *         to a convergence error
  277.      * @throws MathIllegalStateException if the maximum number of iterations is
  278.      *         exceeded
  279.      */
  280.     private double approximateP(final double u, final int n1, final int n2,
  281.                                 final double varU)
  282.         throws MathIllegalStateException {

  283.         final double mu = (long) n1 * n2 / 2.0;

  284.         // If u == mu, return 1
  285.         if (Precision.equals(mu, u)) {
  286.             return 1;
  287.         }

  288.         // Force z <= 0 so we get tail probability. Also apply continuity
  289.         // correction
  290.         final double z = -Math.abs((u - mu) + 0.5) / FastMath.sqrt(varU);

  291.         return 2 * standardNormal.cumulativeProbability(z);
  292.     }

  293.     /**
  294.      * Calculates the (2-sided) p-value associated with a Mann-Whitney U
  295.      * statistic.
  296.      * <p>
  297.      * To compute the p-value, the probability densities for each value of U up
  298.      * to and including u are summed and the resulting tail probability is
  299.      * multiplied by 2.
  300.      * <p>
  301.      * The result of this computation is only valid when the combined n + m
  302.      * sample has no tied values.
  303.      * <p>
  304.      * This method should not be used for large values of n or m as it maintains
  305.      * work arrays of size n*m.
  306.      *
  307.      * @param u Mann-Whitney U statistic value
  308.      * @param n first sample size
  309.      * @param m second sample size
  310.      * @return two-sided exact p-value
  311.      */
  312.     private double exactP(final int n, final int m, final double u) {
  313.         final double nm = m * n;
  314.         if (u > nm) { // Quick exit if u is out of range
  315.             return 1;
  316.         }
  317.         // Need to convert u to a mean deviation, so cumulative probability is
  318.         // tail probability
  319.         final double crit = u < nm / 2 ? u : nm / 2 - u;

  320.         double cum = 0d;
  321.         for (int ct = 0; ct <= crit; ct++) {
  322.             cum += uDensity(n, m, ct);
  323.         }
  324.         return 2 * cum;
  325.     }

  326.     /**
  327.      * Computes the probability density function for the Mann-Whitney U
  328.      * statistic.
  329.      * <p>
  330.      * This method should not be used for large values of n or m as it maintains
  331.      * work arrays of size n*m.
  332.      *
  333.      * @param n first sample size
  334.      * @param m second sample size
  335.      * @param u U-statistic value
  336.      * @return the probability that a U statistic derived from random samples of
  337.      *         size n and m (containing no ties) equals u
  338.      */
  339.     private double uDensity(final int n, final int m, double u) {
  340.         if (u < 0 || u > m * n) {
  341.             return 0;
  342.         }
  343.         final long[] freq = uFrequencies(n, m);
  344.         return freq[(int) FastMath.round(u + 1)] /
  345.                (double) LongStream.of(freq).sum();
  346.     }

  347.     /**
  348.      * Computes frequency counts for values of the Mann-Whitney U statistc. If
  349.      * freq[] is the returned array, freq[u + 1] counts the frequency of U = u
  350.      * among all possible n-m orderings. Therefore, P(u = U) = freq[u + 1] / sum
  351.      * where sum is the sum of the values in the returned array.
  352.      * <p>
  353.      * Implements the algorithm presented in "Algorithm AS 62: A Generator for
  354.      * the Sampling Distribution of the Mann-Whitney U Statistic", L. C. Dinneen
  355.      * and B. C. Blakesley Journal of the Royal Statistical Society. Series C
  356.      * (Applied Statistics) Vol. 22, No. 2 (1973), pp. 269-273.
  357.      *
  358.      * @param n first sample size
  359.      * @param m second sample size
  360.      * @return array of U statistic value frequencies
  361.      */
  362.     private long[] uFrequencies(final int n, final int m) {
  363.         final int max = FastMath.max(m, n);
  364.         if (max > 100) {
  365.             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE,
  366.                                                    max, 100);
  367.         }
  368.         final int min = FastMath.min(m, n);
  369.         final long[] out = new long[n * m + 2];
  370.         final long[] work = new long[n * m + 2];
  371.         for (int i = 1; i < out.length; i++) {
  372.             out[i] = (i <= (max + 1)) ? 1 : 0;
  373.         }
  374.         work[1] = 0;
  375.         int in = max;
  376.         for (int i = 2; i <= min; i++) {
  377.             work[i] = 0;
  378.             in = in + max;
  379.             int n1 = in + 2;
  380.             long l = 1 + in / 2;
  381.             int k = i;
  382.             for (int j = 1; j <= l; j++) {
  383.                 k++;
  384.                 n1 = n1 - 1;
  385.                 final long sum = out[j] + work[j];
  386.                 out[j] = sum;
  387.                 work[k] = sum - out[n1];
  388.                 out[n1] = sum;
  389.             }
  390.         }
  391.         return out;
  392.     }

  393.     /**
  394.      * Computes the variance for a U-statistic associated with samples of
  395.      * sizes{@code n} and {@code m} and ties described by {@code tiesMap}. If
  396.      * {@code tiesMap} is non-empty, the multiplicity counts in its values set
  397.      * are used to adjust the variance.
  398.      *
  399.      * @param n first sample size
  400.      * @param m second sample size
  401.      * @param tiesMap map of <value, multiplicity>
  402.      * @return ties-adjusted variance
  403.      */
  404.     private double varU(final int n, final int m,
  405.                         Map<Double, Integer> tiesMap) {
  406.         final double nm = (long) n * m;
  407.         if (tiesMap.isEmpty()) {
  408.             return nm * (n + m + 1) / 12.0;
  409.         }
  410.         final long tSum = tiesMap.entrySet().stream()
  411.             .mapToLong(e -> e.getValue() * e.getValue() * e.getValue() -
  412.                             e.getValue())
  413.             .sum();
  414.         final double totalN = n + m;
  415.         return (nm / 12) * (totalN + 1 - tSum / (totalN * (totalN - 1)));

  416.     }

  417.     /**
  418.      * Creates a map whose keys are values occurring more than once in the
  419.      * combined dataset formed from x and y. Map entry values are the number of
  420.      * occurrences. The returned map is empty iff there are no ties in the data.
  421.      *
  422.      * @param x first dataset
  423.      * @param y second dataset
  424.      * @return map of <value, number of times it occurs> for values occurring
  425.      *         more than once or an empty map if there are no ties (the returned
  426.      *         map is <em>not</em> thread-safe, which is OK in the context of the callers)
  427.      */
  428.     private Map<Double, Integer> tiesMap(final double[] x, final double[] y) {
  429.         final Map<Double, Integer> tiesMap = new TreeMap<>(); // NOPMD - no concurrent access in the callers context
  430.         for (int i = 0; i < x.length; i++) {
  431.             tiesMap.merge(x[i], 1, Integer::sum);
  432.         }
  433.         for (int i = 0; i < y.length; i++) {
  434.             tiesMap.merge(y[i], 1, Integer::sum);
  435.         }
  436.         tiesMap.entrySet().removeIf(e -> e.getValue() == 1);
  437.         return tiesMap;
  438.     }
  439. }