View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  
23  package org.hipparchus.stat.inference;
24  
25  import java.math.RoundingMode;
26  import java.util.Arrays;
27  import java.util.HashSet;
28  
29  import org.hipparchus.distribution.RealDistribution;
30  import org.hipparchus.exception.LocalizedCoreFormats;
31  import org.hipparchus.exception.MathIllegalArgumentException;
32  import org.hipparchus.exception.MathIllegalStateException;
33  import org.hipparchus.exception.MathRuntimeException;
34  import org.hipparchus.fraction.BigFraction;
35  import org.hipparchus.fraction.BigFractionField;
36  import org.hipparchus.linear.Array2DRowFieldMatrix;
37  import org.hipparchus.linear.FieldMatrix;
38  import org.hipparchus.linear.MatrixUtils;
39  import org.hipparchus.linear.RealMatrix;
40  import org.hipparchus.random.RandomDataGenerator;
41  import org.hipparchus.random.RandomGenerator;
42  import org.hipparchus.stat.LocalizedStatFormats;
43  import org.hipparchus.util.FastMath;
44  import org.hipparchus.util.MathArrays;
45  import org.hipparchus.util.MathUtils;
46  
47  /**
48   * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
49   * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
50   * <p>
51   * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
52   * sample data points from the distribution expected under the null hypothesis. For one-sample tests
53   * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
54   * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
55   * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
56   * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
57   * given in [2].
58   * <p>
59   * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
60   * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
61   * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
62   * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
63   * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
64   * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
65   * follows:
66   * <ul>
67   * <li>For small samples (where the product of the sample sizes is less than
68   * {@link #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the
69   * exact p-value for the 2-sample test.</li>
70   * <li>When the product of the sample sizes exceeds {@link #LARGE_SAMPLE_PRODUCT}, the asymptotic
71   * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
72   * the approximation.</li>
73   * </ul>
74   * <p>
75   * If the product of the sample sizes is less than {@link #LARGE_SAMPLE_PRODUCT} and the sample
76   * data contains ties, random jitter is added to the sample data to break ties before applying
77   * the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)}
78   * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
79   * in the R Matching package [3], can be used if ties are known to be present in the data.
80   * <p>
81   * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
82   * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} &gt; d \)
83   * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
84   * {@code strict} parameter. This parameter is ignored for large samples.
85   * <p>
86   * The methods used by the 2-sample default implementation are also exposed directly:
87   * <ul>
88   * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
89   * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
90   * arguments in the first two methods allow the probability used to estimate the p-value to be
91   * expressed using strict or non-strict inequality. See
92   * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
93   * </ul>
94   * <p>
95   * References:
96   * <ul>
97   * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
98   * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
99   * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
100  * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
101  * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
102  * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
103  * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
104  * <li>[4] Kim, P. J. and Jennrich, R. I. (1970). Tables of the Exact Sampling Distribution of the
105  * Two-Sample Kolmogorov-Smirnov Criterion D_mn ,m≦n in Selected Tables in Mathematical Statistics,
106  * Vol. 1, H. L. Harter and D. B. Owen, editors.</li>
107  * </ul>
108  * <p>
109  * Note that [1] contains an error in computing h, refer to <a
110  * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
111  */
112 public class KolmogorovSmirnovTest { // NOPMD - this is not a Junit test class, PMD false positive here
113 
114     /**
115      * Bound on the number of partial sums in {@link #ksSum(double, double, int)}
116      */
117     protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
118 
119     /** Convergence criterion for {@link #ksSum(double, double, int)} */
120     protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
121 
122     /** Convergence criterion for the sums in #pelzGood(double, double, int)} */
123     protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
124 
125     /**
126      * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
127      * distribution to compute the p-value.
128      */
129     protected static final int LARGE_SAMPLE_PRODUCT = 10000;
130 
131     /**
132      * RandomDataGenerator used by {@link #bootstrap(double[], double[], int)}
133      * or to generate jitter to break ties in the data.
134      */
135     private final RandomDataGenerator gen = new RandomDataGenerator();
136 
137     /**
138      * Construct a KolmogorovSmirnovTest instance.
139      */
140     public KolmogorovSmirnovTest() {
141         super();
142     }
143 
144     /**
145      * Construct a KolmogorovSmirnovTest instance providing a seed for the PRNG
146      * used by the {@link #bootstrap(double[], double[], int)} method.
147      *
148      * @param seed the seed for the PRNG
149      */
150     public KolmogorovSmirnovTest(long seed) {
151         super();
152         gen.setSeed(seed);
153     }
154 
155     /**
156      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
157      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
158      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
159      * {@code exact} is true, the distribution used to compute the p-value is computed using
160      * extended precision. See {@link #cdfExact(double, int)}.
161      *
162      * @param distribution reference distribution
163      * @param data sample being being evaluated
164      * @param exact whether or not to force exact computation of the p-value
165      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
166      *         {@code distribution}
167      * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
168      * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
169      */
170     public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
171         return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
172     }
173 
174     /**
175      * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
176      * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
177      * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
178      * each of the values in {@code data}.
179      *
180      * @param distribution reference distribution
181      * @param data sample being evaluated
182      * @return Kolmogorov-Smirnov statistic \(D_n\)
183      * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
184      * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
185      */
186     public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
187         checkArray(data);
188         final int n = data.length;
189         final double nd = n;
190         final double[] dataCopy = new double[n];
191         System.arraycopy(data, 0, dataCopy, 0, n);
192         Arrays.sort(dataCopy);
193         double d = 0d;
194         for (int i = 1; i <= n; i++) {
195             final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
196             final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
197             if (currD > d) {
198                 d = currD;
199             }
200         }
201         return d;
202     }
203 
204     /**
205      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
206      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
207      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
208      * probability distribution. Specifically, what is returned is an estimate of the probability
209      * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
210      * selected partition of the combined sample into subsamples of sizes {@code x.length} and
211      * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
212      * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
213      * <ul>
214      * <li>For small samples (where the product of the sample sizes is less than
215      * {@link #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented
216      * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li>
217      * <li>When the product of the sample sizes exceeds {@link #LARGE_SAMPLE_PRODUCT}, the
218      * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
219      * for details on the approximation.</li>
220      * </ul><p>
221      * If {@code x.length * y.length} &lt; {@link #LARGE_SAMPLE_PRODUCT} and the combined set of values in
222      * {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to
223      * break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed
224      * on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between
225      * values in the combined sample.</p>
226      * <p>
227      * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
228      * may be used as an alternative method for estimating the p-value.</p>
229      *
230      * @param x first sample dataset
231      * @param y second sample dataset
232      * @param strict whether or not the probability to compute is expressed as a strict inequality
233      *        (ignored for large samples)
234      * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
235      *         samples from the same distribution
236      * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
237      *         least 2
238      * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
239      * @see #bootstrap(double[], double[], int, boolean)
240      */
241     public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
242         final long lengthProduct = (long) x.length * y.length;
243         final double[] xa;
244         final double[] ya;
245         if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
246             xa = x.clone();
247             ya = y.clone();
248             fixTies(xa, ya);
249         } else {
250             xa = x;
251             ya = y;
252         }
253         if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
254             return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
255         }
256         return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
257     }
258 
259     /**
260      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
261      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
262      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
263      * probability distribution. Assumes the strict form of the inequality used to compute the
264      * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
265      *
266      * @param x first sample dataset
267      * @param y second sample dataset
268      * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
269      *         samples from the same distribution
270      * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
271      *         least 2
272      * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
273      */
274     public double kolmogorovSmirnovTest(double[] x, double[] y) {
275         return kolmogorovSmirnovTest(x, y, true);
276     }
277 
278     /**
279      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
280      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
281      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
282      * is the empirical distribution of the {@code y} values.
283      *
284      * @param x first sample
285      * @param y second sample
286      * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
287      *         {@code y} represent samples from the same underlying distribution
288      * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
289      *         least 2
290      * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
291      */
292     public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
293         return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
294     }
295 
296     /**
297      * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
298      * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
299      * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
300      * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
301      * as long value.
302      *
303      * @param x first sample
304      * @param y second sample
305      * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
306      *         {@code y} represent samples from the same underlying distribution
307      * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
308      *         least 2
309      * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
310      */
311     private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
312         checkArray(x);
313         checkArray(y);
314         // Copy and sort the sample arrays
315         final double[] sx = x.clone();
316         final double[] sy = y.clone();
317         Arrays.sort(sx);
318         Arrays.sort(sy);
319         final int n = sx.length;
320         final int m = sy.length;
321 
322         int rankX = 0;
323         int rankY = 0;
324         long curD = 0l;
325 
326         // Find the max difference between cdf_x and cdf_y
327         long supD = 0l;
328         do {
329             double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
330             while(rankX < n && Double.compare(sx[rankX], z) == 0) {
331                 rankX += 1;
332                 curD += m;
333             }
334             while(rankY < m && Double.compare(sy[rankY], z) == 0) {
335                 rankY += 1;
336                 curD -= n;
337             }
338             if (curD > supD) {
339                 supD = curD;
340             }
341             else if (-curD > supD) {
342                 supD = -curD;
343             }
344         } while(rankX < n && rankY < m);
345         return supD;
346     }
347 
348     /**
349      * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
350      * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
351      * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
352      *
353      * @param distribution reference distribution
354      * @param data sample being being evaluated
355      * @return the p-value associated with the null hypothesis that {@code data} is a sample from
356      *         {@code distribution}
357      * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
358      * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
359      */
360     public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
361         return kolmogorovSmirnovTest(distribution, data, false);
362     }
363 
364     /**
365      * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
366      * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
367      *
368      * @param distribution reference distribution
369      * @param data sample being being evaluated
370      * @param alpha significance level of the test
371      * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
372      *         can be rejected with confidence 1 - {@code alpha}
373      * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
374      * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
375      */
376     public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
377         if ((alpha <= 0) || (alpha > 0.5)) {
378             throw new MathIllegalArgumentException(LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
379         }
380         return kolmogorovSmirnovTest(distribution, data) < alpha;
381     }
382 
383     /**
384      * Estimates the <i>p-value</i> of a two-sample
385      * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
386      * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
387      * probability distribution. This method estimates the p-value by repeatedly sampling sets of size
388      * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample.
389      * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function
390      * {@code ks.boot}, described in <pre>
391      * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
392      * Software with Automated Balance Optimization: The Matching package for R.'
393      * Journal of Statistical Software, 42(7): 1-52.
394      * </pre>
395      * @param x first sample
396      * @param y second sample
397      * @param iterations number of bootstrap resampling iterations
398      * @param strict whether or not the null hypothesis is expressed as a strict inequality
399      * @return estimated p-value
400      */
401     public double bootstrap(double[] x, double[] y, int iterations, boolean strict) {
402         final int xLength = x.length;
403         final int yLength = y.length;
404         final double[] combined = new double[xLength + yLength];
405         System.arraycopy(x, 0, combined, 0, xLength);
406         System.arraycopy(y, 0, combined, xLength, yLength);
407         final long d = integralKolmogorovSmirnovStatistic(x, y);
408         int greaterCount = 0;
409         int equalCount = 0;
410         double[] curX;
411         double[] curY;
412         long curD;
413         for (int i = 0; i < iterations; i++) {
414             curX = resample(combined, xLength);
415             curY = resample(combined, yLength);
416             curD = integralKolmogorovSmirnovStatistic(curX, curY);
417             if (curD > d) {
418                 greaterCount++;
419             } else if (curD == d) {
420                 equalCount++;
421             }
422         }
423         return strict ? greaterCount / (double) iterations :
424             (greaterCount + equalCount) / (double) iterations;
425     }
426 
427     /**
428      * Computes {@code bootstrap(x, y, iterations, true)}.
429      * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching
430      * package function. See #bootstrap(double[], double[], int, boolean).
431      *
432      * @param x first sample
433      * @param y second sample
434      * @param iterations number of bootstrap resampling iterations
435      * @return estimated p-value
436      */
437     public double bootstrap(double[] x, double[] y, int iterations) {
438         return bootstrap(x, y, iterations, true);
439     }
440 
441     /**
442      * Return a bootstrap sample (with replacement) of size k from sample.
443      *
444      * @param sample array to sample from
445      * @param k size of bootstrap sample
446      * @return bootstrap sample
447      */
448     private double[] resample(double[] sample, int k) {
449         final int len = sample.length;
450         final double[] out = new double[k];
451         for (int i = 0; i < k; i++) {
452             out[i] = gen.nextInt(len);
453         }
454         return out;
455     }
456 
457     /**
458      * Calculates {@code P(D_n < d)} using the method described in [1] with quick decisions for extreme
459      * values given in [2] (see above). The result is not exact as with
460      * {@link #cdfExact(double, int)} because calculations are based on
461      * {@code double} rather than {@link org.hipparchus.fraction.BigFraction}.
462      *
463      * @param d statistic
464      * @param n sample size
465      * @return \(P(D_n &lt; d)\)
466      * @throws MathRuntimeException if algorithm fails to convert {@code h} to a
467      *         {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
468      *         - h) / m\) for integer {@code k, m} and \(0 &lt;= h &lt; 1\)
469      */
470     public double cdf(double d, int n)
471         throws MathRuntimeException {
472         return cdf(d, n, false);
473     }
474 
475     /**
476      * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
477      * used everywhere at the expense of very slow execution time. Almost never choose this in real
478      * applications unless you are very sure; this is almost solely for verification purposes.
479      * Normally, you would choose {@link #cdf(double, int)}. See the class
480      * javadoc for definitions and algorithm description.
481      *
482      * @param d statistic
483      * @param n sample size
484      * @return \(P(D_n &lt; d)\)
485      * @throws MathRuntimeException if the algorithm fails to convert {@code h} to a
486      *         {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
487      *         - h) / m\) for integer {@code k, m} and \(0 &lt;= h &lt; 1\)
488      */
489     public double cdfExact(double d, int n)
490         throws MathRuntimeException {
491         return cdf(d, n, true);
492     }
493 
494     /**
495      * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
496      * values given in [2] (see above).
497      *
498      * @param d statistic
499      * @param n sample size
500      * @param exact whether the probability should be calculated exact using
501      *        {@link org.hipparchus.fraction.BigFraction} everywhere at the expense of
502      *        very slow execution time, or if {@code double} should be used convenient places to
503      *        gain speed. Almost never choose {@code true} in real applications unless you are very
504      *        sure; {@code true} is almost solely for verification purposes.
505      * @return \(P(D_n &lt; d)\)
506      * @throws MathRuntimeException if algorithm fails to convert {@code h} to a
507      *         {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
508      *         - h) / m\) for integer {@code k, m} and \(0 \lt;= h &lt; 1\).
509      */
510     public double cdf(double d, int n, boolean exact)
511         throws MathRuntimeException {
512 
513         final double ninv = 1 / ((double) n);
514         final double ninvhalf = 0.5 * ninv;
515 
516         if (d <= ninvhalf) {
517             return 0;
518         } else if (ninvhalf < d && d <= ninv) {
519             double res = 1;
520             final double f = 2 * d - ninv;
521             // n! f^n = n*f * (n-1)*f * ... * 1*x
522             for (int i = 1; i <= n; ++i) {
523                 res *= i * f;
524             }
525             return res;
526         } else if (1 - ninv <= d && d < 1) {
527             return 1 - 2 * Math.pow(1 - d, n);
528         } else if (1 <= d) {
529             return 1;
530         }
531         if (exact) {
532             return exactK(d, n);
533         }
534         if (n <= 140) {
535             return roundedK(d, n);
536         }
537         return pelzGood(d, n);
538     }
539 
540     /**
541      * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
542      * in class javadoc above) and {@link org.hipparchus.fraction.BigFraction} (see
543      * above).
544      *
545      * @param d statistic
546      * @param n sample size
547      * @return the two-sided probability of \(P(D_n < d)\)
548      * @throws MathRuntimeException if algorithm fails to convert {@code h} to a
549      *         {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
550      *         - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
551      */
552     private double exactK(double d, int n)
553         throws MathRuntimeException {
554 
555         final int k = (int) Math.ceil(n * d);
556 
557         final FieldMatrix<BigFraction> H = this.createExactH(d, n);
558         final FieldMatrix<BigFraction> Hpower = H.power(n);
559 
560         BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
561 
562         for (int i = 1; i <= n; ++i) {
563             pFrac = pFrac.multiply(i).divide(n);
564         }
565 
566         /*
567          * BigFraction.doubleValue converts numerator to double and the denominator to double and
568          * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
569          * digits):
570          */
571         return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue();
572     }
573 
574     /**
575      * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
576      *
577      * @param d statistic
578      * @param n sample size
579      * @return \(P(D_n < d)\)
580      */
581     private double roundedK(double d, int n) {
582 
583         final int k = (int) Math.ceil(n * d);
584         final RealMatrix H = this.createRoundedH(d, n);
585         final RealMatrix Hpower = H.power(n);
586 
587         double pFrac = Hpower.getEntry(k - 1, k - 1);
588         for (int i = 1; i <= n; ++i) {
589             pFrac *= (double) i / (double) n;
590         }
591 
592         return pFrac;
593     }
594 
595     /**
596      * Computes the Pelz-Good approximation for \(P(D_n &lt; d)\) as described in [2] in the class javadoc.
597      *
598      * @param d value of d-statistic (x in [2])
599      * @param n sample size
600      * @return \(P(D_n &lt; d)\)
601      */
602     public double pelzGood(double d, int n) {
603         // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
604         final double sqrtN = FastMath.sqrt(n);
605         final double z = d * sqrtN;
606         final double z2 = d * d * n;
607         final double z4 = z2 * z2;
608         final double z6 = z4 * z2;
609         final double z8 = z4 * z4;
610 
611         // Compute K_0(z)
612         double sum = 0;
613         double z2Term = MathUtils.PI_SQUARED / (8 * z2);
614         int k = 1;
615         for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
616             final double kTerm = 2 * k - 1;
617             final double increment = FastMath.exp(-z2Term * kTerm * kTerm);
618             sum += increment;
619             if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
620                 break;
621             }
622         }
623         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
624             throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
625         }
626         double ret = sum * FastMath.sqrt(2 * FastMath.PI) / z;
627 
628         // K_1(z)
629         // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
630         // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
631         final double twoZ2 = 2 * z2;
632         sum = 0;
633         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
634             final double kTerm = k + 0.5;
635             final double kTerm2 = kTerm * kTerm;
636             final double increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
637             sum += increment;
638             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
639                 break;
640             }
641         }
642         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
643             throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
644         }
645         final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2);
646         // Instead of doubling sum, divide by 3 instead of 6
647         ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
648 
649         // K_2(z)
650         // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
651         final double z4Term = 2 * z4;
652         final double z6Term = 6 * z6;
653         z2Term = 5 * z2;
654         final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED;
655         sum = 0;
656         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
657             final double kTerm = k + 0.5;
658             final double kTerm2 = kTerm * kTerm;
659             final double increment =  (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 +
660                     pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
661             sum += increment;
662             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
663                 break;
664             }
665         }
666         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
667             throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
668         }
669         double sum2 = 0;
670         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
671             final double kTerm2 = k * k;
672             final double increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
673             sum2 += increment;
674             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
675                 break;
676             }
677         }
678         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
679             throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
680         }
681         // Again, adjust coefficients instead of doubling sum, sum2
682         ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
683 
684         // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
685         // Multiply coefficient denominators by 2, so omit doubling sums.
686         final double pi6 = pi4 * MathUtils.PI_SQUARED;
687         sum = 0;
688         for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
689             final double kTerm = k + 0.5;
690             final double kTerm2 = kTerm * kTerm;
691             final double kTerm4 = kTerm2 * kTerm2;
692             final double kTerm6 = kTerm4 * kTerm2;
693             final double increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
694                             MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
695                     FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
696             sum += increment;
697             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
698                 break;
699             }
700         }
701         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
702             throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
703         }
704         sum2 = 0;
705         for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
706             final double kTerm2 = k * k;
707             final double kTerm4 = kTerm2 * kTerm2;
708             final double increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) *
709                     FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
710             sum2 += increment;
711             if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
712                 break;
713             }
714         }
715         if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
716             throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
717         }
718         return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
719                 + sum2 / (108 * z6));
720 
721     }
722 
723     /***
724      * Creates {@code H} of size {@code m x m} as described in [1] (see above).
725      *
726      * @param d statistic
727      * @param n sample size
728      * @return H matrix
729      * @throws MathIllegalArgumentException if fractional part is greater than 1
730      * @throws MathIllegalStateException if algorithm fails to convert {@code h} to a
731      *         {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
732      *         - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
733      */
734     private FieldMatrix<BigFraction> createExactH(double d, int n)
735         throws MathIllegalArgumentException, MathIllegalStateException {
736 
737         final int k = (int) Math.ceil(n * d);
738         final int m = 2 * k - 1;
739         final double hDouble = k - n * d;
740         if (hDouble >= 1) {
741             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE_BOUND_EXCLUDED,
742                                                    hDouble, 1.0);
743         }
744         BigFraction h;
745         try {
746             h = new BigFraction(hDouble, 1.0e-20, 10000);
747         } catch (final MathIllegalStateException e1) {
748             try {
749                 h = new BigFraction(hDouble, 1.0e-10, 10000);
750             } catch (final MathIllegalStateException e2) {
751                 h = new BigFraction(hDouble, 1.0e-5, 10000);
752             }
753         }
754         final BigFraction[][] Hdata = new BigFraction[m][m];
755 
756         /*
757          * Start by filling everything with either 0 or 1.
758          */
759         for (int i = 0; i < m; ++i) {
760             for (int j = 0; j < m; ++j) {
761                 if (i - j + 1 < 0) {
762                     Hdata[i][j] = BigFraction.ZERO;
763                 } else {
764                     Hdata[i][j] = BigFraction.ONE;
765                 }
766             }
767         }
768 
769         /*
770          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
771          * hPowers[m-1] = h^m
772          */
773         final BigFraction[] hPowers = new BigFraction[m];
774         hPowers[0] = h;
775         for (int i = 1; i < m; ++i) {
776             hPowers[i] = h.multiply(hPowers[i - 1]);
777         }
778 
779         /*
780          * First column and last row has special values (each other reversed).
781          */
782         for (int i = 0; i < m; ++i) {
783             Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
784             Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
785         }
786 
787         /*
788          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
789          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
790          */
791         if (h.compareTo(BigFraction.ONE_HALF) == 1) {
792             Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
793         }
794 
795         /*
796          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
797          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
798          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
799          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
800          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
801          * really necessary.
802          */
803         for (int i = 0; i < m; ++i) {
804             for (int j = 0; j < i + 1; ++j) {
805                 if (i - j + 1 > 0) {
806                     for (int g = 2; g <= i - j + 1; ++g) {
807                         Hdata[i][j] = Hdata[i][j].divide(g);
808                     }
809                 }
810             }
811         }
812         return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
813     }
814 
815     /***
816      * Creates {@code H} of size {@code m x m} as described in [1] (see above)
817      * using double-precision.
818      *
819      * @param d statistic
820      * @param n sample size
821      * @return H matrix
822      * @throws MathIllegalArgumentException if fractional part is greater than 1
823      */
824     private RealMatrix createRoundedH(double d, int n)
825         throws MathIllegalArgumentException {
826 
827         final int k = (int) Math.ceil(n * d);
828         final int m = 2 * k - 1;
829         final double h = k - n * d;
830         if (h >= 1) {
831             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE_BOUND_EXCLUDED,
832                                                    h, 1.0);
833         }
834         final double[][] Hdata = new double[m][m];
835 
836         /*
837          * Start by filling everything with either 0 or 1.
838          */
839         for (int i = 0; i < m; ++i) {
840             for (int j = 0; j < m; ++j) {
841                 if (i - j + 1 < 0) {
842                     Hdata[i][j] = 0;
843                 } else {
844                     Hdata[i][j] = 1;
845                 }
846             }
847         }
848 
849         /*
850          * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
851          * hPowers[m-1] = h^m
852          */
853         final double[] hPowers = new double[m];
854         hPowers[0] = h;
855         for (int i = 1; i < m; ++i) {
856             hPowers[i] = h * hPowers[i - 1];
857         }
858 
859         /*
860          * First column and last row has special values (each other reversed).
861          */
862         for (int i = 0; i < m; ++i) {
863             Hdata[i][0] = Hdata[i][0] - hPowers[i];
864             Hdata[m - 1][i] -= hPowers[m - i - 1];
865         }
866 
867         /*
868          * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
869          * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
870          */
871         if (Double.compare(h, 0.5) > 0) {
872             Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m);
873         }
874 
875         /*
876          * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
877          * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
878          * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
879          * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
880          * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
881          * really necessary.
882          */
883         for (int i = 0; i < m; ++i) {
884             for (int j = 0; j < i + 1; ++j) {
885                 if (i - j + 1 > 0) {
886                     for (int g = 2; g <= i - j + 1; ++g) {
887                         Hdata[i][j] /= g;
888                     }
889                 }
890             }
891         }
892         return MatrixUtils.createRealMatrix(Hdata);
893     }
894 
895     /**
896      * Verifies that {@code array} has length at least 2.
897      *
898      * @param array array to test
899      * @throws org.hipparchus.exception.NullArgumentException if array is null
900      * @throws MathIllegalArgumentException if array is too short
901      */
902     private void checkArray(double[] array) {
903         MathUtils.checkNotNull(array);
904         if (array.length < 2) {
905             throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
906                                                    2);
907         }
908     }
909 
910     /**
911      * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
912      * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
913      * have been computed. If the sum does not converge before {@code maxIterations} iterations a
914      * {@link MathIllegalStateException} is thrown.
915      *
916      * @param t argument
917      * @param tolerance Cauchy criterion for partial sums
918      * @param maxIterations maximum number of partial sums to compute
919      * @return Kolmogorov sum evaluated at t
920      * @throws MathIllegalStateException if the series does not converge
921      */
922     public double ksSum(double t, double tolerance, int maxIterations) {
923         if (t == 0.0) {
924             return 0.0;
925         }
926 
927         // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
928         // from class javadoc should be used.
929 
930         final double x = -2 * t * t;
931         int sign = -1;
932         long i = 1;
933         double partialSum = 0.5d;
934         double delta = 1;
935         while (delta > tolerance && i < maxIterations) {
936             delta = FastMath.exp(x * i * i);
937             partialSum += sign * delta;
938             sign *= -1;
939             i++;
940         }
941         if (i == maxIterations) {
942             throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, maxIterations);
943         }
944         return partialSum * 2;
945     }
946 
947     /**
948      * Computes \(P(D_{n,m} &gt; d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
949      * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
950      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
951      * <p>
952      * The returned probability is exact, implemented by unwinding the recursive function
953      * definitions presented in [4] from the class javadoc.
954      * </p>
955      *
956      * @param d D-statistic value
957      * @param n first sample size
958      * @param m second sample size
959      * @param strict whether or not the probability to compute is expressed as a strict inequality
960      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
961      *         greater than (resp. greater than or equal to) {@code d}
962      */
963     public double exactP(double d, int n, int m, boolean strict) {
964         if (d < 1 / (double)( m * n)) {
965             return 1.0;
966         } else if (d >= 1) {
967             return 0;
968         }
969         double normalizeD = normalizeD(d, n, m);
970         if (!strict) {
971             normalizeD -= 1 / ((double)n * m);
972         }
973         return exactPAtMeshpoint(normalizeD, n, m);
974     }
975 
976     /**
977      * Normalizes a value to an integral multiple of 1/mn between 0 and 1.
978      * If d < 1/mn, 0 is returned; if d > 1, 1 is returned; if d is very close
979      * to an integral multiple of 1/mn, that value is returned; otherwise the
980      * returned value is the smallest multiple of 1/mn less than or equal to d.
981      *
982      * @param d d value
983      * @param n first sample size
984      * @param m second sample size
985      * @return d value suitable for input to exactPAtMeshpoint(d, m, n)
986      */
987     private double normalizeD(double d, int n, int m) {
988         final double resolution = 1 / ((double)n * m);
989         final double tol = 1e-12;
990 
991         // If d is smaller that the first mesh point, return 0
992         // If greater than 1, return 1
993         if (d < resolution) {
994             return 0;
995         } else if (d > 1) {
996             return 1;
997         }
998 
999         // Normalize d to the smallest mesh point less than or equal to d;
1000         // except if d is less than tol less than the next mesh point, bump it up
1001         final double resolutions = d / resolution;
1002         final double ceil = FastMath.ceil(resolutions);
1003         if (ceil - resolutions < tol) {
1004            return ceil * resolution;
1005         } else {
1006            return FastMath.floor(resolutions) * resolution;
1007         }
1008 
1009     }
1010 
1011     /**
1012      * Computes \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
1013      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1014      * <p>
1015      * The returned probability is exact, implemented by unwinding the recursive function
1016      * definitions presented in [4].
1017      *
1018      * @param d D-statistic value (must be a "meshpoint" - i.e., a possible actual value of D(m,n)).
1019      * @param n first sample size
1020      * @param m second sample size
1021      * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1022      *         greater than (resp. greater than or equal to) {@code d}
1023      */
1024     private double exactPAtMeshpoint(double d, int n, int m) {
1025         final int nn = FastMath.max(n, m);
1026         final int mm = FastMath.min(n, m);
1027         final double[] u = new double[nn + 2];
1028         final double k = mm * nn * d + 0.5;
1029         u[1] = 1d;
1030         for (int j = 1; j < nn + 1; j++) {
1031             u[j + 1] = 1;
1032             if (mm * j > k) {
1033                 u[j + 1] = 0;
1034             }
1035         }
1036         for (int i = 1; i < mm + 1; i++) {
1037             final double w = (double) i / (double) (i + nn);
1038             u[1] = w * u[1];
1039             if (nn * i > k) {
1040                 u[1] = 0;
1041             }
1042             for (int j = 1; j < nn + 1; j++) {
1043                 u[j + 1] = u[j] + u[j + 1] * w;
1044                 if (FastMath.abs(nn * i - mm * j) > k) {
1045                     u[j + 1] = 0;
1046                 }
1047             }
1048         }
1049         return 1 - u[nn + 1];
1050     }
1051 
1052     /**
1053      * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} &gt; d)\) where \(D_{n,m}\)
1054      * is the 2-sample Kolmogorov-Smirnov statistic. See
1055      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1056      * <p>
1057      * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
1058      * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
1059      * details on how convergence of the sum is determined. This implementation passes {@code ksSum}
1060      * {@link #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
1061      * {@link #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
1062      * </p>
1063      *
1064      * @param d D-statistic value
1065      * @param n first sample size
1066      * @param m second sample size
1067      * @return approximate probability that a randomly selected m-n partition of m + n generates
1068      *         \(D_{n,m}\) greater than {@code d}
1069      */
1070     public double approximateP(double d, int n, int m) {
1071         final double dm = m;
1072         final double dn = n;
1073         return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)),
1074                          KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
1075     }
1076 
1077     /**
1078      * Fills a boolean array randomly with a fixed number of {@code true} values.
1079      * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
1080      * By processing first the {@code true} values followed by the remaining {@code false} values
1081      * less random numbers need to be generated. The method is optimized for the case
1082      * that the number of {@code true} values is larger than or equal to the number of
1083      * {@code false} values.
1084      *
1085      * @param b boolean array
1086      * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
1087      * @param rng random data generator
1088      */
1089     static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) {
1090         Arrays.fill(b, true);
1091         for (int k = numberOfTrueValues; k < b.length; k++) {
1092             final int r = rng.nextInt(k + 1);
1093             b[(b[r]) ? r : k] = false;
1094         }
1095     }
1096 
1097     /**
1098      * If there are no ties in the combined dataset formed from x and y, this
1099      * method is a no-op.  If there are ties, a uniform random deviate in
1100      * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
1101      * minDelta is the minimum difference between unequal values in the combined
1102      * sample.  A fixed seed is used to generate the jitter, so repeated activations
1103      * with the same input arrays result in the same values.
1104      *
1105      * NOTE: if there are ties in the data, this method overwrites the data in
1106      * x and y with the jittered values.
1107      *
1108      * @param x first sample
1109      * @param y second sample
1110      */
1111     private void fixTies(double[] x, double[] y) {
1112        final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
1113        if (values.length == x.length + y.length) {
1114            return;  // There are no ties
1115        }
1116 
1117        // Find the smallest difference between values, or 1 if all values are the same
1118        double minDelta = 1;
1119        double prev = values[0];
1120        for (int i = 1; i < values.length; i++) {
1121           final double delta = prev - values[i];
1122           if (delta < minDelta) {
1123               minDelta = delta;
1124           }
1125           prev = values[i];
1126        }
1127        minDelta /= 2;
1128 
1129        // Add jitter using a fixed seed (so same arguments always give same results),
1130        // low-initialization-overhead generator
1131        gen.setSeed(100);
1132 
1133        // It is theoretically possible that jitter does not break ties, so repeat
1134        // until all ties are gone.  Bound the loop and throw MIE if bound is exceeded.
1135        int ct = 0;
1136        boolean ties;
1137        do {
1138            jitter(x, minDelta);
1139            jitter(y, minDelta);
1140            ties = hasTies(x, y);
1141            ct++;
1142        } while (ties && ct < 1000);
1143        if (ties) {
1144            throw MathRuntimeException.createInternalError(); // Should never happen
1145        }
1146     }
1147 
1148     /**
1149      * Returns true iff there are ties in the combined sample
1150      * formed from x and y.
1151      *
1152      * @param x first sample
1153      * @param y second sample
1154      * @return true if x and y together contain ties
1155      */
1156     private static boolean hasTies(double[] x, double[] y) {
1157         final HashSet<Double> values = new HashSet<>();
1158             for (int i = 0; i < x.length; i++) {
1159                 if (!values.add(x[i])) {
1160                     return true;
1161                 }
1162             }
1163             for (int i = 0; i < y.length; i++) {
1164                 if (!values.add(y[i])) {
1165                     return true;
1166                 }
1167             }
1168         return false;
1169     }
1170 
1171     /**
1172      * Adds random jitter to {@code data} using uniform deviates between {@code -delta} and {@code delta}.
1173      * <p>
1174      * Note that jitter is applied in-place - i.e., the array
1175      * values are overwritten with the result of applying jitter.</p>
1176      *
1177      * @param data input/output data array - entries overwritten by the method
1178      * @param delta max magnitude of jitter
1179      * @throws NullPointerException if either of the parameters is null
1180      */
1181     private void jitter(double[] data, double delta) {
1182         for (int i = 0; i < data.length; i++) {
1183             data[i] += gen.nextUniform(-delta, delta);
1184         }
1185     }
1186 
1187 }