1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This is not the original file distributed by the Apache Software Foundation 20 * It has been modified by the Hipparchus project 21 */ 22 23 package org.hipparchus.stat.inference; 24 25 import java.math.RoundingMode; 26 import java.util.Arrays; 27 import java.util.HashSet; 28 29 import org.hipparchus.distribution.RealDistribution; 30 import org.hipparchus.exception.LocalizedCoreFormats; 31 import org.hipparchus.exception.MathIllegalArgumentException; 32 import org.hipparchus.exception.MathIllegalStateException; 33 import org.hipparchus.exception.MathRuntimeException; 34 import org.hipparchus.fraction.BigFraction; 35 import org.hipparchus.fraction.BigFractionField; 36 import org.hipparchus.linear.Array2DRowFieldMatrix; 37 import org.hipparchus.linear.FieldMatrix; 38 import org.hipparchus.linear.MatrixUtils; 39 import org.hipparchus.linear.RealMatrix; 40 import org.hipparchus.random.RandomDataGenerator; 41 import org.hipparchus.random.RandomGenerator; 42 import org.hipparchus.stat.LocalizedStatFormats; 43 import org.hipparchus.util.FastMath; 44 import org.hipparchus.util.MathArrays; 45 import org.hipparchus.util.MathUtils; 46 47 /** 48 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> 49 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions. 50 * <p> 51 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of 52 * sample data points from the distribution expected under the null hypothesis. For one-sample tests 53 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the 54 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and 55 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of 56 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values 57 * given in [2]. 58 * <p> 59 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples 60 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test 61 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is 62 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of 63 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The 64 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as 65 * follows: 66 * <ul> 67 * <li>For small samples (where the product of the sample sizes is less than 68 * {@link #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the 69 * exact p-value for the 2-sample test.</li> 70 * <li>When the product of the sample sizes exceeds {@link #LARGE_SAMPLE_PRODUCT}, the asymptotic 71 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on 72 * the approximation.</li> 73 * </ul> 74 * <p> 75 * If the product of the sample sizes is less than {@link #LARGE_SAMPLE_PRODUCT} and the sample 76 * data contains ties, random jitter is added to the sample data to break ties before applying 77 * the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)} 78 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a> 79 * in the R Matching package [3], can be used if ties are known to be present in the data. 80 * <p> 81 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value 82 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \) 83 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean 84 * {@code strict} parameter. This parameter is ignored for large samples. 85 * <p> 86 * The methods used by the 2-sample default implementation are also exposed directly: 87 * <ul> 88 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li> 89 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean} 90 * arguments in the first two methods allow the probability used to estimate the p-value to be 91 * expressed using strict or non-strict inequality. See 92 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li> 93 * </ul> 94 * <p> 95 * References: 96 * <ul> 97 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by 98 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li> 99 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov 100 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li> 101 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07"> 102 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization: 103 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li> 104 * <li>[4] Kim, P. J. and Jennrich, R. I. (1970). Tables of the Exact Sampling Distribution of the 105 * Two-Sample Kolmogorov-Smirnov Criterion D_mn ,m≦n in Selected Tables in Mathematical Statistics, 106 * Vol. 1, H. L. Harter and D. B. Owen, editors.</li> 107 * </ul> 108 * <p> 109 * Note that [1] contains an error in computing h, refer to <a 110 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details. 111 */ 112 public class KolmogorovSmirnovTest { // NOPMD - this is not a Junit test class, PMD false positive here 113 114 /** 115 * Bound on the number of partial sums in {@link #ksSum(double, double, int)} 116 */ 117 protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000; 118 119 /** Convergence criterion for {@link #ksSum(double, double, int)} */ 120 protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20; 121 122 /** Convergence criterion for the sums in #pelzGood(double, double, int)} */ 123 protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10; 124 125 /** 126 * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic 127 * distribution to compute the p-value. 128 */ 129 protected static final int LARGE_SAMPLE_PRODUCT = 10000; 130 131 /** 132 * RandomDataGenerator used by {@link #bootstrap(double[], double[], int)} 133 * or to generate jitter to break ties in the data. 134 */ 135 private final RandomDataGenerator gen = new RandomDataGenerator(); 136 137 /** 138 * Construct a KolmogorovSmirnovTest instance. 139 */ 140 public KolmogorovSmirnovTest() { 141 super(); 142 } 143 144 /** 145 * Construct a KolmogorovSmirnovTest instance providing a seed for the PRNG 146 * used by the {@link #bootstrap(double[], double[], int)} method. 147 * 148 * @param seed the seed for the PRNG 149 */ 150 public KolmogorovSmirnovTest(long seed) { 151 super(); 152 gen.setSeed(seed); 153 } 154 155 /** 156 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 157 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 158 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If 159 * {@code exact} is true, the distribution used to compute the p-value is computed using 160 * extended precision. See {@link #cdfExact(double, int)}. 161 * 162 * @param distribution reference distribution 163 * @param data sample being being evaluated 164 * @param exact whether or not to force exact computation of the p-value 165 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 166 * {@code distribution} 167 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2 168 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null 169 */ 170 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) { 171 return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact); 172 } 173 174 /** 175 * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where 176 * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the 177 * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at 178 * each of the values in {@code data}. 179 * 180 * @param distribution reference distribution 181 * @param data sample being evaluated 182 * @return Kolmogorov-Smirnov statistic \(D_n\) 183 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2 184 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null 185 */ 186 public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) { 187 checkArray(data); 188 final int n = data.length; 189 final double nd = n; 190 final double[] dataCopy = new double[n]; 191 System.arraycopy(data, 0, dataCopy, 0, n); 192 Arrays.sort(dataCopy); 193 double d = 0d; 194 for (int i = 1; i <= n; i++) { 195 final double yi = distribution.cumulativeProbability(dataCopy[i - 1]); 196 final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi); 197 if (currD > d) { 198 d = currD; 199 } 200 } 201 return d; 202 } 203 204 /** 205 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 206 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 207 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 208 * probability distribution. Specifically, what is returned is an estimate of the probability 209 * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly 210 * selected partition of the combined sample into subsamples of sizes {@code x.length} and 211 * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as 212 * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}. 213 * <ul> 214 * <li>For small samples (where the product of the sample sizes is less than 215 * {@link #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented 216 * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li> 217 * <li>When the product of the sample sizes exceeds {@link #LARGE_SAMPLE_PRODUCT}, the 218 * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} 219 * for details on the approximation.</li> 220 * </ul><p> 221 * If {@code x.length * y.length} < {@link #LARGE_SAMPLE_PRODUCT} and the combined set of values in 222 * {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to 223 * break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed 224 * on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between 225 * values in the combined sample.</p> 226 * <p> 227 * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)} 228 * may be used as an alternative method for estimating the p-value.</p> 229 * 230 * @param x first sample dataset 231 * @param y second sample dataset 232 * @param strict whether or not the probability to compute is expressed as a strict inequality 233 * (ignored for large samples) 234 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 235 * samples from the same distribution 236 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at 237 * least 2 238 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null 239 * @see #bootstrap(double[], double[], int, boolean) 240 */ 241 public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { 242 final long lengthProduct = (long) x.length * y.length; 243 final double[] xa; 244 final double[] ya; 245 if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) { 246 xa = x.clone(); 247 ya = y.clone(); 248 fixTies(xa, ya); 249 } else { 250 xa = x; 251 ya = y; 252 } 253 if (lengthProduct < LARGE_SAMPLE_PRODUCT) { 254 return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict); 255 } 256 return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); 257 } 258 259 /** 260 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a 261 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 262 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 263 * probability distribution. Assumes the strict form of the inequality used to compute the 264 * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}. 265 * 266 * @param x first sample dataset 267 * @param y second sample dataset 268 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent 269 * samples from the same distribution 270 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at 271 * least 2 272 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null 273 */ 274 public double kolmogorovSmirnovTest(double[] x, double[] y) { 275 return kolmogorovSmirnovTest(x, y, true); 276 } 277 278 /** 279 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 280 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 281 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 282 * is the empirical distribution of the {@code y} values. 283 * 284 * @param x first sample 285 * @param y second sample 286 * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 287 * {@code y} represent samples from the same underlying distribution 288 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at 289 * least 2 290 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null 291 */ 292 public double kolmogorovSmirnovStatistic(double[] x, double[] y) { 293 return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length)); 294 } 295 296 /** 297 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) 298 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the 299 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) 300 * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned 301 * as long value. 302 * 303 * @param x first sample 304 * @param y second sample 305 * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and 306 * {@code y} represent samples from the same underlying distribution 307 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at 308 * least 2 309 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null 310 */ 311 private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) { 312 checkArray(x); 313 checkArray(y); 314 // Copy and sort the sample arrays 315 final double[] sx = x.clone(); 316 final double[] sy = y.clone(); 317 Arrays.sort(sx); 318 Arrays.sort(sy); 319 final int n = sx.length; 320 final int m = sy.length; 321 322 int rankX = 0; 323 int rankY = 0; 324 long curD = 0l; 325 326 // Find the max difference between cdf_x and cdf_y 327 long supD = 0l; 328 do { 329 double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY]; 330 while(rankX < n && Double.compare(sx[rankX], z) == 0) { 331 rankX += 1; 332 curD += m; 333 } 334 while(rankY < m && Double.compare(sy[rankY], z) == 0) { 335 rankY += 1; 336 curD -= n; 337 } 338 if (curD > supD) { 339 supD = curD; 340 } 341 else if (-curD > supD) { 342 supD = -curD; 343 } 344 } while(rankX < n && rankY < m); 345 return supD; 346 } 347 348 /** 349 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a 350 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 351 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 352 * 353 * @param distribution reference distribution 354 * @param data sample being being evaluated 355 * @return the p-value associated with the null hypothesis that {@code data} is a sample from 356 * {@code distribution} 357 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2 358 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null 359 */ 360 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) { 361 return kolmogorovSmirnovTest(distribution, data, false); 362 } 363 364 /** 365 * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov 366 * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}. 367 * 368 * @param distribution reference distribution 369 * @param data sample being being evaluated 370 * @param alpha significance level of the test 371 * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution} 372 * can be rejected with confidence 1 - {@code alpha} 373 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2 374 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null 375 */ 376 public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) { 377 if ((alpha <= 0) || (alpha > 0.5)) { 378 throw new MathIllegalArgumentException(LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5); 379 } 380 return kolmogorovSmirnovTest(distribution, data) < alpha; 381 } 382 383 /** 384 * Estimates the <i>p-value</i> of a two-sample 385 * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a> 386 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same 387 * probability distribution. This method estimates the p-value by repeatedly sampling sets of size 388 * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample. 389 * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function 390 * {@code ks.boot}, described in <pre> 391 * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching 392 * Software with Automated Balance Optimization: The Matching package for R.' 393 * Journal of Statistical Software, 42(7): 1-52. 394 * </pre> 395 * @param x first sample 396 * @param y second sample 397 * @param iterations number of bootstrap resampling iterations 398 * @param strict whether or not the null hypothesis is expressed as a strict inequality 399 * @return estimated p-value 400 */ 401 public double bootstrap(double[] x, double[] y, int iterations, boolean strict) { 402 final int xLength = x.length; 403 final int yLength = y.length; 404 final double[] combined = new double[xLength + yLength]; 405 System.arraycopy(x, 0, combined, 0, xLength); 406 System.arraycopy(y, 0, combined, xLength, yLength); 407 final long d = integralKolmogorovSmirnovStatistic(x, y); 408 int greaterCount = 0; 409 int equalCount = 0; 410 double[] curX; 411 double[] curY; 412 long curD; 413 for (int i = 0; i < iterations; i++) { 414 curX = resample(combined, xLength); 415 curY = resample(combined, yLength); 416 curD = integralKolmogorovSmirnovStatistic(curX, curY); 417 if (curD > d) { 418 greaterCount++; 419 } else if (curD == d) { 420 equalCount++; 421 } 422 } 423 return strict ? greaterCount / (double) iterations : 424 (greaterCount + equalCount) / (double) iterations; 425 } 426 427 /** 428 * Computes {@code bootstrap(x, y, iterations, true)}. 429 * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching 430 * package function. See #bootstrap(double[], double[], int, boolean). 431 * 432 * @param x first sample 433 * @param y second sample 434 * @param iterations number of bootstrap resampling iterations 435 * @return estimated p-value 436 */ 437 public double bootstrap(double[] x, double[] y, int iterations) { 438 return bootstrap(x, y, iterations, true); 439 } 440 441 /** 442 * Return a bootstrap sample (with replacement) of size k from sample. 443 * 444 * @param sample array to sample from 445 * @param k size of bootstrap sample 446 * @return bootstrap sample 447 */ 448 private double[] resample(double[] sample, int k) { 449 final int len = sample.length; 450 final double[] out = new double[k]; 451 for (int i = 0; i < k; i++) { 452 out[i] = gen.nextInt(len); 453 } 454 return out; 455 } 456 457 /** 458 * Calculates {@code P(D_n < d)} using the method described in [1] with quick decisions for extreme 459 * values given in [2] (see above). The result is not exact as with 460 * {@link #cdfExact(double, int)} because calculations are based on 461 * {@code double} rather than {@link org.hipparchus.fraction.BigFraction}. 462 * 463 * @param d statistic 464 * @param n sample size 465 * @return \(P(D_n < d)\) 466 * @throws MathRuntimeException if algorithm fails to convert {@code h} to a 467 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k 468 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\) 469 */ 470 public double cdf(double d, int n) 471 throws MathRuntimeException { 472 return cdf(d, n, false); 473 } 474 475 /** 476 * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is 477 * used everywhere at the expense of very slow execution time. Almost never choose this in real 478 * applications unless you are very sure; this is almost solely for verification purposes. 479 * Normally, you would choose {@link #cdf(double, int)}. See the class 480 * javadoc for definitions and algorithm description. 481 * 482 * @param d statistic 483 * @param n sample size 484 * @return \(P(D_n < d)\) 485 * @throws MathRuntimeException if the algorithm fails to convert {@code h} to a 486 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k 487 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\) 488 */ 489 public double cdfExact(double d, int n) 490 throws MathRuntimeException { 491 return cdf(d, n, true); 492 } 493 494 /** 495 * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme 496 * values given in [2] (see above). 497 * 498 * @param d statistic 499 * @param n sample size 500 * @param exact whether the probability should be calculated exact using 501 * {@link org.hipparchus.fraction.BigFraction} everywhere at the expense of 502 * very slow execution time, or if {@code double} should be used convenient places to 503 * gain speed. Almost never choose {@code true} in real applications unless you are very 504 * sure; {@code true} is almost solely for verification purposes. 505 * @return \(P(D_n < d)\) 506 * @throws MathRuntimeException if algorithm fails to convert {@code h} to a 507 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k 508 * - h) / m\) for integer {@code k, m} and \(0 \lt;= h < 1\). 509 */ 510 public double cdf(double d, int n, boolean exact) 511 throws MathRuntimeException { 512 513 final double ninv = 1 / ((double) n); 514 final double ninvhalf = 0.5 * ninv; 515 516 if (d <= ninvhalf) { 517 return 0; 518 } else if (ninvhalf < d && d <= ninv) { 519 double res = 1; 520 final double f = 2 * d - ninv; 521 // n! f^n = n*f * (n-1)*f * ... * 1*x 522 for (int i = 1; i <= n; ++i) { 523 res *= i * f; 524 } 525 return res; 526 } else if (1 - ninv <= d && d < 1) { 527 return 1 - 2 * Math.pow(1 - d, n); 528 } else if (1 <= d) { 529 return 1; 530 } 531 if (exact) { 532 return exactK(d, n); 533 } 534 if (n <= 140) { 535 return roundedK(d, n); 536 } 537 return pelzGood(d, n); 538 } 539 540 /** 541 * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference 542 * in class javadoc above) and {@link org.hipparchus.fraction.BigFraction} (see 543 * above). 544 * 545 * @param d statistic 546 * @param n sample size 547 * @return the two-sided probability of \(P(D_n < d)\) 548 * @throws MathRuntimeException if algorithm fails to convert {@code h} to a 549 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k 550 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\). 551 */ 552 private double exactK(double d, int n) 553 throws MathRuntimeException { 554 555 final int k = (int) Math.ceil(n * d); 556 557 final FieldMatrix<BigFraction> H = this.createExactH(d, n); 558 final FieldMatrix<BigFraction> Hpower = H.power(n); 559 560 BigFraction pFrac = Hpower.getEntry(k - 1, k - 1); 561 562 for (int i = 1; i <= n; ++i) { 563 pFrac = pFrac.multiply(i).divide(n); 564 } 565 566 /* 567 * BigFraction.doubleValue converts numerator to double and the denominator to double and 568 * divides afterwards. That gives NaN quite easy. This does not (scale is the number of 569 * digits): 570 */ 571 return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue(); 572 } 573 574 /** 575 * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above). 576 * 577 * @param d statistic 578 * @param n sample size 579 * @return \(P(D_n < d)\) 580 */ 581 private double roundedK(double d, int n) { 582 583 final int k = (int) Math.ceil(n * d); 584 final RealMatrix H = this.createRoundedH(d, n); 585 final RealMatrix Hpower = H.power(n); 586 587 double pFrac = Hpower.getEntry(k - 1, k - 1); 588 for (int i = 1; i <= n; ++i) { 589 pFrac *= (double) i / (double) n; 590 } 591 592 return pFrac; 593 } 594 595 /** 596 * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc. 597 * 598 * @param d value of d-statistic (x in [2]) 599 * @param n sample size 600 * @return \(P(D_n < d)\) 601 */ 602 public double pelzGood(double d, int n) { 603 // Change the variable since approximation is for the distribution evaluated at d / sqrt(n) 604 final double sqrtN = FastMath.sqrt(n); 605 final double z = d * sqrtN; 606 final double z2 = d * d * n; 607 final double z4 = z2 * z2; 608 final double z6 = z4 * z2; 609 final double z8 = z4 * z4; 610 611 // Compute K_0(z) 612 double sum = 0; 613 double z2Term = MathUtils.PI_SQUARED / (8 * z2); 614 int k = 1; 615 for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 616 final double kTerm = 2 * k - 1; 617 final double increment = FastMath.exp(-z2Term * kTerm * kTerm); 618 sum += increment; 619 if (increment <= PG_SUM_RELATIVE_ERROR * sum) { 620 break; 621 } 622 } 623 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 624 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT); 625 } 626 double ret = sum * FastMath.sqrt(2 * FastMath.PI) / z; 627 628 // K_1(z) 629 // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have 630 // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...) 631 final double twoZ2 = 2 * z2; 632 sum = 0; 633 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 634 final double kTerm = k + 0.5; 635 final double kTerm2 = kTerm * kTerm; 636 final double increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 637 sum += increment; 638 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 639 break; 640 } 641 } 642 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 643 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT); 644 } 645 final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2); 646 // Instead of doubling sum, divide by 3 instead of 6 647 ret += sum * sqrtHalfPi / (3 * z4 * sqrtN); 648 649 // K_2(z) 650 // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers. 651 final double z4Term = 2 * z4; 652 final double z6Term = 6 * z6; 653 z2Term = 5 * z2; 654 final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED; 655 sum = 0; 656 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 657 final double kTerm = k + 0.5; 658 final double kTerm2 = kTerm * kTerm; 659 final double increment = (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 + 660 pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 661 sum += increment; 662 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 663 break; 664 } 665 } 666 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 667 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT); 668 } 669 double sum2 = 0; 670 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 671 final double kTerm2 = k * k; 672 final double increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 673 sum2 += increment; 674 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 675 break; 676 } 677 } 678 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 679 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT); 680 } 681 // Again, adjust coefficients instead of doubling sum, sum2 682 ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z)); 683 684 // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even. 685 // Multiply coefficient denominators by 2, so omit doubling sums. 686 final double pi6 = pi4 * MathUtils.PI_SQUARED; 687 sum = 0; 688 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 689 final double kTerm = k + 0.5; 690 final double kTerm2 = kTerm * kTerm; 691 final double kTerm4 = kTerm2 * kTerm2; 692 final double kTerm6 = kTerm4 * kTerm2; 693 final double increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) + 694 MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) * 695 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 696 sum += increment; 697 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) { 698 break; 699 } 700 } 701 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 702 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT); 703 } 704 sum2 = 0; 705 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) { 706 final double kTerm2 = k * k; 707 final double kTerm4 = kTerm2 * kTerm2; 708 final double increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) * 709 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2); 710 sum2 += increment; 711 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) { 712 break; 713 } 714 } 715 if (k == MAXIMUM_PARTIAL_SUM_COUNT) { 716 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT); 717 } 718 return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) + 719 + sum2 / (108 * z6)); 720 721 } 722 723 /*** 724 * Creates {@code H} of size {@code m x m} as described in [1] (see above). 725 * 726 * @param d statistic 727 * @param n sample size 728 * @return H matrix 729 * @throws MathIllegalArgumentException if fractional part is greater than 1 730 * @throws MathIllegalStateException if algorithm fails to convert {@code h} to a 731 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k 732 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\). 733 */ 734 private FieldMatrix<BigFraction> createExactH(double d, int n) 735 throws MathIllegalArgumentException, MathIllegalStateException { 736 737 final int k = (int) Math.ceil(n * d); 738 final int m = 2 * k - 1; 739 final double hDouble = k - n * d; 740 if (hDouble >= 1) { 741 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE_BOUND_EXCLUDED, 742 hDouble, 1.0); 743 } 744 BigFraction h; 745 try { 746 h = new BigFraction(hDouble, 1.0e-20, 10000); 747 } catch (final MathIllegalStateException e1) { 748 try { 749 h = new BigFraction(hDouble, 1.0e-10, 10000); 750 } catch (final MathIllegalStateException e2) { 751 h = new BigFraction(hDouble, 1.0e-5, 10000); 752 } 753 } 754 final BigFraction[][] Hdata = new BigFraction[m][m]; 755 756 /* 757 * Start by filling everything with either 0 or 1. 758 */ 759 for (int i = 0; i < m; ++i) { 760 for (int j = 0; j < m; ++j) { 761 if (i - j + 1 < 0) { 762 Hdata[i][j] = BigFraction.ZERO; 763 } else { 764 Hdata[i][j] = BigFraction.ONE; 765 } 766 } 767 } 768 769 /* 770 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 771 * hPowers[m-1] = h^m 772 */ 773 final BigFraction[] hPowers = new BigFraction[m]; 774 hPowers[0] = h; 775 for (int i = 1; i < m; ++i) { 776 hPowers[i] = h.multiply(hPowers[i - 1]); 777 } 778 779 /* 780 * First column and last row has special values (each other reversed). 781 */ 782 for (int i = 0; i < m; ++i) { 783 Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]); 784 Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]); 785 } 786 787 /* 788 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 789 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 790 */ 791 if (h.compareTo(BigFraction.ONE_HALF) == 1) { 792 Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m)); 793 } 794 795 /* 796 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 797 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 798 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 799 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 800 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 801 * really necessary. 802 */ 803 for (int i = 0; i < m; ++i) { 804 for (int j = 0; j < i + 1; ++j) { 805 if (i - j + 1 > 0) { 806 for (int g = 2; g <= i - j + 1; ++g) { 807 Hdata[i][j] = Hdata[i][j].divide(g); 808 } 809 } 810 } 811 } 812 return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata); 813 } 814 815 /*** 816 * Creates {@code H} of size {@code m x m} as described in [1] (see above) 817 * using double-precision. 818 * 819 * @param d statistic 820 * @param n sample size 821 * @return H matrix 822 * @throws MathIllegalArgumentException if fractional part is greater than 1 823 */ 824 private RealMatrix createRoundedH(double d, int n) 825 throws MathIllegalArgumentException { 826 827 final int k = (int) Math.ceil(n * d); 828 final int m = 2 * k - 1; 829 final double h = k - n * d; 830 if (h >= 1) { 831 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE_BOUND_EXCLUDED, 832 h, 1.0); 833 } 834 final double[][] Hdata = new double[m][m]; 835 836 /* 837 * Start by filling everything with either 0 or 1. 838 */ 839 for (int i = 0; i < m; ++i) { 840 for (int j = 0; j < m; ++j) { 841 if (i - j + 1 < 0) { 842 Hdata[i][j] = 0; 843 } else { 844 Hdata[i][j] = 1; 845 } 846 } 847 } 848 849 /* 850 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ... 851 * hPowers[m-1] = h^m 852 */ 853 final double[] hPowers = new double[m]; 854 hPowers[0] = h; 855 for (int i = 1; i < m; ++i) { 856 hPowers[i] = h * hPowers[i - 1]; 857 } 858 859 /* 860 * First column and last row has special values (each other reversed). 861 */ 862 for (int i = 0; i < m; ++i) { 863 Hdata[i][0] = Hdata[i][0] - hPowers[i]; 864 Hdata[m - 1][i] -= hPowers[m - i - 1]; 865 } 866 867 /* 868 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m + 869 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check: 870 */ 871 if (Double.compare(h, 0.5) > 0) { 872 Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m); 873 } 874 875 /* 876 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i - 877 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is 878 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then 879 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of 880 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't 881 * really necessary. 882 */ 883 for (int i = 0; i < m; ++i) { 884 for (int j = 0; j < i + 1; ++j) { 885 if (i - j + 1 > 0) { 886 for (int g = 2; g <= i - j + 1; ++g) { 887 Hdata[i][j] /= g; 888 } 889 } 890 } 891 } 892 return MatrixUtils.createRealMatrix(Hdata); 893 } 894 895 /** 896 * Verifies that {@code array} has length at least 2. 897 * 898 * @param array array to test 899 * @throws org.hipparchus.exception.NullArgumentException if array is null 900 * @throws MathIllegalArgumentException if array is too short 901 */ 902 private void checkArray(double[] array) { 903 MathUtils.checkNotNull(array); 904 if (array.length < 2) { 905 throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length, 906 2); 907 } 908 } 909 910 /** 911 * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial 912 * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums 913 * have been computed. If the sum does not converge before {@code maxIterations} iterations a 914 * {@link MathIllegalStateException} is thrown. 915 * 916 * @param t argument 917 * @param tolerance Cauchy criterion for partial sums 918 * @param maxIterations maximum number of partial sums to compute 919 * @return Kolmogorov sum evaluated at t 920 * @throws MathIllegalStateException if the series does not converge 921 */ 922 public double ksSum(double t, double tolerance, int maxIterations) { 923 if (t == 0.0) { 924 return 0.0; 925 } 926 927 // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1] 928 // from class javadoc should be used. 929 930 final double x = -2 * t * t; 931 int sign = -1; 932 long i = 1; 933 double partialSum = 0.5d; 934 double delta = 1; 935 while (delta > tolerance && i < maxIterations) { 936 delta = FastMath.exp(x * i * i); 937 partialSum += sign * delta; 938 sign *= -1; 939 i++; 940 } 941 if (i == maxIterations) { 942 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, maxIterations); 943 } 944 return partialSum * 2; 945 } 946 947 /** 948 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge 949 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See 950 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 951 * <p> 952 * The returned probability is exact, implemented by unwinding the recursive function 953 * definitions presented in [4] from the class javadoc. 954 * </p> 955 * 956 * @param d D-statistic value 957 * @param n first sample size 958 * @param m second sample size 959 * @param strict whether or not the probability to compute is expressed as a strict inequality 960 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 961 * greater than (resp. greater than or equal to) {@code d} 962 */ 963 public double exactP(double d, int n, int m, boolean strict) { 964 if (d < 1 / (double)( m * n)) { 965 return 1.0; 966 } else if (d >= 1) { 967 return 0; 968 } 969 double normalizeD = normalizeD(d, n, m); 970 if (!strict) { 971 normalizeD -= 1 / ((double)n * m); 972 } 973 return exactPAtMeshpoint(normalizeD, n, m); 974 } 975 976 /** 977 * Normalizes a value to an integral multiple of 1/mn between 0 and 1. 978 * If d < 1/mn, 0 is returned; if d > 1, 1 is returned; if d is very close 979 * to an integral multiple of 1/mn, that value is returned; otherwise the 980 * returned value is the smallest multiple of 1/mn less than or equal to d. 981 * 982 * @param d d value 983 * @param n first sample size 984 * @param m second sample size 985 * @return d value suitable for input to exactPAtMeshpoint(d, m, n) 986 */ 987 private double normalizeD(double d, int n, int m) { 988 final double resolution = 1 / ((double)n * m); 989 final double tol = 1e-12; 990 991 // If d is smaller that the first mesh point, return 0 992 // If greater than 1, return 1 993 if (d < resolution) { 994 return 0; 995 } else if (d > 1) { 996 return 1; 997 } 998 999 // Normalize d to the smallest mesh point less than or equal to d; 1000 // except if d is less than tol less than the next mesh point, bump it up 1001 final double resolutions = d / resolution; 1002 final double ceil = FastMath.ceil(resolutions); 1003 if (ceil - resolutions < tol) { 1004 return ceil * resolution; 1005 } else { 1006 return FastMath.floor(resolutions) * resolution; 1007 } 1008 1009 } 1010 1011 /** 1012 * Computes \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See 1013 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 1014 * <p> 1015 * The returned probability is exact, implemented by unwinding the recursive function 1016 * definitions presented in [4]. 1017 * 1018 * @param d D-statistic value (must be a "meshpoint" - i.e., a possible actual value of D(m,n)). 1019 * @param n first sample size 1020 * @param m second sample size 1021 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) 1022 * greater than (resp. greater than or equal to) {@code d} 1023 */ 1024 private double exactPAtMeshpoint(double d, int n, int m) { 1025 final int nn = FastMath.max(n, m); 1026 final int mm = FastMath.min(n, m); 1027 final double[] u = new double[nn + 2]; 1028 final double k = mm * nn * d + 0.5; 1029 u[1] = 1d; 1030 for (int j = 1; j < nn + 1; j++) { 1031 u[j + 1] = 1; 1032 if (mm * j > k) { 1033 u[j + 1] = 0; 1034 } 1035 } 1036 for (int i = 1; i < mm + 1; i++) { 1037 final double w = (double) i / (double) (i + nn); 1038 u[1] = w * u[1]; 1039 if (nn * i > k) { 1040 u[1] = 0; 1041 } 1042 for (int j = 1; j < nn + 1; j++) { 1043 u[j + 1] = u[j] + u[j + 1] * w; 1044 if (FastMath.abs(nn * i - mm * j) > k) { 1045 u[j + 1] = 0; 1046 } 1047 } 1048 } 1049 return 1 - u[nn + 1]; 1050 } 1051 1052 /** 1053 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) 1054 * is the 2-sample Kolmogorov-Smirnov statistic. See 1055 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). 1056 * <p> 1057 * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2 1058 * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for 1059 * details on how convergence of the sum is determined. This implementation passes {@code ksSum} 1060 * {@link #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and 1061 * {@link #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}. 1062 * </p> 1063 * 1064 * @param d D-statistic value 1065 * @param n first sample size 1066 * @param m second sample size 1067 * @return approximate probability that a randomly selected m-n partition of m + n generates 1068 * \(D_{n,m}\) greater than {@code d} 1069 */ 1070 public double approximateP(double d, int n, int m) { 1071 final double dm = m; 1072 final double dn = n; 1073 return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)), 1074 KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT); 1075 } 1076 1077 /** 1078 * Fills a boolean array randomly with a fixed number of {@code true} values. 1079 * The method uses a simplified version of the Fisher-Yates shuffle algorithm. 1080 * By processing first the {@code true} values followed by the remaining {@code false} values 1081 * less random numbers need to be generated. The method is optimized for the case 1082 * that the number of {@code true} values is larger than or equal to the number of 1083 * {@code false} values. 1084 * 1085 * @param b boolean array 1086 * @param numberOfTrueValues number of {@code true} values the boolean array should finally have 1087 * @param rng random data generator 1088 */ 1089 static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) { 1090 Arrays.fill(b, true); 1091 for (int k = numberOfTrueValues; k < b.length; k++) { 1092 final int r = rng.nextInt(k + 1); 1093 b[(b[r]) ? r : k] = false; 1094 } 1095 } 1096 1097 /** 1098 * If there are no ties in the combined dataset formed from x and y, this 1099 * method is a no-op. If there are ties, a uniform random deviate in 1100 * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where 1101 * minDelta is the minimum difference between unequal values in the combined 1102 * sample. A fixed seed is used to generate the jitter, so repeated activations 1103 * with the same input arrays result in the same values. 1104 * 1105 * NOTE: if there are ties in the data, this method overwrites the data in 1106 * x and y with the jittered values. 1107 * 1108 * @param x first sample 1109 * @param y second sample 1110 */ 1111 private void fixTies(double[] x, double[] y) { 1112 final double[] values = MathArrays.unique(MathArrays.concatenate(x,y)); 1113 if (values.length == x.length + y.length) { 1114 return; // There are no ties 1115 } 1116 1117 // Find the smallest difference between values, or 1 if all values are the same 1118 double minDelta = 1; 1119 double prev = values[0]; 1120 for (int i = 1; i < values.length; i++) { 1121 final double delta = prev - values[i]; 1122 if (delta < minDelta) { 1123 minDelta = delta; 1124 } 1125 prev = values[i]; 1126 } 1127 minDelta /= 2; 1128 1129 // Add jitter using a fixed seed (so same arguments always give same results), 1130 // low-initialization-overhead generator 1131 gen.setSeed(100); 1132 1133 // It is theoretically possible that jitter does not break ties, so repeat 1134 // until all ties are gone. Bound the loop and throw MIE if bound is exceeded. 1135 int ct = 0; 1136 boolean ties; 1137 do { 1138 jitter(x, minDelta); 1139 jitter(y, minDelta); 1140 ties = hasTies(x, y); 1141 ct++; 1142 } while (ties && ct < 1000); 1143 if (ties) { 1144 throw MathRuntimeException.createInternalError(); // Should never happen 1145 } 1146 } 1147 1148 /** 1149 * Returns true iff there are ties in the combined sample 1150 * formed from x and y. 1151 * 1152 * @param x first sample 1153 * @param y second sample 1154 * @return true if x and y together contain ties 1155 */ 1156 private static boolean hasTies(double[] x, double[] y) { 1157 final HashSet<Double> values = new HashSet<>(); 1158 for (int i = 0; i < x.length; i++) { 1159 if (!values.add(x[i])) { 1160 return true; 1161 } 1162 } 1163 for (int i = 0; i < y.length; i++) { 1164 if (!values.add(y[i])) { 1165 return true; 1166 } 1167 } 1168 return false; 1169 } 1170 1171 /** 1172 * Adds random jitter to {@code data} using uniform deviates between {@code -delta} and {@code delta}. 1173 * <p> 1174 * Note that jitter is applied in-place - i.e., the array 1175 * values are overwritten with the result of applying jitter.</p> 1176 * 1177 * @param data input/output data array - entries overwritten by the method 1178 * @param delta max magnitude of jitter 1179 * @throws NullPointerException if either of the parameters is null 1180 */ 1181 private void jitter(double[] data, double delta) { 1182 for (int i = 0; i < data.length; i++) { 1183 data[i] += gen.nextUniform(-delta, delta); 1184 } 1185 } 1186 1187 }