1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22
23 package org.hipparchus.stat.fitting;
24
25 import java.io.BufferedReader;
26 import java.io.Closeable;
27 import java.io.File;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.InputStreamReader;
31 import java.net.URL;
32 import java.nio.charset.Charset;
33 import java.nio.file.Files;
34 import java.util.ArrayList;
35 import java.util.List;
36
37 import org.hipparchus.distribution.RealDistribution;
38 import org.hipparchus.distribution.continuous.AbstractRealDistribution;
39 import org.hipparchus.distribution.continuous.ConstantRealDistribution;
40 import org.hipparchus.distribution.continuous.NormalDistribution;
41 import org.hipparchus.exception.LocalizedCoreFormats;
42 import org.hipparchus.exception.MathIllegalArgumentException;
43 import org.hipparchus.exception.MathIllegalStateException;
44 import org.hipparchus.exception.MathRuntimeException;
45 import org.hipparchus.exception.NullArgumentException;
46 import org.hipparchus.random.RandomDataGenerator;
47 import org.hipparchus.random.RandomGenerator;
48 import org.hipparchus.stat.descriptive.StatisticalSummary;
49 import org.hipparchus.stat.descriptive.StreamingStatistics;
50 import org.hipparchus.util.FastMath;
51 import org.hipparchus.util.MathUtils;
52
53 /**
54 * <p>Represents an <a href="http://http://en.wikipedia.org/wiki/Empirical_distribution_function">
55 * empirical probability distribution</a> -- a probability distribution derived
56 * from observed data without making any assumptions about the functional form
57 * of the population distribution that the data come from.</p>
58 *
59 * <p>An <code>EmpiricalDistribution</code> maintains data structures, called
60 * <i>distribution digests</i>, that describe empirical distributions and
61 * support the following operations:
62 * </p>
63 * <ul>
64 * <li>loading the distribution from a file of observed data values</li>
65 * <li>dividing the input data into "bin ranges" and reporting bin frequency
66 * counts (data for histogram)</li>
67 * <li>reporting univariate statistics describing the full set of data values
68 * as well as the observations within each bin</li>
69 * <li>generating random values from the distribution</li>
70 * </ul>
71 * <p>
72 * Applications can use <code>EmpiricalDistribution</code> to build grouped
73 * frequency histograms representing the input data or to generate random values
74 * "like" those in the input file -- i.e., the values generated will follow the
75 * distribution of the values in the file.
76 * </p>
77 *
78 * <p>The implementation uses what amounts to the
79 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
80 * Variable Kernel Method</a> with Gaussian smoothing:</p>
81 * <p><strong>Digesting the input file</strong></p>
82 * <ol><li>Pass the file once to compute min and max.</li>
83 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
84 * <li>Pass the data file again, computing bin counts and univariate
85 * statistics (mean, std dev.) for each of the bins </li>
86 * <li>Divide the interval (0,1) into subintervals associated with the bins,
87 * with the length of a bin's subinterval proportional to its count.</li></ol>
88 * <strong>Generating random values from the distribution</strong><ol>
89 * <li>Generate a uniformly distributed value in (0,1) </li>
90 * <li>Select the subinterval to which the value belongs.
91 * <li>Generate a random Gaussian value with mean = mean of the associated
92 * bin and std dev = std dev of associated bin.</li></ol>
93 *
94 * <p>EmpiricalDistribution implements the {@link RealDistribution} interface
95 * as follows. Given x within the range of values in the dataset, let B
96 * be the bin containing x and let K be the within-bin kernel for B. Let P(B-)
97 * be the sum of the probabilities of the bins below B and let K(B) be the
98 * mass of B under K (i.e., the integral of the kernel density over B). Then
99 * set P(X < x) = P(B-) + P(B) * K(x) / K(B) where K(x) is the kernel distribution
100 * evaluated at x. This results in a cdf that matches the grouped frequency
101 * distribution at the bin endpoints and interpolates within bins using
102 * within-bin kernels.</p>
103 *
104 *<p><strong>USAGE NOTES:</strong></p>
105 *<ul>
106 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
107 * is to set the bin count to approximately the length of the input file divided
108 * by 10. </li>
109 *<li>The input file <i>must</i> be a plain text file containing one valid numeric
110 * entry per line.</li>
111 * </ul>
112 *
113 */
114 public class EmpiricalDistribution extends AbstractRealDistribution {
115
116 /** Default bin count */
117 public static final int DEFAULT_BIN_COUNT = 1000;
118
119 /** Character set for file input */
120 private static final String FILE_CHARSET = "US-ASCII";
121
122 /** Serializable version identifier */
123 private static final long serialVersionUID = 5729073523949762654L;
124
125 /** RandomDataGenerator instance to use in repeated calls to getNext() */
126 protected final RandomDataGenerator randomData;
127
128 /** List of SummaryStatistics objects characterizing the bins */
129 private final List<StreamingStatistics> binStats;
130
131 /** Sample statistics */
132 private StreamingStatistics sampleStats;
133
134 /** Max loaded value */
135 private double max = Double.NEGATIVE_INFINITY;
136
137 /** Min loaded value */
138 private double min = Double.POSITIVE_INFINITY;
139
140 /** Grid size */
141 private double delta;
142
143 /** number of bins */
144 private final int binCount;
145
146 /** is the distribution loaded? */
147 private boolean loaded;
148
149 /** upper bounds of subintervals in (0,1) "belonging" to the bins */
150 private double[] upperBounds;
151
152 /**
153 * Creates a new EmpiricalDistribution with the default bin count.
154 */
155 public EmpiricalDistribution() {
156 this(DEFAULT_BIN_COUNT);
157 }
158
159 /**
160 * Creates a new EmpiricalDistribution with the specified bin count.
161 *
162 * @param binCount number of bins. Must be strictly positive.
163 * @throws MathIllegalArgumentException if {@code binCount <= 0}.
164 */
165 public EmpiricalDistribution(int binCount) {
166 this(binCount, new RandomDataGenerator());
167 }
168
169 /**
170 * Creates a new EmpiricalDistribution with the specified bin count using the
171 * provided {@link RandomGenerator} as the source of random data.
172 *
173 * @param binCount number of bins. Must be strictly positive.
174 * @param generator random data generator (may be null, resulting in default JDK generator)
175 * @throws MathIllegalArgumentException if {@code binCount <= 0}.
176 */
177 public EmpiricalDistribution(int binCount, RandomGenerator generator) {
178 this(binCount, RandomDataGenerator.of(generator));
179 }
180
181 /**
182 * Creates a new EmpiricalDistribution with default bin count using the
183 * provided {@link RandomGenerator} as the source of random data.
184 *
185 * @param generator random data generator (may be null, resulting in default JDK generator)
186 */
187 public EmpiricalDistribution(RandomGenerator generator) {
188 this(DEFAULT_BIN_COUNT, generator);
189 }
190
191 /**
192 * Private constructor to allow lazy initialisation of the RNG contained
193 * in the {@link #randomData} instance variable.
194 *
195 * @param binCount number of bins. Must be strictly positive.
196 * @param randomData Random data generator.
197 * @throws MathIllegalArgumentException if {@code binCount <= 0}.
198 */
199 private EmpiricalDistribution(int binCount,
200 RandomDataGenerator randomData) {
201 if (binCount <= 0) {
202 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL_BOUND_EXCLUDED,
203 binCount, 0);
204 }
205 this.binCount = binCount;
206 this.randomData = randomData;
207 binStats = new ArrayList<>();
208 }
209
210 /**
211 * Computes the empirical distribution from the provided
212 * array of numbers.
213 *
214 * @param in the input data array
215 * @exception NullArgumentException if in is null
216 */
217 public void load(double[] in) throws NullArgumentException {
218 try (DataAdapter da = new ArrayDataAdapter(in)) {
219 da.computeStats();
220 // new adapter for the second pass
221 fillBinStats(new ArrayDataAdapter(in));
222 } catch (IOException ex) {
223 // Can't happen
224 throw MathRuntimeException.createInternalError();
225 }
226 loaded = true;
227
228 }
229
230 /**
231 * Computes the empirical distribution using data read from a URL.
232 *
233 * <p>The input file <i>must</i> be an ASCII text file containing one
234 * valid numeric entry per line.</p>
235 *
236 * @param url url of the input file
237 *
238 * @throws IOException if an IO error occurs
239 * @throws NullArgumentException if url is null
240 * @throws MathIllegalArgumentException if URL contains no data
241 */
242 public void load(URL url) throws IOException, MathIllegalArgumentException, NullArgumentException {
243 MathUtils.checkNotNull(url);
244 Charset charset = Charset.forName(FILE_CHARSET);
245 try (InputStream is1 = url.openStream();
246 InputStreamReader isr1 = new InputStreamReader(is1, charset);
247 BufferedReader br1 = new BufferedReader(isr1);
248 DataAdapter da1 = new StreamDataAdapter(br1)) {
249 da1.computeStats();
250 if (sampleStats.getN() == 0) {
251 throw new MathIllegalArgumentException(LocalizedCoreFormats.URL_CONTAINS_NO_DATA, url);
252 }
253 // new adapter for the second pass
254 try (InputStream is2 = url.openStream();
255 InputStreamReader isr2 = new InputStreamReader(is2, charset);
256 BufferedReader br2 = new BufferedReader(isr2);
257 DataAdapter da2 = new StreamDataAdapter(br2)) {
258 fillBinStats(da2);
259 loaded = true;
260 }
261 }
262 }
263
264 /**
265 * Computes the empirical distribution from the input file.
266 *
267 * <p>The input file <i>must</i> be an ASCII text file containing one
268 * valid numeric entry per line.</p>
269 *
270 * @param file the input file
271 * @throws IOException if an IO error occurs
272 * @throws NullArgumentException if file is null
273 */
274 public void load(File file) throws IOException, NullArgumentException {
275 MathUtils.checkNotNull(file);
276 Charset charset = Charset.forName(FILE_CHARSET);
277 try (InputStream is1 = Files.newInputStream(file.toPath());
278 BufferedReader br1 = new BufferedReader(new InputStreamReader(is1, charset));
279 DataAdapter da1 = new StreamDataAdapter(br1)) {
280 da1.computeStats();
281 // new adapter for second pass
282 try (InputStream is2 = Files.newInputStream(file.toPath());
283 BufferedReader in2 = new BufferedReader(new InputStreamReader(is2, charset));
284 DataAdapter da2 = new StreamDataAdapter(in2)) {
285 fillBinStats(da2);
286 }
287 loaded = true;
288 }
289 }
290
291 /**
292 * Provides methods for computing <code>sampleStats</code> and
293 * <code>beanStats</code> abstracting the source of data.
294 */
295 private abstract class DataAdapter implements Closeable {
296
297 /**
298 * Compute bin stats.
299 *
300 * @throws IOException if an error occurs computing bin stats
301 */
302 public abstract void computeBinStats() throws IOException;
303
304 /**
305 * Compute sample statistics.
306 *
307 * @throws IOException if an error occurs computing sample stats
308 */
309 public abstract void computeStats() throws IOException;
310
311 }
312
313 /**
314 * <code>DataAdapter</code> for data provided through some input stream
315 */
316 private class StreamDataAdapter extends DataAdapter {
317
318 /** Input stream providing access to the data */
319 private BufferedReader inputStream;
320
321 /**
322 * Create a StreamDataAdapter from a BufferedReader
323 *
324 * @param in BufferedReader input stream
325 */
326 StreamDataAdapter(BufferedReader in) {
327 inputStream = in;
328 }
329
330 /** {@inheritDoc} */
331 @Override
332 public void computeBinStats() throws IOException {
333 for (String str = inputStream.readLine(); str != null; str = inputStream.readLine()) {
334 final double val = Double.parseDouble(str);
335 StreamingStatistics stats = binStats.get(findBin(val));
336 stats.addValue(val);
337 }
338 }
339
340 /** {@inheritDoc} */
341 @Override
342 public void computeStats() throws IOException {
343 sampleStats = new StreamingStatistics();
344 for (String str = inputStream.readLine(); str != null; str = inputStream.readLine()) {
345 final double val = Double.parseDouble(str);
346 sampleStats.addValue(val);
347 }
348 }
349
350 /** {@inheritDoc} */
351 @Override
352 public void close() throws IOException {
353 if (inputStream != null) {
354 inputStream.close();
355 inputStream = null;
356 }
357 }
358
359 }
360
361 /**
362 * <code>DataAdapter</code> for data provided as array of doubles.
363 */
364 private class ArrayDataAdapter extends DataAdapter {
365
366 /** Array of input data values */
367 private final double[] inputArray;
368
369 /**
370 * Construct an ArrayDataAdapter from a double[] array
371 *
372 * @param in double[] array holding the data, a reference to the array will be stored
373 * @throws NullArgumentException if in is null
374 */
375 ArrayDataAdapter(double[] in) throws NullArgumentException {
376 super();
377 MathUtils.checkNotNull(in);
378 inputArray = in; // NOPMD - storing a reference to the array is intentional and documented here
379 }
380
381 /** {@inheritDoc} */
382 @Override
383 public void computeStats() {
384 sampleStats = new StreamingStatistics();
385 for (int i = 0; i < inputArray.length; i++) {
386 sampleStats.addValue(inputArray[i]);
387 }
388 }
389
390 /** {@inheritDoc} */
391 @Override
392 public void computeBinStats() {
393 for (int i = 0; i < inputArray.length; i++) {
394 StreamingStatistics stats =
395 binStats.get(findBin(inputArray[i]));
396 stats.addValue(inputArray[i]);
397 }
398 }
399
400 /** {@inheritDoc} */
401 @Override
402 public void close() {
403 // nothing to do
404 }
405
406 }
407
408 /**
409 * Fills binStats array (second pass through data file).
410 *
411 * @param da object providing access to the data
412 * @throws IOException if an IO error occurs
413 */
414 private void fillBinStats(final DataAdapter da)
415 throws IOException {
416 // Set up grid
417 min = sampleStats.getMin();
418 max = sampleStats.getMax();
419 delta = (max - min)/binCount;
420
421 // Initialize binStats ArrayList
422 if (!binStats.isEmpty()) {
423 binStats.clear();
424 }
425 for (int i = 0; i < binCount; i++) {
426 StreamingStatistics stats = new StreamingStatistics();
427 binStats.add(i,stats);
428 }
429
430 // Filling data in binStats Array
431 da.computeBinStats();
432
433 // Assign upperBounds based on bin counts
434 upperBounds = new double[binCount];
435 upperBounds[0] =
436 ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
437 for (int i = 1; i < binCount-1; i++) {
438 upperBounds[i] = upperBounds[i-1] +
439 ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
440 }
441 upperBounds[binCount-1] = 1.0d;
442 }
443
444 /**
445 * Returns the index of the bin to which the given value belongs
446 *
447 * @param value the value whose bin we are trying to find
448 * @return the index of the bin containing the value
449 */
450 private int findBin(double value) {
451 return FastMath.min(
452 FastMath.max((int) FastMath.ceil((value - min) / delta) - 1, 0),
453 binCount - 1);
454 }
455
456 /**
457 * Generates a random value from this distribution.
458 * <strong>Preconditions:</strong><ul>
459 * <li>the distribution must be loaded before invoking this method</li></ul>
460 * @return the random value.
461 * @throws MathIllegalStateException if the distribution has not been loaded
462 */
463 public double getNextValue() throws MathIllegalStateException {
464
465 if (!loaded) {
466 throw new MathIllegalStateException(LocalizedCoreFormats.DISTRIBUTION_NOT_LOADED);
467 }
468
469 return inverseCumulativeProbability(randomData.nextDouble());
470 }
471
472 /**
473 * Returns a {@link StatisticalSummary} describing this distribution.
474 * <strong>Preconditions:</strong><ul>
475 * <li>the distribution must be loaded before invoking this method</li></ul>
476 *
477 * @return the sample statistics
478 * @throws IllegalStateException if the distribution has not been loaded
479 */
480 public StatisticalSummary getSampleStats() {
481 return sampleStats;
482 }
483
484 /**
485 * Returns the number of bins.
486 *
487 * @return the number of bins.
488 */
489 public int getBinCount() {
490 return binCount;
491 }
492
493 /**
494 * Returns a List of {@link StreamingStatistics} instances containing
495 * statistics describing the values in each of the bins. The list is
496 * indexed on the bin number.
497 *
498 * @return List of bin statistics.
499 */
500 public List<StreamingStatistics> getBinStats() {
501 return binStats;
502 }
503
504 /**
505 * <p>Returns a fresh copy of the array of upper bounds for the bins.
506 * Bins are: <br>
507 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
508 * (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
509 *
510 * @return array of bin upper bounds
511 */
512 public double[] getUpperBounds() {
513 double[] binUpperBounds = new double[binCount];
514 for (int i = 0; i < binCount - 1; i++) {
515 binUpperBounds[i] = min + delta * (i + 1);
516 }
517 binUpperBounds[binCount - 1] = max;
518 return binUpperBounds;
519 }
520
521 /**
522 * <p>Returns a fresh copy of the array of upper bounds of the subintervals
523 * of [0,1] used in generating data from the empirical distribution.
524 * Subintervals correspond to bins with lengths proportional to bin counts.</p>
525 *
526 * <strong>Preconditions:</strong><ul>
527 * <li>the distribution must be loaded before invoking this method</li></ul>
528 *
529 * @return array of upper bounds of subintervals used in data generation
530 * @throws NullPointerException unless a {@code load} method has been
531 * called beforehand.
532 */
533 public double[] getGeneratorUpperBounds() {
534 int len = upperBounds.length;
535 double[] out = new double[len];
536 System.arraycopy(upperBounds, 0, out, 0, len);
537 return out;
538 }
539
540 /**
541 * Property indicating whether or not the distribution has been loaded.
542 *
543 * @return true if the distribution has been loaded
544 */
545 public boolean isLoaded() {
546 return loaded;
547 }
548
549 /**
550 * Reseeds the random number generator used by {@link #getNextValue()}.
551 *
552 * @param seed random generator seed
553 */
554 public void reSeed(long seed) {
555 randomData.setSeed(seed);
556 }
557
558 // Distribution methods ---------------------------
559
560 /**
561 * {@inheritDoc}
562 *
563 * <p>Returns the kernel density normalized so that its integral over each bin
564 * equals the bin mass.</p>
565 *
566 * <p>Algorithm description:</p>
567 * <ol>
568 * <li>Find the bin B that x belongs to.</li>
569 * <li>Compute K(B) = the mass of B with respect to the within-bin kernel (i.e., the
570 * integral of the kernel density over B).</li>
571 * <li>Return k(x) * P(B) / K(B), where k is the within-bin kernel density
572 * and P(B) is the mass of B.</li></ol>
573 */
574 @Override
575 public double density(double x) {
576 if (x < min || x > max) {
577 return 0d;
578 }
579 final int binIndex = findBin(x);
580 final RealDistribution kernel = getKernel(binStats.get(binIndex));
581 double kernelDensity = kernel.density(x);
582 if (kernelDensity == 0d) {
583 return 0d;
584 }
585 return kernelDensity * pB(binIndex) / kB(binIndex);
586 }
587
588 /**
589 * {@inheritDoc}
590 *
591 * <p>Algorithm description:</p>
592 * <ol>
593 * <li>Find the bin B that x belongs to.</li>
594 * <li>Compute P(B) = the mass of B and P(B-) = the combined mass of the bins below B.</li>
595 * <li>Compute K(B) = the probability mass of B with respect to the within-bin kernel
596 * and K(B-) = the kernel distribution evaluated at the lower endpoint of B</li>
597 * <li>Return P(B-) + P(B) * [K(x) - K(B-)] / K(B) where
598 * K(x) is the within-bin kernel distribution function evaluated at x.</li></ol>
599 * <p>If K is a constant distribution, we return P(B-) + P(B) (counting the full
600 * mass of B).</p>
601 *
602 */
603 @Override
604 public double cumulativeProbability(double x) {
605 if (x < min) {
606 return 0d;
607 } else if (x >= max) {
608 return 1d;
609 }
610 final int binIndex = findBin(x);
611 final double pBminus = pBminus(binIndex);
612 final double pB = pB(binIndex);
613 final RealDistribution kernel = k(x);
614 if (kernel instanceof ConstantRealDistribution) {
615 if (x < kernel.getNumericalMean()) {
616 return pBminus;
617 } else {
618 return pBminus + pB;
619 }
620 }
621 final double[] binBounds = getUpperBounds();
622 final double kB = kB(binIndex);
623 final double lower = binIndex == 0 ? min : binBounds[binIndex - 1];
624 final double withinBinCum =
625 (kernel.cumulativeProbability(x) - kernel.cumulativeProbability(lower)) / kB;
626 return pBminus + pB * withinBinCum;
627 }
628
629 /**
630 * {@inheritDoc}
631 *
632 * <p>Algorithm description:</p>
633 * <ol>
634 * <li>Find the smallest i such that the sum of the masses of the bins
635 * through i is at least p.</li>
636 * <li>
637 * Let K be the within-bin kernel distribution for bin i.<br>
638 * Let K(B) be the mass of B under K. <br>
639 * Let K(B-) be K evaluated at the lower endpoint of B (the combined
640 * mass of the bins below B under K).<br>
641 * Let P(B) be the probability of bin i.<br>
642 * Let P(B-) be the sum of the bin masses below bin i. <br>
643 * Let pCrit = p - P(B-)<br>
644 * <li>Return the inverse of K evaluated at <br>
645 * K(B-) + pCrit * K(B) / P(B) </li>
646 * </ol>
647 *
648 */
649 @Override
650 public double inverseCumulativeProbability(final double p) throws MathIllegalArgumentException {
651 MathUtils.checkRangeInclusive(p, 0, 1);
652
653 if (p == 0.0) {
654 return getSupportLowerBound();
655 }
656
657 if (p == 1.0) {
658 return getSupportUpperBound();
659 }
660
661 int i = 0;
662 while (cumBinP(i) < p) {
663 i++;
664 }
665
666 final RealDistribution kernel = getKernel(binStats.get(i));
667 final double kB = kB(i);
668 final double[] binBounds = getUpperBounds();
669 final double lower = i == 0 ? min : binBounds[i - 1];
670 final double kBminus = kernel.cumulativeProbability(lower);
671 final double pB = pB(i);
672 final double pBminus = pBminus(i);
673 final double pCrit = p - pBminus;
674 if (pCrit <= 0) {
675 return lower;
676 }
677 return kernel.inverseCumulativeProbability(kBminus + pCrit * kB / pB);
678 }
679
680 /**
681 * {@inheritDoc}
682 */
683 @Override
684 public double getNumericalMean() {
685 return sampleStats.getMean();
686 }
687
688 /**
689 * {@inheritDoc}
690 */
691 @Override
692 public double getNumericalVariance() {
693 return sampleStats.getVariance();
694 }
695
696 /**
697 * {@inheritDoc}
698 */
699 @Override
700 public double getSupportLowerBound() {
701 return min;
702 }
703
704 /**
705 * {@inheritDoc}
706 */
707 @Override
708 public double getSupportUpperBound() {
709 return max;
710 }
711
712 /**
713 * {@inheritDoc}
714 */
715 @Override
716 public boolean isSupportConnected() {
717 return true;
718 }
719
720 /**
721 * Reseed the underlying PRNG.
722 *
723 * @param seed new seed value
724 */
725 public void reseedRandomGenerator(long seed) {
726 randomData.setSeed(seed);
727 }
728
729 /**
730 * The probability of bin i.
731 *
732 * @param i the index of the bin
733 * @return the probability that selection begins in bin i
734 */
735 private double pB(int i) {
736 return i == 0 ? upperBounds[0] :
737 upperBounds[i] - upperBounds[i - 1];
738 }
739
740 /**
741 * The combined probability of the bins up to but not including bin i.
742 *
743 * @param i the index of the bin
744 * @return the probability that selection begins in a bin below bin i.
745 */
746 private double pBminus(int i) {
747 return i == 0 ? 0 : upperBounds[i - 1];
748 }
749
750 /**
751 * Mass of bin i under the within-bin kernel of the bin.
752 *
753 * @param i index of the bin
754 * @return the difference in the within-bin kernel cdf between the
755 * upper and lower endpoints of bin i
756 */
757 private double kB(int i) {
758 final double[] binBounds = getUpperBounds();
759 final RealDistribution kernel = getKernel(binStats.get(i));
760 return i == 0 ? kernel.probability(min, binBounds[0]) :
761 kernel.probability(binBounds[i - 1], binBounds[i]);
762 }
763
764 /**
765 * The within-bin kernel of the bin that x belongs to.
766 *
767 * @param x the value to locate within a bin
768 * @return the within-bin kernel of the bin containing x
769 */
770 private RealDistribution k(double x) {
771 final int binIndex = findBin(x);
772 return getKernel(binStats.get(binIndex));
773 }
774
775 /**
776 * The combined probability of the bins up to and including binIndex.
777 *
778 * @param binIndex maximum bin index
779 * @return sum of the probabilities of bins through binIndex
780 */
781 private double cumBinP(int binIndex) {
782 return upperBounds[binIndex];
783 }
784
785 /**
786 * The within-bin smoothing kernel. Returns a Gaussian distribution
787 * parameterized by {@code bStats}, unless the bin contains less than 2
788 * observations, in which case a constant distribution is returned.
789 *
790 * @param bStats summary statistics for the bin
791 * @return within-bin kernel parameterized by bStats
792 */
793 protected RealDistribution getKernel(StreamingStatistics bStats) {
794 if (bStats.getN() < 2 || bStats.getVariance() == 0) {
795 return new ConstantRealDistribution(bStats.getMean());
796 } else {
797 return new NormalDistribution(bStats.getMean(),
798 bStats.getStandardDeviation());
799 }
800 }
801 }