RandomPercentile.java

  1. /*
  2.  * Licensed to the Hipparchus project under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The Hipparchus project licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */
  17. package org.hipparchus.stat.descriptive.rank;

  18. import java.io.Serializable;
  19. import java.util.ArrayList;
  20. import java.util.Arrays;
  21. import java.util.Collection;
  22. import java.util.HashMap;
  23. import java.util.Iterator;
  24. import java.util.List;
  25. import java.util.Map;
  26. import java.util.NoSuchElementException;
  27. import java.util.UUID;

  28. import org.hipparchus.exception.LocalizedCoreFormats;
  29. import org.hipparchus.exception.MathIllegalArgumentException;
  30. import org.hipparchus.exception.MathIllegalStateException;
  31. import org.hipparchus.exception.NullArgumentException;
  32. import org.hipparchus.random.RandomGenerator;
  33. import org.hipparchus.random.Well19937c;
  34. import org.hipparchus.stat.StatUtils;
  35. import org.hipparchus.stat.descriptive.AbstractStorelessUnivariateStatistic;
  36. import org.hipparchus.stat.descriptive.AggregatableStatistic;
  37. import org.hipparchus.stat.descriptive.StorelessUnivariateStatistic;
  38. import org.hipparchus.util.FastMath;
  39. import org.hipparchus.util.MathArrays;

  40. /**
  41.  * A {@link StorelessUnivariateStatistic} estimating percentiles using the
  42.  * <a href=http://dimacs.rutgers.edu/~graham/pubs/papers/nquantiles.pdf>RANDOM</a>
  43.  * Algorithm.
  44.  * <p>
  45.  * Storage requirements for the RANDOM algorithm depend on the desired
  46.  * accuracy of quantile estimates. Quantile estimate accuracy is defined as follows.
  47.  * <p>
  48.  * Let \(X\) be the set of all data values consumed from the stream and let \(q\)
  49.  * be a quantile (measured between 0 and 1) to be estimated. If <ul>
  50.  * <li>\(\epsilon\) is the configured accuracy</li>
  51.  * <li> \(\hat{q}\) is a RandomPercentile estimate for \(q\) (what is returned
  52.  *      by {@link #getResult()} or {@link #getResult(double)}) with \(100q\) as
  53.  *      actual parameter)</li>
  54.  * <li> \(rank(\hat{q}) = |\{x \in X : x &lt; \hat{q}\}|\) is the actual rank of
  55.  *      \(\hat{q}\) in the full data stream</li>
  56.  * <li>\(n = |X|\) is the number of observations</li></ul>
  57.  * then we can expect \((q - \epsilon)n &lt; rank(\hat{q}) &lt; (q + \epsilon)n\).
  58.  * <p>
  59.  * The algorithm maintains \(\left\lceil {log_{2}(1/\epsilon)}\right\rceil + 1\) buffers
  60.  * of size \(\left\lceil {1/\epsilon \sqrt{log_2(1/\epsilon)}}\right\rceil\).  When
  61.  * {@code epsilon} is set to the default value of \(10^{-4}\), this makes 15 buffers
  62.  * of size 36,453.
  63.  * <p>
  64.  * The algorithm uses the buffers to maintain samples of data from the stream.  Until
  65.  * all buffers are full, the entire sample is stored in the buffers.
  66.  * If one of the {@code getResult} methods is called when all data are available in memory
  67.  * and there is room to make a copy of the data (meaning the combined set of buffers is
  68.  * less than half full), the {@code getResult} method delegates to a {@link Percentile}
  69.  * instance to compute and return the exact value for the desired quantile.
  70.  * For default {@code epsilon}, this means exact values will be returned whenever fewer than
  71.  * \(\left\lceil {15 \times 36453 / 2} \right\rceil = 273,398\) values have been consumed
  72.  * from the data stream.
  73.  * <p>
  74.  * When buffers become full, the algorithm merges buffers so that they effectively represent
  75.  * a larger set of values than they can hold. Subsequently, data values are sampled from the
  76.  * stream to fill buffers freed by merge operations.  Both the merging and the sampling
  77.  * require random selection, which is done using a {@code RandomGenerator}.  To get
  78.  * repeatable results for large data streams, users should provide {@code RandomGenerator}
  79.  * instances with fixed seeds. {@code RandomPercentile} itself does not reseed or otherwise
  80.  * initialize the {@code RandomGenerator} provided to it.  By default, it uses a
  81.  * {@link Well19937c} generator with the default seed.
  82.  * <p>
  83.  * Note: This implementation is not thread-safe.
  84.  */
  85. public class RandomPercentile
  86.     extends AbstractStorelessUnivariateStatistic implements StorelessUnivariateStatistic,
  87.     AggregatableStatistic<RandomPercentile>, Serializable {

  88.     /** Default quantile estimation error setting */
  89.     public static final double DEFAULT_EPSILON = 1e-4;
  90.     /** Serialization version id */
  91.     private static final long serialVersionUID = 1L;
  92.     /** Storage size of each buffer */
  93.     private final int s;
  94.     /** Maximum number of buffers minus 1 */
  95.     private final int h;
  96.     /** Data structure used to manage buffers */
  97.     private final BufferMap bufferMap;
  98.     /** Bound on the quantile estimation error */
  99.     private final double epsilon;
  100.     /** Source of random data */
  101.     private final RandomGenerator randomGenerator;
  102.     /** Number of elements consumed from the input data stream */
  103.     private long n;
  104.     /** Buffer currently being filled */
  105.     private Buffer currentBuffer;

  106.     /**
  107.      * Constructs a {@code RandomPercentile} with quantile estimation error
  108.      * {@code epsilon} using {@code randomGenerator} as its source of random data.
  109.      *
  110.      * @param epsilon bound on quantile estimation error (see class javadoc)
  111.      * @param randomGenerator PRNG used in sampling and merge operations
  112.      * @throws MathIllegalArgumentException if percentile is not in the range [0, 100]
  113.      */
  114.     public RandomPercentile(double epsilon, RandomGenerator randomGenerator) {
  115.         if (epsilon <= 0) {
  116.             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL,
  117.                                                    epsilon, 0);
  118.         }
  119.         this.h = (int) FastMath.ceil(log2(1/epsilon));
  120.         this.s = (int) FastMath.ceil(FastMath.sqrt(log2(1/epsilon)) / epsilon);
  121.         this.randomGenerator = randomGenerator;
  122.         bufferMap = new BufferMap(h + 1, s, randomGenerator);
  123.         currentBuffer = bufferMap.create(0);
  124.         this.epsilon = epsilon;
  125.     }

  126.     /**
  127.      * Constructs a {@code RandomPercentile} with default estimation error
  128.      * using {@code randomGenerator} as its source of random data.
  129.      *
  130.      * @param randomGenerator PRNG used in sampling and merge operations
  131.      * @throws MathIllegalArgumentException if percentile is not in the range [0, 100]
  132.      */
  133.     public RandomPercentile(RandomGenerator randomGenerator) {
  134.         this(DEFAULT_EPSILON, randomGenerator);
  135.     }

  136.     /**
  137.      * Constructs a {@code RandomPercentile} with quantile estimation error
  138.      * {@code epsilon} using the default PRNG as source of random data.
  139.      *
  140.      * @param epsilon bound on quantile estimation error (see class javadoc)
  141.      * @throws MathIllegalArgumentException if percentile is not in the range [0, 100]
  142.      */
  143.     public RandomPercentile(double epsilon) {
  144.         this(epsilon, new Well19937c());
  145.     }

  146.     /**
  147.      * Constructs a {@code RandomPercentile} with quantile estimation error
  148.      * set to the default ({@link #DEFAULT_EPSILON}), using the default PRNG
  149.      * as source of random data.
  150.      */
  151.     public RandomPercentile() {
  152.         this(DEFAULT_EPSILON, new Well19937c());
  153.     }

  154.     /**
  155.      * Copy constructor, creates a new {@code RandomPercentile} identical
  156.      * to the {@code original}.  Note: the RandomGenerator used by the new
  157.      * instance is referenced, not copied - i.e., the new instance shares
  158.      * a generator with the original.
  159.      *
  160.      * @param original the {@code PSquarePercentile} instance to copy
  161.      */
  162.     public RandomPercentile(RandomPercentile original) {
  163.         super();
  164.         this.h = original.h;
  165.         this.n = original.n;
  166.         this.s = original.s;
  167.         this.epsilon = original.epsilon;
  168.         this.bufferMap = new BufferMap(original.bufferMap);
  169.         this.randomGenerator = original.randomGenerator;
  170.         Iterator<Buffer> iterator = bufferMap.iterator();
  171.         Buffer current = null;
  172.         Buffer curr = null;
  173.         // See if there is a partially filled buffer - that will be currentBuffer
  174.         while (current == null && iterator.hasNext()) {
  175.             curr = iterator.next();
  176.             if (curr.hasCapacity()) {
  177.                 current = curr;
  178.             }
  179.         }
  180.         // If there is no partially filled buffer, just assign the last one.
  181.         // Next increment() will find no capacity and create a new one or trigger
  182.         // a merge.
  183.         this.currentBuffer = current == null ? curr : current;
  184.     }

  185.     @Override
  186.     public long getN() {
  187.         return n;
  188.     }

  189.     /**
  190.      * Returns an estimate of the given percentile, computed using the designated
  191.      * array segment as input data.
  192.      *
  193.      * @param values source of input data
  194.      * @param begin position of the first element of the values array to include
  195.      * @param length number of array elements to include
  196.      * @param percentile desired percentile (scaled 0 - 100)
  197.      *
  198.      * @return estimated percentile
  199.      * @throws MathIllegalArgumentException if percentile is out of the range [0, 100]
  200.      */
  201.     public double evaluate(final double percentile, final double[] values, final int begin, final int length)
  202.         throws MathIllegalArgumentException {
  203.         if (MathArrays.verifyValues(values, begin, length)) {
  204.             RandomPercentile randomPercentile = new RandomPercentile(this.epsilon,
  205.                                                                      this.randomGenerator);
  206.             randomPercentile.incrementAll(values, begin, length);
  207.             return randomPercentile.getResult(percentile);
  208.         }
  209.         return Double.NaN;
  210.     }

  211.     /**
  212.      * Returns an estimate of the median, computed using the designated
  213.      * array segment as input data.
  214.      *
  215.      * @param values source of input data
  216.      * @param begin position of the first element of the values array to include
  217.      * @param length number of array elements to include
  218.      *
  219.      * @return estimated percentile
  220.      * @throws MathIllegalArgumentException if percentile is out of the range [0, 100]
  221.      */
  222.     @Override
  223.     public double evaluate(final double[] values, final int begin, final int length) {
  224.         return evaluate(50d, values, begin, length);
  225.     }

  226.     /**
  227.      * Returns an estimate of percentile over the given array.
  228.      *
  229.      * @param values source of input data
  230.      * @param percentile desired percentile (scaled 0 - 100)
  231.      *
  232.      * @return estimated percentile
  233.      * @throws MathIllegalArgumentException if percentile is out of the range [0, 100]
  234.      */
  235.     public double evaluate(final double percentile, final double[] values) {
  236.         return evaluate(percentile, values, 0, values.length);
  237.     }

  238.     @Override
  239.     public RandomPercentile copy() {
  240.         return new RandomPercentile(this);
  241.     }

  242.     @Override
  243.     public void clear() {
  244.         n = 0;
  245.         bufferMap.clear();
  246.         currentBuffer = bufferMap.create(0);
  247.     }

  248.     /**
  249.      * Returns an estimate of the median.
  250.      */
  251.     @Override
  252.     public double getResult() {
  253.         return getResult(50d);

  254.     }

  255.     /**
  256.      * Returns an estimate of the given percentile.
  257.      *
  258.      * @param percentile desired percentile (scaled 0 - 100)
  259.      * @return estimated percentile
  260.      * @throws MathIllegalArgumentException if percentile is out of the range [0, 100]
  261.      */
  262.     public double getResult(double percentile) {
  263.         if (percentile > 100 || percentile < 0) {
  264.             throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE,
  265.                                                    percentile, 0, 100);
  266.         }
  267.         // Convert to internal quantile scale
  268.         final double q = percentile / 100;
  269.         // First get global min and max to bound search.
  270.         double min = Double.POSITIVE_INFINITY;
  271.         double max = Double.NEGATIVE_INFINITY;
  272.         double bMin;
  273.         double bMax;
  274.         for (Buffer buffer : bufferMap) {
  275.             bMin = buffer.min();
  276.             if (bMin < min) {
  277.                 min = bMin;
  278.             }
  279.             bMax = buffer.max();
  280.             if (bMax > max) {
  281.                 max = bMax;
  282.             }
  283.         }

  284.         // Handle degenerate cases
  285.         if (Double.compare(q, 0d) == 0 || n == 1) {
  286.             return min;
  287.         }
  288.         if (Double.compare(q, 1) == 0) {
  289.             return max;
  290.         }
  291.         if (n == 0) {
  292.             return Double.NaN;
  293.         }

  294.         // See if we have all data in memory and enough free memory to copy.
  295.         // If so, use Percentile to perform exact computation.
  296.         if (bufferMap.halfEmpty()) {
  297.             return new Percentile(percentile).evaluate(bufferMap.levelZeroData());
  298.         }

  299.         // Compute target rank
  300.         final double targetRank = q * n;

  301.         // Start with initial guess min + quantile * (max - min).
  302.         double estimate = min + q * (max - min);
  303.         double estimateRank = getRank(estimate);
  304.         double lower;
  305.         double upper;
  306.         if (estimateRank > targetRank) {
  307.             upper = estimate;
  308.             lower = min;
  309.         } else if (estimateRank < targetRank) {
  310.             lower = estimate;
  311.             upper = max;
  312.         } else {
  313.             return estimate;
  314.         }
  315.         final double eps = epsilon / 2;
  316.         final double rankTolerance = eps * n;
  317.         final double minWidth = eps / n;
  318.         double intervalWidth = FastMath.abs(upper - lower);
  319.         while (FastMath.abs(estimateRank - targetRank) > rankTolerance && intervalWidth > minWidth) {
  320.             if (estimateRank > targetRank) {
  321.                 upper = estimate;
  322.             } else {
  323.                 lower = estimate;
  324.             }
  325.             intervalWidth = upper - lower;
  326.             estimate = lower + intervalWidth / 2;
  327.             estimateRank = getRank(estimate);
  328.         }
  329.         return estimate;
  330.     }

  331.     /**
  332.      * Gets the estimated rank of {@code value}, i.e.  \(|\{x \in X : x &lt; value\}|\)
  333.      * where \(X\) is the set of values that have been consumed from the stream.
  334.      *
  335.      * @param value value whose overall rank is sought
  336.      * @return estimated number of sample values that are strictly less than {@code value}
  337.      */
  338.     public double getRank(double value) {
  339.         double rankSum = 0;
  340.         for (Buffer buffer : bufferMap) {
  341.             rankSum += buffer.rankOf(value) * FastMath.pow(2, buffer.level);
  342.         }
  343.         return rankSum;
  344.     }

  345.     /**
  346.      * Returns the estimated quantile position of value in the dataset.
  347.      * Specifically, what is returned is an estimate of \(|\{x \in X : x &lt; value\}| / |X|\)
  348.      * where \(X\) is the set of values that have been consumed from the stream.
  349.      *
  350.      * @param value value whose quantile rank is sought.
  351.      * @return estimated proportion of sample values that are strictly less than {@code value}
  352.      */
  353.     public double getQuantileRank(double value) {
  354.         return getRank(value) / getN();
  355.     }

  356.     @Override
  357.     public void increment(double d) {
  358.         n++;
  359.         if (!currentBuffer.hasCapacity()) { // Need to get a new buffer to fill
  360.             // First see if we have not yet created all the buffers
  361.             if (bufferMap.canCreate()) {
  362.                 final int level = (int) Math.ceil(Math.max(0, log2(n/(s * FastMath.pow(2, h - 1)))));
  363.                 currentBuffer = bufferMap.create(level);
  364.             } else { // All buffers have been created - need to merge to free one
  365.                 currentBuffer = bufferMap.merge();
  366.             }
  367.         }
  368.         currentBuffer.consume(d);
  369.     }

  370.     /**
  371.      * Maintains a buffer of values sampled from the input data stream.
  372.      * <p>
  373.      * The {@link #level} of a buffer determines its sampling frequency.
  374.      * The {@link #consume(double)} method retains 1 out of every 2^level values
  375.      * read in from the stream.
  376.      * <p>
  377.      * The {@link #size} of the buffer is the number of values that it can store
  378.      * The buffer is considered full when it has consumed 2^level * size values.
  379.      * <p>
  380.      * The {@link #blockSize} of a buffer is 2^level.
  381.      * The consume method starts each block by generating a random integer in
  382.      * [0, blockSize - 1].  It then skips over all but the element with that offset
  383.      * in the block, retaining only the selected value. So after 2^level * size
  384.      * elements have been consumed, it will have retained size elements - one
  385.      * from each 2^level block.
  386.      * <p>
  387.      * The {@link #mergeWith(Buffer)} method merges this buffer with another one,
  388.      * The merge operation merges the data from the other buffer into this and clears
  389.      * the other buffer (so it can consume data). Both buffers have their level
  390.      * incremented by the merge. This operation is only used on full buffers.
  391.      */
  392.     private static class Buffer implements Serializable {
  393.         /** Serialization version id */
  394.         private static final long serialVersionUID = 1L;
  395.         /** Number of values actually stored in the buffer */
  396.         private final int size;
  397.         /** Data sampled from the stream */
  398.         private final double[] data;
  399.         /** PRNG used for merges and stream sampling */
  400.         private final RandomGenerator randomGenerator;
  401.         /** Level of the buffer */
  402.         private int level;
  403.         /** Block size  = 2^level */
  404.         private long blockSize;
  405.         /** Next location in backing array for stored (taken) value */
  406.         private int next;
  407.         /** Number of values consumed in current 2^level block of values from the stream */
  408.         private long consumed;
  409.         /** Index of next value to take in current 2^level block */
  410.         private long nextToTake;
  411.         /** ID */
  412.         private final UUID id;

  413.         /**
  414.          * Creates a new buffer capable of retaining size values with the given level.
  415.          *
  416.          * @param size number of values the buffer can retain
  417.          * @param level the base 2 log of the sampling frequency
  418.          *        (one out of every 2^level values is retained)
  419.          * @param randomGenerator PRNG used for sampling and merge operations
  420.          */
  421.         Buffer(int size, int level, RandomGenerator randomGenerator) {
  422.             this.size = size;
  423.             data = new double[size];
  424.             this.level = level;
  425.             this.randomGenerator = randomGenerator;
  426.             this.id = UUID.randomUUID();
  427.             computeBlockSize();
  428.         }

  429.         /**
  430.          * Sets blockSize and nextToTake based on level.
  431.          */
  432.         private void computeBlockSize() {
  433.             if (level == 0) {
  434.                 blockSize = 1;
  435.             } else {
  436.                 long product = 1;
  437.                 for (int i = 0; i < level; i++) {
  438.                     product *= 2;
  439.                 }
  440.                 blockSize = product;
  441.             }
  442.             if (blockSize > 1) {
  443.                 nextToTake = randomGenerator.nextLong(blockSize);
  444.             }
  445.         }

  446.         /**
  447.          * Consumes a value from the input stream.
  448.          * <p>
  449.          * For each 2^level values consumed, one is added to the buffer.
  450.          * The buffer is not considered full until 2^level * size values
  451.          * have been consumed.
  452.          * <p>
  453.          * Sorts the data array if the consumption renders the buffer full.
  454.          * <p>
  455.          * There is no capacity check in this method.  Clients are expected
  456.          * to use {@link #hasCapacity()} before invoking this method.  If
  457.          * it is invoked on a full buffer, an ArrayIndexOutOfBounds exception
  458.          * will result.
  459.          *
  460.          * @param value value to consume from the stream
  461.          */
  462.         public void consume(double value) {
  463.             if (consumed == nextToTake) {
  464.                 data[next] = value;
  465.                 next++;
  466.             }
  467.             consumed++;
  468.             if (consumed == blockSize) {
  469.                 if (next == size) {   // Buffer is full
  470.                     Arrays.sort(data);
  471.                 } else {              // Reset in-block counter and nextToTake
  472.                     consumed = 0;
  473.                     if (blockSize > 1) {
  474.                         nextToTake = randomGenerator.nextLong(blockSize);
  475.                     }
  476.                 }
  477.             }
  478.         }

  479.         /**
  480.          * Merges this with other.
  481.          * <p>
  482.          * After the merge, this will be the merged buffer and other will be free.
  483.          * Both will have level+1.
  484.          * Post-merge, other can be used to accept new data.
  485.          * <p>
  486.          * The contents of the merged buffer (this after the merge) are determined
  487.          * by randomly choosing one of the two retained elements in each of the
  488.          * [0...size - 1] positions in the two buffers.
  489.          * <p>
  490.          * This and other must have the same level and both must be full.
  491.          *
  492.          * @param other initially full other buffer at the same level as this.
  493.          * @throws MathIllegalArgumentException if either buffer is not full or they
  494.          * have different levels
  495.          */
  496.         public void mergeWith(Buffer other) {
  497.             // Make sure both this and other are full and have the same level
  498.             if (this.hasCapacity() || other.hasCapacity() || other.level != this.level) {
  499.                 throw new MathIllegalArgumentException(LocalizedCoreFormats.INTERNAL_ERROR);
  500.             }
  501.             // Randomly select one of the two entries for each slot
  502.             for (int i = 0; i < size; i++) {
  503.                 if (randomGenerator.nextBoolean()) {
  504.                     data[i] = other.data[i];
  505.                 }
  506.             }
  507.             // Re-sort data
  508.             Arrays.sort(data);
  509.             // Bump level of both buffers
  510.             other.setLevel(level + 1);
  511.             this.setLevel(level + 1);
  512.             // Clear the free one (and compute new blocksize)
  513.             other.clear();
  514.         }

  515.         /**
  516.          * Merge this into a higher-level buffer.
  517.          * <p>
  518.          * Does not alter this; but after the merge, higher may have some of its
  519.          * data replaced by data from this.  Levels are not changed for either buffer.
  520.          * <p>
  521.          * Probability of selection into the newly constituted higher buffer is weighted
  522.          * according to level. So for example, if this has level 0 and higher has level
  523.          * 2, the ith element of higher is 4 times more likely than the corresponding
  524.          * element of this to retain its spot.
  525.          * <p>
  526.          * This method is only used when aggregating RandomPercentile instances.
  527.          * <p>
  528.          * Preconditions:
  529.          * <ol><li> this.level < higher.level </li>
  530.          *     <li> this.size = higher.size </li>
  531.          *     <li> Both buffers are full </li>
  532.          * </ol>
  533.          *
  534.          * @param higher higher-level buffer to merge this into
  535.          * @throws MathIllegalArgumentException if the buffers have different sizes,
  536.          * either buffer is not full or this has level greater than or equal to higher
  537.          */
  538.         public void mergeInto(Buffer higher) {
  539.             // Check preconditions
  540.             if (this.size != higher.size || this.hasCapacity() || higher.hasCapacity() ||
  541.                     this.level >= higher.level) {
  542.                 throw new MathIllegalArgumentException(LocalizedCoreFormats.INTERNAL_ERROR);
  543.             }
  544.             final int levelDifference = higher.level - this.level;
  545.             int m = 1;
  546.             for (int i = 0; i < levelDifference; i++) {
  547.                 m *= 2;
  548.             }
  549.             // Randomly select one of the two entries for each slot in higher, giving
  550.             // m-times higher weight to the entries of higher.
  551.             for (int i = 0; i < size; i++) {
  552.                 // data[i] <-> {0}, higher.data[i] <-> {1, ..., m}
  553.                 if (randomGenerator.nextInt(m + 1) == 0) {
  554.                     higher.data[i] = data[i];
  555.                 }
  556.             }
  557.             // Resort higher's data
  558.             Arrays.sort(higher.data);
  559.         }

  560.         /**
  561.          * @return true if the buffer has capacity; false if it is full
  562.          */
  563.         public boolean hasCapacity() {
  564.             // Buffer has capacity if it has not yet set all of its data
  565.             // values or if it has but still has not finished its last block
  566.             return next < size || consumed < blockSize;
  567.         }

  568.         /**
  569.          * Sets the level of the buffer.
  570.          *
  571.          * @param level new level value
  572.          */
  573.         public void setLevel(int level) {
  574.             this.level = level;
  575.         }

  576.         /**
  577.          * Clears data, recomputes blockSize and resets consumed and nextToTake.
  578.          */
  579.         public void clear() {
  580.             consumed = 0;
  581.             next = 0;
  582.             computeBlockSize();
  583.         }

  584.         /**
  585.          * Returns a copy of the data that has been added to the buffer
  586.          *
  587.          * @return possibly unsorted copy of the portion of the buffer that has been filled
  588.          */
  589.         public double[] getData() {
  590.             final double[] out = new double[next];
  591.             System.arraycopy(data, 0, out, 0, next);
  592.             return out;
  593.         }

  594.         /**
  595.          * Returns the ordinal rank of value among the sampled values in this buffer.
  596.          *
  597.          * @param value value whose rank is sought
  598.          * @return |{v in data : v < value}|
  599.          */
  600.         public int rankOf(double value) {
  601.             int ret = 0;
  602.             if (!hasCapacity()) { // Full sorted buffer, can do binary search
  603.                 ret = Arrays.binarySearch(data, value);
  604.                 if (ret < 0) {
  605.                     return -ret - 1;
  606.                 } else {
  607.                     return ret;
  608.                 }
  609.             } else { // have to count - not sorted yet and can't sort yet
  610.                 for (int i = 0; i < next; i++) {
  611.                     if (data[i] < value) {
  612.                         ret++;
  613.                     }
  614.                 }
  615.                 return ret;
  616.             }
  617.         }

  618.         /**
  619.          * @return the smallest value held in this buffer
  620.          */
  621.         public double min() {
  622.             if (!hasCapacity()) {
  623.                 return data[0];
  624.             } else {
  625.                 return StatUtils.min(getData());
  626.             }
  627.         }

  628.         /**
  629.          * @return the largest value held in this buffer
  630.          */
  631.         public double max() {
  632.             if (!hasCapacity()) {
  633.                 return data[data.length - 1];
  634.             } else {
  635.                 return StatUtils.max(getData());
  636.             }
  637.         }

  638.         /**
  639.          * @return the level of this buffer
  640.          */
  641.         public int getLevel() {
  642.             return level;
  643.         }

  644.         /**
  645.          * @return the id
  646.          */
  647.         public UUID getId() {
  648.             return id;
  649.         }
  650.     }

  651.     /**
  652.      * Computes base 2 log of the argument.
  653.      *
  654.      * @param x input value
  655.      * @return the value y such that 2^y = x
  656.      */
  657.     private static double log2(double x) {
  658.         return Math.log(x) / Math.log(2);
  659.     }

  660.     /**
  661.      * A map structure to hold the buffers.
  662.      * Keys are levels and values are lists of buffers at the given level.
  663.      * Overall capacity is limited by the total number of buffers.
  664.      */
  665.     private static class BufferMap implements Iterable<Buffer>, Serializable {
  666.         /** Serialization version ID */
  667.         private static final long serialVersionUID = 1L;
  668.         /** Total number of buffers that can be created - cap for count */
  669.         private final int capacity;
  670.         /** PRNG used in merges */
  671.         private final RandomGenerator randomGenerator;
  672.         /** Total count of all buffers */
  673.         private int count;
  674.         /** Uniform buffer size */
  675.         private final int bufferSize;
  676.         /** Backing store for the buffer map. Keys are levels, values are lists of registered buffers. */
  677.         private final Map<Integer,List<Buffer>> registry;
  678.         /** Maximum buffer level */
  679.         private int maxLevel;

  680.         /**
  681.          * Creates a BufferMap that can manage up to capacity buffers.
  682.          * Buffers created by the pool with have size = buffersize.
  683.          *
  684.          * @param capacity cap on the number of buffers
  685.          * @param bufferSize size of each buffer
  686.          * @param randomGenerator RandomGenerator to use in merges
  687.          */
  688.         BufferMap(int capacity, int bufferSize, RandomGenerator randomGenerator) {
  689.             this.bufferSize = bufferSize;
  690.             this.capacity = capacity;
  691.             this.randomGenerator = randomGenerator;
  692.             this.registry = new HashMap<>();
  693.         }

  694.         /**
  695.          * Copy constructor.
  696.          *
  697.          * @param original BufferMap to copy
  698.          */
  699.         BufferMap(BufferMap original) {
  700.             super();
  701.             this.bufferSize = original.bufferSize;
  702.             this.capacity = original.capacity;
  703.             this.count = 0;
  704.             this.randomGenerator = original.randomGenerator;
  705.             this.registry = new HashMap<>();
  706.             for (Buffer current : original) {
  707.                 // Create and register a new buffer at the same level
  708.                 final Buffer newCopy = create(current.getLevel());
  709.                 // Consume the data
  710.                 final double[] data = current.getData();
  711.                 for (double value : data) {
  712.                     newCopy.consume(value);
  713.                 }
  714.             }
  715.         }

  716.         /**
  717.          * Tries to create a buffer with the given level.
  718.          * <p>
  719.          * If there is capacity to create a new buffer (i.e., fewer than
  720.          * count have been created), a new buffer is created with the given
  721.          * level, registered and returned.  If capacity has been reached,
  722.          * null is returned.
  723.          *
  724.          * @param level level of the new buffer
  725.          * @return an empty buffer or null if a buffer can't be provided
  726.          */
  727.         public Buffer create(int level) {
  728.             if (!canCreate()) {
  729.                 return null;
  730.             }
  731.             count++;
  732.             Buffer buffer = new Buffer(bufferSize, level, randomGenerator);
  733.             List<Buffer> bufferList = registry.computeIfAbsent(level, k -> new ArrayList<>());
  734.             bufferList.add(buffer);
  735.             if (level > maxLevel) {
  736.                 maxLevel = level;
  737.             }
  738.             return buffer;
  739.         }

  740.         /**
  741.          * Returns true if there is capacity to create a new buffer.
  742.          *
  743.          * @return true if fewer than capacity buffers have been created.
  744.          */
  745.         public boolean canCreate() {
  746.             return count < capacity;
  747.         }

  748.         /**
  749.          * Returns true if we have used less than half of the allocated storage.
  750.          * <p>
  751.          * Includes a check to make sure all buffers have level 0;
  752.          * but this should always be the case.
  753.          * <p>
  754.          * When this method returns true, we have all consumed data in storage
  755.          * and enough space to make a copy of the combined dataset.
  756.          *
  757.          * @return true if all buffers have level 0 and less than half of the
  758.          * available storage has been used
  759.          */
  760.         public boolean halfEmpty() {
  761.             return count * 2 < capacity &&
  762.                     registry.size() == 1 &&
  763.                     registry.containsKey(0);
  764.         }

  765.         /**
  766.          * Returns a fresh copy of all data from level 0 buffers.
  767.          *
  768.          * @return combined data stored in all level 0 buffers
  769.          */
  770.         public double[] levelZeroData() {
  771.             List<Buffer> levelZeroBuffers = registry.get(0);
  772.             // First determine the combined size of the data
  773.             int length = 0;
  774.             for (Buffer buffer : levelZeroBuffers) {
  775.                 if (!buffer.hasCapacity()) { // full buffer
  776.                     length += buffer.size;
  777.                 } else {
  778.                     length += buffer.next;  // filled amount
  779.                 }
  780.             }
  781.             // Copy the data
  782.             int pos = 0;
  783.             int currLen;
  784.             final double[] out = new double[length];
  785.             for (Buffer buffer : levelZeroBuffers) {
  786.                 if (!buffer.hasCapacity()) {
  787.                     currLen = buffer.size;
  788.                 } else {
  789.                     currLen =  buffer.next;
  790.                 }
  791.                 System.arraycopy(buffer.data, 0, out, pos, currLen);
  792.                 pos += currLen;
  793.             }
  794.             return out;
  795.         }

  796.         /**
  797.          * Finds the lowest level l where there exist at least two buffers,
  798.          * merges them to create a new buffer with level l+1 and returns
  799.          * a free buffer with level l+1.
  800.          *
  801.          * @return free buffer that can accept data
  802.          */
  803.         public Buffer merge() {
  804.             int l = 0;
  805.             List<Buffer> mergeCandidates = null;
  806.             // Find the lowest level containing at least two buffers
  807.             while (mergeCandidates == null && l <= maxLevel) {
  808.                 final List<Buffer> bufferList = registry.get(l);
  809.                 if (bufferList != null && bufferList.size() > 1) {
  810.                     mergeCandidates = bufferList;
  811.                 } else {
  812.                     l++;
  813.                 }
  814.             }
  815.             if (mergeCandidates == null) {
  816.                 // Should never happen
  817.                 throw new MathIllegalStateException(LocalizedCoreFormats.INTERNAL_ERROR);
  818.             }
  819.             Buffer buffer1 = mergeCandidates.get(0);
  820.             Buffer buffer2 = mergeCandidates.get(1);
  821.             // Remove buffers to be merged
  822.             mergeCandidates.remove(0);
  823.             mergeCandidates.remove(0);
  824.             // If these are the last level-l buffers, remove the empty list
  825.             if (registry.get(l).isEmpty()) {
  826.                 registry.remove(l);
  827.             }
  828.             // Merge the buffers
  829.             buffer1.mergeWith(buffer2);
  830.             // Now both buffers have level l+1; buffer1 is full and buffer2 is free.
  831.             // Register both buffers
  832.             register(buffer1);
  833.             register(buffer2);

  834.             // Return the free one
  835.             return buffer2;
  836.         }

  837.         /**
  838.          * Clears the buffer map.
  839.          */
  840.         public void clear() {
  841.             for (List<Buffer> bufferList : registry.values()) {
  842.                 bufferList.clear();
  843.             }
  844.             registry.clear();
  845.             count = 0;
  846.         }

  847.         /**
  848.          * Registers a buffer.
  849.          *
  850.          * @param buffer Buffer to be registered.
  851.          */
  852.         public void register(Buffer buffer) {
  853.             final int level = buffer.getLevel();
  854.             List<Buffer> list = registry.get(level);
  855.             if (list == null) {
  856.                 list = new ArrayList<>();
  857.                 registry.put(level, list);
  858.                 if (level > maxLevel) {
  859.                     maxLevel = level;
  860.                 }
  861.             }
  862.             list.add(buffer);
  863.         }

  864.         /**
  865.          * De-register a buffer, without clearing it.
  866.          *
  867.          * @param buffer Buffer to be de-registered
  868.          * @throws IllegalStateException if the buffer is not registered
  869.          */
  870.         public void deRegister(Buffer buffer) {
  871.             final Iterator<Buffer> iterator = registry.get(buffer.getLevel()).iterator();
  872.             while (iterator.hasNext()) {
  873.                 if (iterator.next().getId().equals(buffer.getId())) {
  874.                     iterator.remove();
  875.                     return;
  876.                 }
  877.             }
  878.             throw new MathIllegalStateException(LocalizedCoreFormats.INTERNAL_ERROR);
  879.         }

  880.         /**
  881.          * Returns an iterator over all of the buffers. Iteration goes by level, with
  882.          * level 0 first.  Assumes there are no empty buffer lists.
  883.          */
  884.         @Override
  885.         public Iterator<Buffer> iterator() {
  886.             return new Iterator<Buffer>() {

  887.                 /** Outer loop iterator, from level to level. */
  888.                 private final Iterator<Integer> levelIterator = registry.keySet().iterator();

  889.                 /** List of buffers at current level. */
  890.                 private List<Buffer> currentList = registry.get(levelIterator.next()); // NOPMD - cannot use local variable in anonymous class

  891.                 /** Inner loop iterator, from buffer to buffer. */
  892.                 private Iterator<Buffer> bufferIterator =
  893.                         currentList == null ? null : currentList.iterator();

  894.                 @Override
  895.                 public boolean hasNext() {
  896.                     if (bufferIterator == null) {
  897.                         return false;
  898.                     }
  899.                     if (bufferIterator.hasNext()) {
  900.                         return true;
  901.                     }
  902.                     // The current level iterator has just finished, try to bump level
  903.                     if (levelIterator.hasNext()) {
  904.                         List<Buffer> currentList = registry.get(levelIterator.next());
  905.                         bufferIterator = currentList.iterator();
  906.                         return true;
  907.                     } else {
  908.                         // Nothing left, signal this by nulling bufferIterator
  909.                         bufferIterator = null;
  910.                         return false;
  911.                     }
  912.                 }

  913.                 @Override
  914.                 public Buffer next() {
  915.                      if (hasNext()) {
  916.                          return bufferIterator.next();
  917.                      }
  918.                      throw new NoSuchElementException();
  919.                 }

  920.                 @Override
  921.                 public void remove() {
  922.                     throw new UnsupportedOperationException();
  923.                 }
  924.             };
  925.         }

  926.         /**
  927.          * Absorbs the data in other into this, merging buffers as necessary to trim
  928.          * the aggregate down to capacity. This method is only used when aggregating
  929.          * RandomPercentile instances.
  930.          *
  931.          * @param other other BufferMap to merge in
  932.          */
  933.         public void absorb(BufferMap other) {
  934.             // Add all of other's buffers to the map - possibly exceeding cap
  935.             boolean full = true;
  936.             for (Buffer buffer : other) {
  937.                 if (buffer.hasCapacity()) {
  938.                     full = false;
  939.                 }
  940.                 register(buffer);
  941.                 count++;
  942.             }
  943.             // Now eliminate the excess by merging
  944.             while (count > capacity || (count == capacity && full)) {
  945.                 mergeUp();
  946.                 count--;
  947.             }
  948.         }

  949.         /**
  950.          * Find two buffers, first and second, of minimal level. Then merge
  951.          * first into second and discard first.
  952.          * <p>
  953.          * If the buffers have different levels, make second the higher level
  954.          * buffer and make probability of selection in the merge proportional
  955.          * to level weight ratio.
  956.          * <p>
  957.          * This method is only used when aggregating RandomPercentile instances.
  958.          */
  959.         public void mergeUp() {
  960.             // Find two minimum-level buffers to merge
  961.             // Loop depends on two invariants:
  962.             //   0) iterator goes in level order
  963.             //   1) there are no empty lists in the registry
  964.             Iterator<Buffer> bufferIterator = iterator();
  965.             Buffer first = null;
  966.             Buffer second = null;
  967.             while ((first == null || second == null) && bufferIterator.hasNext()) {
  968.                 Buffer buffer = bufferIterator.next();
  969.                 if (!buffer.hasCapacity()) { // Skip not full buffers
  970.                     if (first == null) {
  971.                         first = buffer;
  972.                     } else {
  973.                         second = buffer;
  974.                     }
  975.                 }
  976.             }
  977.             if (first == null || second == null || first.level > second.level) {
  978.                 throw new MathIllegalStateException(LocalizedCoreFormats.INTERNAL_ERROR);
  979.             }
  980.             // Merge first into second and deregister first.
  981.             // Assumes that first has level <= second (checked above).
  982.             if (first.getLevel() == second.getLevel()) {
  983.                 deRegister(first);
  984.                 deRegister(second);
  985.                 second.mergeWith(first);
  986.                 register(second);
  987.             } else {
  988.                 deRegister(first);
  989.                 first.mergeInto(second);
  990.             }
  991.         }
  992.     }

  993.     /**
  994.      * Computes the given percentile by combining the data from the collection
  995.      * of aggregates. The result describes the combined sample of all data added
  996.      * to any of the aggregates.
  997.      *
  998.      * @param percentile desired percentile (scaled 0-100)
  999.      * @param aggregates RandomPercentile instances to combine data from
  1000.      * @return estimate of the given percentile using combined data from the aggregates
  1001.      * @throws MathIllegalArgumentException if percentile is out of the range [0, 100]
  1002.      */
  1003.     public double reduce(double percentile, Collection<RandomPercentile> aggregates) {
  1004.         if (percentile > 100 || percentile < 0) {
  1005.             throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE,
  1006.                                                    percentile, 0, 100);
  1007.         }

  1008.         // First see if we can copy all data and just compute exactly.
  1009.         // The following could be improved to verify that all have only level 0 buffers
  1010.         // and the sum of the data sizes is less than 1/2 total capacity.  Here we
  1011.         // just check that each of the aggregates is less than half full.
  1012.         Iterator<RandomPercentile> iterator = aggregates.iterator();
  1013.         boolean small = true;
  1014.         while (small && iterator.hasNext()) {
  1015.             small = iterator.next().bufferMap.halfEmpty();
  1016.         }
  1017.         if (small) {
  1018.             iterator = aggregates.iterator();
  1019.             double[] combined = {};
  1020.             while (iterator.hasNext()) {
  1021.                combined = MathArrays.concatenate(combined, iterator.next().bufferMap.levelZeroData());
  1022.             }
  1023.             final Percentile exactP = new Percentile(percentile);
  1024.             return exactP.evaluate(combined);
  1025.         }

  1026.         // Below largely duplicates code in getResult(percentile).
  1027.         // Common binary search code with function parameter should be factored out.

  1028.         // Get global max and min to bound binary search and total N
  1029.         double min = Double.POSITIVE_INFINITY;
  1030.         double max = Double.NEGATIVE_INFINITY;
  1031.         double combinedN = 0;
  1032.         iterator = aggregates.iterator();
  1033.         while (iterator.hasNext()) {
  1034.             final RandomPercentile curr = iterator.next();
  1035.             final double curMin = curr.getResult(0);
  1036.             final double curMax = curr.getResult(100);
  1037.             if (curMin < min) {
  1038.                 min = curMin;
  1039.             }
  1040.             if (curMax > max) {
  1041.                 max = curMax;
  1042.             }
  1043.             combinedN += curr.getN();
  1044.         }

  1045.         final double q = percentile / 100;
  1046.         // Handle degenerate cases
  1047.         if (Double.compare(q, 0d) == 0) {
  1048.             return min;
  1049.         }
  1050.         if (Double.compare(q, 1) == 0) {
  1051.             return max;
  1052.         }

  1053.         // Compute target rank
  1054.         final double targetRank = q * combinedN;

  1055.         // Perform binary search using aggregated rank computation
  1056.         // Start with initial guess min + quantile * (max - min).
  1057.         double estimate = min + q * (max - min);
  1058.         double estimateRank = getAggregateRank(estimate, aggregates);
  1059.         double lower;
  1060.         double upper;
  1061.         if (estimateRank > targetRank) {
  1062.             upper = estimate;
  1063.             lower = min;
  1064.         } else if (estimateRank < targetRank) {
  1065.             lower = estimate;
  1066.             upper = max;
  1067.         } else {
  1068.             return estimate;
  1069.         }
  1070.         final double eps = epsilon / 2;
  1071.         double intervalWidth = FastMath.abs(upper - lower);
  1072.         while (FastMath.abs(estimateRank / combinedN - q) > eps && intervalWidth > eps / combinedN) {
  1073.             if (estimateRank == targetRank) {
  1074.                 return estimate;
  1075.             }
  1076.             if (estimateRank > targetRank) {
  1077.                 upper = estimate;
  1078.             } else {
  1079.                 lower = estimate;
  1080.             }
  1081.             intervalWidth = FastMath.abs(upper - lower);
  1082.             estimate = lower + intervalWidth / 2;
  1083.             estimateRank = getAggregateRank(estimate, aggregates);
  1084.         }
  1085.         return estimate;
  1086.     }

  1087.     /**
  1088.      * Computes the estimated rank of value in the combined dataset of the aggregates.
  1089.      * Sums the values from {@link #getRank(double)}.
  1090.      *
  1091.      * @param value value whose rank is sought
  1092.      * @param aggregates collection to aggregate rank over
  1093.      * @return estimated number of elements in the combined dataset that are less than value
  1094.      */
  1095.     public double getAggregateRank(double value, Collection<RandomPercentile> aggregates) {
  1096.         double result = 0;
  1097.         for (RandomPercentile aggregate : aggregates) {
  1098.             result += aggregate.getRank(value);
  1099.         }
  1100.         return result;
  1101.     }

  1102.     /**
  1103.      * Returns the estimated quantile position of value in the combined dataset of the aggregates.
  1104.      * Specifically, what is returned is an estimate of \(|\{x \in X : x &lt; value\}| / |X|\)
  1105.      * where \(X\) is the set of values that have been consumed from all of the datastreams
  1106.      * feeding the aggregates.
  1107.      *
  1108.      * @param value value whose quantile rank is sought.
  1109.      * @param aggregates collection of RandomPercentile instances being combined
  1110.      * @return estimated proportion of combined sample values that are strictly less than {@code value}
  1111.      */
  1112.     public double getAggregateQuantileRank(double value, Collection<RandomPercentile> aggregates) {
  1113.         return getAggregateRank(value, aggregates) / getAggregateN(aggregates);
  1114.     }

  1115.     /**
  1116.      * Returns the total number of values that have been consumed by the aggregates.
  1117.      *
  1118.      * @param aggregates collection of RandomPercentile instances whose combined sample size is sought
  1119.      * @return total number of values that have been consumed by the aggregates
  1120.      */
  1121.     public double getAggregateN(Collection<RandomPercentile> aggregates) {
  1122.         double result = 0;
  1123.         for (RandomPercentile aggregate : aggregates) {
  1124.             result += aggregate.getN();
  1125.         }
  1126.         return result;
  1127.     }

  1128.     /**
  1129.      * Aggregates the provided instance into this instance.
  1130.      * <p>
  1131.      * Other must have the same buffer size as this. If the combined data size
  1132.      * exceeds the maximum storage configured for this instance, buffers are
  1133.      * merged to create capacity. If all that is needed is computation of
  1134.      * aggregate results, {@link #reduce(double, Collection)} is faster,
  1135.      * may be more accurate and does not require the buffer sizes to be the same.
  1136.      *
  1137.      * @param other the instance to aggregate into this instance
  1138.      * @throws NullArgumentException if the input is null
  1139.      * @throws IllegalArgumentException if other has different buffer size than this
  1140.      */
  1141.     @Override
  1142.     public void aggregate(RandomPercentile other)
  1143.         throws NullArgumentException {
  1144.         if (other == null) {
  1145.             throw new NullArgumentException();
  1146.         }
  1147.         if (other.s != s) {
  1148.             throw new MathIllegalArgumentException(LocalizedCoreFormats.INTERNAL_ERROR);
  1149.         }
  1150.         bufferMap.absorb(other.bufferMap);
  1151.         n += other.n;
  1152.     }

  1153.     /**
  1154.      * Returns the maximum number of {@code double} values that a {@code RandomPercentile}
  1155.      * instance created with the given {@code epsilon} value will retain in memory.
  1156.      * <p>
  1157.      * If the number of values that have been consumed from the stream is less than 1/2
  1158.      * of this value, reported statistics are exact.
  1159.      *
  1160.      * @param epsilon bound on the relative quantile error (see class javadoc)
  1161.      * @return upper bound on the total number of primitive double values retained in memory
  1162.      * @throws MathIllegalArgumentException if epsilon is not in the interval (0,1)
  1163.      */
  1164.     public static long maxValuesRetained(double epsilon) {
  1165.         if (epsilon >= 1) {
  1166.             throw new MathIllegalArgumentException(
  1167.                     LocalizedCoreFormats.NUMBER_TOO_LARGE_BOUND_EXCLUDED, epsilon, 1);
  1168.         }
  1169.         if (epsilon <= 0) {
  1170.             throw new MathIllegalArgumentException(
  1171.                     LocalizedCoreFormats.NUMBER_TOO_SMALL_BOUND_EXCLUDED, epsilon, 0);
  1172.         }
  1173.         final long h = (long) FastMath.ceil(log2(1/epsilon));
  1174.         final long s = (long) FastMath.ceil(FastMath.sqrt(log2(1/epsilon)) / epsilon);
  1175.         return (h+1) * s;
  1176.     }
  1177. }