StreamingStatistics.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
- * This is not the original file distributed by the Apache Software Foundation
- * It has been modified by the Hipparchus project
- */
- package org.hipparchus.stat.descriptive;
- import java.io.Serializable;
- import java.util.function.DoubleConsumer;
- import org.hipparchus.exception.NullArgumentException;
- import org.hipparchus.random.RandomGenerator;
- import org.hipparchus.stat.descriptive.moment.GeometricMean;
- import org.hipparchus.stat.descriptive.moment.Mean;
- import org.hipparchus.stat.descriptive.moment.SecondMoment;
- import org.hipparchus.stat.descriptive.moment.Variance;
- import org.hipparchus.stat.descriptive.rank.Max;
- import org.hipparchus.stat.descriptive.rank.Min;
- import org.hipparchus.stat.descriptive.rank.RandomPercentile;
- import org.hipparchus.stat.descriptive.summary.Sum;
- import org.hipparchus.stat.descriptive.summary.SumOfLogs;
- import org.hipparchus.stat.descriptive.summary.SumOfSquares;
- import org.hipparchus.util.FastMath;
- import org.hipparchus.util.MathUtils;
- import org.hipparchus.util.Precision;
- /**
- * Computes summary statistics for a stream of data values added using the
- * {@link #addValue(double) addValue} method. The data values are not stored in
- * memory, so this class can be used to compute statistics for very large data
- * streams.
- * <p>
- * By default, all statistics other than percentiles are maintained. Percentile
- * calculations use an embedded {@link RandomPercentile} which carries more memory
- * and compute overhead than the other statistics, so it is disabled by default.
- * To enable percentiles, either pass {@code true} to the constructor or use a
- * {@link StreamingStatisticsBuilder} to configure an instance with percentiles turned
- * on. Other stats can also be selectively disabled using
- * {@code StreamingStatisticsBulder}.
- * <p>
- * Note: This class is not thread-safe.
- */
- public class StreamingStatistics
- implements StatisticalSummary, AggregatableStatistic<StreamingStatistics>,
- DoubleConsumer, Serializable {
- /** Serialization UID */
- private static final long serialVersionUID = 20160422L;
- /** count of values that have been added */
- private long n;
- /** SecondMoment is used to compute the mean and variance */
- private final SecondMoment secondMoment;
- /** min of values that have been added */
- private final Min minImpl;
- /** max of values that have been added */
- private final Max maxImpl;
- /** sum of values that have been added */
- private final Sum sumImpl;
- /** sum of the square of each value that has been added */
- private final SumOfSquares sumOfSquaresImpl;
- /** sumLog of values that have been added */
- private final SumOfLogs sumOfLogsImpl;
- /** mean of values that have been added */
- private final Mean meanImpl;
- /** variance of values that have been added */
- private final Variance varianceImpl;
- /** geoMean of values that have been added */
- private final GeometricMean geoMeanImpl;
- /** population variance of values that have been added */
- private final Variance populationVariance;
- /** source of percentiles */
- private final RandomPercentile randomPercentile;
- /** whether or not moment stats (sum, mean, variance) are maintained */
- private final boolean computeMoments;
- /** whether or not sum of squares and quadratic mean are maintained */
- private final boolean computeSumOfSquares;
- /** whether or not sum of logs and geometric mean are maintained */
- private final boolean computeSumOfLogs;
- /** whether or not min and max are maintained */
- private final boolean computeExtrema;
- /**
- * Construct a new StreamingStatistics instance, maintaining all statistics
- * other than percentiles.
- */
- public StreamingStatistics() {
- this(Double.NaN, null);
- }
- /**
- * Construct a new StreamingStatistics instance, maintaining all statistics
- * other than percentiles and with/without percentiles per the arguments.
- *
- * @param epsilon bound on quantile estimation error (see {@link RandomGenerator})
- * @param randomGenerator PRNG used in sampling and merge operations (null if percentiles should not be computed)
- * @since 2.3
- */
- public StreamingStatistics(final double epsilon, final RandomGenerator randomGenerator) {
- this(true, true, true, true, epsilon, randomGenerator);
- }
- /**
- * Private constructor used by {@link StreamingStatisticsBuilder}.
- *
- * @param computeMoments whether or not moment stats (mean, sum, variance) are maintained
- * @param computeSumOfLogs whether or not sum of logs and geometric mean are maintained
- * @param computeSumOfSquares whether or not sum of squares and quadratic mean are maintained
- * @param computeExtrema whether or not min and max are maintained
- * @param epsilon bound on quantile estimation error (see {@link RandomGenerator})
- * @param randomGenerator PRNG used in sampling and merge operations (null if percentiles should not be computed)
- * @since 2.3
- */
- private StreamingStatistics(final boolean computeMoments,
- final boolean computeSumOfLogs, final boolean computeSumOfSquares,
- final boolean computeExtrema,
- final double epsilon, final RandomGenerator randomGenerator) {
- this.computeMoments = computeMoments;
- this.computeSumOfLogs = computeSumOfLogs;
- this.computeSumOfSquares = computeSumOfSquares;
- this.computeExtrema = computeExtrema;
- this.secondMoment = computeMoments ? new SecondMoment() : null;
- this.maxImpl = computeExtrema ? new Max() : null;
- this.minImpl = computeExtrema ? new Min() : null;
- this.sumImpl = computeMoments ? new Sum() : null;
- this.sumOfSquaresImpl = computeSumOfSquares ? new SumOfSquares() : null;
- this.sumOfLogsImpl = computeSumOfLogs ? new SumOfLogs() : null;
- this.meanImpl = computeMoments ? new Mean(this.secondMoment) : null;
- this.varianceImpl = computeMoments ? new Variance(this.secondMoment) : null;
- this.geoMeanImpl = computeSumOfLogs ? new GeometricMean(this.sumOfLogsImpl) : null;
- this.populationVariance = computeMoments ? new Variance(false, this.secondMoment) : null;
- this.randomPercentile = randomGenerator == null ? null : new RandomPercentile(epsilon, randomGenerator);
- }
- /**
- * A copy constructor. Creates a deep-copy of the {@code original}.
- *
- * @param original the {@code StreamingStatistics} instance to copy
- * @throws NullArgumentException if original is null
- */
- StreamingStatistics(StreamingStatistics original) throws NullArgumentException {
- MathUtils.checkNotNull(original);
- this.n = original.n;
- this.secondMoment = original.computeMoments ? original.secondMoment.copy() : null;
- this.maxImpl = original.computeExtrema ? original.maxImpl.copy() : null;
- this.minImpl = original.computeExtrema ? original.minImpl.copy() : null;
- this.sumImpl = original.computeMoments ? original.sumImpl.copy() : null;
- this.sumOfLogsImpl = original.computeSumOfLogs ? original.sumOfLogsImpl.copy() : null;
- this.sumOfSquaresImpl = original.computeSumOfSquares ? original.sumOfSquaresImpl.copy() : null;
- // Keep statistics with embedded moments in synch
- this.meanImpl = original.computeMoments ? new Mean(this.secondMoment) : null;
- this.varianceImpl = original.computeMoments ? new Variance(this.secondMoment) : null;
- this.geoMeanImpl = original.computeSumOfLogs ? new GeometricMean(this.sumOfLogsImpl) : null;
- this.populationVariance = original.computeMoments ? new Variance(false, this.secondMoment) : null;
- this.randomPercentile = original.randomPercentile != null ? original.randomPercentile.copy() : null;
- this.computeMoments = original.computeMoments;
- this.computeSumOfLogs = original.computeSumOfLogs;
- this.computeSumOfSquares = original.computeSumOfSquares;
- this.computeExtrema = original.computeExtrema;
- }
- /**
- * Returns a copy of this StreamingStatistics instance with the same internal state.
- *
- * @return a copy of this
- */
- public StreamingStatistics copy() {
- return new StreamingStatistics(this);
- }
- /**
- * Return a {@link StatisticalSummaryValues} instance reporting current
- * statistics.
- * @return Current values of statistics
- */
- public StatisticalSummary getSummary() {
- return new StatisticalSummaryValues(getMean(), getVariance(), getN(),
- getMax(), getMin(), getSum());
- }
- /**
- * Add a value to the data
- * @param value the value to add
- */
- public void addValue(double value) {
- if (computeMoments) {
- secondMoment.increment(value);
- sumImpl.increment(value);
- }
- if (computeExtrema) {
- minImpl.increment(value);
- maxImpl.increment(value);
- }
- if (computeSumOfSquares) {
- sumOfSquaresImpl.increment(value);
- }
- if (computeSumOfLogs) {
- sumOfLogsImpl.increment(value);
- }
- if (randomPercentile != null) {
- randomPercentile.increment(value);
- }
- n++;
- }
- /** {@inheritDoc} */
- @Override
- public void accept(double value) {
- addValue(value);
- }
- /**
- * Resets all statistics and storage.
- */
- public void clear() {
- this.n = 0;
- if (computeExtrema) {
- minImpl.clear();
- maxImpl.clear();
- }
- if (computeMoments) {
- sumImpl.clear();
- secondMoment.clear();
- }
- if (computeSumOfLogs) {
- sumOfLogsImpl.clear();
- }
- if (computeSumOfSquares) {
- sumOfSquaresImpl.clear();
- }
- if (randomPercentile != null) {
- randomPercentile.clear();
- }
- }
- /** {@inheritDoc} */
- @Override
- public long getN() {
- return n;
- }
- /** {@inheritDoc} */
- @Override
- public double getMax() {
- return computeExtrema ? maxImpl.getResult() : Double.NaN;
- }
- /** {@inheritDoc} */
- @Override
- public double getMin() {
- return computeExtrema ? minImpl.getResult() : Double.NaN;
- }
- /** {@inheritDoc} */
- @Override
- public double getSum() {
- return computeMoments ? sumImpl.getResult() : Double.NaN;
- }
- /**
- * Returns the sum of the squares of the values that have been added.
- * <p>
- * Double.NaN is returned if no values have been added.
- *
- * @return The sum of squares
- */
- public double getSumOfSquares() {
- return computeSumOfSquares ? sumOfSquaresImpl.getResult() : Double.NaN;
- }
- /** {@inheritDoc} */
- @Override
- public double getMean() {
- return computeMoments ? meanImpl.getResult() : Double.NaN;
- }
- /** {@inheritDoc} */
- @Override
- public double getVariance() {
- return computeMoments ? varianceImpl.getResult() : Double.NaN;
- }
- /**
- * Returns the <a href="http://en.wikibooks.org/wiki/Statistics/Summary/Variance">
- * population variance</a> of the values that have been added.
- * <p>
- * Double.NaN is returned if no values have been added.
- *
- * @return the population variance
- */
- public double getPopulationVariance() {
- return computeMoments ? populationVariance.getResult() : Double.NaN;
- }
- /**
- * Returns the geometric mean of the values that have been added.
- * <p>
- * Double.NaN is returned if no values have been added.
- *
- * @return the geometric mean
- */
- public double getGeometricMean() {
- return computeSumOfLogs ? geoMeanImpl.getResult() : Double.NaN;
- }
- /**
- * Returns the sum of the logs of the values that have been added.
- * <p>
- * Double.NaN is returned if no values have been added.
- *
- * @return the sum of logs
- */
- public double getSumOfLogs() {
- return computeSumOfLogs ? sumOfLogsImpl.getResult() : Double.NaN;
- }
- /**
- * Returns a statistic related to the Second Central Moment. Specifically,
- * what is returned is the sum of squared deviations from the sample mean
- * among the values that have been added.
- * <p>
- * Returns <code>Double.NaN</code> if no data values have been added and
- * returns <code>0</code> if there is just one value in the data set.
- *
- * @return second central moment statistic
- */
- public double getSecondMoment() {
- return computeMoments ? secondMoment.getResult() : Double.NaN;
- }
- /**
- * Returns the quadratic mean, a.k.a.
- * <a href="http://mathworld.wolfram.com/Root-Mean-Square.html">
- * root-mean-square</a> of the available values
- *
- * @return The quadratic mean or {@code Double.NaN} if no values
- * have been added.
- */
- public double getQuadraticMean() {
- if (computeSumOfSquares) {
- long size = getN();
- return size > 0 ? FastMath.sqrt(getSumOfSquares() / size) : Double.NaN;
- } else {
- return Double.NaN;
- }
- }
- /**
- * Returns the standard deviation of the values that have been added.
- * <p>
- * Double.NaN is returned if no values have been added.
- *
- * @return the standard deviation
- */
- @Override
- public double getStandardDeviation() {
- long size = getN();
- if (computeMoments) {
- if (size > 0) {
- return size > 1 ? FastMath.sqrt(getVariance()) : 0.0;
- } else {
- return Double.NaN;
- }
- } else {
- return Double.NaN;
- }
- }
- /**
- * Returns an estimate of the median of the values that have been entered.
- * See {@link RandomPercentile} for a description of the algorithm used for large
- * data streams.
- *
- * @return the median
- */
- public double getMedian() {
- return randomPercentile != null ? randomPercentile.getResult(50d) : Double.NaN;
- }
- /**
- * Returns an estimate of the given percentile of the values that have been entered.
- * See {@link RandomPercentile} for a description of the algorithm used for large
- * data streams.
- *
- * @param percentile the desired percentile (must be between 0 and 100)
- * @return estimated percentile
- */
- public double getPercentile(double percentile) {
- return randomPercentile == null ? Double.NaN : randomPercentile.getResult(percentile);
- }
- /**
- * {@inheritDoc}
- * Statistics are aggregated only when both this and other are maintaining them. For example,
- * if this.computeMoments is false, but other.computeMoments is true, the moment data in other
- * will be lost.
- */
- @Override
- public void aggregate(StreamingStatistics other) {
- MathUtils.checkNotNull(other);
- if (other.n > 0) {
- this.n += other.n;
- if (computeMoments && other.computeMoments) {
- this.secondMoment.aggregate(other.secondMoment);
- this.sumImpl.aggregate(other.sumImpl);
- }
- if (computeExtrema && other.computeExtrema) {
- this.minImpl.aggregate(other.minImpl);
- this.maxImpl.aggregate(other.maxImpl);
- }
- if (computeSumOfLogs && other.computeSumOfLogs) {
- this.sumOfLogsImpl.aggregate(other.sumOfLogsImpl);
- }
- if (computeSumOfSquares && other.computeSumOfSquares) {
- this.sumOfSquaresImpl.aggregate(other.sumOfSquaresImpl);
- }
- if (randomPercentile != null && other.randomPercentile != null) {
- this.randomPercentile.aggregate(other.randomPercentile);
- }
- }
- }
- /**
- * Generates a text report displaying summary statistics from values that
- * have been added.
- *
- * @return String with line feeds displaying statistics
- */
- @Override
- public String toString() {
- StringBuilder outBuffer = new StringBuilder(200); // the size is just a wild guess
- String endl = "\n";
- outBuffer.append("StreamingStatistics:").append(endl).
- append("n: ").append(getN()).append(endl).
- append("min: ").append(getMin()).append(endl).
- append("max: ").append(getMax()).append(endl).
- append("sum: ").append(getSum()).append(endl).
- append("mean: ").append(getMean()).append(endl).
- append("variance: ").append(getVariance()).append(endl).
- append("population variance: ").append(getPopulationVariance()).append(endl).
- append("standard deviation: ").append(getStandardDeviation()).append(endl).
- append("geometric mean: ").append(getGeometricMean()).append(endl).
- append("second moment: ").append(getSecondMoment()).append(endl).
- append("sum of squares: ").append(getSumOfSquares()).append(endl).
- append("sum of logs: ").append(getSumOfLogs()).append(endl);
- return outBuffer.toString();
- }
- /**
- * Returns true iff <code>object</code> is a <code>StreamingStatistics</code>
- * instance and all statistics have the same values as this.
- *
- * @param object the object to test equality against.
- * @return true if object equals this
- */
- @Override
- public boolean equals(Object object) {
- if (object == this) {
- return true;
- }
- if (!(object instanceof StreamingStatistics)) {
- return false;
- }
- StreamingStatistics other = (StreamingStatistics)object;
- return other.getN() == getN() &&
- Precision.equalsIncludingNaN(other.getMax(), getMax()) &&
- Precision.equalsIncludingNaN(other.getMin(), getMin()) &&
- Precision.equalsIncludingNaN(other.getSum(), getSum()) &&
- Precision.equalsIncludingNaN(other.getGeometricMean(), getGeometricMean()) &&
- Precision.equalsIncludingNaN(other.getMean(), getMean()) &&
- Precision.equalsIncludingNaN(other.getSumOfSquares(), getSumOfSquares()) &&
- Precision.equalsIncludingNaN(other.getSumOfLogs(), getSumOfLogs()) &&
- Precision.equalsIncludingNaN(other.getVariance(), getVariance()) &&
- Precision.equalsIncludingNaN(other.getMedian(), getMedian());
- }
- /**
- * Returns hash code based on values of statistics.
- * @return hash code
- */
- @Override
- public int hashCode() {
- int result = 31 + MathUtils.hash(getN());
- result = result * 31 + MathUtils.hash(getMax());
- result = result * 31 + MathUtils.hash(getMin());
- result = result * 31 + MathUtils.hash(getSum());
- result = result * 31 + MathUtils.hash(getGeometricMean());
- result = result * 31 + MathUtils.hash(getMean());
- result = result * 31 + MathUtils.hash(getSumOfSquares());
- result = result * 31 + MathUtils.hash(getSumOfLogs());
- result = result * 31 + MathUtils.hash(getVariance());
- result = result * 31 + MathUtils.hash(getMedian());
- return result;
- }
- /**
- * Returns a {@link StreamingStatisticsBuilder} to source configured
- * {@code StreamingStatistics} instances.
- *
- * @return a StreamingStatisticsBuilder instance
- */
- public static StreamingStatisticsBuilder builder() {
- return new StreamingStatisticsBuilder();
- }
- /**
- * Builder for StreamingStatistics instances.
- */
- public static class StreamingStatisticsBuilder {
- /** whether or not moment statistics are maintained by instances created by this factory */
- private boolean computeMoments;
- /** whether or not sum of squares and quadratic mean are maintained by instances created by this factory */
- private boolean computeSumOfSquares;
- /** whether or not sum of logs and geometric mean are maintained by instances created by this factory */
- private boolean computeSumOfLogs;
- /** whether or not min and max are maintained by instances created by this factory */
- private boolean computeExtrema;
- /** bound on quantile estimation error for percentiles.
- * @since 2.3
- */
- private double epsilon;
- /** PRNG used in sampling and merge operations.
- * @since 2.3
- */
- private RandomGenerator randomGenerator;
- /** Simple constructor.
- */
- public StreamingStatisticsBuilder() {
- computeMoments = true;
- computeSumOfSquares = true;
- computeSumOfLogs = true;
- computeExtrema = true;
- percentiles(Double.NaN, null);
- }
- /**
- * Sets the computeMoments setting of the factory
- *
- * @param arg whether or not instances created using {@link #build()} will
- * maintain moment statistics
- * @return a factory with the given computeMoments property set
- */
- public StreamingStatisticsBuilder moments(boolean arg) {
- this.computeMoments = arg;
- return this;
- }
- /**
- * Sets the computeSumOfLogs setting of the factory
- *
- * @param arg whether or not instances created using {@link #build()} will
- * maintain log sums
- * @return a factory with the given computeSumOfLogs property set
- */
- public StreamingStatisticsBuilder sumOfLogs(boolean arg) {
- this.computeSumOfLogs = arg;
- return this;
- }
- /**
- * Sets the computeSumOfSquares setting of the factory.
- *
- * @param arg whether or not instances created using {@link #build()} will
- * maintain sums of squares
- * @return a factory with the given computeSumOfSquares property set
- */
- public StreamingStatisticsBuilder sumOfSquares(boolean arg) {
- this.computeSumOfSquares = arg;
- return this;
- }
- /**
- * Sets the computePercentiles setting of the factory.
- * @param epsilonBound bound on quantile estimation error (see {@link RandomGenerator})
- * @param generator PRNG used in sampling and merge operations
- * @return a factory with the given computePercentiles property set
- * @since 2.3
- */
- public StreamingStatisticsBuilder percentiles(final double epsilonBound, final RandomGenerator generator) {
- this.epsilon = epsilonBound;
- this.randomGenerator = generator;
- return this;
- }
- /**
- * Sets the computeExtrema setting of the factory.
- *
- * @param arg whether or not instances created using {@link #build()} will
- * compute min and max
- * @return a factory with the given computeExtrema property set
- */
- public StreamingStatisticsBuilder extrema(boolean arg) {
- this.computeExtrema = arg;
- return this;
- }
- /**
- * Builds a StreamingStatistics instance with currently defined properties.
- *
- * @return newly configured StreamingStatistics instance
- */
- public StreamingStatistics build() {
- return new StreamingStatistics(computeMoments,
- computeSumOfLogs, computeSumOfSquares,
- computeExtrema,
- epsilon, randomGenerator);
- }
- }
- }