StreamingStatistics.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This is not the original file distributed by the Apache Software Foundation
* It has been modified by the Hipparchus project
*/
package org.hipparchus.stat.descriptive;
import java.io.Serializable;
import java.util.function.DoubleConsumer;
import org.hipparchus.exception.NullArgumentException;
import org.hipparchus.random.RandomGenerator;
import org.hipparchus.stat.descriptive.moment.GeometricMean;
import org.hipparchus.stat.descriptive.moment.Mean;
import org.hipparchus.stat.descriptive.moment.SecondMoment;
import org.hipparchus.stat.descriptive.moment.Variance;
import org.hipparchus.stat.descriptive.rank.Max;
import org.hipparchus.stat.descriptive.rank.Min;
import org.hipparchus.stat.descriptive.rank.RandomPercentile;
import org.hipparchus.stat.descriptive.summary.Sum;
import org.hipparchus.stat.descriptive.summary.SumOfLogs;
import org.hipparchus.stat.descriptive.summary.SumOfSquares;
import org.hipparchus.util.FastMath;
import org.hipparchus.util.MathUtils;
import org.hipparchus.util.Precision;
/**
* Computes summary statistics for a stream of data values added using the
* {@link #addValue(double) addValue} method. The data values are not stored in
* memory, so this class can be used to compute statistics for very large data
* streams.
* <p>
* By default, all statistics other than percentiles are maintained. Percentile
* calculations use an embedded {@link RandomPercentile} which carries more memory
* and compute overhead than the other statistics, so it is disabled by default.
* To enable percentiles, either pass {@code true} to the constructor or use a
* {@link StreamingStatisticsBuilder} to configure an instance with percentiles turned
* on. Other stats can also be selectively disabled using
* {@code StreamingStatisticsBulder}.
* <p>
* Note: This class is not thread-safe.
*/
public class StreamingStatistics
implements StatisticalSummary, AggregatableStatistic<StreamingStatistics>,
DoubleConsumer, Serializable {
/** Serialization UID */
private static final long serialVersionUID = 20160422L;
/** count of values that have been added */
private long n;
/** SecondMoment is used to compute the mean and variance */
private final SecondMoment secondMoment;
/** min of values that have been added */
private final Min minImpl;
/** max of values that have been added */
private final Max maxImpl;
/** sum of values that have been added */
private final Sum sumImpl;
/** sum of the square of each value that has been added */
private final SumOfSquares sumOfSquaresImpl;
/** sumLog of values that have been added */
private final SumOfLogs sumOfLogsImpl;
/** mean of values that have been added */
private final Mean meanImpl;
/** variance of values that have been added */
private final Variance varianceImpl;
/** geoMean of values that have been added */
private final GeometricMean geoMeanImpl;
/** population variance of values that have been added */
private final Variance populationVariance;
/** source of percentiles */
private final RandomPercentile randomPercentile;
/** whether or not moment stats (sum, mean, variance) are maintained */
private final boolean computeMoments;
/** whether or not sum of squares and quadratic mean are maintained */
private final boolean computeSumOfSquares;
/** whether or not sum of logs and geometric mean are maintained */
private final boolean computeSumOfLogs;
/** whether or not min and max are maintained */
private final boolean computeExtrema;
/**
* Construct a new StreamingStatistics instance, maintaining all statistics
* other than percentiles.
*/
public StreamingStatistics() {
this(Double.NaN, null);
}
/**
* Construct a new StreamingStatistics instance, maintaining all statistics
* other than percentiles and with/without percentiles per the arguments.
*
* @param epsilon bound on quantile estimation error (see {@link RandomGenerator})
* @param randomGenerator PRNG used in sampling and merge operations (null if percentiles should not be computed)
* @since 2.3
*/
public StreamingStatistics(final double epsilon, final RandomGenerator randomGenerator) {
this(true, true, true, true, epsilon, randomGenerator);
}
/**
* Private constructor used by {@link StreamingStatisticsBuilder}.
*
* @param computeMoments whether or not moment stats (mean, sum, variance) are maintained
* @param computeSumOfLogs whether or not sum of logs and geometric mean are maintained
* @param computeSumOfSquares whether or not sum of squares and quadratic mean are maintained
* @param computeExtrema whether or not min and max are maintained
* @param epsilon bound on quantile estimation error (see {@link RandomGenerator})
* @param randomGenerator PRNG used in sampling and merge operations (null if percentiles should not be computed)
* @since 2.3
*/
private StreamingStatistics(final boolean computeMoments,
final boolean computeSumOfLogs, final boolean computeSumOfSquares,
final boolean computeExtrema,
final double epsilon, final RandomGenerator randomGenerator) {
this.computeMoments = computeMoments;
this.computeSumOfLogs = computeSumOfLogs;
this.computeSumOfSquares = computeSumOfSquares;
this.computeExtrema = computeExtrema;
this.secondMoment = computeMoments ? new SecondMoment() : null;
this.maxImpl = computeExtrema ? new Max() : null;
this.minImpl = computeExtrema ? new Min() : null;
this.sumImpl = computeMoments ? new Sum() : null;
this.sumOfSquaresImpl = computeSumOfSquares ? new SumOfSquares() : null;
this.sumOfLogsImpl = computeSumOfLogs ? new SumOfLogs() : null;
this.meanImpl = computeMoments ? new Mean(this.secondMoment) : null;
this.varianceImpl = computeMoments ? new Variance(this.secondMoment) : null;
this.geoMeanImpl = computeSumOfLogs ? new GeometricMean(this.sumOfLogsImpl) : null;
this.populationVariance = computeMoments ? new Variance(false, this.secondMoment) : null;
this.randomPercentile = randomGenerator == null ? null : new RandomPercentile(epsilon, randomGenerator);
}
/**
* A copy constructor. Creates a deep-copy of the {@code original}.
*
* @param original the {@code StreamingStatistics} instance to copy
* @throws NullArgumentException if original is null
*/
StreamingStatistics(StreamingStatistics original) throws NullArgumentException {
MathUtils.checkNotNull(original);
this.n = original.n;
this.secondMoment = original.computeMoments ? original.secondMoment.copy() : null;
this.maxImpl = original.computeExtrema ? original.maxImpl.copy() : null;
this.minImpl = original.computeExtrema ? original.minImpl.copy() : null;
this.sumImpl = original.computeMoments ? original.sumImpl.copy() : null;
this.sumOfLogsImpl = original.computeSumOfLogs ? original.sumOfLogsImpl.copy() : null;
this.sumOfSquaresImpl = original.computeSumOfSquares ? original.sumOfSquaresImpl.copy() : null;
// Keep statistics with embedded moments in synch
this.meanImpl = original.computeMoments ? new Mean(this.secondMoment) : null;
this.varianceImpl = original.computeMoments ? new Variance(this.secondMoment) : null;
this.geoMeanImpl = original.computeSumOfLogs ? new GeometricMean(this.sumOfLogsImpl) : null;
this.populationVariance = original.computeMoments ? new Variance(false, this.secondMoment) : null;
this.randomPercentile = original.randomPercentile != null ? original.randomPercentile.copy() : null;
this.computeMoments = original.computeMoments;
this.computeSumOfLogs = original.computeSumOfLogs;
this.computeSumOfSquares = original.computeSumOfSquares;
this.computeExtrema = original.computeExtrema;
}
/**
* Returns a copy of this StreamingStatistics instance with the same internal state.
*
* @return a copy of this
*/
public StreamingStatistics copy() {
return new StreamingStatistics(this);
}
/**
* Return a {@link StatisticalSummaryValues} instance reporting current
* statistics.
* @return Current values of statistics
*/
public StatisticalSummary getSummary() {
return new StatisticalSummaryValues(getMean(), getVariance(), getN(),
getMax(), getMin(), getSum());
}
/**
* Add a value to the data
* @param value the value to add
*/
public void addValue(double value) {
if (computeMoments) {
secondMoment.increment(value);
sumImpl.increment(value);
}
if (computeExtrema) {
minImpl.increment(value);
maxImpl.increment(value);
}
if (computeSumOfSquares) {
sumOfSquaresImpl.increment(value);
}
if (computeSumOfLogs) {
sumOfLogsImpl.increment(value);
}
if (randomPercentile != null) {
randomPercentile.increment(value);
}
n++;
}
/** {@inheritDoc} */
@Override
public void accept(double value) {
addValue(value);
}
/**
* Resets all statistics and storage.
*/
public void clear() {
this.n = 0;
if (computeExtrema) {
minImpl.clear();
maxImpl.clear();
}
if (computeMoments) {
sumImpl.clear();
secondMoment.clear();
}
if (computeSumOfLogs) {
sumOfLogsImpl.clear();
}
if (computeSumOfSquares) {
sumOfSquaresImpl.clear();
}
if (randomPercentile != null) {
randomPercentile.clear();
}
}
/** {@inheritDoc} */
@Override
public long getN() {
return n;
}
/** {@inheritDoc} */
@Override
public double getMax() {
return computeExtrema ? maxImpl.getResult() : Double.NaN;
}
/** {@inheritDoc} */
@Override
public double getMin() {
return computeExtrema ? minImpl.getResult() : Double.NaN;
}
/** {@inheritDoc} */
@Override
public double getSum() {
return computeMoments ? sumImpl.getResult() : Double.NaN;
}
/**
* Returns the sum of the squares of the values that have been added.
* <p>
* Double.NaN is returned if no values have been added.
*
* @return The sum of squares
*/
public double getSumOfSquares() {
return computeSumOfSquares ? sumOfSquaresImpl.getResult() : Double.NaN;
}
/** {@inheritDoc} */
@Override
public double getMean() {
return computeMoments ? meanImpl.getResult() : Double.NaN;
}
/** {@inheritDoc} */
@Override
public double getVariance() {
return computeMoments ? varianceImpl.getResult() : Double.NaN;
}
/**
* Returns the <a href="http://en.wikibooks.org/wiki/Statistics/Summary/Variance">
* population variance</a> of the values that have been added.
* <p>
* Double.NaN is returned if no values have been added.
*
* @return the population variance
*/
public double getPopulationVariance() {
return computeMoments ? populationVariance.getResult() : Double.NaN;
}
/**
* Returns the geometric mean of the values that have been added.
* <p>
* Double.NaN is returned if no values have been added.
*
* @return the geometric mean
*/
public double getGeometricMean() {
return computeSumOfLogs ? geoMeanImpl.getResult() : Double.NaN;
}
/**
* Returns the sum of the logs of the values that have been added.
* <p>
* Double.NaN is returned if no values have been added.
*
* @return the sum of logs
*/
public double getSumOfLogs() {
return computeSumOfLogs ? sumOfLogsImpl.getResult() : Double.NaN;
}
/**
* Returns a statistic related to the Second Central Moment. Specifically,
* what is returned is the sum of squared deviations from the sample mean
* among the values that have been added.
* <p>
* Returns <code>Double.NaN</code> if no data values have been added and
* returns <code>0</code> if there is just one value in the data set.
*
* @return second central moment statistic
*/
public double getSecondMoment() {
return computeMoments ? secondMoment.getResult() : Double.NaN;
}
/**
* Returns the quadratic mean, a.k.a.
* <a href="http://mathworld.wolfram.com/Root-Mean-Square.html">
* root-mean-square</a> of the available values
*
* @return The quadratic mean or {@code Double.NaN} if no values
* have been added.
*/
public double getQuadraticMean() {
if (computeSumOfSquares) {
long size = getN();
return size > 0 ? FastMath.sqrt(getSumOfSquares() / size) : Double.NaN;
} else {
return Double.NaN;
}
}
/**
* Returns the standard deviation of the values that have been added.
* <p>
* Double.NaN is returned if no values have been added.
*
* @return the standard deviation
*/
@Override
public double getStandardDeviation() {
long size = getN();
if (computeMoments) {
if (size > 0) {
return size > 1 ? FastMath.sqrt(getVariance()) : 0.0;
} else {
return Double.NaN;
}
} else {
return Double.NaN;
}
}
/**
* Returns an estimate of the median of the values that have been entered.
* See {@link RandomPercentile} for a description of the algorithm used for large
* data streams.
*
* @return the median
*/
public double getMedian() {
return randomPercentile != null ? randomPercentile.getResult(50d) : Double.NaN;
}
/**
* Returns an estimate of the given percentile of the values that have been entered.
* See {@link RandomPercentile} for a description of the algorithm used for large
* data streams.
*
* @param percentile the desired percentile (must be between 0 and 100)
* @return estimated percentile
*/
public double getPercentile(double percentile) {
return randomPercentile == null ? Double.NaN : randomPercentile.getResult(percentile);
}
/**
* {@inheritDoc}
* Statistics are aggregated only when both this and other are maintaining them. For example,
* if this.computeMoments is false, but other.computeMoments is true, the moment data in other
* will be lost.
*/
@Override
public void aggregate(StreamingStatistics other) {
MathUtils.checkNotNull(other);
if (other.n > 0) {
this.n += other.n;
if (computeMoments && other.computeMoments) {
this.secondMoment.aggregate(other.secondMoment);
this.sumImpl.aggregate(other.sumImpl);
}
if (computeExtrema && other.computeExtrema) {
this.minImpl.aggregate(other.minImpl);
this.maxImpl.aggregate(other.maxImpl);
}
if (computeSumOfLogs && other.computeSumOfLogs) {
this.sumOfLogsImpl.aggregate(other.sumOfLogsImpl);
}
if (computeSumOfSquares && other.computeSumOfSquares) {
this.sumOfSquaresImpl.aggregate(other.sumOfSquaresImpl);
}
if (randomPercentile != null && other.randomPercentile != null) {
this.randomPercentile.aggregate(other.randomPercentile);
}
}
}
/**
* Generates a text report displaying summary statistics from values that
* have been added.
*
* @return String with line feeds displaying statistics
*/
@Override
public String toString() {
StringBuilder outBuffer = new StringBuilder(200); // the size is just a wild guess
String endl = "\n";
outBuffer.append("StreamingStatistics:").append(endl).
append("n: ").append(getN()).append(endl).
append("min: ").append(getMin()).append(endl).
append("max: ").append(getMax()).append(endl).
append("sum: ").append(getSum()).append(endl).
append("mean: ").append(getMean()).append(endl).
append("variance: ").append(getVariance()).append(endl).
append("population variance: ").append(getPopulationVariance()).append(endl).
append("standard deviation: ").append(getStandardDeviation()).append(endl).
append("geometric mean: ").append(getGeometricMean()).append(endl).
append("second moment: ").append(getSecondMoment()).append(endl).
append("sum of squares: ").append(getSumOfSquares()).append(endl).
append("sum of logs: ").append(getSumOfLogs()).append(endl);
return outBuffer.toString();
}
/**
* Returns true iff <code>object</code> is a <code>StreamingStatistics</code>
* instance and all statistics have the same values as this.
*
* @param object the object to test equality against.
* @return true if object equals this
*/
@Override
public boolean equals(Object object) {
if (object == this) {
return true;
}
if (!(object instanceof StreamingStatistics)) {
return false;
}
StreamingStatistics other = (StreamingStatistics)object;
return other.getN() == getN() &&
Precision.equalsIncludingNaN(other.getMax(), getMax()) &&
Precision.equalsIncludingNaN(other.getMin(), getMin()) &&
Precision.equalsIncludingNaN(other.getSum(), getSum()) &&
Precision.equalsIncludingNaN(other.getGeometricMean(), getGeometricMean()) &&
Precision.equalsIncludingNaN(other.getMean(), getMean()) &&
Precision.equalsIncludingNaN(other.getSumOfSquares(), getSumOfSquares()) &&
Precision.equalsIncludingNaN(other.getSumOfLogs(), getSumOfLogs()) &&
Precision.equalsIncludingNaN(other.getVariance(), getVariance()) &&
Precision.equalsIncludingNaN(other.getMedian(), getMedian());
}
/**
* Returns hash code based on values of statistics.
* @return hash code
*/
@Override
public int hashCode() {
int result = 31 + MathUtils.hash(getN());
result = result * 31 + MathUtils.hash(getMax());
result = result * 31 + MathUtils.hash(getMin());
result = result * 31 + MathUtils.hash(getSum());
result = result * 31 + MathUtils.hash(getGeometricMean());
result = result * 31 + MathUtils.hash(getMean());
result = result * 31 + MathUtils.hash(getSumOfSquares());
result = result * 31 + MathUtils.hash(getSumOfLogs());
result = result * 31 + MathUtils.hash(getVariance());
result = result * 31 + MathUtils.hash(getMedian());
return result;
}
/**
* Returns a {@link StreamingStatisticsBuilder} to source configured
* {@code StreamingStatistics} instances.
*
* @return a StreamingStatisticsBuilder instance
*/
public static StreamingStatisticsBuilder builder() {
return new StreamingStatisticsBuilder();
}
/**
* Builder for StreamingStatistics instances.
*/
public static class StreamingStatisticsBuilder {
/** whether or not moment statistics are maintained by instances created by this factory */
private boolean computeMoments;
/** whether or not sum of squares and quadratic mean are maintained by instances created by this factory */
private boolean computeSumOfSquares;
/** whether or not sum of logs and geometric mean are maintained by instances created by this factory */
private boolean computeSumOfLogs;
/** whether or not min and max are maintained by instances created by this factory */
private boolean computeExtrema;
/** bound on quantile estimation error for percentiles.
* @since 2.3
*/
private double epsilon;
/** PRNG used in sampling and merge operations.
* @since 2.3
*/
private RandomGenerator randomGenerator;
/** Simple constructor.
*/
public StreamingStatisticsBuilder() {
computeMoments = true;
computeSumOfSquares = true;
computeSumOfLogs = true;
computeExtrema = true;
percentiles(Double.NaN, null);
}
/**
* Sets the computeMoments setting of the factory
*
* @param arg whether or not instances created using {@link #build()} will
* maintain moment statistics
* @return a factory with the given computeMoments property set
*/
public StreamingStatisticsBuilder moments(boolean arg) {
this.computeMoments = arg;
return this;
}
/**
* Sets the computeSumOfLogs setting of the factory
*
* @param arg whether or not instances created using {@link #build()} will
* maintain log sums
* @return a factory with the given computeSumOfLogs property set
*/
public StreamingStatisticsBuilder sumOfLogs(boolean arg) {
this.computeSumOfLogs = arg;
return this;
}
/**
* Sets the computeSumOfSquares setting of the factory.
*
* @param arg whether or not instances created using {@link #build()} will
* maintain sums of squares
* @return a factory with the given computeSumOfSquares property set
*/
public StreamingStatisticsBuilder sumOfSquares(boolean arg) {
this.computeSumOfSquares = arg;
return this;
}
/**
* Sets the computePercentiles setting of the factory.
* @param epsilonBound bound on quantile estimation error (see {@link RandomGenerator})
* @param generator PRNG used in sampling and merge operations
* @return a factory with the given computePercentiles property set
* @since 2.3
*/
public StreamingStatisticsBuilder percentiles(final double epsilonBound, final RandomGenerator generator) {
this.epsilon = epsilonBound;
this.randomGenerator = generator;
return this;
}
/**
* Sets the computeExtrema setting of the factory.
*
* @param arg whether or not instances created using {@link #build()} will
* compute min and max
* @return a factory with the given computeExtrema property set
*/
public StreamingStatisticsBuilder extrema(boolean arg) {
this.computeExtrema = arg;
return this;
}
/**
* Builds a StreamingStatistics instance with currently defined properties.
*
* @return newly configured StreamingStatistics instance
*/
public StreamingStatistics build() {
return new StreamingStatistics(computeMoments,
computeSumOfLogs, computeSumOfSquares,
computeExtrema,
epsilon, randomGenerator);
}
}
}