ZipfDistribution.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. /*
  18.  * This is not the original file distributed by the Apache Software Foundation
  19.  * It has been modified by the Hipparchus project
  20.  */

  21. package org.hipparchus.distribution.discrete;

  22. import org.hipparchus.exception.LocalizedCoreFormats;
  23. import org.hipparchus.exception.MathIllegalArgumentException;
  24. import org.hipparchus.util.FastMath;

  25. /**
  26.  * Implementation of the Zipf distribution.
  27.  * <p>
  28.  * <strong>Parameters:</strong>
  29.  * For a random variable {@code X} whose values are distributed according to this
  30.  * distribution, the probability mass function is given by
  31.  * </p>
  32.  * <pre>
  33.  *   P(X = k) = H(N,s) * 1 / k^s    for {@code k = 1,2,...,N}.
  34.  * </pre>
  35.  * <p>
  36.  * {@code H(N,s)} is the normalizing constant
  37.  * which corresponds to the generalized harmonic number of order N of s.
  38.  * </p>
  39.  * <ul>
  40.  * <li>{@code N} is the number of elements</li>
  41.  * <li>{@code s} is the exponent</li>
  42.  * </ul>
  43.  *
  44.  * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a>
  45.  * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a>
  46.  */
  47. public class ZipfDistribution extends AbstractIntegerDistribution {
  48.     /** Serializable version identifier. */
  49.     private static final long serialVersionUID = 20150501L;
  50.     /** Number of elements. */
  51.     private final int numberOfElements;
  52.     /** Exponent parameter of the distribution. */
  53.     private final double exponent;
  54.     /** Cached values of the nth generalized harmonic. */
  55.     private final double nthHarmonic;
  56.     /** Cached numerical mean */
  57.     private double numericalMean = Double.NaN;
  58.     /** Whether or not the numerical mean has been calculated */
  59.     private boolean numericalMeanIsCalculated;
  60.     /** Cached numerical variance */
  61.     private double numericalVariance = Double.NaN;
  62.     /** Whether or not the numerical variance has been calculated */
  63.     private boolean numericalVarianceIsCalculated;

  64.     /**
  65.      * Create a new Zipf distribution with the given number of elements and
  66.      * exponent.
  67.      *
  68.      * @param numberOfElements Number of elements.
  69.      * @param exponent Exponent.
  70.      * @exception MathIllegalArgumentException if {@code numberOfElements <= 0}
  71.      * or {@code exponent <= 0}.
  72.      */
  73.     public ZipfDistribution(final int numberOfElements, final double exponent)
  74.         throws MathIllegalArgumentException {
  75.         if (numberOfElements <= 0) {
  76.             throw new MathIllegalArgumentException(LocalizedCoreFormats.DIMENSION,
  77.                                                    numberOfElements);
  78.         }
  79.         if (exponent <= 0) {
  80.             throw new MathIllegalArgumentException(LocalizedCoreFormats.EXPONENT,
  81.                                                    exponent);
  82.         }

  83.         this.numberOfElements = numberOfElements;
  84.         this.exponent = exponent;
  85.         this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
  86.     }

  87.     /**
  88.      * Get the number of elements (e.g. corpus size) for the distribution.
  89.      *
  90.      * @return the number of elements
  91.      */
  92.     public int getNumberOfElements() {
  93.         return numberOfElements;
  94.     }

  95.     /**
  96.      * Get the exponent characterizing the distribution.
  97.      *
  98.      * @return the exponent
  99.      */
  100.     public double getExponent() {
  101.         return exponent;
  102.     }

  103.     /** {@inheritDoc} */
  104.     @Override
  105.     public double probability(final int x) {
  106.         if (x <= 0 || x > numberOfElements) {
  107.             return 0.0;
  108.         }

  109.         return (1.0 / FastMath.pow(x, exponent)) / nthHarmonic;
  110.     }

  111.     /** {@inheritDoc} */
  112.     @Override
  113.     public double logProbability(int x) {
  114.         if (x <= 0 || x > numberOfElements) {
  115.             return Double.NEGATIVE_INFINITY;
  116.         }

  117.         return -FastMath.log(x) * exponent - FastMath.log(nthHarmonic);
  118.     }

  119.     /** {@inheritDoc} */
  120.     @Override
  121.     public double cumulativeProbability(final int x) {
  122.         if (x <= 0) {
  123.             return 0.0;
  124.         } else if (x >= numberOfElements) {
  125.             return 1.0;
  126.         }

  127.         return generalizedHarmonic(x, exponent) / nthHarmonic;
  128.     }

  129.     /**
  130.      * {@inheritDoc}
  131.      *
  132.      * For number of elements {@code N} and exponent {@code s}, the mean is
  133.      * {@code Hs1 / Hs}, where
  134.      * <ul>
  135.      *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
  136.      *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
  137.      * </ul>
  138.      */
  139.     @Override
  140.     public double getNumericalMean() {
  141.         if (!numericalMeanIsCalculated) {
  142.             numericalMean = calculateNumericalMean();
  143.             numericalMeanIsCalculated = true;
  144.         }
  145.         return numericalMean;
  146.     }

  147.     /**
  148.      * Used by {@link #getNumericalMean()}.
  149.      *
  150.      * @return the mean of this distribution
  151.      */
  152.     protected double calculateNumericalMean() {
  153.         final int N = getNumberOfElements();
  154.         final double s = getExponent();

  155.         final double Hs1 = generalizedHarmonic(N, s - 1);
  156.         final double Hs = nthHarmonic;

  157.         return Hs1 / Hs;
  158.     }

  159.     /**
  160.      * {@inheritDoc}
  161.      *
  162.      * For number of elements {@code N} and exponent {@code s}, the mean is
  163.      * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
  164.      * <ul>
  165.      *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
  166.      *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
  167.      *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
  168.      * </ul>
  169.      */
  170.     @Override
  171.     public double getNumericalVariance() {
  172.         if (!numericalVarianceIsCalculated) {
  173.             numericalVariance = calculateNumericalVariance();
  174.             numericalVarianceIsCalculated = true;
  175.         }
  176.         return numericalVariance;
  177.     }

  178.     /**
  179.      * Used by {@link #getNumericalVariance()}.
  180.      *
  181.      * @return the variance of this distribution
  182.      */
  183.     protected double calculateNumericalVariance() {
  184.         final int N = getNumberOfElements();
  185.         final double s = getExponent();

  186.         final double Hs2 = generalizedHarmonic(N, s - 2);
  187.         final double Hs1 = generalizedHarmonic(N, s - 1);
  188.         final double Hs = nthHarmonic;

  189.         return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
  190.     }

  191.     /**
  192.      * Calculates the Nth generalized harmonic number. See
  193.      * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
  194.      * Series</a>.
  195.      *
  196.      * @param n Term in the series to calculate (must be larger than 1)
  197.      * @param m Exponent (special case {@code m = 1} is the harmonic series).
  198.      * @return the n<sup>th</sup> generalized harmonic number.
  199.      */
  200.     private double generalizedHarmonic(final int n, final double m) {
  201.         double value = 0;
  202.         for (int k = n; k > 0; --k) {
  203.             value += 1.0 / FastMath.pow(k, m);
  204.         }
  205.         return value;
  206.     }

  207.     /**
  208.      * {@inheritDoc}
  209.      *
  210.      * The lower bound of the support is always 1 no matter the parameters.
  211.      *
  212.      * @return lower bound of the support (always 1)
  213.      */
  214.     @Override
  215.     public int getSupportLowerBound() {
  216.         return 1;
  217.     }

  218.     /**
  219.      * {@inheritDoc}
  220.      *
  221.      * The upper bound of the support is the number of elements.
  222.      *
  223.      * @return upper bound of the support
  224.      */
  225.     @Override
  226.     public int getSupportUpperBound() {
  227.         return getNumberOfElements();
  228.     }

  229.     /**
  230.      * {@inheritDoc}
  231.      *
  232.      * The support of this distribution is connected.
  233.      *
  234.      * @return {@code true}
  235.      */
  236.     @Override
  237.     public boolean isSupportConnected() {
  238.         return true;
  239.     }
  240. }