View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  
23  package org.hipparchus.distribution.discrete;
24  
25  import org.hipparchus.exception.LocalizedCoreFormats;
26  import org.hipparchus.exception.MathIllegalArgumentException;
27  import org.hipparchus.util.FastMath;
28  
29  /**
30   * Implementation of the Zipf distribution.
31   * <p>
32   * <strong>Parameters:</strong>
33   * For a random variable {@code X} whose values are distributed according to this
34   * distribution, the probability mass function is given by
35   * </p>
36   * <pre>
37   *   P(X = k) = H(N,s) * 1 / k^s    for {@code k = 1,2,...,N}.
38   * </pre>
39   * <p>
40   * {@code H(N,s)} is the normalizing constant
41   * which corresponds to the generalized harmonic number of order N of s.
42   * </p>
43   * <ul>
44   * <li>{@code N} is the number of elements</li>
45   * <li>{@code s} is the exponent</li>
46   * </ul>
47   *
48   * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a>
49   * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a>
50   */
51  public class ZipfDistribution extends AbstractIntegerDistribution {
52      /** Serializable version identifier. */
53      private static final long serialVersionUID = 20150501L;
54      /** Number of elements. */
55      private final int numberOfElements;
56      /** Exponent parameter of the distribution. */
57      private final double exponent;
58      /** Cached values of the nth generalized harmonic. */
59      private final double nthHarmonic;
60      /** Cached numerical mean */
61      private double numericalMean = Double.NaN;
62      /** Whether or not the numerical mean has been calculated */
63      private boolean numericalMeanIsCalculated;
64      /** Cached numerical variance */
65      private double numericalVariance = Double.NaN;
66      /** Whether or not the numerical variance has been calculated */
67      private boolean numericalVarianceIsCalculated;
68  
69      /**
70       * Create a new Zipf distribution with the given number of elements and
71       * exponent.
72       *
73       * @param numberOfElements Number of elements.
74       * @param exponent Exponent.
75       * @exception MathIllegalArgumentException if {@code numberOfElements <= 0}
76       * or {@code exponent <= 0}.
77       */
78      public ZipfDistribution(final int numberOfElements, final double exponent)
79          throws MathIllegalArgumentException {
80          if (numberOfElements <= 0) {
81              throw new MathIllegalArgumentException(LocalizedCoreFormats.DIMENSION,
82                                                     numberOfElements);
83          }
84          if (exponent <= 0) {
85              throw new MathIllegalArgumentException(LocalizedCoreFormats.EXPONENT,
86                                                     exponent);
87          }
88  
89          this.numberOfElements = numberOfElements;
90          this.exponent = exponent;
91          this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
92      }
93  
94      /**
95       * Get the number of elements (e.g. corpus size) for the distribution.
96       *
97       * @return the number of elements
98       */
99      public int getNumberOfElements() {
100         return numberOfElements;
101     }
102 
103     /**
104      * Get the exponent characterizing the distribution.
105      *
106      * @return the exponent
107      */
108     public double getExponent() {
109         return exponent;
110     }
111 
112     /** {@inheritDoc} */
113     @Override
114     public double probability(final int x) {
115         if (x <= 0 || x > numberOfElements) {
116             return 0.0;
117         }
118 
119         return (1.0 / FastMath.pow(x, exponent)) / nthHarmonic;
120     }
121 
122     /** {@inheritDoc} */
123     @Override
124     public double logProbability(int x) {
125         if (x <= 0 || x > numberOfElements) {
126             return Double.NEGATIVE_INFINITY;
127         }
128 
129         return -FastMath.log(x) * exponent - FastMath.log(nthHarmonic);
130     }
131 
132     /** {@inheritDoc} */
133     @Override
134     public double cumulativeProbability(final int x) {
135         if (x <= 0) {
136             return 0.0;
137         } else if (x >= numberOfElements) {
138             return 1.0;
139         }
140 
141         return generalizedHarmonic(x, exponent) / nthHarmonic;
142     }
143 
144     /**
145      * {@inheritDoc}
146      *
147      * For number of elements {@code N} and exponent {@code s}, the mean is
148      * {@code Hs1 / Hs}, where
149      * <ul>
150      *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
151      *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
152      * </ul>
153      */
154     @Override
155     public double getNumericalMean() {
156         if (!numericalMeanIsCalculated) {
157             numericalMean = calculateNumericalMean();
158             numericalMeanIsCalculated = true;
159         }
160         return numericalMean;
161     }
162 
163     /**
164      * Used by {@link #getNumericalMean()}.
165      *
166      * @return the mean of this distribution
167      */
168     protected double calculateNumericalMean() {
169         final int N = getNumberOfElements();
170         final double s = getExponent();
171 
172         final double Hs1 = generalizedHarmonic(N, s - 1);
173         final double Hs = nthHarmonic;
174 
175         return Hs1 / Hs;
176     }
177 
178     /**
179      * {@inheritDoc}
180      *
181      * For number of elements {@code N} and exponent {@code s}, the mean is
182      * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
183      * <ul>
184      *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
185      *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
186      *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
187      * </ul>
188      */
189     @Override
190     public double getNumericalVariance() {
191         if (!numericalVarianceIsCalculated) {
192             numericalVariance = calculateNumericalVariance();
193             numericalVarianceIsCalculated = true;
194         }
195         return numericalVariance;
196     }
197 
198     /**
199      * Used by {@link #getNumericalVariance()}.
200      *
201      * @return the variance of this distribution
202      */
203     protected double calculateNumericalVariance() {
204         final int N = getNumberOfElements();
205         final double s = getExponent();
206 
207         final double Hs2 = generalizedHarmonic(N, s - 2);
208         final double Hs1 = generalizedHarmonic(N, s - 1);
209         final double Hs = nthHarmonic;
210 
211         return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
212     }
213 
214     /**
215      * Calculates the Nth generalized harmonic number. See
216      * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
217      * Series</a>.
218      *
219      * @param n Term in the series to calculate (must be larger than 1)
220      * @param m Exponent (special case {@code m = 1} is the harmonic series).
221      * @return the n<sup>th</sup> generalized harmonic number.
222      */
223     private double generalizedHarmonic(final int n, final double m) {
224         double value = 0;
225         for (int k = n; k > 0; --k) {
226             value += 1.0 / FastMath.pow(k, m);
227         }
228         return value;
229     }
230 
231     /**
232      * {@inheritDoc}
233      *
234      * The lower bound of the support is always 1 no matter the parameters.
235      *
236      * @return lower bound of the support (always 1)
237      */
238     @Override
239     public int getSupportLowerBound() {
240         return 1;
241     }
242 
243     /**
244      * {@inheritDoc}
245      *
246      * The upper bound of the support is the number of elements.
247      *
248      * @return upper bound of the support
249      */
250     @Override
251     public int getSupportUpperBound() {
252         return getNumberOfElements();
253     }
254 
255     /**
256      * {@inheritDoc}
257      *
258      * The support of this distribution is connected.
259      *
260      * @return {@code true}
261      */
262     @Override
263     public boolean isSupportConnected() {
264         return true;
265     }
266 }