1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22
23 package org.hipparchus.distribution.discrete;
24
25 import org.hipparchus.exception.LocalizedCoreFormats;
26 import org.hipparchus.exception.MathIllegalArgumentException;
27 import org.hipparchus.util.FastMath;
28
29 /**
30 * Implementation of the Zipf distribution.
31 * <p>
32 * <strong>Parameters:</strong>
33 * For a random variable {@code X} whose values are distributed according to this
34 * distribution, the probability mass function is given by
35 * </p>
36 * <pre>
37 * P(X = k) = H(N,s) * 1 / k^s for {@code k = 1,2,...,N}.
38 * </pre>
39 * <p>
40 * {@code H(N,s)} is the normalizing constant
41 * which corresponds to the generalized harmonic number of order N of s.
42 * </p>
43 * <ul>
44 * <li>{@code N} is the number of elements</li>
45 * <li>{@code s} is the exponent</li>
46 * </ul>
47 *
48 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a>
49 * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a>
50 */
51 public class ZipfDistribution extends AbstractIntegerDistribution {
52 /** Serializable version identifier. */
53 private static final long serialVersionUID = 20150501L;
54 /** Number of elements. */
55 private final int numberOfElements;
56 /** Exponent parameter of the distribution. */
57 private final double exponent;
58 /** Cached values of the nth generalized harmonic. */
59 private final double nthHarmonic;
60 /** Cached numerical mean */
61 private double numericalMean = Double.NaN;
62 /** Whether or not the numerical mean has been calculated */
63 private boolean numericalMeanIsCalculated;
64 /** Cached numerical variance */
65 private double numericalVariance = Double.NaN;
66 /** Whether or not the numerical variance has been calculated */
67 private boolean numericalVarianceIsCalculated;
68
69 /**
70 * Create a new Zipf distribution with the given number of elements and
71 * exponent.
72 *
73 * @param numberOfElements Number of elements.
74 * @param exponent Exponent.
75 * @exception MathIllegalArgumentException if {@code numberOfElements <= 0}
76 * or {@code exponent <= 0}.
77 */
78 public ZipfDistribution(final int numberOfElements, final double exponent)
79 throws MathIllegalArgumentException {
80 if (numberOfElements <= 0) {
81 throw new MathIllegalArgumentException(LocalizedCoreFormats.DIMENSION,
82 numberOfElements);
83 }
84 if (exponent <= 0) {
85 throw new MathIllegalArgumentException(LocalizedCoreFormats.EXPONENT,
86 exponent);
87 }
88
89 this.numberOfElements = numberOfElements;
90 this.exponent = exponent;
91 this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent);
92 }
93
94 /**
95 * Get the number of elements (e.g. corpus size) for the distribution.
96 *
97 * @return the number of elements
98 */
99 public int getNumberOfElements() {
100 return numberOfElements;
101 }
102
103 /**
104 * Get the exponent characterizing the distribution.
105 *
106 * @return the exponent
107 */
108 public double getExponent() {
109 return exponent;
110 }
111
112 /** {@inheritDoc} */
113 @Override
114 public double probability(final int x) {
115 if (x <= 0 || x > numberOfElements) {
116 return 0.0;
117 }
118
119 return (1.0 / FastMath.pow(x, exponent)) / nthHarmonic;
120 }
121
122 /** {@inheritDoc} */
123 @Override
124 public double logProbability(int x) {
125 if (x <= 0 || x > numberOfElements) {
126 return Double.NEGATIVE_INFINITY;
127 }
128
129 return -FastMath.log(x) * exponent - FastMath.log(nthHarmonic);
130 }
131
132 /** {@inheritDoc} */
133 @Override
134 public double cumulativeProbability(final int x) {
135 if (x <= 0) {
136 return 0.0;
137 } else if (x >= numberOfElements) {
138 return 1.0;
139 }
140
141 return generalizedHarmonic(x, exponent) / nthHarmonic;
142 }
143
144 /**
145 * {@inheritDoc}
146 *
147 * For number of elements {@code N} and exponent {@code s}, the mean is
148 * {@code Hs1 / Hs}, where
149 * <ul>
150 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
151 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
152 * </ul>
153 */
154 @Override
155 public double getNumericalMean() {
156 if (!numericalMeanIsCalculated) {
157 numericalMean = calculateNumericalMean();
158 numericalMeanIsCalculated = true;
159 }
160 return numericalMean;
161 }
162
163 /**
164 * Used by {@link #getNumericalMean()}.
165 *
166 * @return the mean of this distribution
167 */
168 protected double calculateNumericalMean() {
169 final int N = getNumberOfElements();
170 final double s = getExponent();
171
172 final double Hs1 = generalizedHarmonic(N, s - 1);
173 final double Hs = nthHarmonic;
174
175 return Hs1 / Hs;
176 }
177
178 /**
179 * {@inheritDoc}
180 *
181 * For number of elements {@code N} and exponent {@code s}, the mean is
182 * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
183 * <ul>
184 * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
185 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
186 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
187 * </ul>
188 */
189 @Override
190 public double getNumericalVariance() {
191 if (!numericalVarianceIsCalculated) {
192 numericalVariance = calculateNumericalVariance();
193 numericalVarianceIsCalculated = true;
194 }
195 return numericalVariance;
196 }
197
198 /**
199 * Used by {@link #getNumericalVariance()}.
200 *
201 * @return the variance of this distribution
202 */
203 protected double calculateNumericalVariance() {
204 final int N = getNumberOfElements();
205 final double s = getExponent();
206
207 final double Hs2 = generalizedHarmonic(N, s - 2);
208 final double Hs1 = generalizedHarmonic(N, s - 1);
209 final double Hs = nthHarmonic;
210
211 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
212 }
213
214 /**
215 * Calculates the Nth generalized harmonic number. See
216 * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
217 * Series</a>.
218 *
219 * @param n Term in the series to calculate (must be larger than 1)
220 * @param m Exponent (special case {@code m = 1} is the harmonic series).
221 * @return the n<sup>th</sup> generalized harmonic number.
222 */
223 private double generalizedHarmonic(final int n, final double m) {
224 double value = 0;
225 for (int k = n; k > 0; --k) {
226 value += 1.0 / FastMath.pow(k, m);
227 }
228 return value;
229 }
230
231 /**
232 * {@inheritDoc}
233 *
234 * The lower bound of the support is always 1 no matter the parameters.
235 *
236 * @return lower bound of the support (always 1)
237 */
238 @Override
239 public int getSupportLowerBound() {
240 return 1;
241 }
242
243 /**
244 * {@inheritDoc}
245 *
246 * The upper bound of the support is the number of elements.
247 *
248 * @return upper bound of the support
249 */
250 @Override
251 public int getSupportUpperBound() {
252 return getNumberOfElements();
253 }
254
255 /**
256 * {@inheritDoc}
257 *
258 * The support of this distribution is connected.
259 *
260 * @return {@code true}
261 */
262 @Override
263 public boolean isSupportConnected() {
264 return true;
265 }
266 }