1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This is not the original file distributed by the Apache Software Foundation 20 * It has been modified by the Hipparchus project 21 */ 22 23 package org.hipparchus.distribution.discrete; 24 25 import org.hipparchus.exception.LocalizedCoreFormats; 26 import org.hipparchus.exception.MathIllegalArgumentException; 27 import org.hipparchus.util.FastMath; 28 29 /** 30 * Implementation of the Zipf distribution. 31 * <p> 32 * <strong>Parameters:</strong> 33 * For a random variable {@code X} whose values are distributed according to this 34 * distribution, the probability mass function is given by 35 * </p> 36 * <pre> 37 * P(X = k) = H(N,s) * 1 / k^s for {@code k = 1,2,...,N}. 38 * </pre> 39 * <p> 40 * {@code H(N,s)} is the normalizing constant 41 * which corresponds to the generalized harmonic number of order N of s. 42 * </p> 43 * <ul> 44 * <li>{@code N} is the number of elements</li> 45 * <li>{@code s} is the exponent</li> 46 * </ul> 47 * 48 * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a> 49 * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a> 50 */ 51 public class ZipfDistribution extends AbstractIntegerDistribution { 52 /** Serializable version identifier. */ 53 private static final long serialVersionUID = 20150501L; 54 /** Number of elements. */ 55 private final int numberOfElements; 56 /** Exponent parameter of the distribution. */ 57 private final double exponent; 58 /** Cached values of the nth generalized harmonic. */ 59 private final double nthHarmonic; 60 /** Cached numerical mean */ 61 private double numericalMean = Double.NaN; 62 /** Whether or not the numerical mean has been calculated */ 63 private boolean numericalMeanIsCalculated; 64 /** Cached numerical variance */ 65 private double numericalVariance = Double.NaN; 66 /** Whether or not the numerical variance has been calculated */ 67 private boolean numericalVarianceIsCalculated; 68 69 /** 70 * Create a new Zipf distribution with the given number of elements and 71 * exponent. 72 * 73 * @param numberOfElements Number of elements. 74 * @param exponent Exponent. 75 * @exception MathIllegalArgumentException if {@code numberOfElements <= 0} 76 * or {@code exponent <= 0}. 77 */ 78 public ZipfDistribution(final int numberOfElements, final double exponent) 79 throws MathIllegalArgumentException { 80 if (numberOfElements <= 0) { 81 throw new MathIllegalArgumentException(LocalizedCoreFormats.DIMENSION, 82 numberOfElements); 83 } 84 if (exponent <= 0) { 85 throw new MathIllegalArgumentException(LocalizedCoreFormats.EXPONENT, 86 exponent); 87 } 88 89 this.numberOfElements = numberOfElements; 90 this.exponent = exponent; 91 this.nthHarmonic = generalizedHarmonic(numberOfElements, exponent); 92 } 93 94 /** 95 * Get the number of elements (e.g. corpus size) for the distribution. 96 * 97 * @return the number of elements 98 */ 99 public int getNumberOfElements() { 100 return numberOfElements; 101 } 102 103 /** 104 * Get the exponent characterizing the distribution. 105 * 106 * @return the exponent 107 */ 108 public double getExponent() { 109 return exponent; 110 } 111 112 /** {@inheritDoc} */ 113 @Override 114 public double probability(final int x) { 115 if (x <= 0 || x > numberOfElements) { 116 return 0.0; 117 } 118 119 return (1.0 / FastMath.pow(x, exponent)) / nthHarmonic; 120 } 121 122 /** {@inheritDoc} */ 123 @Override 124 public double logProbability(int x) { 125 if (x <= 0 || x > numberOfElements) { 126 return Double.NEGATIVE_INFINITY; 127 } 128 129 return -FastMath.log(x) * exponent - FastMath.log(nthHarmonic); 130 } 131 132 /** {@inheritDoc} */ 133 @Override 134 public double cumulativeProbability(final int x) { 135 if (x <= 0) { 136 return 0.0; 137 } else if (x >= numberOfElements) { 138 return 1.0; 139 } 140 141 return generalizedHarmonic(x, exponent) / nthHarmonic; 142 } 143 144 /** 145 * {@inheritDoc} 146 * 147 * For number of elements {@code N} and exponent {@code s}, the mean is 148 * {@code Hs1 / Hs}, where 149 * <ul> 150 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 151 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 152 * </ul> 153 */ 154 @Override 155 public double getNumericalMean() { 156 if (!numericalMeanIsCalculated) { 157 numericalMean = calculateNumericalMean(); 158 numericalMeanIsCalculated = true; 159 } 160 return numericalMean; 161 } 162 163 /** 164 * Used by {@link #getNumericalMean()}. 165 * 166 * @return the mean of this distribution 167 */ 168 protected double calculateNumericalMean() { 169 final int N = getNumberOfElements(); 170 final double s = getExponent(); 171 172 final double Hs1 = generalizedHarmonic(N, s - 1); 173 final double Hs = nthHarmonic; 174 175 return Hs1 / Hs; 176 } 177 178 /** 179 * {@inheritDoc} 180 * 181 * For number of elements {@code N} and exponent {@code s}, the mean is 182 * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where 183 * <ul> 184 * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li> 185 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 186 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 187 * </ul> 188 */ 189 @Override 190 public double getNumericalVariance() { 191 if (!numericalVarianceIsCalculated) { 192 numericalVariance = calculateNumericalVariance(); 193 numericalVarianceIsCalculated = true; 194 } 195 return numericalVariance; 196 } 197 198 /** 199 * Used by {@link #getNumericalVariance()}. 200 * 201 * @return the variance of this distribution 202 */ 203 protected double calculateNumericalVariance() { 204 final int N = getNumberOfElements(); 205 final double s = getExponent(); 206 207 final double Hs2 = generalizedHarmonic(N, s - 2); 208 final double Hs1 = generalizedHarmonic(N, s - 1); 209 final double Hs = nthHarmonic; 210 211 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs)); 212 } 213 214 /** 215 * Calculates the Nth generalized harmonic number. See 216 * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic 217 * Series</a>. 218 * 219 * @param n Term in the series to calculate (must be larger than 1) 220 * @param m Exponent (special case {@code m = 1} is the harmonic series). 221 * @return the n<sup>th</sup> generalized harmonic number. 222 */ 223 private double generalizedHarmonic(final int n, final double m) { 224 double value = 0; 225 for (int k = n; k > 0; --k) { 226 value += 1.0 / FastMath.pow(k, m); 227 } 228 return value; 229 } 230 231 /** 232 * {@inheritDoc} 233 * 234 * The lower bound of the support is always 1 no matter the parameters. 235 * 236 * @return lower bound of the support (always 1) 237 */ 238 @Override 239 public int getSupportLowerBound() { 240 return 1; 241 } 242 243 /** 244 * {@inheritDoc} 245 * 246 * The upper bound of the support is the number of elements. 247 * 248 * @return upper bound of the support 249 */ 250 @Override 251 public int getSupportUpperBound() { 252 return getNumberOfElements(); 253 } 254 255 /** 256 * {@inheritDoc} 257 * 258 * The support of this distribution is connected. 259 * 260 * @return {@code true} 261 */ 262 @Override 263 public boolean isSupportConnected() { 264 return true; 265 } 266 }