1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * https://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This is not the original file distributed by the Apache Software Foundation 20 * It has been modified by the Hipparchus project 21 */ 22 package org.hipparchus.clustering; 23 24 import java.util.ArrayList; 25 import java.util.Collection; 26 import java.util.HashMap; 27 import java.util.HashSet; 28 import java.util.List; 29 import java.util.Map; 30 import java.util.Set; 31 32 import org.hipparchus.clustering.distance.DistanceMeasure; 33 import org.hipparchus.clustering.distance.EuclideanDistance; 34 import org.hipparchus.exception.LocalizedCoreFormats; 35 import org.hipparchus.exception.MathIllegalArgumentException; 36 import org.hipparchus.exception.NullArgumentException; 37 import org.hipparchus.util.MathUtils; 38 39 /** 40 * DBSCAN (density-based spatial clustering of applications with noise) algorithm. 41 * <p> 42 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. 43 * a point p is density connected to another point q, if there exists a chain of 44 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q, 45 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable. 46 * A point q is directly density-reachable from point p if it is in the ε-neighborhood 47 * of this point. 48 * <p> 49 * Any point that is not density-reachable from a formed cluster is treated as noise, and 50 * will thus not be present in the result. 51 * <p> 52 * The algorithm requires two parameters: 53 * <ul> 54 * <li>eps: the distance that defines the ε-neighborhood of a point 55 * <li>minPoints: the minimum number of density-connected points required to form a cluster 56 * </ul> 57 * 58 * @param <T> type of the points to cluster 59 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a> 60 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf"> 61 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a> 62 */ 63 public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> { 64 65 /** Maximum radius of the neighborhood to be considered. */ 66 private final double eps; 67 68 /** Minimum number of points needed for a cluster. */ 69 private final int minPts; 70 71 /** Status of a point during the clustering process. */ 72 private enum PointStatus { 73 /** The point has is considered to be noise. */ 74 NOISE, 75 /** The point is already part of a cluster. */ 76 PART_OF_CLUSTER 77 } 78 79 /** 80 * Creates a new instance of a DBSCANClusterer. 81 * <p> 82 * The euclidean distance will be used as default distance measure. 83 * 84 * @param eps maximum radius of the neighborhood to be considered 85 * @param minPts minimum number of points needed for a cluster 86 * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0} 87 */ 88 public DBSCANClusterer(final double eps, final int minPts) 89 throws MathIllegalArgumentException { 90 this(eps, minPts, new EuclideanDistance()); 91 } 92 93 /** 94 * Creates a new instance of a DBSCANClusterer. 95 * 96 * @param eps maximum radius of the neighborhood to be considered 97 * @param minPts minimum number of points needed for a cluster 98 * @param measure the distance measure to use 99 * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0} 100 */ 101 public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure) 102 throws MathIllegalArgumentException { 103 super(measure); 104 105 if (eps < 0.0d) { 106 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, eps, 0); 107 } 108 if (minPts < 0) { 109 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, minPts, 0); 110 } 111 this.eps = eps; 112 this.minPts = minPts; 113 } 114 115 /** 116 * Returns the maximum radius of the neighborhood to be considered. 117 * @return maximum radius of the neighborhood 118 */ 119 public double getEps() { 120 return eps; 121 } 122 123 /** 124 * Returns the minimum number of points needed for a cluster. 125 * @return minimum number of points needed for a cluster 126 */ 127 public int getMinPts() { 128 return minPts; 129 } 130 131 /** 132 * Performs DBSCAN cluster analysis. 133 * 134 * @param points the points to cluster 135 * @return the list of clusters 136 * @throws NullArgumentException if the data points are null 137 */ 138 @Override 139 public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException { 140 141 // sanity checks 142 MathUtils.checkNotNull(points); 143 144 final List<Cluster<T>> clusters = new ArrayList<>(); 145 final Map<Clusterable, PointStatus> visited = new HashMap<>(); 146 147 for (final T point : points) { 148 if (visited.get(point) != null) { 149 continue; 150 } 151 final List<T> neighbors = getNeighbors(point, points); 152 if (neighbors.size() >= minPts) { 153 // DBSCAN does not care about center points 154 final Cluster<T> cluster = new Cluster<>(); 155 clusters.add(expandCluster(cluster, point, neighbors, points, visited)); 156 } else { 157 visited.put(point, PointStatus.NOISE); 158 } 159 } 160 161 return clusters; 162 } 163 164 /** 165 * Expands the cluster to include density-reachable items. 166 * 167 * @param cluster Cluster to expand 168 * @param point Point to add to cluster 169 * @param neighbors List of neighbors 170 * @param points the data set 171 * @param visited the set of already visited points 172 * @return the expanded cluster 173 */ 174 private Cluster<T> expandCluster(final Cluster<T> cluster, 175 final T point, 176 final List<T> neighbors, 177 final Collection<T> points, 178 final Map<Clusterable, PointStatus> visited) { 179 cluster.addPoint(point); 180 visited.put(point, PointStatus.PART_OF_CLUSTER); 181 182 List<T> seeds = new ArrayList<>(neighbors); 183 int index = 0; 184 while (index < seeds.size()) { 185 final T current = seeds.get(index); 186 PointStatus pStatus = visited.get(current); 187 // only check non-visited points 188 if (pStatus == null) { 189 final List<T> currentNeighbors = getNeighbors(current, points); 190 if (currentNeighbors.size() >= minPts) { 191 seeds = merge(seeds, currentNeighbors); 192 } 193 } 194 195 if (pStatus != PointStatus.PART_OF_CLUSTER) { 196 visited.put(current, PointStatus.PART_OF_CLUSTER); 197 cluster.addPoint(current); 198 } 199 200 index++; 201 } 202 return cluster; 203 } 204 205 /** 206 * Returns a list of density-reachable neighbors of a {@code point}. 207 * 208 * @param point the point to look for 209 * @param points possible neighbors 210 * @return the List of neighbors 211 */ 212 private List<T> getNeighbors(final T point, final Collection<T> points) { 213 final List<T> neighbors = new ArrayList<>(); 214 for (final T neighbor : points) { 215 if (point != neighbor && distance(neighbor, point) <= eps) { 216 neighbors.add(neighbor); 217 } 218 } 219 return neighbors; 220 } 221 222 /** 223 * Merges two lists together. 224 * 225 * @param one first list 226 * @param two second list 227 * @return merged lists 228 */ 229 private List<T> merge(final List<T> one, final List<T> two) { 230 final Set<T> oneSet = new HashSet<>(one); 231 for (T item : two) { 232 if (!oneSet.contains(item)) { 233 one.add(item); 234 } 235 } 236 return one; 237 } 238 }