1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22 package org.hipparchus.clustering;
23
24 import java.util.ArrayList;
25 import java.util.Collection;
26 import java.util.HashMap;
27 import java.util.HashSet;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31
32 import org.hipparchus.clustering.distance.DistanceMeasure;
33 import org.hipparchus.clustering.distance.EuclideanDistance;
34 import org.hipparchus.exception.LocalizedCoreFormats;
35 import org.hipparchus.exception.MathIllegalArgumentException;
36 import org.hipparchus.exception.NullArgumentException;
37 import org.hipparchus.util.MathUtils;
38
39 /**
40 * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
41 * <p>
42 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
43 * a point p is density connected to another point q, if there exists a chain of
44 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
45 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable.
46 * A point q is directly density-reachable from point p if it is in the ε-neighborhood
47 * of this point.
48 * <p>
49 * Any point that is not density-reachable from a formed cluster is treated as noise, and
50 * will thus not be present in the result.
51 * <p>
52 * The algorithm requires two parameters:
53 * <ul>
54 * <li>eps: the distance that defines the ε-neighborhood of a point
55 * <li>minPoints: the minimum number of density-connected points required to form a cluster
56 * </ul>
57 *
58 * @param <T> type of the points to cluster
59 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
60 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
61 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
62 */
63 public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> {
64
65 /** Maximum radius of the neighborhood to be considered. */
66 private final double eps;
67
68 /** Minimum number of points needed for a cluster. */
69 private final int minPts;
70
71 /** Status of a point during the clustering process. */
72 private enum PointStatus {
73 /** The point has is considered to be noise. */
74 NOISE,
75 /** The point is already part of a cluster. */
76 PART_OF_CLUSTER
77 }
78
79 /**
80 * Creates a new instance of a DBSCANClusterer.
81 * <p>
82 * The euclidean distance will be used as default distance measure.
83 *
84 * @param eps maximum radius of the neighborhood to be considered
85 * @param minPts minimum number of points needed for a cluster
86 * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
87 */
88 public DBSCANClusterer(final double eps, final int minPts)
89 throws MathIllegalArgumentException {
90 this(eps, minPts, new EuclideanDistance());
91 }
92
93 /**
94 * Creates a new instance of a DBSCANClusterer.
95 *
96 * @param eps maximum radius of the neighborhood to be considered
97 * @param minPts minimum number of points needed for a cluster
98 * @param measure the distance measure to use
99 * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
100 */
101 public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure)
102 throws MathIllegalArgumentException {
103 super(measure);
104
105 if (eps < 0.0d) {
106 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, eps, 0);
107 }
108 if (minPts < 0) {
109 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, minPts, 0);
110 }
111 this.eps = eps;
112 this.minPts = minPts;
113 }
114
115 /**
116 * Returns the maximum radius of the neighborhood to be considered.
117 * @return maximum radius of the neighborhood
118 */
119 public double getEps() {
120 return eps;
121 }
122
123 /**
124 * Returns the minimum number of points needed for a cluster.
125 * @return minimum number of points needed for a cluster
126 */
127 public int getMinPts() {
128 return minPts;
129 }
130
131 /**
132 * Performs DBSCAN cluster analysis.
133 *
134 * @param points the points to cluster
135 * @return the list of clusters
136 * @throws NullArgumentException if the data points are null
137 */
138 @Override
139 public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
140
141 // sanity checks
142 MathUtils.checkNotNull(points);
143
144 final List<Cluster<T>> clusters = new ArrayList<>();
145 final Map<Clusterable, PointStatus> visited = new HashMap<>();
146
147 for (final T point : points) {
148 if (visited.get(point) != null) {
149 continue;
150 }
151 final List<T> neighbors = getNeighbors(point, points);
152 if (neighbors.size() >= minPts) {
153 // DBSCAN does not care about center points
154 final Cluster<T> cluster = new Cluster<>();
155 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
156 } else {
157 visited.put(point, PointStatus.NOISE);
158 }
159 }
160
161 return clusters;
162 }
163
164 /**
165 * Expands the cluster to include density-reachable items.
166 *
167 * @param cluster Cluster to expand
168 * @param point Point to add to cluster
169 * @param neighbors List of neighbors
170 * @param points the data set
171 * @param visited the set of already visited points
172 * @return the expanded cluster
173 */
174 private Cluster<T> expandCluster(final Cluster<T> cluster,
175 final T point,
176 final List<T> neighbors,
177 final Collection<T> points,
178 final Map<Clusterable, PointStatus> visited) {
179 cluster.addPoint(point);
180 visited.put(point, PointStatus.PART_OF_CLUSTER);
181
182 List<T> seeds = new ArrayList<>(neighbors);
183 int index = 0;
184 while (index < seeds.size()) {
185 final T current = seeds.get(index);
186 PointStatus pStatus = visited.get(current);
187 // only check non-visited points
188 if (pStatus == null) {
189 final List<T> currentNeighbors = getNeighbors(current, points);
190 if (currentNeighbors.size() >= minPts) {
191 seeds = merge(seeds, currentNeighbors);
192 }
193 }
194
195 if (pStatus != PointStatus.PART_OF_CLUSTER) {
196 visited.put(current, PointStatus.PART_OF_CLUSTER);
197 cluster.addPoint(current);
198 }
199
200 index++;
201 }
202 return cluster;
203 }
204
205 /**
206 * Returns a list of density-reachable neighbors of a {@code point}.
207 *
208 * @param point the point to look for
209 * @param points possible neighbors
210 * @return the List of neighbors
211 */
212 private List<T> getNeighbors(final T point, final Collection<T> points) {
213 final List<T> neighbors = new ArrayList<>();
214 for (final T neighbor : points) {
215 if (point != neighbor && distance(neighbor, point) <= eps) {
216 neighbors.add(neighbor);
217 }
218 }
219 return neighbors;
220 }
221
222 /**
223 * Merges two lists together.
224 *
225 * @param one first list
226 * @param two second list
227 * @return merged lists
228 */
229 private List<T> merge(final List<T> one, final List<T> two) {
230 final Set<T> oneSet = new HashSet<>(one);
231 for (T item : two) {
232 if (!oneSet.contains(item)) {
233 one.add(item);
234 }
235 }
236 return one;
237 }
238 }