DBSCANClusterer.java
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
- * This is not the original file distributed by the Apache Software Foundation
- * It has been modified by the Hipparchus project
- */
- package org.hipparchus.clustering;
- import java.util.ArrayList;
- import java.util.Collection;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Map;
- import java.util.Set;
- import org.hipparchus.clustering.distance.DistanceMeasure;
- import org.hipparchus.clustering.distance.EuclideanDistance;
- import org.hipparchus.exception.LocalizedCoreFormats;
- import org.hipparchus.exception.MathIllegalArgumentException;
- import org.hipparchus.exception.NullArgumentException;
- import org.hipparchus.util.MathUtils;
- /**
- * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
- * <p>
- * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
- * a point p is density connected to another point q, if there exists a chain of
- * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
- * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable.
- * A point q is directly density-reachable from point p if it is in the ε-neighborhood
- * of this point.
- * <p>
- * Any point that is not density-reachable from a formed cluster is treated as noise, and
- * will thus not be present in the result.
- * <p>
- * The algorithm requires two parameters:
- * <ul>
- * <li>eps: the distance that defines the ε-neighborhood of a point
- * <li>minPoints: the minimum number of density-connected points required to form a cluster
- * </ul>
- *
- * @param <T> type of the points to cluster
- * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
- * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
- * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
- */
- public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> {
- /** Maximum radius of the neighborhood to be considered. */
- private final double eps;
- /** Minimum number of points needed for a cluster. */
- private final int minPts;
- /** Status of a point during the clustering process. */
- private enum PointStatus {
- /** The point has is considered to be noise. */
- NOISE,
- /** The point is already part of a cluster. */
- PART_OF_CLUSTER
- }
- /**
- * Creates a new instance of a DBSCANClusterer.
- * <p>
- * The euclidean distance will be used as default distance measure.
- *
- * @param eps maximum radius of the neighborhood to be considered
- * @param minPts minimum number of points needed for a cluster
- * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
- */
- public DBSCANClusterer(final double eps, final int minPts)
- throws MathIllegalArgumentException {
- this(eps, minPts, new EuclideanDistance());
- }
- /**
- * Creates a new instance of a DBSCANClusterer.
- *
- * @param eps maximum radius of the neighborhood to be considered
- * @param minPts minimum number of points needed for a cluster
- * @param measure the distance measure to use
- * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
- */
- public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure)
- throws MathIllegalArgumentException {
- super(measure);
- if (eps < 0.0d) {
- throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, eps, 0);
- }
- if (minPts < 0) {
- throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, minPts, 0);
- }
- this.eps = eps;
- this.minPts = minPts;
- }
- /**
- * Returns the maximum radius of the neighborhood to be considered.
- * @return maximum radius of the neighborhood
- */
- public double getEps() {
- return eps;
- }
- /**
- * Returns the minimum number of points needed for a cluster.
- * @return minimum number of points needed for a cluster
- */
- public int getMinPts() {
- return minPts;
- }
- /**
- * Performs DBSCAN cluster analysis.
- *
- * @param points the points to cluster
- * @return the list of clusters
- * @throws NullArgumentException if the data points are null
- */
- @Override
- public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
- // sanity checks
- MathUtils.checkNotNull(points);
- final List<Cluster<T>> clusters = new ArrayList<>();
- final Map<Clusterable, PointStatus> visited = new HashMap<>();
- for (final T point : points) {
- if (visited.get(point) != null) {
- continue;
- }
- final List<T> neighbors = getNeighbors(point, points);
- if (neighbors.size() >= minPts) {
- // DBSCAN does not care about center points
- final Cluster<T> cluster = new Cluster<>();
- clusters.add(expandCluster(cluster, point, neighbors, points, visited));
- } else {
- visited.put(point, PointStatus.NOISE);
- }
- }
- return clusters;
- }
- /**
- * Expands the cluster to include density-reachable items.
- *
- * @param cluster Cluster to expand
- * @param point Point to add to cluster
- * @param neighbors List of neighbors
- * @param points the data set
- * @param visited the set of already visited points
- * @return the expanded cluster
- */
- private Cluster<T> expandCluster(final Cluster<T> cluster,
- final T point,
- final List<T> neighbors,
- final Collection<T> points,
- final Map<Clusterable, PointStatus> visited) {
- cluster.addPoint(point);
- visited.put(point, PointStatus.PART_OF_CLUSTER);
- List<T> seeds = new ArrayList<>(neighbors);
- int index = 0;
- while (index < seeds.size()) {
- final T current = seeds.get(index);
- PointStatus pStatus = visited.get(current);
- // only check non-visited points
- if (pStatus == null) {
- final List<T> currentNeighbors = getNeighbors(current, points);
- if (currentNeighbors.size() >= minPts) {
- seeds = merge(seeds, currentNeighbors);
- }
- }
- if (pStatus != PointStatus.PART_OF_CLUSTER) {
- visited.put(current, PointStatus.PART_OF_CLUSTER);
- cluster.addPoint(current);
- }
- index++;
- }
- return cluster;
- }
- /**
- * Returns a list of density-reachable neighbors of a {@code point}.
- *
- * @param point the point to look for
- * @param points possible neighbors
- * @return the List of neighbors
- */
- private List<T> getNeighbors(final T point, final Collection<T> points) {
- final List<T> neighbors = new ArrayList<>();
- for (final T neighbor : points) {
- if (point != neighbor && distance(neighbor, point) <= eps) {
- neighbors.add(neighbor);
- }
- }
- return neighbors;
- }
- /**
- * Merges two lists together.
- *
- * @param one first list
- * @param two second list
- * @return merged lists
- */
- private List<T> merge(final List<T> one, final List<T> two) {
- final Set<T> oneSet = new HashSet<>(one);
- for (T item : two) {
- if (!oneSet.contains(item)) {
- one.add(item);
- }
- }
- return one;
- }
- }