DBSCANClusterer.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. /*
  18.  * This is not the original file distributed by the Apache Software Foundation
  19.  * It has been modified by the Hipparchus project
  20.  */
  21. package org.hipparchus.clustering;

  22. import java.util.ArrayList;
  23. import java.util.Collection;
  24. import java.util.HashMap;
  25. import java.util.HashSet;
  26. import java.util.List;
  27. import java.util.Map;
  28. import java.util.Set;

  29. import org.hipparchus.clustering.distance.DistanceMeasure;
  30. import org.hipparchus.clustering.distance.EuclideanDistance;
  31. import org.hipparchus.exception.LocalizedCoreFormats;
  32. import org.hipparchus.exception.MathIllegalArgumentException;
  33. import org.hipparchus.exception.NullArgumentException;
  34. import org.hipparchus.util.MathUtils;

  35. /**
  36.  * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
  37.  * <p>
  38.  * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
  39.  * a point p is density connected to another point q, if there exists a chain of
  40.  * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
  41.  * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
  42.  * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
  43.  * of this point.
  44.  * <p>
  45.  * Any point that is not density-reachable from a formed cluster is treated as noise, and
  46.  * will thus not be present in the result.
  47.  * <p>
  48.  * The algorithm requires two parameters:
  49.  * <ul>
  50.  *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
  51.  *   <li>minPoints: the minimum number of density-connected points required to form a cluster
  52.  * </ul>
  53.  *
  54.  * @param <T> type of the points to cluster
  55.  * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
  56.  * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
  57.  * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
  58.  */
  59. public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> {

  60.     /** Maximum radius of the neighborhood to be considered. */
  61.     private final double              eps;

  62.     /** Minimum number of points needed for a cluster. */
  63.     private final int                 minPts;

  64.     /** Status of a point during the clustering process. */
  65.     private enum PointStatus {
  66.         /** The point has is considered to be noise. */
  67.         NOISE,
  68.         /** The point is already part of a cluster. */
  69.         PART_OF_CLUSTER
  70.     }

  71.     /**
  72.      * Creates a new instance of a DBSCANClusterer.
  73.      * <p>
  74.      * The euclidean distance will be used as default distance measure.
  75.      *
  76.      * @param eps maximum radius of the neighborhood to be considered
  77.      * @param minPts minimum number of points needed for a cluster
  78.      * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
  79.      */
  80.     public DBSCANClusterer(final double eps, final int minPts)
  81.         throws MathIllegalArgumentException {
  82.         this(eps, minPts, new EuclideanDistance());
  83.     }

  84.     /**
  85.      * Creates a new instance of a DBSCANClusterer.
  86.      *
  87.      * @param eps maximum radius of the neighborhood to be considered
  88.      * @param minPts minimum number of points needed for a cluster
  89.      * @param measure the distance measure to use
  90.      * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
  91.      */
  92.     public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure)
  93.         throws MathIllegalArgumentException {
  94.         super(measure);

  95.         if (eps < 0.0d) {
  96.             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, eps, 0);
  97.         }
  98.         if (minPts < 0) {
  99.             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, minPts, 0);
  100.         }
  101.         this.eps = eps;
  102.         this.minPts = minPts;
  103.     }

  104.     /**
  105.      * Returns the maximum radius of the neighborhood to be considered.
  106.      * @return maximum radius of the neighborhood
  107.      */
  108.     public double getEps() {
  109.         return eps;
  110.     }

  111.     /**
  112.      * Returns the minimum number of points needed for a cluster.
  113.      * @return minimum number of points needed for a cluster
  114.      */
  115.     public int getMinPts() {
  116.         return minPts;
  117.     }

  118.     /**
  119.      * Performs DBSCAN cluster analysis.
  120.      *
  121.      * @param points the points to cluster
  122.      * @return the list of clusters
  123.      * @throws NullArgumentException if the data points are null
  124.      */
  125.     @Override
  126.     public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {

  127.         // sanity checks
  128.         MathUtils.checkNotNull(points);

  129.         final List<Cluster<T>> clusters = new ArrayList<>();
  130.         final Map<Clusterable, PointStatus> visited = new HashMap<>();

  131.         for (final T point : points) {
  132.             if (visited.get(point) != null) {
  133.                 continue;
  134.             }
  135.             final List<T> neighbors = getNeighbors(point, points);
  136.             if (neighbors.size() >= minPts) {
  137.                 // DBSCAN does not care about center points
  138.                 final Cluster<T> cluster = new Cluster<>();
  139.                 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
  140.             } else {
  141.                 visited.put(point, PointStatus.NOISE);
  142.             }
  143.         }

  144.         return clusters;
  145.     }

  146.     /**
  147.      * Expands the cluster to include density-reachable items.
  148.      *
  149.      * @param cluster Cluster to expand
  150.      * @param point Point to add to cluster
  151.      * @param neighbors List of neighbors
  152.      * @param points the data set
  153.      * @param visited the set of already visited points
  154.      * @return the expanded cluster
  155.      */
  156.     private Cluster<T> expandCluster(final Cluster<T> cluster,
  157.                                      final T point,
  158.                                      final List<T> neighbors,
  159.                                      final Collection<T> points,
  160.                                      final Map<Clusterable, PointStatus> visited) {
  161.         cluster.addPoint(point);
  162.         visited.put(point, PointStatus.PART_OF_CLUSTER);

  163.         List<T> seeds = new ArrayList<>(neighbors);
  164.         int index = 0;
  165.         while (index < seeds.size()) {
  166.             final T current = seeds.get(index);
  167.             PointStatus pStatus = visited.get(current);
  168.             // only check non-visited points
  169.             if (pStatus == null) {
  170.                 final List<T> currentNeighbors = getNeighbors(current, points);
  171.                 if (currentNeighbors.size() >= minPts) {
  172.                     seeds = merge(seeds, currentNeighbors);
  173.                 }
  174.             }

  175.             if (pStatus != PointStatus.PART_OF_CLUSTER) {
  176.                 visited.put(current, PointStatus.PART_OF_CLUSTER);
  177.                 cluster.addPoint(current);
  178.             }

  179.             index++;
  180.         }
  181.         return cluster;
  182.     }

  183.     /**
  184.      * Returns a list of density-reachable neighbors of a {@code point}.
  185.      *
  186.      * @param point the point to look for
  187.      * @param points possible neighbors
  188.      * @return the List of neighbors
  189.      */
  190.     private List<T> getNeighbors(final T point, final Collection<T> points) {
  191.         final List<T> neighbors = new ArrayList<>();
  192.         for (final T neighbor : points) {
  193.             if (point != neighbor && distance(neighbor, point) <= eps) {
  194.                 neighbors.add(neighbor);
  195.             }
  196.         }
  197.         return neighbors;
  198.     }

  199.     /**
  200.      * Merges two lists together.
  201.      *
  202.      * @param one first list
  203.      * @param two second list
  204.      * @return merged lists
  205.      */
  206.     private List<T> merge(final List<T> one, final List<T> two) {
  207.         final Set<T> oneSet = new HashSet<>(one);
  208.         for (T item : two) {
  209.             if (!oneSet.contains(item)) {
  210.                 one.add(item);
  211.             }
  212.         }
  213.         return one;
  214.     }
  215. }