Acts/Utilities/DBScan.hpp

0001 // This file is part of the ACTS project.
0002 //
0003 // Copyright (C) 2016 CERN for the benefit of the ACTS project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
0008
0009 #pragma once
0010
0011 #include "Acts/Utilities/KDTree.hpp"
0012
0013 #include <algorithm>
0014 #include <array>
0015 #include <cmath>
0016 #include <functional>
0017 #include <iostream>
0018 #include <memory>
0019 #include <string>
0020 #include <vector>
0021
0022 namespace Acts {
0023 /// @brief A general implementation of an N dimensional DBScan clustering algorithm.
0024 ///
0025 /// This is a general implementation of an N dimensional DBScan clustering
0026 /// algorithm. The DBScan algorithm uses density information to cluster together
0027 /// points that are close to each other.
0028 ///
0029 /// For each point, we will look for the neighbours that are within the epsilon
0030 /// radius. If the number of neighbours is greater than the minimum number of
0031 /// points, we will start a new cluster and assign the current point to it. We
0032 /// will then look for the neighbours of the neighbours and assign them to the
0033 /// current cluster if they are not already assigned to a cluster. If the
0034 /// neighbours have itself more than the minimum number of points as neighbours,
0035 /// we will repeat the process on those neighbours.
0036 ///
0037 /// To speed up the search for the neighbours, we use the KDTree implemented in
0038 /// ACTS. It performs a range search in the orthogonal hypercube with a length
0039 /// of 2 epsilon. An extra cut is used to only keep the neighbours that are
0040 /// within the epsilon radius.
0041 ///
0042 /// @tparam kDims The number of dimensions.
0043 /// @tparam scalar_t The scalar type used to construct position vectors.
0044 /// @tparam kLeafSize The maximum number of points in a leaf node of the KDTree.
0045 template <std::size_t kDims, typename scalar_t = double,
0046           std::size_t kLeafSize = 4>
0047 class DBScan {
0048  public:
0049   /// Type alias for coordinates of points in N-dimensional space
0050   using Point = std::array<scalar_t, kDims>;
0051
0052   /// Type alias for vector of coordinate points
0053   using VectorPoints = std::vector<Point>;
0054
0055   /// Type alias for pair of point coordinates with unique identifier
0056   using Pair = std::pair<Point, std::size_t>;
0057
0058   /// Type alias for vector of coordinate-ID pairs
0059   using VectorPairs = std::vector<Pair>;
0060
0061   /// Type alias for KDTree used for neighbor search during clustering
0062   using Tree = KDTree<kDims, std::size_t, scalar_t, std::array, kLeafSize>;
0063
0064   // Remove the default constructor.
0065   DBScan() = delete;
0066
0067   /// @brief Construct the DBScan algorithm with a given epsilon and minPoints.
0068   ///
0069   /// @param epsilon The epsilon radius used to find the neighbours.
0070   /// @param minPoints The minimum number of points to form a cluster.
0071   /// @param onePointCluster If true, all the noise points are considered as
0072   /// individual one point clusters.
0073   explicit DBScan(scalar_t epsilon = 1.0, std::size_t minPoints = 1,
0074                   bool onePointCluster = false)
0075       : m_eps(epsilon),
0076         m_minPoints(minPoints),
0077         m_onePointCluster(onePointCluster) {}
0078
0079   /// @brief Cluster the input points.
0080   ///
0081   /// This function implements the main loop of the DBScan algorithm.
0082   /// It loops over all the point and will try to start new cluster
0083   /// if it finds points that have yet to be clustered.
0084   ///
0085   /// @param inputPoints The input points to cluster.
0086   /// @param clusteredPoints Vector containing the cluster ID of each point.
0087   /// @return The number of clusters (excluding noise if onePointCluster==False).
0088   ///
0089   int cluster(const VectorPoints& inputPoints,
0090               std::vector<int>& clusteredPoints) {
0091     // Transform the initial vector of input point to a vector of pairs
0092     // with the index of the point in the initial vector.
0093     VectorPairs inputPointsWithIndex;
0094     for (std::size_t id = 0; id < inputPoints.size(); id++) {
0095       inputPointsWithIndex.push_back(std::make_pair(inputPoints[id], id));
0096     }
0097     // Build the KDTree with the input points.
0098     Tree tree = Tree(std::move(inputPointsWithIndex));
0099
0100     // Initialize the cluster ID to 0.
0101     int clusterID = 0;
0102     // By default all the points are considered as noise.
0103     clusteredPoints = std::vector<int>(inputPoints.size(), -1);
0104
0105     // Loop over all the points
0106     for (std::size_t id = 0; id < inputPoints.size(); id++) {
0107       // If the point is already assigned to a cluster, skip it.
0108       if (clusteredPoints[id] != -1) {
0109         continue;
0110       }
0111       // If not we try to build a new cluster
0112       std::vector<std::size_t> pointToProcess{id};
0113       expandCluster(tree, inputPoints, pointToProcess, clusteredPoints,
0114                     clusterID);
0115       // If the cluster has been created, increment the cluster ID.
0116       if (clusteredPoints[id] != -1) {
0117         clusterID++;
0118       }
0119     }
0120     if (m_onePointCluster) {
0121       // If noise is present and onePointCluster is true, all the noise points
0122       // are considered as individual one point clusters. Loop over all the
0123       // points in the KDTree.
0124       for (auto& cluster : clusteredPoints) {
0125         // If the point is assigned to noise, assign it to a new cluster.
0126         if (cluster == -1) {
0127           cluster = clusterID;
0128           clusterID++;
0129         }
0130       }
0131     }
0132     return clusterID;
0133   }
0134
0135  private:
0136   /// @brief Extend the cluster.
0137   ///
0138   /// This function will extend the cluster by finding all the neighbours of the
0139   /// current point and assign them to the current cluster.
0140   /// The KDTree is used to find the neighbours and an extra cut is used to only
0141   /// keep the neighbours that are within the epsilon radius.
0142   ///
0143   /// @param tree The KDTree containing all the points.
0144   /// @param inputPoints The vector containing the input points.
0145   /// @param pointsToProcess The vector containing the ids of the points that need to be
0146   /// processed.
0147   /// @param clusteredPoints Vector containing the cluster ID of each point.
0148   /// @param clusterID The ID of the current cluster.
0149   ///
0150   void expandCluster(const Tree& tree, const VectorPoints& inputPoints,
0151                      const std::vector<std::size_t>& pointsToProcess,
0152                      std::vector<int>& clusteredPoints, const int clusterID) {
0153     // Loop over all the points that need to be process.
0154     for (const auto id : pointsToProcess) {
0155       // Lets look for the neighbours of the current point.
0156       const Point currentPoint = inputPoints[id];
0157       std::vector<std::size_t> neighbours;
0158       // We create the range in which we will look for the neighbours (an
0159       // hypercube with a length of 2 epsilon).
0160       typename Tree::range_t range;
0161       for (std::size_t dim = 0; dim < kDims; dim++) {
0162         range[dim] = Range1D<scalar_t>(currentPoint[dim] - m_eps,
0163                                        currentPoint[dim] + m_eps);
0164       }
0165       // We use the KDTree to find the neighbours.
0166       // An extra cut needs to be applied to only keep the neighbours that
0167       // are within the epsilon radius.
0168       tree.rangeSearchMapDiscard(
0169           range, [this, &neighbours, currentPoint](
0170                      const typename Tree::coordinate_t& pos,
0171                      const typename Tree::value_t& val) {
0172             scalar_t distance = 0;
0173             for (std::size_t dim = 0; dim < kDims; dim++) {
0174               distance += (pos[dim] - currentPoint[dim]) *
0175                           (pos[dim] - currentPoint[dim]);
0176             }
0177             if (distance <= m_eps * m_eps) {
0178               neighbours.push_back(val);
0179             }
0180           });
0181       std::size_t nNeighbours = neighbours.size();
0182       // If a cluster has already been started we add the neighbours to it
0183       if (clusteredPoints[id] != -1) {
0184         updateNeighbours(neighbours, clusteredPoints, clusterID);
0185       }
0186       if (nNeighbours >= m_minPoints) {
0187         // If the cluster has not been started yet and we have enough
0188         // neighbours, we start the cluster and assign the current point and its
0189         // neighbours to it.
0190         if (clusteredPoints[id] == -1) {
0191           clusteredPoints[id] = clusterID;
0192           updateNeighbours(neighbours, clusteredPoints, clusterID);
0193         }
0194         // Try to extend the cluster with the neighbours.
0195         expandCluster(tree, inputPoints, neighbours, clusteredPoints,
0196                       clusterID);
0197       }
0198     }
0199   }
0200
0201   /// @brief Update the neighbours.
0202   ///
0203   /// This function will remove the neighbours that are already assigned to a
0204   /// cluster and assign the remaining ones to the current cluster.
0205   ///
0206   /// @param neighbours The vector containing the ids of the neighbours.
0207   /// @param clusteredPoints Vector containing the cluster ID of each point.
0208   /// @param clusterID The ID of the current cluster.
0209   ///
0210   void updateNeighbours(std::vector<std::size_t>& neighbours,
0211                         std::vector<int>& clusteredPoints,
0212                         const int clusterID) {
0213     neighbours.erase(std::remove_if(neighbours.begin(), neighbours.end(),
0214                                     [&clusteredPoints](int i) {
0215                                       return clusteredPoints[i] != -1;
0216                                     }),
0217                      neighbours.end());
0218
0219     for (const auto& neighbour : neighbours) {
0220       clusteredPoints[neighbour] = clusterID;
0221     }
0222   }
0223
0224   // The epsilon radius used to find the neighbours.
0225   scalar_t m_eps;
0226   // The minimum number of points to form a cluster.
0227   std::size_t m_minPoints = 1;
0228   // If true, all the noise points are considered as individual one point
0229   // clusters.
0230   bool m_onePointCluster = false;
0231 };
0232
0233 }  // namespace Acts