Python/MLAmbiguityResolution/ambiguity_solver_full_chain.py

0001 import glob
0002 import os
0003 import math
0004
0005 import pandas as pd
0006 import numpy as np
0007
0008 import torch.utils
0009
0010 from sklearn.cluster import DBSCAN
0011
0012 from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
0013 from ambiguity_solver_network import prepareDataSet, DuplicateClassifier, Normalise
0014
0015
0016 def readDataSet(CKS_files: list[str]) -> pd.DataFrame:
0017     """Read the dataset from the different file, remove the pure duplicate tracks and combine the datasets"""
0018     """
0019     @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually)
0020     @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each event
0021     """
0022     data = []
0023     for f in CKS_files:
0024         datafile = pd.read_csv(f)
0025         datafile = prepareDataSet(datafile)
0026         # Combine dataset
0027         data.append(datafile)
0028     return data
0029
0030
0031 def prepareInferenceData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
0032     """Prepare the data"""
0033     """
0034     @param[in] data: input DataFrame to be prepared
0035     @return: array of the network input and the corresponding truth
0036     """
0037     # Remove truth and useless variable
0038     target_column = "good/duplicate/fake"
0039     # Separate the truth from the input variables
0040     y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
0041     input = data.drop(
0042         columns=[
0043             target_column,
0044             "track_id",
0045             "nMajorityHits",
0046             "nSharedHits",
0047             "truthMatchProbability",
0048             "Hits_ID",
0049             "chi2",
0050             "pT",
0051             "cluster",
0052         ]
0053     )
0054     # Prepare the input feature
0055     x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
0056     x = np.concatenate((x_cat, input), axis=1)
0057     return x, y
0058
0059
0060 def clusterTracks(
0061     event: pd.DataFrame, DBSCAN_eps: float = 0.07, DBSCAN_min_samples: int = 2
0062 ) -> pd.DataFrame:
0063     """
0064     Cluster together all the track that appear to belong to the same truth particle
0065     To cluster the tracks together, a DBSCAN is first used followed by a sub clustering based on hits shared by tracks.
0066     """
0067     """
0068     @param[in] event: input DataFrame that contain all track in one event
0069     @param[in] DBSCAN_eps: minimum radius used by the DBSCAN to cluster track together
0070     @param[in] DBSCAN_min_samples: minimum number of tracks needed for DBSCAN to create a cluster
0071     @return: DataFrame identical to the output with an added column with the cluster
0072     """
0073     # Perform the DBSCAN clustering and sort the Db by cluster ID
0074     trackDir = event[["eta", "phi"]].to_numpy()
0075     clustering = DBSCAN(eps=DBSCAN_eps, min_samples=DBSCAN_min_samples).fit(trackDir)
0076     # Set "cluster" to 0 if you want to see the performance without DBSCAN
0077     event["cluster"] = clustering.labels_
0078     # event["cluster"] = 0
0079     sorted = event.sort_values(["cluster", "nMeasurements"], ascending=[True, False])
0080     updatedCluster = []
0081     cluster_hits = sorted.loc[:, ("Hits_ID", "cluster")]
0082     # Further split each cluster into subCluster that have shared hits' IDs
0083     for key, frame in cluster_hits.groupby("cluster"):
0084         clusterarray = frame.to_numpy()
0085         clusterarray = subClustering(clusterarray, key, key)
0086         updatedCluster.extend(clusterarray[:, 1])
0087     sorted.loc[:, ("cluster")] = updatedCluster
0088     # Turn back the cluster ID into int
0089     sorted = sorted.sort_values("cluster")
0090     clusterarray = sorted.loc[:, ("Hits_ID", "cluster")].to_numpy()
0091     clusterarray = renameCluster(clusterarray)
0092     sorted.loc[:, ("cluster")] = clusterarray[:, 1]
0093     return sorted
0094
0095
0096 def subClustering(clusterarray: np.ndarray, c: int, lastCluster: float) -> np.ndarray:
0097     """SubClustering algorithm, cluster together tracks that share hits (TODO : doesn't handle real shared hits)"""
0098     """
0099     @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
0100     @param[in] c: ID of the cluster we are working on
0101     @param[in] lastCluster: ID given to the last subcluster
0102     @return: numpy array with updated cluster IDs
0103     """
0104     # New cluster ID set to the float increment
0105     newCluster = math.nextafter(lastCluster, c + 1)
0106     if newCluster >= c + 1:
0107         raise RuntimeError(
0108             "Too many subcluster in the clusters, this shouldn't be possible."
0109         )
0110     hits_IDs = []
0111     set_IDs = set(hits_IDs)
0112     # Update the cluster ID of all tracks sharing a hit with the first hits that haven't been updated yet
0113     for track in clusterarray:
0114         if track[1] == c:
0115             if hits_IDs == []:
0116                 hits_IDs = track[0]
0117                 set_IDs = set(hits_IDs)
0118             if set_IDs & set(track[0]):
0119                 track[1] = newCluster
0120     # If all ID have been updated our work is done return the updated array
0121     if hits_IDs == []:
0122         return clusterarray
0123     else:
0124         # Perform a new subclusterning for the remaining tracks
0125         clusterarray = subClustering(clusterarray, c, newCluster)
0126         return clusterarray
0127
0128
0129 def renameCluster(clusterarray: np.ndarray) -> np.ndarray:
0130     """Rename the cluster IDs to be int starting from 0"""
0131     """
0132     @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
0133     @return: numpy array with updated cluster IDs
0134     """
0135     last_id = -1
0136     new_id = -1
0137     for track in clusterarray:
0138         if track[1] != last_id:
0139             last_id = track[1]
0140             new_id = new_id + 1
0141         track[1] = new_id
0142     return clusterarray
0143
0144
0145 # ==================================================================
0146
0147 import time
0148
0149 start = time.time()
0150
0151 import sys
0152
0153 sys.setrecursionlimit(10**6)
0154
0155 # ttbar events as test input
0156 CKF_files = sorted(glob.glob("odd_output" + "/event0000000[0-9][0-9]-tracks_ckf.csv"))
0157 data = readDataSet(CKF_files)
0158
0159 # Data of each event after clustering
0160 clusteredData = []
0161 # data of each event after ambiguity resolution
0162 cleanedData = []
0163
0164 t1 = time.time()
0165
0166 # Cluster togather tracks belonging to the same particle
0167 for event in data:
0168     clustered = clusterTracks(event)
0169     clusteredData.append(clustered)
0170
0171 t2 = time.time()
0172
0173 duplicateClassifier = torch.load("duplicateClassifier.pt")
0174
0175 t3 = time.time()
0176
0177 # Performed the MLP based ambiguity resolution
0178 for clusteredEvent in clusteredData:
0179     # Prepare the data
0180     x_test, y_test = prepareInferenceData(clusteredEvent)
0181     # Write the network score to a list
0182     output_predict = []
0183     for x in x_test:
0184         x = torch.tensor(x, dtype=torch.float32)
0185         output_predict.append(duplicateClassifier(x).item())
0186
0187     clusteredEvent["score"] = output_predict
0188     cleanedEvent = clusteredEvent
0189     # For each cluster only keep the track with the highest score
0190     idx = (
0191         cleanedEvent.groupby(["cluster"])["score"].transform(max)
0192         == cleanedEvent["score"]
0193     )
0194     cleanedEvent = cleanedEvent[idx]
0195     cleanedData.append(cleanedEvent)
0196
0197 t4 = time.time()
0198
0199 # Compute the algorithm performances
0200 nb_part = 0
0201 nb_track = 0
0202 nb_fake = 0
0203 nb_duplicate = 0
0204
0205 nb_good_match = 0
0206 nb_reco_part = 0
0207 nb_reco_fake = 0
0208 nb_reco_duplicate = 0
0209 nb_reco_track = 0
0210
0211 for clusteredEvent, cleanedEvent in zip(clusteredData, cleanedData):
0212     nb_part += clusteredEvent.loc[
0213         clusteredEvent["good/duplicate/fake"] != "fake"
0214     ].index.nunique()
0215     nb_track += clusteredEvent.shape[0]
0216     nb_fake += clusteredEvent.loc[
0217         clusteredEvent["good/duplicate/fake"] == "fake"
0218     ].shape[0]
0219     nb_duplicate += clusteredEvent.loc[
0220         clusteredEvent["good/duplicate/fake"] == "duplicate"
0221     ].shape[0]
0222
0223     nb_good_match += cleanedEvent.loc[
0224         cleanedEvent["good/duplicate/fake"] == "good"
0225     ].shape[0]
0226     nb_reco_fake += cleanedEvent.loc[
0227         cleanedEvent["good/duplicate/fake"] == "fake"
0228     ].shape[0]
0229     nb_reco_duplicate += cleanedEvent.loc[
0230         cleanedEvent["good/duplicate/fake"] == "duplicate"
0231     ].shape[0]
0232     nb_reco_part += cleanedEvent.loc[
0233         cleanedEvent["good/duplicate/fake"] != "fake"
0234     ].index.nunique()
0235     nb_reco_track += cleanedEvent.shape[0]
0236 end = time.time()
0237
0238 print("===Initial efficiencies===")
0239 print("nb particles : ", nb_part)
0240 print("nb track : ", nb_track)
0241 print("duplicate rate: ", 100 * nb_duplicate / nb_track, " %")
0242 print("Fake rate: ", 100 * nb_fake / nb_track, " %")
0243
0244 print("===computed efficiencies===")
0245 print("nb particles : ", nb_part)
0246 print("nb good match : ", nb_good_match)
0247 print("nb particle reco : ", nb_reco_part)
0248 print("nb track reco : ", nb_reco_track)
0249 print("Efficiency (good track) : ", 100 * nb_good_match / nb_part, " %")
0250 print("Efficiency (particle reco) : ", 100 * nb_reco_part / nb_part, " %")
0251 print(
0252     "duplicate rate: ",
0253     100 * ((nb_good_match + nb_reco_duplicate) - nb_reco_part) / nb_reco_track,
0254     " %",
0255 )
0256 print("Fake rate: ", 100 * nb_reco_fake / nb_reco_track, " %")
0257
0258 print("===computed speed===")
0259 print("Clustering : ", (t2 - t1) * 1000 / len(CKF_files), "ms")
0260 print("Inference : ", (t4 - t3) * 1000 / len(CKF_files), "ms")
0261 print("tot : ", (end - start) * 1000 / len(CKF_files), "ms")
0262
0263 for file, cleanedEvent in zip(CKF_files, cleanedData):
0264     newFile = file[:-4] + "-Cleaned.csv"
0265     cleanedEvent = cleanedEvent.sort_values("track_id")
0266     cleanedEvent.to_csv(path_or_buf=newFile)