File indexing completed on 2025-01-18 09:12:07
0001 import glob
0002 import os
0003 import math
0004
0005 import pandas as pd
0006 import numpy as np
0007
0008 import torch.utils
0009
0010 from sklearn.cluster import DBSCAN
0011
0012 from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
0013 from ambiguity_solver_network import prepareDataSet, DuplicateClassifier, Normalise
0014
0015
0016 def readDataSet(CKS_files: list[str]) -> pd.DataFrame:
0017 """Read the dataset from the different file, remove the pure duplicate tracks and combine the datasets"""
0018 """
0019 @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually)
0020 @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each event
0021 """
0022 data = []
0023 for f in CKS_files:
0024 datafile = pd.read_csv(f)
0025 datafile = prepareDataSet(datafile)
0026
0027 data.append(datafile)
0028 return data
0029
0030
0031 def prepareInferenceData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
0032 """Prepare the data"""
0033 """
0034 @param[in] data: input DataFrame to be prepared
0035 @return: array of the network input and the corresponding truth
0036 """
0037
0038 target_column = "good/duplicate/fake"
0039
0040 y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
0041 input = data.drop(
0042 columns=[
0043 target_column,
0044 "track_id",
0045 "nMajorityHits",
0046 "nSharedHits",
0047 "truthMatchProbability",
0048 "Hits_ID",
0049 "chi2",
0050 "pT",
0051 "cluster",
0052 ]
0053 )
0054
0055 x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
0056 x = np.concatenate((x_cat, input), axis=1)
0057 return x, y
0058
0059
0060 def clusterTracks(
0061 event: pd.DataFrame, DBSCAN_eps: float = 0.07, DBSCAN_min_samples: int = 2
0062 ) -> pd.DataFrame:
0063 """
0064 Cluster together all the track that appear to belong to the same truth particle
0065 To cluster the tracks together, a DBSCAN is first used followed by a sub clustering based on hits shared by tracks.
0066 """
0067 """
0068 @param[in] event: input DataFrame that contain all track in one event
0069 @param[in] DBSCAN_eps: minimum radius used by the DBSCAN to cluster track together
0070 @param[in] DBSCAN_min_samples: minimum number of tracks needed for DBSCAN to create a cluster
0071 @return: DataFrame identical to the output with an added column with the cluster
0072 """
0073
0074 trackDir = event[["eta", "phi"]].to_numpy()
0075 clustering = DBSCAN(eps=DBSCAN_eps, min_samples=DBSCAN_min_samples).fit(trackDir)
0076
0077 event["cluster"] = clustering.labels_
0078
0079 sorted = event.sort_values(["cluster", "nMeasurements"], ascending=[True, False])
0080 updatedCluster = []
0081 cluster_hits = sorted.loc[:, ("Hits_ID", "cluster")]
0082
0083 for key, frame in cluster_hits.groupby("cluster"):
0084 clusterarray = frame.to_numpy()
0085 clusterarray = subClustering(clusterarray, key, key)
0086 updatedCluster.extend(clusterarray[:, 1])
0087 sorted.loc[:, ("cluster")] = updatedCluster
0088
0089 sorted = sorted.sort_values("cluster")
0090 clusterarray = sorted.loc[:, ("Hits_ID", "cluster")].to_numpy()
0091 clusterarray = renameCluster(clusterarray)
0092 sorted.loc[:, ("cluster")] = clusterarray[:, 1]
0093 return sorted
0094
0095
0096 def subClustering(clusterarray: np.ndarray, c: int, lastCluster: float) -> np.ndarray:
0097 """SubClustering algorithm, cluster together tracks that share hits (TODO : doesn't handle real shared hits)"""
0098 """
0099 @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
0100 @param[in] c: ID of the cluster we are working on
0101 @param[in] lastCluster: ID given to the last subcluster
0102 @return: numpy array with updated cluster IDs
0103 """
0104
0105 newCluster = math.nextafter(lastCluster, c + 1)
0106 if newCluster >= c + 1:
0107 raise RuntimeError(
0108 "Too many subcluster in the clusters, this shouldn't be possible."
0109 )
0110 hits_IDs = []
0111 set_IDs = set(hits_IDs)
0112
0113 for track in clusterarray:
0114 if track[1] == c:
0115 if hits_IDs == []:
0116 hits_IDs = track[0]
0117 set_IDs = set(hits_IDs)
0118 if set_IDs & set(track[0]):
0119 track[1] = newCluster
0120
0121 if hits_IDs == []:
0122 return clusterarray
0123 else:
0124
0125 clusterarray = subClustering(clusterarray, c, newCluster)
0126 return clusterarray
0127
0128
0129 def renameCluster(clusterarray: np.ndarray) -> np.ndarray:
0130 """Rename the cluster IDs to be int starting from 0"""
0131 """
0132 @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
0133 @return: numpy array with updated cluster IDs
0134 """
0135 last_id = -1
0136 new_id = -1
0137 for track in clusterarray:
0138 if track[1] != last_id:
0139 last_id = track[1]
0140 new_id = new_id + 1
0141 track[1] = new_id
0142 return clusterarray
0143
0144
0145
0146
0147 import time
0148
0149 start = time.time()
0150
0151 import sys
0152
0153 sys.setrecursionlimit(10**6)
0154
0155
0156 CKF_files = sorted(glob.glob("odd_output" + "/event0000000[0-9][0-9]-tracks_ckf.csv"))
0157 data = readDataSet(CKF_files)
0158
0159
0160 clusteredData = []
0161
0162 cleanedData = []
0163
0164 t1 = time.time()
0165
0166
0167 for event in data:
0168 clustered = clusterTracks(event)
0169 clusteredData.append(clustered)
0170
0171 t2 = time.time()
0172
0173 duplicateClassifier = torch.load("duplicateClassifier.pt")
0174
0175 t3 = time.time()
0176
0177
0178 for clusteredEvent in clusteredData:
0179
0180 x_test, y_test = prepareInferenceData(clusteredEvent)
0181
0182 output_predict = []
0183 for x in x_test:
0184 x = torch.tensor(x, dtype=torch.float32)
0185 output_predict.append(duplicateClassifier(x).item())
0186
0187 clusteredEvent["score"] = output_predict
0188 cleanedEvent = clusteredEvent
0189
0190 idx = (
0191 cleanedEvent.groupby(["cluster"])["score"].transform(max)
0192 == cleanedEvent["score"]
0193 )
0194 cleanedEvent = cleanedEvent[idx]
0195 cleanedData.append(cleanedEvent)
0196
0197 t4 = time.time()
0198
0199
0200 nb_part = 0
0201 nb_track = 0
0202 nb_fake = 0
0203 nb_duplicate = 0
0204
0205 nb_good_match = 0
0206 nb_reco_part = 0
0207 nb_reco_fake = 0
0208 nb_reco_duplicate = 0
0209 nb_reco_track = 0
0210
0211 for clusteredEvent, cleanedEvent in zip(clusteredData, cleanedData):
0212 nb_part += clusteredEvent.loc[
0213 clusteredEvent["good/duplicate/fake"] != "fake"
0214 ].index.nunique()
0215 nb_track += clusteredEvent.shape[0]
0216 nb_fake += clusteredEvent.loc[
0217 clusteredEvent["good/duplicate/fake"] == "fake"
0218 ].shape[0]
0219 nb_duplicate += clusteredEvent.loc[
0220 clusteredEvent["good/duplicate/fake"] == "duplicate"
0221 ].shape[0]
0222
0223 nb_good_match += cleanedEvent.loc[
0224 cleanedEvent["good/duplicate/fake"] == "good"
0225 ].shape[0]
0226 nb_reco_fake += cleanedEvent.loc[
0227 cleanedEvent["good/duplicate/fake"] == "fake"
0228 ].shape[0]
0229 nb_reco_duplicate += cleanedEvent.loc[
0230 cleanedEvent["good/duplicate/fake"] == "duplicate"
0231 ].shape[0]
0232 nb_reco_part += cleanedEvent.loc[
0233 cleanedEvent["good/duplicate/fake"] != "fake"
0234 ].index.nunique()
0235 nb_reco_track += cleanedEvent.shape[0]
0236 end = time.time()
0237
0238 print("===Initial efficiencies===")
0239 print("nb particles : ", nb_part)
0240 print("nb track : ", nb_track)
0241 print("duplicate rate: ", 100 * nb_duplicate / nb_track, " %")
0242 print("Fake rate: ", 100 * nb_fake / nb_track, " %")
0243
0244 print("===computed efficiencies===")
0245 print("nb particles : ", nb_part)
0246 print("nb good match : ", nb_good_match)
0247 print("nb particle reco : ", nb_reco_part)
0248 print("nb track reco : ", nb_reco_track)
0249 print("Efficiency (good track) : ", 100 * nb_good_match / nb_part, " %")
0250 print("Efficiency (particle reco) : ", 100 * nb_reco_part / nb_part, " %")
0251 print(
0252 "duplicate rate: ",
0253 100 * ((nb_good_match + nb_reco_duplicate) - nb_reco_part) / nb_reco_track,
0254 " %",
0255 )
0256 print("Fake rate: ", 100 * nb_reco_fake / nb_reco_track, " %")
0257
0258 print("===computed speed===")
0259 print("Clustering : ", (t2 - t1) * 1000 / len(CKF_files), "ms")
0260 print("Inference : ", (t4 - t3) * 1000 / len(CKF_files), "ms")
0261 print("tot : ", (end - start) * 1000 / len(CKF_files), "ms")
0262
0263 for file, cleanedEvent in zip(CKF_files, cleanedData):
0264 newFile = file[:-4] + "-Cleaned.csv"
0265 cleanedEvent = cleanedEvent.sort_values("track_id")
0266 cleanedEvent.to_csv(path_or_buf=newFile)