File indexing completed on 2025-01-18 09:12:08
0001 import glob
0002
0003 import pandas as pd
0004 import numpy as np
0005
0006 import torch.nn as nn
0007 import torch.nn.functional as F
0008 import torch.utils
0009 from torch.utils.tensorboard import SummaryWriter
0010
0011 from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
0012
0013 from ambiguity_solver_network import prepareDataSet, DuplicateClassifier, Normalise
0014
0015 avg_mean = [0, 0, 0, 0, 0, 0, 0, 0]
0016 avg_sdv = [0, 0, 0, 0, 0, 0, 0, 0]
0017 events = 0
0018 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
0019
0020
0021 def readDataSet(CKS_files: list[str]) -> pd.DataFrame:
0022 """Read the dataset from the different files, remove the pure duplicate tracks and combine the datasets"""
0023 """
0024 @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually)
0025 @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each events
0026 """
0027 data = pd.DataFrame()
0028 for f in CKS_files:
0029 datafile = pd.read_csv(f)
0030
0031 datafile.loc[
0032 datafile["good/duplicate/fake"] == "fake", "good/duplicate/fake"
0033 ] = "duplicate"
0034 datafile = prepareDataSet(datafile)
0035
0036 data = pd.concat([data, datafile])
0037 return data
0038
0039
0040 def prepareTrainingData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
0041 """Prepare the data"""
0042 """
0043 @param[in] data: input DataFrame to be prepared
0044 @return: array of the network input and the corresponding truth
0045 """
0046
0047 target_column = "good/duplicate/fake"
0048
0049 y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
0050 input = data.drop(
0051 columns=[
0052 target_column,
0053 "track_id",
0054 "nMajorityHits",
0055 "nSharedHits",
0056 "truthMatchProbability",
0057 "Hits_ID",
0058 "chi2",
0059 "pT",
0060 ]
0061 )
0062
0063 scale = StandardScaler()
0064 scale.fit(input.select_dtypes("number"))
0065
0066 global avg_mean
0067 avg_mean = avg_mean + scale.mean_
0068 global avg_sdv
0069 avg_sdv = avg_sdv + scale.var_
0070 global events
0071 events = events + 1
0072
0073 x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
0074 x = np.concatenate((x_cat, input), axis=1)
0075 return x, y
0076
0077
0078 def batchSplit(data: pd.DataFrame, batch_size: int) -> list[pd.DataFrame]:
0079 """Split the data into batch each containing @batch_size truth particles (the number of corresponding tracks may vary)"""
0080 """
0081 @param[in] data: input DataFrame to be cut into batch
0082 @param[in] batch_size: Number of truth particles per batch
0083 @return: list of DataFrame, each element correspond to a batch
0084 """
0085 batch = []
0086 pid = data[0][0]
0087 n_particle = 0
0088 id_prev = 0
0089 id = 0
0090 for index, row, truth in zip(data[0], data[1], data[2]):
0091 if index != pid:
0092 pid = index
0093 n_particle += 1
0094 if n_particle == batch_size:
0095 b = data[0][id_prev:id], data[1][id_prev:id], data[2][id_prev:id]
0096 batch.append(b)
0097 n_particle = 0
0098 id_prev = id
0099 id += 1
0100 return batch
0101
0102
0103 def computeLoss(
0104 score_good: torch.Tensor,
0105 score_duplicate: list[torch.Tensor],
0106 batch_loss: torch.Tensor,
0107 margin: float = 0.05,
0108 ) -> torch.Tensor:
0109 """Compute one loss for each duplicate track associated with the particle"""
0110 """
0111 @param[in] score_good: score return by the model for the good track associated with this particle
0112 @param[in] score_duplicate: list of the scores of all duplicate track associated with this particle
0113 @param[in] margin: Margin used in the computation of the MarginRankingLoss
0114 @return: return the updated loss
0115 """
0116
0117 batch_loss = batch_loss
0118 if score_duplicate:
0119 for s in score_duplicate:
0120 batch_loss += F.relu(s - score_good + margin) / len(score_duplicate)
0121 return batch_loss
0122
0123
0124 def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, float]:
0125 """Run the MLP on a batch and compute the corresponding efficiency and loss. If an optimiser is specified train the MLP."""
0126 """
0127 @param[in] batch: list of DataFrame, each element correspond to a batch
0128 @param[in] Optimiser: Optimiser for the MLP, if one is specify the network will be train on batch.
0129 @return: array containing the number of particles, the number of particle where the good track was found and the loss
0130 """
0131
0132 nb_part = 0
0133
0134 nb_good_match = 0
0135
0136 loss = 0
0137
0138 max_score = 0
0139
0140 max_match = 0
0141
0142 for b_data in batch:
0143
0144 pid = b_data[0][0]
0145
0146 batch_loss = 0
0147
0148 score_good = 0
0149
0150 score_duplicate = []
0151 if Optimiser:
0152 Optimiser.zero_grad()
0153 input = torch.tensor(b_data[1], dtype=torch.float32)
0154 input = input.to(device)
0155 prediction = duplicateClassifier(input)
0156
0157 for index, pred, truth in zip(b_data[0], prediction, b_data[2]):
0158
0159 if index != pid:
0160
0161 if max_match == 1:
0162 nb_good_match += 1
0163 batch_loss = computeLoss(
0164 score_good, score_duplicate, batch_loss, margin=0.05
0165 )
0166 nb_part += 1
0167
0168 pid = index
0169 score_duplicate = []
0170 score_good = 0
0171 max_score = 0
0172 max_match = 0
0173
0174 if truth:
0175 score_good = pred
0176 else:
0177 score_duplicate.append(pred)
0178
0179 if pred == max_score:
0180 max_match = 0
0181 if pred > max_score:
0182 max_score = pred
0183 max_match = truth
0184
0185 if max_match == 1:
0186 nb_good_match += 1
0187 batch_loss = computeLoss(score_good, score_duplicate, batch_loss, margin=0.05)
0188 nb_part += 1
0189
0190 batch_loss = batch_loss / len(b_data[0])
0191 loss += batch_loss.item()
0192
0193 if Optimiser:
0194 batch_loss.backward()
0195 Optimiser.step()
0196 loss = loss / len(batch)
0197 return nb_part, nb_good_match, loss
0198
0199
0200 def train(
0201 duplicateClassifier: DuplicateClassifier,
0202 data: tuple[np.ndarray, np.ndarray, np.ndarray],
0203 epochs: int = 20,
0204 batch: int = 32,
0205 validation: float = 0.3,
0206 ) -> DuplicateClassifier:
0207 """Training of the MLP"""
0208 """
0209 @param[in] duplicateClassifier: model to be trained.
0210 @param[in] data: tuple containing three list. Each element of those list correspond to a given track and represent : the truth particle ID, the track parameters and the truth.
0211 @param[in] epochs: number of epoch the model will be trained for.
0212 @param[in] batch: size of the batch used in the training
0213 @param[in] validation: Fraction of the batch used in training
0214 @return: trained model
0215 """
0216
0217
0218 writer = SummaryWriter()
0219 opt = torch.optim.Adam(duplicateClassifier.parameters())
0220
0221 batch = batchSplit(data, batch)
0222 val_batch = int(len(batch) * (1 - validation))
0223
0224 for epoch in range(epochs):
0225 print("Epoch: ", epoch, " / ", epochs)
0226 loss = 0.0
0227 nb_part = 0.0
0228 nb_good_match = 0.0
0229
0230
0231 nb_part, nb_good_match, loss = scoringBatch(batch[:val_batch], Optimiser=opt)
0232 print("Loss/train: ", loss, " Eff/train: ", nb_good_match / nb_part)
0233 writer.add_scalar("Loss/train", loss, epoch)
0234 writer.add_scalar("Eff/train", nb_good_match / nb_part, epoch)
0235
0236
0237 if validation > 0.0:
0238 nb_part, nb_good_match, loss = scoringBatch(batch[val_batch:])
0239 writer.add_scalar("Loss/val", loss, epoch)
0240 writer.add_scalar("Eff/val", nb_good_match / nb_part, epoch)
0241 print("Loss/val: ", loss, " Eff/val: ", nb_good_match / nb_part)
0242
0243 writer.close()
0244 return duplicateClassifier
0245
0246
0247
0248
0249
0250 CKF_files = sorted(glob.glob("odd_output" + "/event0000000[0-7][0-9]-tracks_ckf.csv"))
0251 data = readDataSet(CKF_files)
0252
0253
0254 x_train, y_train = prepareTrainingData(data)
0255
0256 avg_mean = [x / events for x in avg_mean]
0257 avg_sdv = [x / events for x in avg_sdv]
0258
0259
0260 input_dim = np.shape(x_train)[1]
0261 layers_dim = [10, 15, 10]
0262
0263 duplicateClassifier = nn.Sequential(
0264 Normalise(avg_mean, avg_sdv), DuplicateClassifier(input_dim, layers_dim)
0265 )
0266 duplicateClassifier = duplicateClassifier.to(device)
0267
0268
0269 input = data.index, x_train, y_train
0270 train(duplicateClassifier, input, epochs=20, batch=128, validation=0.3)
0271 duplicateClassifier.eval()
0272 input_test = torch.tensor(x_train, dtype=torch.float32)
0273 torch.save(duplicateClassifier, "duplicateClassifier.pt")
0274 torch.onnx.export(
0275 duplicateClassifier,
0276 input_test[0:1],
0277 "duplicateClassifier.onnx",
0278 input_names=["x"],
0279 output_names=["y"],
0280 dynamic_axes={"x": {0: "batch_size"}, "y": {0: "batch_size"}},
0281 )
0282
0283
0284
0285 CKF_files_test = sorted(
0286 glob.glob("odd_output" + "/event0000000[8-9][0-9]-tracks_ckf.csv")
0287 )
0288 test = readDataSet(CKF_files_test)
0289
0290
0291 x_test, y_test = prepareTrainingData(test)
0292
0293
0294 output_predict = []
0295
0296 x_test = torch.tensor(x_test, dtype=torch.float32)
0297 x_test = x_test.to(device)
0298 for x in x_test:
0299 output_predict.append(duplicateClassifier(x))
0300
0301
0302 for sample_test, sample_predict, sample_true in zip(
0303 test.index[0:100], output_predict[0:100], y_test[0:100]
0304 ):
0305 print(sample_test, sample_predict, sample_true)
0306
0307 id = 0
0308 pid = test.index[0]
0309 nb_part = 1
0310 nb_good_match = 0
0311 max_match = 0
0312 max_score = 0
0313
0314
0315 for index, pred, truth in zip(test.index, output_predict, y_test):
0316 if index != pid:
0317 if max_match == 1:
0318 nb_good_match += 1
0319 pid = index
0320 nb_part += 1
0321 max_score = 0
0322 max_match = 0
0323 if pred == max_score:
0324 max_match = 0
0325 if pred > max_score:
0326 max_score = pred
0327 max_match = truth
0328 nb_part += 1
0329 if max_match == 1:
0330 nb_good_match += 1
0331
0332 print("nb particles: ", nb_part)
0333 print("nb good match: ", nb_good_match)
0334 print("Efficiency: ", 100 * nb_good_match / nb_part, " %")