File indexing completed on 2025-01-18 09:12:08
0001 import glob
0002
0003 import pandas as pd
0004 import numpy as np
0005
0006 import torch.nn as nn
0007 import torch.nn.functional as F
0008 import torch.utils
0009 from torch.utils.tensorboard import SummaryWriter
0010
0011 from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
0012
0013 from seed_solver_network import (
0014 prepareDataSet,
0015 DuplicateClassifier,
0016 Normalise,
0017 )
0018
0019 avg_mean = [0] * 14
0020 avg_sdv = [0] * 14
0021 events = 0
0022 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
0023
0024
0025 def readDataSet(Seed_files: list[str]) -> pd.DataFrame:
0026 """Read the dataset from the different files, remove the particle with only fakes and combine the datasets"""
0027 """
0028 @param[in] Seed_files: DataFrame contain the data from each seed files (1 file per events usually)
0029 @return: combined DataFrame containing all the seed, ordered by events and then by truth particle ID in each events
0030 """
0031 data = pd.DataFrame()
0032 for f in Seed_files:
0033 datafile = pd.read_csv(f)
0034 datafile = prepareDataSet(datafile)
0035 data = pd.concat([data, datafile])
0036 return data
0037
0038
0039 def prepareTrainingData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
0040 """Prepare the data"""
0041 """
0042 @param[in] data: input DataFrame to be prepared
0043 @return: array of the network input and the corresponding truth
0044 """
0045
0046 target_column = "good/duplicate/fake"
0047
0048 y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
0049 input = data.drop(
0050 columns=[
0051 target_column,
0052 "seed_id",
0053 "Hits_ID",
0054 ]
0055 )
0056
0057 scale = StandardScaler()
0058 scale.fit(input.select_dtypes("number"))
0059
0060 global avg_mean
0061 avg_mean = avg_mean + scale.mean_
0062 global avg_sdv
0063 avg_sdv = avg_sdv + scale.var_
0064 global events
0065 events = events + 1
0066
0067 x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
0068 x = np.concatenate((x_cat, input), axis=1)
0069 return x, y
0070
0071
0072 def batchSplit(data: pd.DataFrame, batch_size: int) -> list[pd.DataFrame]:
0073 """Split the data into batch each containing @batch_size truth particles (the number of corresponding seeds may vary)"""
0074 """
0075 @param[in] data: input DataFrame to be cut into batch
0076 @param[in] batch_size: Number of truth particles per batch
0077 @return: list of DataFrame, each element correspond to a batch
0078 """
0079 batch = []
0080 pid = data[0][0]
0081 n_particle = 0
0082 id_prev = 0
0083 id = 0
0084 for index, row, truth in zip(data[0], data[1], data[2]):
0085 if index != pid:
0086 pid = index
0087 n_particle += 1
0088 if n_particle == batch_size:
0089 b = data[0][id_prev:id], data[1][id_prev:id], data[2][id_prev:id]
0090 batch.append(b)
0091 n_particle = 0
0092 id_prev = id
0093 id += 1
0094 return batch
0095
0096
0097 def computeLoss(
0098 score_good: torch.Tensor,
0099 score_duplicate: list[torch.Tensor],
0100 score_fake: list[torch.Tensor],
0101 batch_loss: torch.Tensor,
0102 margin_duplicate: float = 0.3,
0103 margin_fake: float = 0.9,
0104 ) -> torch.Tensor:
0105 """Compute one loss for each duplicate seed associated with the particle"""
0106 """
0107 @param[in] score_good: score return by the model for the good seed associated with this particle
0108 @param[in] score_duplicate: list of the scores of all duplicate seed associated with this particle
0109 @param[in] margin_duplicate: Margin used in the computation of the MarginRankingLoss for duplicate seeds
0110 @param[in] margin_fake: Margin used in the computation of the MarginRankingLoss for fake seeds
0111 @return: return the updated loss
0112 """
0113
0114 batch_loss = batch_loss
0115 if score_duplicate:
0116 for s in score_duplicate:
0117 batch_loss += F.relu(s - score_good + margin_duplicate) / (
0118 len(score_duplicate) + len(score_fake) + 1
0119 )
0120 if score_fake:
0121 for s in score_fake:
0122 batch_loss += F.relu(s - score_good + margin_fake) / (
0123 len(score_duplicate) + len(score_fake) + 1
0124 )
0125 batch_loss += margin_fake / (len(score_duplicate) + len(score_fake) + 1)
0126
0127 return batch_loss
0128
0129
0130 def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, float]:
0131 """Run the MLP on a batch and compute the corresponding efficiency and loss. If an optimiser is specify train the MLP."""
0132 """
0133 @param[in] batch: list of DataFrame, each element correspond to a batch
0134 @param[in] Optimiser: Optimiser for the MLP, if one is specify the network will be train on batch.
0135 @return: array containing the number of particles, the number of particle where the good seed was found and the loss
0136 """
0137
0138 nb_part = 0
0139
0140 nb_good_match = 0
0141
0142 nb_best_match = 0
0143
0144 loss = 0
0145
0146 max_score = 0
0147
0148 max_match = 1
0149
0150 for b_data in batch:
0151
0152 pid = b_data[0][0]
0153
0154 batch_loss = 0
0155
0156 score_good = 1
0157
0158 score_duplicate = []
0159
0160 score_fake = []
0161
0162 if Optimiser:
0163 Optimiser.zero_grad()
0164 input = torch.tensor(b_data[1], dtype=torch.float32)
0165 input = input.to(device)
0166 prediction = duplicateClassifier(input)
0167
0168 for index, pred, truth in zip(b_data[0], prediction, b_data[2]):
0169
0170 if index != pid:
0171
0172 if max_match == 0 or max_match == 2:
0173 nb_good_match += 1
0174 if max_match == 2:
0175 nb_best_match += 1
0176 batch_loss = computeLoss(
0177 score_good,
0178 score_duplicate,
0179 score_fake,
0180 batch_loss,
0181 margin_duplicate=0.2,
0182 margin_fake=0.4,
0183 )
0184 nb_part += 1
0185
0186 pid = index
0187 score_duplicate = []
0188 score_fake = []
0189 score_good = 1
0190 max_score = 0
0191 max_match = 1
0192
0193 if truth == 2:
0194 score_good = pred
0195 elif truth == 0:
0196 score_duplicate.append(pred)
0197 else:
0198 score_fake.append(pred)
0199
0200 if pred > max_score:
0201 max_score = pred
0202 max_match = truth
0203
0204 if max_match == 0 or max_match == 2:
0205 nb_good_match += 1
0206 if max_score == 2:
0207 nb_best_match += 1
0208 batch_loss = computeLoss(
0209 score_good,
0210 score_duplicate,
0211 score_fake,
0212 batch_loss,
0213 margin_duplicate=0.2,
0214 margin_fake=0.4,
0215 )
0216 nb_part += 1
0217
0218 batch_loss = batch_loss / len(b_data[0])
0219 loss += batch_loss.item()
0220
0221 if Optimiser:
0222 batch_loss.backward()
0223 Optimiser.step()
0224 loss = loss / len(batch)
0225 return nb_part, nb_good_match, nb_best_match, loss
0226
0227
0228 def train(
0229 duplicateClassifier: DuplicateClassifier,
0230 data: tuple[np.ndarray, np.ndarray, np.ndarray],
0231 epochs: int = 20,
0232 batch: int = 32,
0233 validation: float = 0.3,
0234 ) -> DuplicateClassifier:
0235 """Training of the MLP"""
0236 """
0237 @param[in] duplicateClassifier: model to be trained.
0238 @param[in] data: tuple containing three list. Each element of those list correspond to a given seed and represent : the truth particle ID, the seed parameters and the truth.
0239 @param[in] epochs: number of epoch the model will be trained for.
0240 @param[in] batch: size of the batch used in the training
0241 @param[in] validation: Fraction of the batch used in training
0242 @return: trained model
0243 """
0244
0245
0246 writer = SummaryWriter()
0247 opt = torch.optim.Adam(duplicateClassifier.parameters())
0248
0249 batch = batchSplit(data, batch)
0250 val_batch = int(len(batch) * (1 - validation))
0251
0252 for epoch in range(epochs):
0253 print("Epoch: ", epoch, " / ", epochs)
0254 loss = 0.0
0255 nb_part = 0.0
0256 nb_good_match = 0.0
0257
0258
0259 nb_part, nb_good_match, nb_best_match, loss = scoringBatch(
0260 batch[:val_batch], Optimiser=opt
0261 )
0262 print(
0263 "Loss/train: ",
0264 loss,
0265 " Eff/train: ",
0266 nb_good_match / nb_part,
0267 " Eff_best/train: ",
0268 nb_best_match / nb_part,
0269 )
0270 writer.add_scalar("Loss/train", loss, epoch)
0271 writer.add_scalar("Eff/train", nb_good_match / nb_part, epoch)
0272 writer.add_scalar("Eff_best/train", nb_best_match / nb_part, epoch)
0273
0274
0275 if validation > 0.0:
0276 nb_part, nb_good_match, nb_best_match, loss = scoringBatch(
0277 batch[val_batch:]
0278 )
0279 writer.add_scalar("Loss/val", loss, epoch)
0280 writer.add_scalar("Eff/val", nb_good_match / nb_part, epoch)
0281 writer.add_scalar("Eff_best/train", nb_best_match / nb_part, epoch)
0282 print(
0283 "Loss/val: ",
0284 loss,
0285 " Eff/val: ",
0286 nb_good_match / nb_part,
0287 " Eff_best/val: ",
0288 nb_best_match / nb_part,
0289 )
0290
0291 writer.close()
0292 return duplicateClassifier
0293
0294
0295
0296
0297
0298 CKF_files = sorted(
0299 glob.glob("odd_output" + "/event000000[0-9][0-9][0-9]-seed_cleaned.csv")
0300 )
0301 data = readDataSet(CKF_files)
0302
0303 x_train, y_train = prepareTrainingData(data)
0304
0305 avg_mean = [x / events for x in avg_mean]
0306 avg_sdv = [x / events for x in avg_sdv]
0307
0308
0309 input_dim = np.shape(x_train)[1]
0310 layers_dim = [80, 80, 100, 80, 80]
0311
0312 duplicateClassifier = nn.Sequential(
0313 Normalise(avg_mean, avg_sdv), DuplicateClassifier(input_dim, layers_dim)
0314 )
0315 duplicateClassifier = duplicateClassifier.to(device)
0316
0317
0318 input = data.index, x_train, y_train
0319 train(duplicateClassifier, input, epochs=30, batch=128, validation=0.3)
0320 duplicateClassifier.eval()
0321 input_test = torch.tensor(x_train, dtype=torch.float32)
0322 torch.save(duplicateClassifier, "seedduplicateClassifier.pt")
0323 torch.onnx.export(
0324 duplicateClassifier,
0325 input_test[0:1],
0326 "seedduplicateClassifier.onnx",
0327 input_names=["x"],
0328 output_names=["y"],
0329 dynamic_axes={"x": {0: "batch_size"}, "y": {0: "batch_size"}},
0330 )
0331
0332
0333
0334
0335 CKF_files_test = sorted(
0336 glob.glob("odd_output" + "/event000001[0-0][0-9][0-9]-seed_cleaned.csv")
0337 )
0338
0339 test = readDataSet(CKF_files_test)
0340
0341
0342 x_test, y_test = prepareTrainingData(test)
0343
0344
0345 output_predict = []
0346
0347 model = torch.load("seedduplicateClassifier.pt")
0348
0349 x_test = torch.tensor(x_test, dtype=torch.float32)
0350 x_test = x_test.to(device)
0351 for x in x_test:
0352 output_predict.append(model(x))
0353
0354
0355 for sample_test, sample_predict, sample_true in zip(
0356 test.index[0:100], output_predict[0:100], y_test[0:100]
0357 ):
0358 print(sample_test, sample_predict, sample_true)
0359
0360 id = 0
0361 pid = test.index[0]
0362 nb_part = 0
0363 nb_good_match = 0
0364 nb_best_match = 0
0365 max_match = 1
0366 max_score = 0
0367
0368
0369 for index, pred, truth in zip(test.index, output_predict, y_test):
0370 if index != pid:
0371 nb_part += 1
0372 if max_match == 0 or max_match == 2:
0373 nb_good_match += 1
0374 if max_match == 2:
0375 nb_best_match += 1
0376 pid = index
0377 max_match = 1
0378 max_score = 0
0379
0380 if pred > max_score:
0381 max_score = pred
0382 max_match = truth
0383
0384 print("nb particles: ", nb_part)
0385 print("nb good match: ", nb_good_match)
0386 print("nb best match: ", nb_best_match)
0387 print("Efficiency: ", 100 * nb_good_match / nb_part, " %")
0388 print("Efficiency_best: ", 100 * nb_best_match / nb_part, " %")