Python/MLAmbiguityResolution/train_seed_solver.py

0001 import glob
0002
0003 import pandas as pd
0004 import numpy as np
0005
0006 import torch.nn as nn
0007 import torch.nn.functional as F
0008 import torch.utils
0009 from torch.utils.tensorboard import SummaryWriter
0010
0011 from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
0012
0013 from seed_solver_network import (
0014     prepareDataSet,
0015     DuplicateClassifier,
0016     Normalise,
0017 )
0018
0019 avg_mean = [0] * 14
0020 avg_sdv = [0] * 14
0021 events = 0
0022 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
0023
0024
0025 def readDataSet(Seed_files: list[str]) -> pd.DataFrame:
0026     """Read the dataset from the different files, remove the particle with only fakes and combine the datasets"""
0027     """
0028     @param[in] Seed_files: DataFrame contain the data from each seed files (1 file per events usually)
0029     @return: combined DataFrame containing all the seed, ordered by events and then by truth particle ID in each events
0030     """
0031     data = pd.DataFrame()
0032     for f in Seed_files:
0033         datafile = pd.read_csv(f)
0034         datafile = prepareDataSet(datafile)
0035         data = pd.concat([data, datafile])
0036     return data
0037
0038
0039 def prepareTrainingData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
0040     """Prepare the data"""
0041     """
0042     @param[in] data: input DataFrame to be prepared
0043     @return: array of the network input and the corresponding truth
0044     """
0045     # Remove truth and useless variable
0046     target_column = "good/duplicate/fake"
0047     # Separate the truth from the input variables
0048     y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
0049     input = data.drop(
0050         columns=[
0051             target_column,
0052             "seed_id",
0053             "Hits_ID",
0054         ]
0055     )
0056     # Compute the normalisation factors
0057     scale = StandardScaler()
0058     scale.fit(input.select_dtypes("number"))
0059     # Variables to compute the normalisation
0060     global avg_mean
0061     avg_mean = avg_mean + scale.mean_
0062     global avg_sdv
0063     avg_sdv = avg_sdv + scale.var_
0064     global events
0065     events = events + 1
0066     # Prepare the input feature
0067     x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
0068     x = np.concatenate((x_cat, input), axis=1)
0069     return x, y
0070
0071
0072 def batchSplit(data: pd.DataFrame, batch_size: int) -> list[pd.DataFrame]:
0073     """Split the data into batch each containing @batch_size truth particles (the number of corresponding seeds may vary)"""
0074     """
0075     @param[in] data: input DataFrame to be cut into batch
0076     @param[in] batch_size: Number of truth particles per batch
0077     @return: list of DataFrame, each element correspond to a batch
0078     """
0079     batch = []
0080     pid = data[0][0]
0081     n_particle = 0
0082     id_prev = 0
0083     id = 0
0084     for index, row, truth in zip(data[0], data[1], data[2]):
0085         if index != pid:
0086             pid = index
0087             n_particle += 1
0088             if n_particle == batch_size:
0089                 b = data[0][id_prev:id], data[1][id_prev:id], data[2][id_prev:id]
0090                 batch.append(b)
0091                 n_particle = 0
0092                 id_prev = id
0093         id += 1
0094     return batch
0095
0096
0097 def computeLoss(
0098     score_good: torch.Tensor,
0099     score_duplicate: list[torch.Tensor],
0100     score_fake: list[torch.Tensor],
0101     batch_loss: torch.Tensor,
0102     margin_duplicate: float = 0.3,
0103     margin_fake: float = 0.9,
0104 ) -> torch.Tensor:
0105     """Compute one loss for each duplicate seed associated with the particle"""
0106     """
0107     @param[in] score_good: score return by the model for the good seed associated with this particle
0108     @param[in] score_duplicate: list of the scores of all duplicate seed associated with this particle
0109     @param[in] margin_duplicate: Margin used in the computation of the MarginRankingLoss for duplicate seeds
0110     @param[in] margin_fake: Margin used in the computation of the MarginRankingLoss for fake seeds
0111     @return: return the updated loss
0112     """
0113     # Compute the losses using the MarginRankingLoss with respect to the good seed score
0114     batch_loss = batch_loss
0115     if score_duplicate:
0116         for s in score_duplicate:
0117             batch_loss += F.relu(s - score_good + margin_duplicate) / (
0118                 len(score_duplicate) + len(score_fake) + 1
0119             )
0120     if score_fake:
0121         for s in score_fake:
0122             batch_loss += F.relu(s - score_good + margin_fake) / (
0123                 len(score_duplicate) + len(score_fake) + 1
0124             )
0125     batch_loss += margin_fake / (len(score_duplicate) + len(score_fake) + 1)
0126
0127     return batch_loss
0128
0129
0130 def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, float]:
0131     """Run the MLP on a batch and compute the corresponding efficiency and loss. If an optimiser is specify train the MLP."""
0132     """
0133     @param[in] batch:  list of DataFrame, each element correspond to a batch
0134     @param[in] Optimiser: Optimiser for the MLP, if one is specify the network will be train on batch.
0135     @return: array containing the number of particles, the number of particle where the good seed was found and the loss
0136     """
0137     # number of particles
0138     nb_part = 0
0139     # number of particles associated with a good seed
0140     nb_good_match = 0
0141     # number of particles associated with a best seed
0142     nb_best_match = 0
0143     # loss for the batch
0144     loss = 0
0145     # best seed score for a particle
0146     max_score = 0
0147     # is the best score associated with the good seed
0148     max_match = 1
0149     # loop over all the batch
0150     for b_data in batch:
0151         # ID of the current particle
0152         pid = b_data[0][0]
0153         # loss for the current batch
0154         batch_loss = 0
0155         # score of the good seed
0156         score_good = 1
0157         # score of the duplicate seeds
0158         score_duplicate = []
0159         # score of the fake seeds
0160         score_fake = []
0161
0162         if Optimiser:
0163             Optimiser.zero_grad()
0164         input = torch.tensor(b_data[1], dtype=torch.float32)
0165         input = input.to(device)
0166         prediction = duplicateClassifier(input)
0167         # loop over all the seed in the batch
0168         for index, pred, truth in zip(b_data[0], prediction, b_data[2]):
0169             # If we are changing particle update the loss
0170             if index != pid:
0171                 # Starting a new particles, compute the loss for the previous one
0172                 if max_match == 0 or max_match == 2:
0173                     nb_good_match += 1
0174                 if max_match == 2:
0175                     nb_best_match += 1
0176                 batch_loss = computeLoss(
0177                     score_good,
0178                     score_duplicate,
0179                     score_fake,
0180                     batch_loss,
0181                     margin_duplicate=0.2,
0182                     margin_fake=0.4,
0183                 )
0184                 nb_part += 1
0185                 # Reinitialise the variable for the next particle
0186                 pid = index
0187                 score_duplicate = []
0188                 score_fake = []
0189                 score_good = 1
0190                 max_score = 0
0191                 max_match = 1
0192             # Store seed scores
0193             if truth == 2:
0194                 score_good = pred
0195             elif truth == 0:
0196                 score_duplicate.append(pred)
0197             else:
0198                 score_fake.append(pred)
0199             # Prepare efficiency computation
0200             if pred > max_score:
0201                 max_score = pred
0202                 max_match = truth
0203         # Compute the loss for the last particle when reaching the end of the batch
0204         if max_match == 0 or max_match == 2:
0205             nb_good_match += 1
0206         if max_score == 2:
0207             nb_best_match += 1
0208         batch_loss = computeLoss(
0209             score_good,
0210             score_duplicate,
0211             score_fake,
0212             batch_loss,
0213             margin_duplicate=0.2,
0214             margin_fake=0.4,
0215         )
0216         nb_part += 1
0217         # Normalise the loss to the batch size
0218         batch_loss = batch_loss / len(b_data[0])
0219         loss += batch_loss.item()
0220         # Perform the gradient descent if an optimiser was specified
0221         if Optimiser:
0222             batch_loss.backward()
0223             Optimiser.step()
0224     loss = loss / len(batch)
0225     return nb_part, nb_good_match, nb_best_match, loss
0226
0227
0228 def train(
0229     duplicateClassifier: DuplicateClassifier,
0230     data: tuple[np.ndarray, np.ndarray, np.ndarray],
0231     epochs: int = 20,
0232     batch: int = 32,
0233     validation: float = 0.3,
0234 ) -> DuplicateClassifier:
0235     """Training of the MLP"""
0236     """
0237     @param[in] duplicateClassifier: model to be trained.
0238     @param[in] data: tuple containing three list. Each element of those list correspond to a given seed and represent : the truth particle ID, the seed parameters and the truth.
0239     @param[in] epochs: number of epoch the model will be trained for.
0240     @param[in] batch: size of the batch used in the training
0241     @param[in] validation: Fraction of the batch used in training
0242     @return: trained model
0243     """
0244     # Prepare tensorboard for the training plot
0245     # use 'tensorboard --logdir=runs' to access the plot afterward
0246     writer = SummaryWriter()
0247     opt = torch.optim.Adam(duplicateClassifier.parameters())
0248     # Split the data in batch
0249     batch = batchSplit(data, batch)
0250     val_batch = int(len(batch) * (1 - validation))
0251     # Loop over all the epoch
0252     for epoch in range(epochs):
0253         print("Epoch: ", epoch, " / ", epochs)
0254         loss = 0.0
0255         nb_part = 0.0
0256         nb_good_match = 0.0
0257
0258         # Loop over all the network over the training batch
0259         nb_part, nb_good_match, nb_best_match, loss = scoringBatch(
0260             batch[:val_batch], Optimiser=opt
0261         )
0262         print(
0263             "Loss/train: ",
0264             loss,
0265             " Eff/train: ",
0266             nb_good_match / nb_part,
0267             " Eff_best/train: ",
0268             nb_best_match / nb_part,
0269         )
0270         writer.add_scalar("Loss/train", loss, epoch)
0271         writer.add_scalar("Eff/train", nb_good_match / nb_part, epoch)
0272         writer.add_scalar("Eff_best/train", nb_best_match / nb_part, epoch)
0273
0274         # If using validation, compute the efficiency and loss over the training batch
0275         if validation > 0.0:
0276             nb_part, nb_good_match, nb_best_match, loss = scoringBatch(
0277                 batch[val_batch:]
0278             )
0279             writer.add_scalar("Loss/val", loss, epoch)
0280             writer.add_scalar("Eff/val", nb_good_match / nb_part, epoch)
0281             writer.add_scalar("Eff_best/train", nb_best_match / nb_part, epoch)
0282             print(
0283                 "Loss/val: ",
0284                 loss,
0285                 " Eff/val: ",
0286                 nb_good_match / nb_part,
0287                 " Eff_best/val: ",
0288                 nb_best_match / nb_part,
0289             )
0290
0291     writer.close()
0292     return duplicateClassifier
0293
0294
0295 # ==================================================================
0296
0297 # ttbar events used as the training input, here we assume 1000 events are available
0298 CKF_files = sorted(
0299     glob.glob("odd_output" + "/event000000[0-9][0-9][0-9]-seed_cleaned.csv")
0300 )
0301 data = readDataSet(CKF_files)
0302 # Prepare the data
0303 x_train, y_train = prepareTrainingData(data)
0304
0305 avg_mean = [x / events for x in avg_mean]
0306 avg_sdv = [x / events for x in avg_sdv]
0307
0308 # Create our model and chose the layers sizes
0309 input_dim = np.shape(x_train)[1]
0310 layers_dim = [80, 80, 100, 80, 80]
0311
0312 duplicateClassifier = nn.Sequential(
0313     Normalise(avg_mean, avg_sdv), DuplicateClassifier(input_dim, layers_dim)
0314 )
0315 duplicateClassifier = duplicateClassifier.to(device)
0316
0317 # Train the model and save it
0318 input = data.index, x_train, y_train
0319 train(duplicateClassifier, input, epochs=30, batch=128, validation=0.3)
0320 duplicateClassifier.eval()
0321 input_test = torch.tensor(x_train, dtype=torch.float32)
0322 torch.save(duplicateClassifier, "seedduplicateClassifier.pt")
0323 torch.onnx.export(
0324     duplicateClassifier,
0325     input_test[0:1],
0326     "seedduplicateClassifier.onnx",
0327     input_names=["x"],
0328     output_names=["y"],
0329     dynamic_axes={"x": {0: "batch_size"}, "y": {0: "batch_size"}},
0330 )
0331
0332 # ==================================================================
0333
0334 # ttbar events for the test, here we assume 40 events are availables
0335 CKF_files_test = sorted(
0336     glob.glob("odd_output" + "/event000001[0-0][0-9][0-9]-seed_cleaned.csv")
0337 )
0338
0339 test = readDataSet(CKF_files_test)
0340
0341 # Prepare the data
0342 x_test, y_test = prepareTrainingData(test)
0343
0344 # Write the network score to a list
0345 output_predict = []
0346
0347 model = torch.load("seedduplicateClassifier.pt")
0348
0349 x_test = torch.tensor(x_test, dtype=torch.float32)
0350 x_test = x_test.to(device)
0351 for x in x_test:
0352     output_predict.append(model(x))
0353
0354 # For the first 100 particles print the ID, score and truth
0355 for sample_test, sample_predict, sample_true in zip(
0356     test.index[0:100], output_predict[0:100], y_test[0:100]
0357 ):
0358     print(sample_test, sample_predict, sample_true)
0359
0360 id = 0
0361 pid = test.index[0]
0362 nb_part = 0
0363 nb_good_match = 0
0364 nb_best_match = 0
0365 max_match = 1
0366 max_score = 0
0367
0368 # Compute the efficiency
0369 for index, pred, truth in zip(test.index, output_predict, y_test):
0370     if index != pid:
0371         nb_part += 1
0372         if max_match == 0 or max_match == 2:
0373             nb_good_match += 1
0374         if max_match == 2:
0375             nb_best_match += 1
0376         pid = index
0377         max_match = 1
0378         max_score = 0
0379
0380     if pred > max_score:
0381         max_score = pred
0382         max_match = truth
0383
0384 print("nb particles: ", nb_part)
0385 print("nb good match: ", nb_good_match)
0386 print("nb best match: ", nb_best_match)
0387 print("Efficiency: ", 100 * nb_good_match / nb_part, " %")
0388 print("Efficiency_best: ", 100 * nb_best_match / nb_part, " %")