Python/MLAmbiguityResolution/train_ambiguity_solver.py

0001 import glob
0002
0003 import pandas as pd
0004 import numpy as np
0005
0006 import torch.nn as nn
0007 import torch.nn.functional as F
0008 import torch.utils
0009 from torch.utils.tensorboard import SummaryWriter
0010
0011 from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
0012
0013 from ambiguity_solver_network import prepareDataSet, DuplicateClassifier, Normalise
0014
0015 avg_mean = [0, 0, 0, 0, 0, 0, 0, 0]
0016 avg_sdv = [0, 0, 0, 0, 0, 0, 0, 0]
0017 events = 0
0018 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
0019
0020
0021 def readDataSet(CKS_files: list[str]) -> pd.DataFrame:
0022     """Read the dataset from the different files, remove the pure duplicate tracks and combine the datasets"""
0023     """
0024     @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually)
0025     @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each events
0026     """
0027     data = pd.DataFrame()
0028     for f in CKS_files:
0029         datafile = pd.read_csv(f)
0030         # We at this point we don't make any difference between fake and duplicate
0031         datafile.loc[
0032             datafile["good/duplicate/fake"] == "fake", "good/duplicate/fake"
0033         ] = "duplicate"
0034         datafile = prepareDataSet(datafile)
0035         # Combine dataset
0036         data = pd.concat([data, datafile])
0037     return data
0038
0039
0040 def prepareTrainingData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
0041     """Prepare the data"""
0042     """
0043     @param[in] data: input DataFrame to be prepared
0044     @return: array of the network input and the corresponding truth
0045     """
0046     # Remove truth and useless variable
0047     target_column = "good/duplicate/fake"
0048     # Separate the truth from the input variables
0049     y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
0050     input = data.drop(
0051         columns=[
0052             target_column,
0053             "track_id",
0054             "nMajorityHits",
0055             "nSharedHits",
0056             "truthMatchProbability",
0057             "Hits_ID",
0058             "chi2",
0059             "pT",
0060         ]
0061     )
0062     # Compute the normalisation factors
0063     scale = StandardScaler()
0064     scale.fit(input.select_dtypes("number"))
0065     # Variables to compute the normalisation
0066     global avg_mean
0067     avg_mean = avg_mean + scale.mean_
0068     global avg_sdv
0069     avg_sdv = avg_sdv + scale.var_
0070     global events
0071     events = events + 1
0072     # Prepare the input feature
0073     x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
0074     x = np.concatenate((x_cat, input), axis=1)
0075     return x, y
0076
0077
0078 def batchSplit(data: pd.DataFrame, batch_size: int) -> list[pd.DataFrame]:
0079     """Split the data into batch each containing @batch_size truth particles (the number of corresponding tracks may vary)"""
0080     """
0081     @param[in] data: input DataFrame to be cut into batch
0082     @param[in] batch_size: Number of truth particles per batch
0083     @return: list of DataFrame, each element correspond to a batch
0084     """
0085     batch = []
0086     pid = data[0][0]
0087     n_particle = 0
0088     id_prev = 0
0089     id = 0
0090     for index, row, truth in zip(data[0], data[1], data[2]):
0091         if index != pid:
0092             pid = index
0093             n_particle += 1
0094             if n_particle == batch_size:
0095                 b = data[0][id_prev:id], data[1][id_prev:id], data[2][id_prev:id]
0096                 batch.append(b)
0097                 n_particle = 0
0098                 id_prev = id
0099         id += 1
0100     return batch
0101
0102
0103 def computeLoss(
0104     score_good: torch.Tensor,
0105     score_duplicate: list[torch.Tensor],
0106     batch_loss: torch.Tensor,
0107     margin: float = 0.05,
0108 ) -> torch.Tensor:
0109     """Compute one loss for each duplicate track associated with the particle"""
0110     """
0111     @param[in] score_good: score return by the model for the good track associated with this particle
0112     @param[in] score_duplicate: list of the scores of all duplicate track associated with this particle
0113     @param[in] margin: Margin used in the computation of the MarginRankingLoss
0114     @return: return the updated loss
0115     """
0116     # Compute the losses using the MarginRankingLoss with respect to the good track score
0117     batch_loss = batch_loss
0118     if score_duplicate:
0119         for s in score_duplicate:
0120             batch_loss += F.relu(s - score_good + margin) / len(score_duplicate)
0121     return batch_loss
0122
0123
0124 def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, float]:
0125     """Run the MLP on a batch and compute the corresponding efficiency and loss. If an optimiser is specified train the MLP."""
0126     """
0127     @param[in] batch:  list of DataFrame, each element correspond to a batch
0128     @param[in] Optimiser: Optimiser for the MLP, if one is specify the network will be train on batch.
0129     @return: array containing the number of particles, the number of particle where the good track was found and the loss
0130     """
0131     # number of particles
0132     nb_part = 0
0133     # number of particles associated with a good track
0134     nb_good_match = 0
0135     # loss for the batch
0136     loss = 0
0137     # best track score for a particle
0138     max_score = 0
0139     # is the best score associated with the good track
0140     max_match = 0
0141     # loop over all the batch
0142     for b_data in batch:
0143         # ID of the current particule
0144         pid = b_data[0][0]
0145         # loss for the current batch
0146         batch_loss = 0
0147         # score of the good track
0148         score_good = 0
0149         # score of the duplicate tracks
0150         score_duplicate = []
0151         if Optimiser:
0152             Optimiser.zero_grad()
0153         input = torch.tensor(b_data[1], dtype=torch.float32)
0154         input = input.to(device)
0155         prediction = duplicateClassifier(input)
0156         # loop over all the track in the batch
0157         for index, pred, truth in zip(b_data[0], prediction, b_data[2]):
0158             # If we are changing particle uptade the loss
0159             if index != pid:
0160                 # Starting a new particles, compute the loss for the previous one
0161                 if max_match == 1:
0162                     nb_good_match += 1
0163                 batch_loss = computeLoss(
0164                     score_good, score_duplicate, batch_loss, margin=0.05
0165                 )
0166                 nb_part += 1
0167                 # Reinitialise the variable for the next particle
0168                 pid = index
0169                 score_duplicate = []
0170                 score_good = 0
0171                 max_score = 0
0172                 max_match = 0
0173             # Store track scores
0174             if truth:
0175                 score_good = pred
0176             else:
0177                 score_duplicate.append(pred)
0178             # Prepare efficiency computtion
0179             if pred == max_score:
0180                 max_match = 0
0181             if pred > max_score:
0182                 max_score = pred
0183                 max_match = truth
0184         # Compute the loss for the last particle when reaching the end of the batch
0185         if max_match == 1:
0186             nb_good_match += 1
0187         batch_loss = computeLoss(score_good, score_duplicate, batch_loss, margin=0.05)
0188         nb_part += 1
0189         # Normalise the loss to the batch size
0190         batch_loss = batch_loss / len(b_data[0])
0191         loss += batch_loss.item()
0192         # Perform the gradient descend if an optimiser was specified
0193         if Optimiser:
0194             batch_loss.backward()
0195             Optimiser.step()
0196     loss = loss / len(batch)
0197     return nb_part, nb_good_match, loss
0198
0199
0200 def train(
0201     duplicateClassifier: DuplicateClassifier,
0202     data: tuple[np.ndarray, np.ndarray, np.ndarray],
0203     epochs: int = 20,
0204     batch: int = 32,
0205     validation: float = 0.3,
0206 ) -> DuplicateClassifier:
0207     """Training of the MLP"""
0208     """
0209     @param[in] duplicateClassifier: model to be trained.
0210     @param[in] data: tuple containing three list. Each element of those list correspond to a given track and represent : the truth particle ID, the track parameters and the truth.
0211     @param[in] epochs: number of epoch the model will be trained for.
0212     @param[in] batch: size of the batch used in the training
0213     @param[in] validation: Fraction of the batch used in training
0214     @return: trained model
0215     """
0216     # Prepare tensorboard for the training plot
0217     # use 'tensorboard --logdir=runs' to access the plot afterward
0218     writer = SummaryWriter()
0219     opt = torch.optim.Adam(duplicateClassifier.parameters())
0220     # Split the data in batch
0221     batch = batchSplit(data, batch)
0222     val_batch = int(len(batch) * (1 - validation))
0223     # Loop over all the epoch
0224     for epoch in range(epochs):
0225         print("Epoch: ", epoch, " / ", epochs)
0226         loss = 0.0
0227         nb_part = 0.0
0228         nb_good_match = 0.0
0229
0230         # Loop over all the network over the training batch
0231         nb_part, nb_good_match, loss = scoringBatch(batch[:val_batch], Optimiser=opt)
0232         print("Loss/train: ", loss, " Eff/train: ", nb_good_match / nb_part)
0233         writer.add_scalar("Loss/train", loss, epoch)
0234         writer.add_scalar("Eff/train", nb_good_match / nb_part, epoch)
0235
0236         # If using validation, compute the efficiency and loss over the training batch
0237         if validation > 0.0:
0238             nb_part, nb_good_match, loss = scoringBatch(batch[val_batch:])
0239             writer.add_scalar("Loss/val", loss, epoch)
0240             writer.add_scalar("Eff/val", nb_good_match / nb_part, epoch)
0241             print("Loss/val: ", loss, " Eff/val: ", nb_good_match / nb_part)
0242
0243     writer.close()
0244     return duplicateClassifier
0245
0246
0247 # ==================================================================
0248
0249 # ttbar events used as the training input, here we assume 1000 events are availables
0250 CKF_files = sorted(glob.glob("odd_output" + "/event0000000[0-7][0-9]-tracks_ckf.csv"))
0251 data = readDataSet(CKF_files)
0252
0253 # Prepare the data
0254 x_train, y_train = prepareTrainingData(data)
0255
0256 avg_mean = [x / events for x in avg_mean]
0257 avg_sdv = [x / events for x in avg_sdv]
0258
0259 # Create our model
0260 input_dim = np.shape(x_train)[1]
0261 layers_dim = [10, 15, 10]
0262
0263 duplicateClassifier = nn.Sequential(
0264     Normalise(avg_mean, avg_sdv), DuplicateClassifier(input_dim, layers_dim)
0265 )
0266 duplicateClassifier = duplicateClassifier.to(device)
0267
0268 # Train the model and save it
0269 input = data.index, x_train, y_train
0270 train(duplicateClassifier, input, epochs=20, batch=128, validation=0.3)
0271 duplicateClassifier.eval()
0272 input_test = torch.tensor(x_train, dtype=torch.float32)
0273 torch.save(duplicateClassifier, "duplicateClassifier.pt")
0274 torch.onnx.export(
0275     duplicateClassifier,
0276     input_test[0:1],
0277     "duplicateClassifier.onnx",
0278     input_names=["x"],
0279     output_names=["y"],
0280     dynamic_axes={"x": {0: "batch_size"}, "y": {0: "batch_size"}},
0281 )
0282 # ==================================================================
0283
0284 # ttbar events for the test, here we assume 1000 events are availables
0285 CKF_files_test = sorted(
0286     glob.glob("odd_output" + "/event0000000[8-9][0-9]-tracks_ckf.csv")
0287 )
0288 test = readDataSet(CKF_files_test)
0289
0290 # Prepare the data
0291 x_test, y_test = prepareTrainingData(test)
0292
0293 # Write the network score to a list
0294 output_predict = []
0295
0296 x_test = torch.tensor(x_test, dtype=torch.float32)
0297 x_test = x_test.to(device)
0298 for x in x_test:
0299     output_predict.append(duplicateClassifier(x))
0300
0301 # For the first 100 particles print the ID, score and truth
0302 for sample_test, sample_predict, sample_true in zip(
0303     test.index[0:100], output_predict[0:100], y_test[0:100]
0304 ):
0305     print(sample_test, sample_predict, sample_true)
0306
0307 id = 0
0308 pid = test.index[0]
0309 nb_part = 1
0310 nb_good_match = 0
0311 max_match = 0
0312 max_score = 0
0313
0314 # Compute the efficiency
0315 for index, pred, truth in zip(test.index, output_predict, y_test):
0316     if index != pid:
0317         if max_match == 1:
0318             nb_good_match += 1
0319         pid = index
0320         nb_part += 1
0321         max_score = 0
0322         max_match = 0
0323     if pred == max_score:
0324         max_match = 0
0325     if pred > max_score:
0326         max_score = pred
0327         max_match = truth
0328 nb_part += 1
0329 if max_match == 1:
0330     nb_good_match += 1
0331
0332 print("nb particles: ", nb_part)
0333 print("nb good match: ", nb_good_match)
0334 print("Efficiency: ", 100 * nb_good_match / nb_part, " %")