Python/MLAmbiguityResolution/match_good_track-seed.py

0001 import glob
0002
0003 import pandas as pd
0004
0005
0006 def matchGood(seed_files: list[str], ckf_files: list[str]):
0007     """Read the dataset from the tracks and seeds files, then modify the seed dataset so that good seed correspond to the ones that lead to good tracks. Seed with truth id that do not lead to good tracks are considered as fake. Also create a new dataset with only truth particle associated to a good seeds."""
0008     """
0009     @param[in] Seed_files: List of files containing seeds data (1 file per events usually)
0010     @param[in] CKF_files: List of files containing tracks data (1 file per events usually)
0011     """
0012     data_seed = pd.DataFrame()
0013     data_track = pd.DataFrame()
0014     goodSeed = pd.DataFrame()
0015     # Loop over the different track files and collect the list of seed ID associated to the good tracks
0016     for f_ckf, f_seed in zip(ckf_files, seed_files):
0017         print("reading file: ", f_ckf, f_seed)
0018         data_track = pd.read_csv(f_ckf)
0019         data_track = data_track.loc[data_track["good/duplicate/fake"] == "good"]
0020         goodSeed = data_track["seed_id"]
0021
0022         data_seed = pd.read_csv(f_seed)
0023         # Add a good seed column to the seed dataset
0024         data_seed["goodSeed"] = data_seed["seed_id"].isin(goodSeed)
0025
0026         data_seed.loc[
0027             data_seed["good/duplicate/fake"] == "good", "good/duplicate/fake"
0028         ] = "duplicate"
0029         data_seed.loc[data_seed["goodSeed"] == True, "good/duplicate/fake"] = "good"
0030
0031         cleanedData = pd.DataFrame()
0032
0033         # Find the particle ID that are associated to only fake seeds
0034         for ID in data_seed["particleId"].unique():
0035             if (
0036                 data_seed.loc[data_seed["particleId"] == ID, "goodSeed"] == False
0037             ).all():
0038                 data_seed.loc[data_seed["particleId"] == ID, "good/duplicate/fake"] = (
0039                     "fake"
0040                 )
0041             else:
0042                 cleanedData = pd.concat(
0043                     [data_seed.loc[data_seed["particleId"] == ID], cleanedData]
0044                 )
0045
0046         # Save the matched dataset for future use (the matching is time consuming)
0047         matched = f_seed[:-4] + "_matched.csv"
0048         matchedData = data_seed.sort_values("seed_id")
0049         matchedData = matchedData.set_index("seed_id")
0050         matchedData = matchedData.drop(columns=["goodSeed"])
0051         matchedData.to_csv(matched)
0052
0053         # Save the cleaned dataset for future use (the cleaning is time consuming)
0054         cleaned = f_seed[:-4] + "_cleaned.csv"
0055         cleanedData = cleanedData.sort_values("seed_id")
0056         cleanedData = cleanedData.set_index("seed_id")
0057         cleanedData = cleanedData.drop(columns=["goodSeed"])
0058         cleanedData.to_csv(cleaned)
0059
0060     return
0061
0062
0063 # Read the seed and track files and match them
0064 # This will allow us to determine which seeds leads to the best possible tracks
0065 seed_files = sorted(glob.glob("odd_output" + "/event*-seed.csv"))
0066 ckf_files = sorted(glob.glob("odd_output" + "/event*-tracks_ckf.csv"))
0067 matchGood(seed_files, ckf_files)