Warning, /detector_benchmarks/benchmarks/calo_pid/calo_pid.org is written in an unsupported language. File is not indexed.
0001 #+PROPERTY: header-args:jupyter-python :session /jpy:localhost#8888:benchmark :async yes :results drawer :exports both
0002
0003 #+TITLE: ePIC EEEMCal calorimetry PID
0004 #+AUTHOR: Dmitry Kalinkin
0005 #+OPTIONS: d:t
0006
0007 #+LATEX_CLASS_OPTIONS: [9pt,letter]
0008 #+BIND: org-latex-image-default-width ""
0009 #+BIND: org-latex-image-default-option "scale=0.3"
0010 #+BIND: org-latex-images-centered nil
0011 #+BIND: org-latex-minted-options (("breaklines") ("bgcolor" "black!5") ("frame" "single"))
0012 #+LATEX_HEADER: \usepackage[margin=1in]{geometry}
0013 #+LATEX_HEADER: \setlength{\parindent}{0pt}
0014 #+LATEX: \sloppy
0015
0016 #+begin_src jupyter-python :results silent
0017 import os
0018 from pathlib import Path
0019
0020 import awkward as ak
0021 import boost_histogram as bh
0022 import numpy as np
0023 import vector
0024 import uproot
0025 from sklearn.metrics import roc_curve
0026
0027 vector.register_awkward()
0028 #+end_src
0029
0030 * Parameters
0031
0032 #+begin_src jupyter-python :results silent
0033 DETECTOR_CONFIG=os.environ.get("DETECTOR_CONFIG")
0034 PLOT_TITLE=os.environ.get("PLOT_TITLE")
0035 INPUT_PIONS=os.environ.get("INPUT_PIONS")
0036 INPUT_ELECTRONS=os.environ.get("INPUT_ELECTRONS")
0037
0038 output_dir=Path(os.environ.get("OUTPUT_DIR", "./"))
0039 output_dir.mkdir(parents=True, exist_ok=True)
0040 #+end_src
0041
0042 * Plotting setup
0043
0044 #+begin_src jupyter-python :results silent
0045 import matplotlib as mpl
0046 import matplotlib.pyplot as plt
0047
0048 def setup_presentation_style():
0049 mpl.rcParams.update(mpl.rcParamsDefault)
0050 plt.style.use('ggplot')
0051 plt.rcParams.update({
0052 'axes.labelsize': 8,
0053 'axes.titlesize': 9,
0054 'figure.titlesize': 9,
0055 'figure.figsize': (4, 3),
0056 'legend.fontsize': 7,
0057 'xtick.labelsize': 8,
0058 'ytick.labelsize': 8,
0059 'pgf.rcfonts': False,
0060 })
0061
0062 setup_presentation_style()
0063 #+end_src
0064
0065 * Analysis
0066
0067 #+begin_src jupyter-python
0068 def filter_pointing(events):
0069 part_momentum = ak.zip({
0070 "m": events["MCParticles.mass"],
0071 "px": events["MCParticles.momentum.x"],
0072 "py": events["MCParticles.momentum.y"],
0073 "pz": events["MCParticles.momentum.z"],
0074 }, with_name="Momentum4D")
0075 cond = (part_momentum.eta[:,0] > -3.5) & (part_momentum.eta[:,0] < -2.)
0076 return events[cond]
0077
0078 def readlist(path):
0079 with open(path, "rt") as fp:
0080 paths = [line.rstrip() for line in fp.readlines()]
0081 return paths
0082
0083 e = filter_pointing(uproot.concatenate({filename: "events" for filename in readlist(INPUT_ELECTRONS)}, filter_name=["MCParticles.*", "*EcalEndcapN*"]))
0084 pi = filter_pointing(uproot.concatenate({filename: "events" for filename in readlist(INPUT_PIONS)}, filter_name=["MCParticles.*", "*EcalEndcapN*"]))
0085
0086 e_train = e[:len(pi)//2]
0087 pi_train = pi[:len(pi)//2]
0088 e_eval = e[len(pi)//2:]
0089 pi_eval = pi[len(pi)//2:]
0090 #+end_src
0091
0092 #+RESULTS:
0093 :results:
0094 :end:
0095
0096 #+begin_src jupyter-python
0097 nums = ak.num(pi_train["_EcalEndcapNParticleIDInput_features_floatData"], axis=1)
0098 num_features = ak.min(nums[nums > 0])
0099 print(f"{num_features} features")
0100 nums = ak.num(pi_train["_EcalEndcapNParticleIDTarget_int64Data"], axis=1)
0101 num_targets = ak.min(nums[nums > 0])
0102 print(f"{num_targets} targets")
0103
0104 pi_train_leading_cluster_ixs = ak.argsort(pi_train["EcalEndcapNClusters.energy"], ascending=False)[:,:1]
0105 e_train_leading_cluster_ixs = ak.argsort(e_train["EcalEndcapNClusters.energy"], ascending=False)[:,:1]
0106 pi_train_x = ak.flatten(ak.unflatten(pi_train["_EcalEndcapNParticleIDInput_features_floatData"], num_features, axis=1)[pi_train_leading_cluster_ixs])
0107 pi_train_y = ak.flatten(ak.unflatten(pi_train["_EcalEndcapNParticleIDTarget_int64Data"], num_targets, axis=1)[pi_train_leading_cluster_ixs])
0108 e_train_x = ak.flatten(ak.unflatten(e_train["_EcalEndcapNParticleIDInput_features_floatData"], num_features, axis=1)[e_train_leading_cluster_ixs])
0109 e_train_y = ak.flatten(ak.unflatten(e_train["_EcalEndcapNParticleIDTarget_int64Data"], num_targets, axis=1)[e_train_leading_cluster_ixs])
0110 train_x = ak.concatenate([pi_train_x, e_train_x])
0111 train_y = ak.concatenate([pi_train_y, e_train_y])
0112
0113 pi_eval_leading_cluster_ixs = ak.argsort(pi_eval["EcalEndcapNClusters.energy"], ascending=False)[:,:1]
0114 e_eval_leading_cluster_ixs = ak.argsort(e_eval["EcalEndcapNClusters.energy"], ascending=False)[:,:1]
0115 pi_eval_x = ak.flatten(ak.unflatten(pi_eval["_EcalEndcapNParticleIDInput_features_floatData"], num_features, axis=1)[pi_eval_leading_cluster_ixs])
0116 pi_eval_y = ak.flatten(ak.unflatten(pi_eval["_EcalEndcapNParticleIDTarget_int64Data"], num_targets, axis=1)[pi_eval_leading_cluster_ixs])
0117 e_eval_x = ak.flatten(ak.unflatten(e_eval["_EcalEndcapNParticleIDInput_features_floatData"], num_features, axis=1)[e_eval_leading_cluster_ixs])
0118 e_eval_y = ak.flatten(ak.unflatten(e_eval["_EcalEndcapNParticleIDTarget_int64Data"], num_targets, axis=1)[e_eval_leading_cluster_ixs])
0119 eval_x = ak.concatenate([pi_eval_x, e_eval_x])
0120 eval_y = ak.concatenate([pi_eval_y, e_eval_y])
0121 #+end_src
0122
0123 #+RESULTS:
0124 :results:
0125 : 11 features
0126 : 2 targets
0127 :end:
0128
0129 #+begin_src jupyter-python
0130
0131 #+end_src
0132
0133 #+RESULTS:
0134 :results:
0135 #+begin_export html
0136 <pre>[5.11,
0137 0.0424,
0138 3.03,
0139 2.16,
0140 17.7,
0141 8.32,
0142 -4.54e-07,
0143 0.000456,
0144 0,
0145 69.2,
0146 0]
0147 ------------------
0148 type: 11 * float32</pre>
0149 #+end_export
0150 :end:
0151
0152 #+begin_src jupyter-python
0153 ak.sum((ak.num(pi_train_leading_cluster_ixs) != 0)), ak.num(pi_train_leading_cluster_ixs, axis=0)
0154 #+end_src
0155
0156 #+RESULTS:
0157 :results:
0158 | 87721 | array | (88210) |
0159 :end:
0160
0161 #+begin_src jupyter-python
0162 plt.hist(pi_eval_x[:,0])
0163 plt.hist(e_eval_x[:,0], alpha=0.5)
0164 plt.show()
0165 #+end_src
0166
0167 #+RESULTS:
0168 :results:
0169 [[file:./.ob-jupyter/5381c9bd149f0bb8855bf539e7ce8ef927a2e1a9.png]]
0170 :end:
0171
0172 #+begin_src jupyter-python
0173 """
0174 fig, axs = plt.subplots(2, 4, sharex=True, sharey=True, figsize=(15, 6))
0175 fig_log, axs_log = plt.subplots(2, 4, sharex=True, sharey=True, figsize=(15, 6))
0176 fig_roc, axs_roc = plt.subplots(2, 4, sharex=True, sharey=True, figsize=(15, 6))
0177
0178 axs = np.ravel(np.array(axs))
0179 axs_log = np.ravel(np.array(axs_log))
0180 axs_roc = np.ravel(np.array(axs_roc))
0181
0182 rocs = {}
0183
0184 for ix, energy in enumerate(energies):
0185 for c in [False, True]:
0186 energy_value = float(energy.replace("GeV", "").replace("MeV", "e-3"))
0187 if c:
0188 clf_label = "leading cluster"
0189 else:
0190 clf_label = "sum all hits"
0191 def clf(events):
0192 if c:
0193 return ak.drop_none(ak.max(events["EcalEndcapNClusters.energy"], axis=-1)) / energy_value
0194 else:
0195 return ak.sum(events["EcalEndcapNRecHits.energy"], axis=-1) / energy_value
0196 e_pred = clf(e_eval[energy])
0197 pi_pred = clf(pi_eval[energy])
0198
0199 for do_log in [False, True]:
0200 plt.sca(axs[ix])
0201 plt.hist(e_pred, bins=np.linspace(0., 1.01, 101), label=rf"$e^-$ {clf_label}")
0202 plt.hist(pi_pred, bins=np.linspace(0., 1.01, 101), label=rf"$\pi^-$ {clf_label}", histtype="step")
0203 plt.title(f"{energy}")
0204 plt.legend()
0205 if do_log: plt.yscale("log")
0206
0207 plt.sca(axs_roc[ix])
0208 fpr, tpr, _ = roc_curve(
0209 np.concatenate([np.ones_like(e_pred), np.zeros_like(pi_pred)]),
0210 np.concatenate([e_pred, pi_pred]),
0211 )
0212 cond = fpr != 0 # avoid infinite rejection (region of large uncertainty)
0213 cond &= tpr != 1 # avoid linear interpolation (region of large uncertainty)
0214 def mk_interp(tpr, fpr):
0215 def interp(eff):
0216 return np.interp(eff, tpr, fpr)
0217 return interp
0218 rocs.setdefault(clf_label, {})[energy] = mk_interp(tpr, fpr)
0219 plt.plot(tpr[cond] * 100, 1 / fpr[cond], label=f"{clf_label}")
0220 plt.title(f"{energy}")
0221 plt.legend()
0222
0223 fig.show()
0224 plt.close(fig_log)
0225 fig_roc.show()
0226
0227 plt.figure()
0228 for clf_label, roc in rocs.items():
0229 plt.plot(
0230 [float(energy.replace("GeV", "").replace("MeV", "e-3")) for energy in energies],
0231 [1 / roc[energy](0.95) for energy in energies],
0232 marker=".",
0233 label=f"{clf_label}",
0234 )
0235 plt.legend()
0236 plt.show()
0237 """
0238 #+end_src
0239
0240 #+begin_src jupyter-python
0241 import catboost
0242 clf = {}
0243
0244 from sklearn.metrics import roc_curve
0245 roc = {}
0246
0247 clf = catboost.CatBoostClassifier(loss_function="CrossEntropy", verbose=0, n_estimators=1000)
0248 clf.fit(
0249 train_x.to_numpy(),
0250 train_y.to_numpy()[:,1], # index 1 = is electron
0251 )
0252 plt.hist(clf.predict_proba(e_eval_x.to_numpy())[:,1], bins=np.linspace(0., 1.01, 101), label=r"$e^-$")
0253 plt.hist(clf.predict_proba(pi_eval_x.to_numpy())[:,1], bins=np.linspace(0., 1.01, 101), label=r"$\pi^-$", histtype="step")
0254 plt.xlabel("Classifier's probability prediction", loc="right")
0255 plt.ylabel("Number of events", loc="top")
0256 plt.legend(loc="upper center")
0257 plt.savefig(output_dir / "predict_proba.pdf", bbox_inches="tight")
0258 plt.show()
0259 #+end_src
0260
0261 #+begin_src jupyter-python
0262 energy_bin_edges = np.arange(0., 20. + 1e-7, 1.)
0263
0264 _eval_energy = eval_x[:,0]
0265
0266 for energy_bin_ix, (energy_bin_low, energy_bin_high) in enumerate(zip(energy_bin_edges[:-1], energy_bin_edges[1:])):
0267 cond = (_eval_energy >= energy_bin_low) & (_eval_energy < energy_bin_high)
0268 print(energy_bin_low, energy_bin_high, ak.sum(cond))
0269
0270 pi_cond = (pi_eval_x[:,0] >= energy_bin_low) & (pi_eval_x[:,0] < energy_bin_high)
0271 e_cond = (e_eval_x[:,0] >= energy_bin_low) & (e_eval_x[:,0] < energy_bin_high)
0272 plt.hist(pi_eval_x[pi_cond][:,1], bins=np.linspace(0., 1.01, 101))
0273 plt.hist(e_eval_x[e_cond][:,1], bins=np.linspace(0., 1.01, 101))
0274 plt.yscale("log")
0275 plt.show()
0276 plt.clf()
0277
0278 fpr, tpr, _ = roc_curve(
0279 eval_y[cond][:,1],
0280 #eval_x[cond][:,1],
0281 clf.predict_proba(eval_x[cond].to_numpy())[:,1],
0282 )
0283 cond = fpr != 0 # avoid infinite rejection (region of large uncertainty)
0284 cond &= tpr != 1 # avoid linear interpolation (region of large uncertainty)
0285 #cond &= tpr > 0.5
0286 plt.plot(tpr[cond] * 100, 1 / fpr[cond])
0287
0288 def mk_interp(tpr, fpr):
0289 def interp(eff):
0290 return np.interp(eff, tpr, fpr)
0291 return interp
0292 roc[energy_bin_ix] = mk_interp(tpr, fpr)
0293
0294 plt.xlabel("Electron efficiency, %", loc="right")
0295 plt.ylabel("Pion rejection factor", loc="top")
0296 plt.title(rf"${energy_bin_low:.1f} < |\vec{{p}}| < {energy_bin_high:.1f}$ GeV")
0297 plt.legend(loc="lower left")
0298 plt.yscale("log")
0299 plt.savefig(output_dir / f"roc_{energy_bin_low:.1f}_{energy_bin_high:.1f}.pdf", bbox_inches="tight")
0300 plt.show()
0301 plt.clf()
0302
0303 plt.plot(
0304 (energy_bin_edges[:-1] + energy_bin_edges[1:]) / 2,
0305 [1 / roc[energy_bin_ix](0.95) for energy_bin_ix in range(len(energy_bin_edges) - 1)],
0306 marker=".",
0307 label="",
0308 )
0309 plt.yscale("log")
0310 plt.legend()
0311 plt.xlabel("Energy, GeV", loc="right")
0312 plt.ylabel("Pion rejection at 95%", loc="top")
0313 plt.savefig(output_dir / f"pion_rej.pdf", bbox_inches="tight")
0314 plt.show()
0315 #+end_src
0316
0317 #+begin_src jupyter-python
0318 clf.save_model(
0319 output_dir / "EcalEndcapN_pi_rejection.onnx",
0320 format="onnx",
0321 export_parameters={
0322 "onnx_doc_string": "Classifier model for pion rejection in EEEMCal",
0323 "onnx_graph_name": "CalorimeterParticleID_BinaryClassification",
0324 }
0325 )
0326 import onnx
0327
0328 model = onnx.load(output_dir / "EcalEndcapN_pi_rejection.onnx")
0329 onnx.checker.check_model(model)
0330 graph_def = onnx.helper.make_graph(
0331 nodes=[model.graph.node[0]],
0332 name=model.graph.name,
0333 inputs=model.graph.input,
0334 outputs=[model.graph.output[0], model.graph.value_info[0]],
0335 initializer=model.graph.initializer
0336 )
0337 model_def = onnx.helper.make_model(graph_def, producer_name=model.producer_name)
0338 del model_def.opset_import[:]
0339 op_set = model_def.opset_import.add()
0340 op_set.domain = "ai.onnx.ml"
0341 op_set.version = 2
0342 model_def = onnx.shape_inference.infer_shapes(model_def)
0343 onnx.checker.check_model(model_def)
0344 onnx.save(model_def, output_dir / "EcalEndcapN_pi_rejection.onnx")
0345 #+end_src
0346
0347 #+RESULTS:
0348 :results:
0349 :end:
0350
0351 #+begin_src jupyter-python
0352 if "_EcalEndcapNParticleIDOutput_probability_tensor_floatData" in pi_train.fields:
0353 nums = ak.num(pi_train["_EcalEndcapNParticleIDOutput_probability_tensor_floatData"], axis=1)
0354 num_outputs = ak.min(nums[nums > 0])
0355 print(f"{num_outputs} outputs")
0356
0357 pi_train_proba = ak.flatten(ak.unflatten(pi_train["_EcalEndcapNParticleIDOutput_probability_tensor_floatData"], num_outputs, axis=1)[pi_train_leading_cluster_ixs])
0358 e_train_proba = ak.flatten(ak.unflatten(e_train["_EcalEndcapNParticleIDOutput_probability_tensor_floatData"], num_outputs, axis=1)[e_train_leading_cluster_ixs])
0359 train_proba = ak.concatenate([pi_train_proba, e_train_proba])
0360
0361 pi_eval_proba = ak.flatten(ak.unflatten(pi_eval["_EcalEndcapNParticleIDOutput_probability_tensor_floatData"], num_outputs, axis=1)[pi_eval_leading_cluster_ixs])
0362 e_eval_proba = ak.flatten(ak.unflatten(e_eval["_EcalEndcapNParticleIDOutput_probability_tensor_floatData"], num_outputs, axis=1)[e_eval_leading_cluster_ixs])
0363 eval_proba = ak.concatenate([pi_eval_proba, e_eval_proba])
0364
0365 plt.hist(clf.predict_proba(eval_x.to_numpy())[:,1] - eval_proba[:,1].to_numpy())
0366 plt.savefig(output_dir / f"proba_diff.pdf", bbox_inches="tight")
0367 plt.show()
0368 else:
0369 print("EcalEndcapNParticleIDOutput not present")
0370 #+end_src