epic_capybara/cli/bara.py

0001 import gzip
0002 import os
0003 import re
0004
0005 import awkward as ak
0006 import click
0007 import numpy as np
0008 import uproot
0009 from bokeh.events import DocumentReady
0010 from bokeh.io import curdoc
0011 from bokeh.layouts import gridplot
0012 from bokeh.models import ColumnDataSource
0013 from bokeh.models import CustomJSExpr
0014 from bokeh.models import Range1d
0015 from bokeh.models import PrintfTickFormatter
0016 from bokeh.plotting import figure, output_file, save
0017 from hist import Hist
0018 from scipy.stats import kstest
0019
0020 from ..util import skip_common_prefix
0021
0022 _MIDPOINT_EXPR_CODE = """
0023 const y1 = this.data.y1;
0024 const y2 = this.data.y2;
0025 return y1.map((v, i) => (v + y2[i]) / 2);
0026 """
0027
0028
0029 def _is_leaf(obj):
0030     """Check if an uproot branch/field object is a leaf (has no sub-branches/sub-fields).
0031
0032     Supports both TTree TBranch objects (which use `.branches`) and
0033     RNTuple RField objects (which use `.fields`).
0034     """
0035     if hasattr(obj, 'branches'):
0036         return len(obj.branches) == 0
0037     if hasattr(obj, 'fields'):
0038         return len(obj.fields) == 0
0039     return True
0040
0041
0042 def _normalize_key(key):
0043     """Normalize uproot key format between TTree and RNTuple styles.
0044
0045     TTree EDM4hep keys follow 'CollectionName/CollectionName.fieldPath' pattern.
0046     RNTuple EDM4hep keys follow 'CollectionName.fieldPath' pattern.
0047     This function converts TTree-style keys to the RNTuple-style format so that
0048     the same physics quantity has the same key regardless of the input file format.
0049
0050     TTree keys for fixed-size array branches include a trailing '[N]' size
0051     annotation (e.g. 'covariance.covariance[21]') which is absent in RNTuple
0052     keys; this suffix is stripped so the two formats match.
0053
0054     >>> _normalize_key('MCParticles/MCParticles.momentum.x')
0055     'MCParticles.momentum.x'
0056     >>> _normalize_key('MCParticles.momentum.x')
0057     'MCParticles.momentum.x'
0058     >>> _normalize_key('EventHeader/EventHeader.eventNumber')
0059     'EventHeader.eventNumber'
0060     >>> _normalize_key('CentralCKFTrackParameters/CentralCKFTrackParameters.covariance.covariance[21]')
0061     'CentralCKFTrackParameters.covariance.covariance'
0062     """
0063     if "/" in key:
0064         _, field_part = key.split("/", 1)
0065     else:
0066         field_part = key
0067     return re.sub(r'\[\d+\]$', '', field_part)
0068
0069
0070 def match_filter(key, match, unmatch):
0071     accept = True
0072     if match:
0073         accept = False
0074         for regex in match:
0075             if regex.match(key):
0076                 accept = True
0077     for regex in unmatch:
0078         if regex.match(key):
0079             accept = False
0080     return accept
0081
0082
0083 @click.command()
0084 @click.argument("files", type=click.File('rb'), nargs=-1)
0085 @click.option(
0086     "-m", "--match", multiple=True,
0087     help="Only include collections with names matching a regex"
0088 )
0089 @click.option(
0090     "-M", "--unmatch", multiple=True,
0091     help="Exclude collections with names matching a regex"
0092 )
0093 @click.option(
0094     "--serve", is_flag=True,
0095     default=False,
0096     help="Run a local HTTP server to view the report"
0097 )
0098 def bara(files, match, unmatch, serve):
0099     arr = {}
0100
0101     match = list(map(re.compile, match))
0102     unmatch = list(map(re.compile, unmatch))
0103
0104     for _file in files:
0105         tree = uproot.open(_file)["events"]
0106
0107         sort_by_evtnum = None
0108         for evtnum_key in ["EventHeader/EventHeader.eventNumber", "EventHeader.eventNumber"]:
0109             if evtnum_key in tree.keys(recursive=True):
0110                 evtnum = tree[evtnum_key].array()
0111                 sort_by_evtnum = ak.argsort(ak.flatten(evtnum))
0112                 break
0113
0114         for key in tree.keys(recursive=True):
0115             if not key.startswith("PARAMETERS") and _is_leaf(tree[key]):
0116                 normalized = _normalize_key(key)
0117                 if match_filter(normalized, match, unmatch):
0118                     val = tree[key].array()
0119                     if sort_by_evtnum is not None:
0120                         val = val[sort_by_evtnum]
0121                     arr.setdefault(normalized, {})[_file] = val
0122
0123     paths = skip_common_prefix([_file.name.split("/") for _file in files])
0124     paths = skip_common_prefix([reversed(list(path)) for path in paths])
0125     labels = ["/".join(reversed(list(reversed_path))) for reversed_path in paths]
0126
0127     collection_figs = {}
0128     collection_with_diffs = {}
0129     collection_step_exprs = {}
0130
0131     for key in sorted(arr.keys()):
0132         if any("string" in str(ak.type(a)) for a in arr[key].values()):
0133             click.echo(f"String value detected for key \"{key}\". Skipping...")
0134             continue
0135         x_min = min(filter(
0136             lambda v: v is not None,
0137             map(lambda a: ak.min(ak.mask(a, np.isfinite(a))), arr[key].values())
0138         ), default=None)
0139         if x_min is None:
0140             continue
0141         x_range = max(filter(
0142             lambda v: v is not None,
0143             map(lambda a: ak.max(ak.mask(a - x_min, np.isfinite(a))), arr[key].values())
0144         ), default=None)
0145         nbins = 10
0146
0147         if (any("* uint" in str(ak.type(a)) for a in arr[key].values())
0148            or any("* int" in str(ak.type(a)) for a in arr[key].values())):
0149             x_range = x_range + 1
0150             nbins = int(min(100, np.ceil(x_range)))
0151         else:
0152             x_range = x_range * 1.1
0153
0154         if x_range == 0:
0155             x_range = 1
0156
0157         if "." in key:
0158             branch_name = key.split(".", 1)[0]
0159             leaf_name = key
0160         else:
0161             branch_name = key
0162             leaf_name = key
0163
0164         midpoint_expr = collection_step_exprs.setdefault(
0165             branch_name,
0166             CustomJSExpr(code=_MIDPOINT_EXPR_CODE),
0167         )
0168         fig = figure(x_axis_label=leaf_name, y_axis_label="Entries")
0169         if x_range < 1.:
0170             fig.xaxis.formatter = PrintfTickFormatter(format="%.2g")
0171         collection_figs.setdefault(branch_name, []).append(fig)
0172         y_max = 0
0173
0174         prev_file_arr = None
0175         vis_params = [
0176           ("green", 1.5, "solid", " "),
0177           ("red", 3, "dashed", ","),
0178           ("blue", 2, "dotted", "."),
0179         ]
0180
0181         if set(arr[key].keys()) != set(files):
0182             # not every file has the key
0183             collection_with_diffs[branch_name] = 0.0
0184
0185         for _file, label, (color, line_width, line_dash, hatch_pattern) in zip(files, labels, vis_params):
0186             if _file not in arr[key]:
0187                 continue
0188             file_arr = arr[key][_file]
0189
0190             # diff and KS test
0191             pvalue = None
0192             if prev_file_arr is not None:
0193                 if ((ak.num(file_arr, axis=0) != ak.num(prev_file_arr, axis=0))
0194                    or ak.any(ak.num(file_arr, axis=1)
0195                              != ak.num(prev_file_arr, axis=1))
0196                    or ak.any(ak.nan_to_none(file_arr)
0197                              != ak.nan_to_none(prev_file_arr))):
0198                     if (ak.num(ak.flatten(file_arr, axis=None), axis=0) > 0 and
0199                         ak.num(ak.flatten(prev_file_arr, axis=None), axis=0) > 0):
0200                         # We can only apply the KS test on non-empty arrays
0201                         pvalue = kstest(
0202                                 ak.to_numpy(ak.flatten(file_arr, axis=None)),
0203                                 ak.to_numpy(ak.flatten(prev_file_arr, axis=None))
0204                             ).pvalue
0205                     else:
0206                         pvalue = 0
0207                     print(key, f"p = {pvalue:.3f}")
0208                     print(prev_file_arr)
0209                     print(file_arr)
0210                     collection_with_diffs[branch_name] = min(pvalue, collection_with_diffs.get(branch_name, 1.))
0211
0212             # Figure
0213             h = (
0214                 Hist.new
0215                 .Reg(nbins, 0, x_range, name="x", label=key)
0216                 .Int64()
0217             )
0218             h.fill(x=ak.flatten(file_arr - x_min, axis=None))
0219
0220             ys, edges = h.to_numpy()
0221             y0 = np.concatenate([ys, [ys[-1]]])
0222             legend_label=label + (f"\n{100*pvalue:.0f}%CL KS" if pvalue is not None else "")
0223             source = ColumnDataSource(
0224                 {
0225                     "x": edges + x_min,
0226                     "y1": y0 - np.sqrt(y0),
0227                     "y2": y0 + np.sqrt(y0),
0228                 }
0229             )
0230             step_r = fig.step(
0231                 x="x",
0232                 y={"expr": midpoint_expr},
0233                 mode="after",
0234                 source=source,
0235                 legend_label=legend_label,
0236                 line_color=color,
0237                 line_width=line_width,
0238                 line_dash=line_dash,
0239             )
0240             step_r.nonselection_glyph = step_r.glyph
0241             varea_r = fig.varea_step(
0242                 x="x",
0243                 y1="y1",
0244                 y2="y2",
0245                 step_mode="after",
0246                 source=source,
0247                 legend_label=legend_label,
0248                 fill_color=color if hatch_pattern == " " else None,
0249                 fill_alpha=0.25,
0250                 hatch_color=color,
0251                 hatch_alpha=0.5,
0252                 hatch_pattern=hatch_pattern,
0253             )
0254             varea_r.nonselection_glyph = varea_r.glyph
0255             fig.legend.background_fill_alpha = 0.5 # make legend more transparent
0256
0257             y_max = max(y_max, np.max(y0 + np.sqrt(y0)))
0258             prev_file_arr = file_arr
0259
0260         x_bounds = (x_min - 0.05 * x_range, x_min + 1.05 * x_range)
0261         y_bounds = (- 0.05 * y_max, 1.05 * y_max)
0262         # Set y range for histograms
0263         if np.all(np.isfinite(x_bounds)):
0264             try:
0265                 fig.x_range = Range1d(
0266                     *x_bounds,
0267                     bounds=x_bounds)
0268             except ValueError as e:
0269                 click.secho(str(e), fg="red", err=True)
0270         else:
0271             click.secho(f"overflow while calculating x bounds for \"{key}\"", fg="red", err=True)
0272         if np.all(np.isfinite(y_bounds)):
0273             try:
0274                 fig.y_range = Range1d(
0275                     *y_bounds,
0276                     bounds=y_bounds)
0277             except ValueError as e:
0278                 click.secho(str(e), fg="red", err=True)
0279         else:
0280             click.secho(f"overflow while calculating y bounds for \"{key}\"", fg="red", err=True)
0281
0282     def to_filename(branch_name):
0283         return branch_name.replace("#", "__pound__")
0284
0285     def option_key(item):
0286         collection_name, figs = item
0287         key = ""
0288         if collection_name in collection_with_diffs:
0289             if collection_with_diffs[collection_name] > 0.99:
0290                 key += " 0.99"
0291             elif collection_with_diffs[collection_name] > 0.95:
0292                 key += " 0.95"
0293             elif collection_with_diffs[collection_name] > 0.67:
0294                 key += " 0.67"
0295             else:
0296                 key += " 0.00"
0297         key += collection_name.lstrip("_")
0298         return key
0299
0300     options = [("", "")]
0301     for collection_name, figs in sorted(collection_figs.items(), key=option_key):
0302         marker = ""
0303         if collection_name in collection_with_diffs:
0304             if collection_with_diffs[collection_name] > 0.99:
0305                 marker = " (*)"
0306             elif collection_with_diffs[collection_name] > 0.95:
0307                 marker = " (**)"
0308             elif collection_with_diffs[collection_name] > 0.67:
0309                 marker = " (***)"
0310             else:
0311                 marker = " (****)"
0312         options.append((to_filename(collection_name), collection_name + marker))
0313
0314     from bokeh.models import CustomJS, Select
0315     def mk_dropdown(value=""):
0316         dropdown = Select(title="Select branch (**** < 67% CL, ..., * > 99% CL stat. equiv.):", value=value, options=options)
0317         dropdown.js_on_change("value", CustomJS(code="""
0318           console.log('dropdown: ' + this.value, this.toString())
0319           if (this.value != "") {
0320             window.location.hash = "#" + this.value;
0321             fetchAndReplaceBokehDocument(this.value);
0322           }
0323         """))
0324         return dropdown
0325
0326     def mk_dropdown_minimal(value=""):
0327         # Embed only the currently selected option; the full list is stored once
0328         # in index.html's JavaScript and restored client-side after each load.
0329         # This avoids repeating a ~54 KB options list in every .json.gz file.
0330         label = next((lbl for val, lbl in options if val == value), value)
0331         minimal_options = [("", "")] + ([(value, label)] if value else [])
0332         dropdown = Select(title="Select branch (**** < 67% CL, ..., * > 99% CL stat. equiv.):", value=value, options=minimal_options)
0333         dropdown.js_on_change("value", CustomJS(code="""
0334           console.log('dropdown: ' + this.value, this.toString())
0335           if (this.value != "") {
0336             window.location.hash = "#" + this.value;
0337             fetchAndReplaceBokehDocument(this.value);
0338           }
0339         """))
0340         return dropdown
0341
0342     from bokeh.layouts import column
0343     from bokeh.embed import json_item
0344     import json
0345
0346     os.makedirs("capybara-reports", exist_ok=True)
0347
0348     for collection_name, figs in collection_figs.items():
0349         item = column(
0350           mk_dropdown_minimal(collection_name),
0351           gridplot(figs, ncols=3, width=400, height=300),
0352         )
0353
0354         with gzip.open(f"capybara-reports/{to_filename(collection_name)}.json.gz", "wt") as fp:
0355             json.dump(json_item(item), fp, separators=(',', ':'))
0356
0357     curdoc().js_on_event(DocumentReady, CustomJS(args={"all_options": options}, code="""
0358       window._bokehSelectOptions = all_options;
0359
0360       function fetchAndReplaceBokehDocument(location) {
0361         fetch(location + '.json.gz')
0362           .then(async function(response) {
0363             if (!response.ok) {
0364                 throw new Error('Network response was not ok');
0365             }
0366
0367             const ds = new DecompressionStream('gzip');
0368             const decompressedStream = response.body.pipeThrough(ds);
0369             const decompressedResponse = new Response(decompressedStream);
0370             const item = await decompressedResponse.json();
0371
0372             Bokeh.documents[0].replace_with_json(item.doc);
0373
0374             // Restore the full options list to the newly loaded Select widget.
0375             for (const [, model] of Bokeh.documents[0]._all_models) {
0376               if (model.options instanceof Array) {
0377                 model.options = window._bokehSelectOptions;
0378                 model.value = location;
0379                 break;
0380               }
0381             }
0382           })
0383           .catch(function(error) {
0384             console.error('Fetch or decompression failed:', error);
0385           });
0386       }
0387
0388       window.onhashchange = function() {
0389         var location = window.location.hash.replace(/^#/, "");
0390         if ((location != "") && ((typeof current_location === 'undefined') || (current_location != location))) {
0391           fetchAndReplaceBokehDocument(location);
0392           window.current_location = location;
0393         }
0394       }
0395       window.onhashchange();
0396     """))
0397     output_file(filename="capybara-reports/index.html", title="ePIC capybara report")
0398     save(mk_dropdown())
0399
0400     if serve:
0401         os.chdir("capybara-reports/")
0402         from http.server import SimpleHTTPRequestHandler
0403         from socketserver import TCPServer
0404         with TCPServer(("127.0.0.1", 24535), SimpleHTTPRequestHandler) as httpd:
0405             print("Serving report at http://127.0.0.1:24535")
0406             try:
0407                 httpd.serve_forever()
0408             except KeyboardInterrupt:
0409                 pass