Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-06-26 08:40:22

0001 #!/usr/bin/env python3
0002 """
0003 backfill_sample_discriminator.py — one-shot identity backfill.
0004 
0005 Make the composed dataset name (``Dataset.build_dataset_name``) a unique
0006 identity for every live dataset, by materializing the two discriminators that
0007 the tag run alone does not carry (PCS.md §Sample Variants, JEDI_INTEGRATION
0008 §Output naming). Two independent corrections:
0009 
0010   1. Radiative corrections -> evgen tag. A generator whose EVGEN paths carry
0011      both ``Rad`` and ``noRad`` conflates two distinct physics configurations
0012      under one evgen tag. Split it: the existing tag becomes the noRad tag
0013      (``radiative='off'``), a new tag carries ``radiative='on'``, and the Rad
0014      datasets rebind to it. Generators seen with only one radiative state just
0015      get the matching ``radiative`` value recorded (no new tag, no rebind).
0016 
0017   2. Single-particle angle -> ``Dataset.sample_name``. SINGLE samples share a
0018      ``(particle, energy)`` physics tag and differ only by polar-angle range —
0019      a production discriminator, not a tag. Set ``sample_name`` from the path
0020      tail (``single_particle_angle``), so it composes after the tag run.
0021 
0022 Scope: live datasets only — those referenced by a non-``past_output`` ProdTask.
0023 Idempotent: re-running makes no further change once applied.
0024 
0025 Default is a DRY RUN: the work is performed inside a transaction that is then
0026 rolled back, so the before/after composed-name collision counts it prints are
0027 real, not predicted. Pass ``--apply`` to commit.
0028 
0029 Usage::
0030 
0031     cd /data/wenauseic/github/swf-monitor/src
0032     source ../../swf-testbed/.venv/bin/activate && source ~/.env
0033     python ../scripts/backfill_sample_discriminator.py            # dry-run
0034     python ../scripts/backfill_sample_discriminator.py --apply    # persist
0035 
0036 One-shot intra-app backfill, not an operational tool, so it boots Django
0037 directly rather than going through REST. After running once on a deployment it
0038 can be archived.
0039 """
0040 import argparse
0041 import os
0042 import sys
0043 from pathlib import Path
0044 
0045 
0046 class _Rollback(Exception):
0047     """Sentinel used to roll back the dry-run transaction after measuring."""
0048 
0049 
0050 def main():
0051     ap = argparse.ArgumentParser(description=__doc__.strip().splitlines()[0])
0052     ap.add_argument('--apply', action='store_true',
0053                     help='Persist changes (default: dry-run, rolled back)')
0054     args = ap.parse_args()
0055 
0056     src = Path(__file__).resolve().parent.parent / 'src'
0057     sys.path.insert(0, str(src))
0058     os.environ.setdefault('DJANGO_SETTINGS_MODULE',
0059                           'swf_monitor_project.settings')
0060     import django
0061     django.setup()
0062 
0063     from collections import defaultdict
0064     from django.db import transaction
0065     from pcs.models import Dataset, ProdTask
0066     from pcs.physics_match import derive_evgen, single_particle_angle
0067     from pcs.services import _task_name_from_path, find_or_create_evgen_tag
0068 
0069     live_ids = set(
0070         ProdTask.objects.exclude(status='past_output')
0071         .values_list('dataset_id', flat=True)
0072     )
0073     live = list(
0074         Dataset.objects.filter(id__in=live_ids).select_related(
0075             'physics_tag', 'evgen_tag', 'simu_tag', 'reco_tag', 'background_tag')
0076     )
0077     print(f'Live datasets (referenced by non-past_output tasks): {len(live)}')
0078 
0079     def collisions(datasets):
0080         by_name = defaultdict(list)
0081         for d in datasets:
0082             by_name[d.build_dataset_name()].append(d.id)
0083         return {n: ids for n, ids in by_name.items() if len(ids) > 1}
0084 
0085     before = collisions(live)
0086     print(f'Composed-name collisions BEFORE: {len(before)} names over '
0087           f'{sum(len(v) for v in before.values())} datasets')
0088     print()
0089 
0090     def relpath(d):
0091         return _task_name_from_path(d.source_location)
0092 
0093     def rad_state(d):
0094         segs = relpath(d).split('/')
0095         if 'noRad' in segs:
0096             return 'off'
0097         if 'Rad' in segs:
0098             return 'on'
0099         return ''
0100 
0101     changes = {'tag_radiative_set': 0, 'rad_tags_created': 0,
0102                'datasets_rebound': 0, 'sample_name_set': 0, 'warnings': []}
0103 
0104     try:
0105         with transaction.atomic():
0106             # ---- 1. Radiative corrections -> evgen tag ----
0107             by_tag = defaultdict(list)
0108             for d in live:
0109                 if rad_state(d):
0110                     by_tag[d.evgen_tag_id].append(d)
0111 
0112             for tag_id, ds_list in by_tag.items():
0113                 tag = ds_list[0].evgen_tag
0114                 # A tag that also carries datasets with no radiative path
0115                 # segment can't be cleanly labelled — flag, don't guess.
0116                 others = [d for d in live
0117                           if d.evgen_tag_id == tag_id and not rad_state(d)]
0118                 if others:
0119                     msg = (f'evgen {tag.tag_label} mixes radiative and '
0120                            f'non-radiative datasets — left unsplit')
0121                     changes['warnings'].append(msg)
0122                     print(f'  WARNING: {msg}')
0123                     continue
0124 
0125                 states = sorted({rad_state(d) for d in ds_list})
0126                 if states in (['off'], ['on']):
0127                     v = states[0]
0128                     params = dict(tag.parameters or {})
0129                     if params.get('radiative') != v:
0130                         params['radiative'] = v
0131                         tag.parameters = params
0132                         tag.save(update_fields=['parameters'])
0133                         changes['tag_radiative_set'] += 1
0134                         print(f'  set {tag.tag_label} radiative={v} '
0135                               f'({len(ds_list)} datasets)')
0136                 else:  # both 'off' and 'on' under one tag — split
0137                     params = dict(tag.parameters or {})
0138                     if params.get('radiative') != 'off':
0139                         params['radiative'] = 'off'
0140                         tag.parameters = params
0141                         tag.save(update_fields=['parameters'])
0142                         changes['tag_radiative_set'] += 1
0143                     on_params = dict(tag.parameters or {})
0144                     on_params['radiative'] = 'on'
0145                     on_tag, action = find_or_create_evgen_tag(
0146                         on_params, created_by='backfill-radiative')
0147                     if action == 'create':
0148                         changes['rad_tags_created'] += 1
0149                     print(f'  split {tag.tag_label} -> noRad stays '
0150                           f'{tag.tag_label}, Rad -> {on_tag.tag_label} '
0151                           f'({action})')
0152                     for d in ds_list:
0153                         if rad_state(d) == 'on':
0154                             d.evgen_tag = on_tag
0155                             d.evgen_tag_id = on_tag.id
0156                             d.save(update_fields=['evgen_tag'])
0157                             changes['datasets_rebound'] += 1
0158 
0159             # ---- 2. Single-particle angle -> sample_name ----
0160             for d in live:
0161                 angle = single_particle_angle(relpath(d))
0162                 if angle and d.sample_name != angle:
0163                     d.sample_name = angle
0164                     d.save(update_fields=['sample_name'])
0165                     changes['sample_name_set'] += 1
0166 
0167             after = collisions(live)
0168             print()
0169             print(f"evgen radiative recorded:   {changes['tag_radiative_set']}")
0170             print(f"new Rad evgen tags created: {changes['rad_tags_created']}")
0171             print(f"datasets rebound to Rad tag:{changes['datasets_rebound']}")
0172             print(f"sample_name set (SINGLE):   {changes['sample_name_set']}")
0173             print()
0174             print(f'Composed-name collisions AFTER:  {len(after)} names over '
0175                   f'{sum(len(v) for v in after.values())} datasets')
0176             if after:
0177                 print('  Residual collisions:')
0178                 for name, ids in after.items():
0179                     print(f'    {name}  <- datasets {ids}')
0180 
0181             if not args.apply:
0182                 raise _Rollback
0183     except _Rollback:
0184         print()
0185         print('DRY RUN — transaction rolled back, nothing written. '
0186               'Re-run with --apply to commit.')
0187         return 0
0188 
0189     print()
0190     print('APPLIED — changes committed.')
0191     return 0
0192 
0193 
0194 if __name__ == '__main__':
0195     sys.exit(main())