File indexing completed on 2026-06-26 08:40:22
0001
0002 """
0003 backfill_sample_discriminator.py — one-shot identity backfill.
0004
0005 Make the composed dataset name (``Dataset.build_dataset_name``) a unique
0006 identity for every live dataset, by materializing the two discriminators that
0007 the tag run alone does not carry (PCS.md §Sample Variants, JEDI_INTEGRATION
0008 §Output naming). Two independent corrections:
0009
0010 1. Radiative corrections -> evgen tag. A generator whose EVGEN paths carry
0011 both ``Rad`` and ``noRad`` conflates two distinct physics configurations
0012 under one evgen tag. Split it: the existing tag becomes the noRad tag
0013 (``radiative='off'``), a new tag carries ``radiative='on'``, and the Rad
0014 datasets rebind to it. Generators seen with only one radiative state just
0015 get the matching ``radiative`` value recorded (no new tag, no rebind).
0016
0017 2. Single-particle angle -> ``Dataset.sample_name``. SINGLE samples share a
0018 ``(particle, energy)`` physics tag and differ only by polar-angle range —
0019 a production discriminator, not a tag. Set ``sample_name`` from the path
0020 tail (``single_particle_angle``), so it composes after the tag run.
0021
0022 Scope: live datasets only — those referenced by a non-``past_output`` ProdTask.
0023 Idempotent: re-running makes no further change once applied.
0024
0025 Default is a DRY RUN: the work is performed inside a transaction that is then
0026 rolled back, so the before/after composed-name collision counts it prints are
0027 real, not predicted. Pass ``--apply`` to commit.
0028
0029 Usage::
0030
0031 cd /data/wenauseic/github/swf-monitor/src
0032 source ../../swf-testbed/.venv/bin/activate && source ~/.env
0033 python ../scripts/backfill_sample_discriminator.py # dry-run
0034 python ../scripts/backfill_sample_discriminator.py --apply # persist
0035
0036 One-shot intra-app backfill, not an operational tool, so it boots Django
0037 directly rather than going through REST. After running once on a deployment it
0038 can be archived.
0039 """
0040 import argparse
0041 import os
0042 import sys
0043 from pathlib import Path
0044
0045
0046 class _Rollback(Exception):
0047 """Sentinel used to roll back the dry-run transaction after measuring."""
0048
0049
0050 def main():
0051 ap = argparse.ArgumentParser(description=__doc__.strip().splitlines()[0])
0052 ap.add_argument('--apply', action='store_true',
0053 help='Persist changes (default: dry-run, rolled back)')
0054 args = ap.parse_args()
0055
0056 src = Path(__file__).resolve().parent.parent / 'src'
0057 sys.path.insert(0, str(src))
0058 os.environ.setdefault('DJANGO_SETTINGS_MODULE',
0059 'swf_monitor_project.settings')
0060 import django
0061 django.setup()
0062
0063 from collections import defaultdict
0064 from django.db import transaction
0065 from pcs.models import Dataset, ProdTask
0066 from pcs.physics_match import derive_evgen, single_particle_angle
0067 from pcs.services import _task_name_from_path, find_or_create_evgen_tag
0068
0069 live_ids = set(
0070 ProdTask.objects.exclude(status='past_output')
0071 .values_list('dataset_id', flat=True)
0072 )
0073 live = list(
0074 Dataset.objects.filter(id__in=live_ids).select_related(
0075 'physics_tag', 'evgen_tag', 'simu_tag', 'reco_tag', 'background_tag')
0076 )
0077 print(f'Live datasets (referenced by non-past_output tasks): {len(live)}')
0078
0079 def collisions(datasets):
0080 by_name = defaultdict(list)
0081 for d in datasets:
0082 by_name[d.build_dataset_name()].append(d.id)
0083 return {n: ids for n, ids in by_name.items() if len(ids) > 1}
0084
0085 before = collisions(live)
0086 print(f'Composed-name collisions BEFORE: {len(before)} names over '
0087 f'{sum(len(v) for v in before.values())} datasets')
0088 print()
0089
0090 def relpath(d):
0091 return _task_name_from_path(d.source_location)
0092
0093 def rad_state(d):
0094 segs = relpath(d).split('/')
0095 if 'noRad' in segs:
0096 return 'off'
0097 if 'Rad' in segs:
0098 return 'on'
0099 return ''
0100
0101 changes = {'tag_radiative_set': 0, 'rad_tags_created': 0,
0102 'datasets_rebound': 0, 'sample_name_set': 0, 'warnings': []}
0103
0104 try:
0105 with transaction.atomic():
0106
0107 by_tag = defaultdict(list)
0108 for d in live:
0109 if rad_state(d):
0110 by_tag[d.evgen_tag_id].append(d)
0111
0112 for tag_id, ds_list in by_tag.items():
0113 tag = ds_list[0].evgen_tag
0114
0115
0116 others = [d for d in live
0117 if d.evgen_tag_id == tag_id and not rad_state(d)]
0118 if others:
0119 msg = (f'evgen {tag.tag_label} mixes radiative and '
0120 f'non-radiative datasets — left unsplit')
0121 changes['warnings'].append(msg)
0122 print(f' WARNING: {msg}')
0123 continue
0124
0125 states = sorted({rad_state(d) for d in ds_list})
0126 if states in (['off'], ['on']):
0127 v = states[0]
0128 params = dict(tag.parameters or {})
0129 if params.get('radiative') != v:
0130 params['radiative'] = v
0131 tag.parameters = params
0132 tag.save(update_fields=['parameters'])
0133 changes['tag_radiative_set'] += 1
0134 print(f' set {tag.tag_label} radiative={v} '
0135 f'({len(ds_list)} datasets)')
0136 else:
0137 params = dict(tag.parameters or {})
0138 if params.get('radiative') != 'off':
0139 params['radiative'] = 'off'
0140 tag.parameters = params
0141 tag.save(update_fields=['parameters'])
0142 changes['tag_radiative_set'] += 1
0143 on_params = dict(tag.parameters or {})
0144 on_params['radiative'] = 'on'
0145 on_tag, action = find_or_create_evgen_tag(
0146 on_params, created_by='backfill-radiative')
0147 if action == 'create':
0148 changes['rad_tags_created'] += 1
0149 print(f' split {tag.tag_label} -> noRad stays '
0150 f'{tag.tag_label}, Rad -> {on_tag.tag_label} '
0151 f'({action})')
0152 for d in ds_list:
0153 if rad_state(d) == 'on':
0154 d.evgen_tag = on_tag
0155 d.evgen_tag_id = on_tag.id
0156 d.save(update_fields=['evgen_tag'])
0157 changes['datasets_rebound'] += 1
0158
0159
0160 for d in live:
0161 angle = single_particle_angle(relpath(d))
0162 if angle and d.sample_name != angle:
0163 d.sample_name = angle
0164 d.save(update_fields=['sample_name'])
0165 changes['sample_name_set'] += 1
0166
0167 after = collisions(live)
0168 print()
0169 print(f"evgen radiative recorded: {changes['tag_radiative_set']}")
0170 print(f"new Rad evgen tags created: {changes['rad_tags_created']}")
0171 print(f"datasets rebound to Rad tag:{changes['datasets_rebound']}")
0172 print(f"sample_name set (SINGLE): {changes['sample_name_set']}")
0173 print()
0174 print(f'Composed-name collisions AFTER: {len(after)} names over '
0175 f'{sum(len(v) for v in after.values())} datasets')
0176 if after:
0177 print(' Residual collisions:')
0178 for name, ids in after.items():
0179 print(f' {name} <- datasets {ids}')
0180
0181 if not args.apply:
0182 raise _Rollback
0183 except _Rollback:
0184 print()
0185 print('DRY RUN — transaction rolled back, nothing written. '
0186 'Re-run with --apply to commit.')
0187 return 0
0188
0189 print()
0190 print('APPLIED — changes committed.')
0191 return 0
0192
0193
0194 if __name__ == '__main__':
0195 sys.exit(main())