File indexing completed on 2026-06-26 08:40:22
0001
0002 """
0003 pcs_physics_completion_dryrun.py — read-only preview of the physics-tag re-tag.
0004
0005 Runs the real derivation + matching (``physics_match.derive_physics`` +
0006 ``services.find_or_create_physics_tag(dry_run=True)``) over both catalog
0007 populations — the csv_import EVGEN paths and the 4900 past Rucio DIDs — and
0008 reports what a catalog reload WOULD do once the completed-physics code is
0009 deployed: rows resolved, existing tags reused, distinct new tags created (per
0010 category), backgrounds routed to the signal-free p6001 tag, the p1006 anchor
0011 unwind, and any path whose physics cannot be derived. Writes nothing.
0012
0013 Usage::
0014 cd /data/wenauseic/github/swf-monitor/src
0015 source ../../swf-testbed/.venv/bin/activate && source ~/.env
0016 python ../scripts/pcs_physics_completion_dryrun.py
0017 """
0018 import os
0019 import sys
0020 from collections import Counter, defaultdict
0021
0022 THIS_DIR = os.path.dirname(os.path.abspath(__file__))
0023 sys.path.insert(0, os.path.join(THIS_DIR, '..', 'src'))
0024 os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'swf_monitor_project.settings')
0025 import django
0026 django.setup()
0027 from pcs.models import Dataset
0028 from pcs.physics_match import derive_physics
0029 from pcs.services import (
0030 find_or_create_physics_tag, _task_name_from_path, _extract_csv_filters,
0031 _physics_key, _PROCESS_CATEGORY,
0032 )
0033
0034 CAT_NAME = {1: 'Single', 2: 'DIS', 3: 'DVCS', 4: 'SIDIS', 5: 'Exclusive', 6: 'Background'}
0035
0036
0037 def _rows():
0038 """Yield (population, current_tag_label, derived) for every catalog row."""
0039 for ds in Dataset.objects.filter(dataset_name__startswith='csv_import.') \
0040 .select_related('physics_tag').iterator():
0041 loc = ds.get_metadata_value('source', 'location', default='') or ''
0042 task_name = _task_name_from_path(loc) or ds.dataset_name
0043 beam = _extract_csv_filters(loc, 'epic_craterlake').get('beam', '')
0044 yield 'csv_import', ds.physics_tag.tag_label, task_name, derive_physics(task_name, beam=beam)
0045 for ds in Dataset.objects.filter(dataset_name__startswith='past.') \
0046 .select_related('physics_tag').iterator():
0047 po = ds.get_metadata_value('past_output', default={}) or {}
0048 remainder = (po.get('path') or {}).get('path_remainder', '')
0049 beam = (po.get('filters') or {}).get('beam', '')
0050 yield 'past', ds.physics_tag.tag_label, remainder, derive_physics(remainder, beam=beam)
0051
0052
0053 def main():
0054 n = Counter()
0055 reused = set()
0056 new_keys = defaultdict(set)
0057 reuse_by_cat = defaultdict(set)
0058 current = Counter()
0059 moved_off_anchor = 0
0060 unrecognised = []
0061
0062 for pop, cur_label, path, derived in _rows():
0063 n[pop] += 1
0064 current[cur_label] += 1
0065 if derived is None:
0066 unrecognised.append(path)
0067 continue
0068 proc = derived.get('process')
0069 if proc in ('BEAMGAS', 'SYNRAD'):
0070 n['background_p6001'] += 1
0071 continue
0072 tag, action = find_or_create_physics_tag(derived, dry_run=True)
0073 cat = _PROCESS_CATEGORY.get(proc)
0074 if action == 'reuse':
0075 reused.add(tag.tag_label)
0076 reuse_by_cat[cat].add(tag.tag_label)
0077 if tag.tag_label != cur_label:
0078 moved_off_anchor += 1
0079 else:
0080 new_keys[cat].add(_physics_key(derived))
0081 moved_off_anchor += 1
0082
0083 anchor_label, anchor_n = current.most_common(1)[0]
0084 print('=' * 70)
0085 print('PCS PHYSICS-TAG RE-TAG — DRY RUN (no writes)')
0086 print('=' * 70)
0087 print(f'rows scanned: csv_import={n["csv_import"]} past={n["past"]}')
0088 print(f' resolved to a physics tag, moved off current binding: {moved_off_anchor}')
0089 print(f' backgrounds -> p6001 (signal-free): {n["background_p6001"]}')
0090 print(f' unrecognised (kept on anchor): {len(unrecognised)}')
0091 print(f'\ncurrent dominant binding (the anchor): {anchor_label} holds {anchor_n} rows')
0092
0093 print('\nresulting physics tags per category (reuse existing + create new):')
0094 total_reuse = total_new = 0
0095 for cat in sorted(set(reuse_by_cat) | set(new_keys)):
0096 r, c = len(reuse_by_cat.get(cat, ())), len(new_keys.get(cat, ()))
0097 total_reuse += r
0098 total_new += c
0099 print(f' cat {cat} {CAT_NAME.get(cat, "?"):10s} reuse {r:3d} create {c:3d} total {r + c:3d}')
0100 print(f' {"":15s} reuse {total_reuse:3d} create {total_new:3d} '
0101 f'TOTAL {total_reuse + total_new:3d} distinct physics tags')
0102
0103 if unrecognised:
0104 u = Counter(unrecognised)
0105 print(f'\nUNRECOGNISED paths ({len(unrecognised)} rows, {len(u)} distinct):')
0106 for p, c in u.most_common(20):
0107 print(f' {c:4d}x {p!r}')
0108 return 0
0109
0110
0111 if __name__ == '__main__':
0112 sys.exit(main())