File indexing completed on 2026-06-26 08:40:25
0001 """Derive canonical physics-tag parameters from a catalog task's EVGEN path.
0002
0003 The legacy ``csv_import`` catalog encodes a task's real physics in its path
0004 (``EVGEN/<category>/<sub>/.../<beam>/...``), not in a physics tag — the import
0005 pinned every row to one placeholder anchor tag, so the bound physics (beam,
0006 process) is wrong for almost all of them. This module parses the path into the
0007 parameters a physics tag carries, so each task can be matched to the correct
0008 existing tag, or have the right tag created. See EPICPROD_TASK_CATALOG.md.
0009
0010 The single source of truth is the path. ``derive_physics(path)`` returns the
0011 canonical params; the caller matches/creates the PhysicsTag and rebinds.
0012 """
0013 import re
0014
0015
0016
0017 _ABCONV = '_ABCONV'
0018
0019
0020 def _strip_abconv(s):
0021 return s[:-len(_ABCONV)] if s.endswith(_ABCONV) else s
0022
0023
0024 def _beam_pair(beam):
0025 """'18x275' -> ('18', '275'); '' -> ('N/A', 'N/A')."""
0026 if beam and 'x' in beam:
0027 e, h = beam.split('x', 1)
0028 return (e or 'N/A', h or 'N/A')
0029 return ('N/A', 'N/A')
0030
0031
0032
0033
0034
0035
0036 _KNOWN_AREAS = {'SINGLE', 'DIS', 'DDIS', 'SIDIS', 'EXCLUSIVE', 'EW_BSM', 'BACKGROUNDS'}
0037 _BEAM_RE = re.compile(r'^\d+x\d+$')
0038 _Q2_RE = re.compile(r'^(minQ2=\d+|q2_\S+)$')
0039 _ION_RE = re.compile(r'^e(H[0-9]+|He[0-9]+|Li[0-9]*|Ca|Cu|Ru|Pb|Au|C|O)$')
0040 _NUCLEON = {'ep', 'en'}
0041 _MASS_RE = re.compile(r'^ma_[0-9]')
0042 _CHANNEL_RE = re.compile(r'^aem')
0043 _UPSILON_RE = re.compile(r'^upsilon(1s|2s|3s)(photo|_threshold)_ab_(hiAcc|hiDiv)_(\d+x\d+)')
0044 _BEAMCONFIG = {'hiAcc', 'hiDiv'}
0045 _DECAY = {'edecay', 'mudecay'}
0046 _CHARGE = {'hplus', 'hminus'}
0047 _HELICITY = {'hel_plus', 'hel_minus'}
0048 _COHERENCE = {'coherent', 'Coherent'}
0049 _MODEL = {'bsat'}
0050 _POLAR = {'unpolarised', 'polarised'}
0051 _FINAL_STATE = {'pi+', 'pi-', 'pi0', 'K+', 'K-', 'K0', 'K+Lambda'}
0052
0053
0054
0055 def _split_compounds(tok):
0056 """Yield a segment and its '_'-split parts so embedded physics tokens are
0057 seen: 'coherent_ep' -> coherent, ep; 'ep_noradcor' -> ep (noradcor ignored)."""
0058 yield tok
0059 if '_' in tok:
0060 yield from tok.split('_')
0061
0062
0063 def _physics_area_segments(path):
0064 """Return path segments from the first recognised physics area onward,
0065 dropping any leading prefix — the ``EVGEN`` root, or a past Rucio-DID
0066 background-overlay chain (``Bkg_*/Synrad_*/GoldC*/<um>/…``). ``[]`` if no
0067 physics area is present."""
0068 segs = [s for s in (path or '').split('/') if s]
0069 for i, s in enumerate(segs):
0070 if s in _KNOWN_AREAS:
0071 return segs[i:]
0072 return []
0073
0074
0075 def _beam_split(tok):
0076 e, h = tok.split('x', 1)
0077 return (e or 'N/A', h or 'N/A')
0078
0079
0080 def derive_physics(path, beam=''):
0081 """Full physics-tag parameter set from an EVGEN path or a past Rucio-DID
0082 path remainder. Token-scanning, not positional. Returns the schema-named
0083 param dict, or ``None`` when no physics area is present. Excludes the angle
0084 range (a sample variant), radiation, and generator (EVGEN-tag axes).
0085
0086 BACKGROUNDS resolve to process BEAMGAS/SYNRAD so the caller can route them to
0087 the signal-free p6001 physics tag plus a k background tag.
0088 """
0089 segs = _physics_area_segments(path)
0090 if not segs:
0091 return None
0092 area, rest = segs[0], segs[1:]
0093
0094 if area == 'SINGLE':
0095 return {'process': 'SINGLE',
0096 'beam_energy_electron': 'N/A', 'beam_energy_hadron': 'N/A',
0097 'particle': rest[0] if rest else '',
0098 'gun_energy': rest[1] if len(rest) > 1 else ''}
0099
0100 if area == 'BACKGROUNDS':
0101 e, h = _beam_pair(beam)
0102 sub = rest[0] if rest else ''
0103 if sub == 'SYNRAD':
0104 return {'process': 'SYNRAD', 'beam_energy_electron': e, 'beam_energy_hadron': h}
0105 return {'process': 'BEAMGAS', 'beam_energy_electron': e, 'beam_energy_hadron': h,
0106 'bg_source': rest[1] if len(rest) > 1 else '',
0107 'bg_mechanism': rest[2] if len(rest) > 2 else ''}
0108
0109 sig = {}
0110 if area in ('DIS', 'DDIS'):
0111 proc = 'DDIS' if area == 'DDIS' else 'DIS'
0112 if 'NC' in rest:
0113 proc = 'DIS_NC'
0114 elif 'CC' in rest:
0115 proc = 'DIS_CC'
0116 sig['process'] = proc
0117 elif area == 'SIDIS':
0118 sub = _strip_abconv(rest[0]) if rest else ''
0119 sig['process'] = 'SIDIS_' + sub if sub in ('D0', 'DIJET', 'Lc') else 'SIDIS'
0120 elif area == 'EXCLUSIVE':
0121 sig['process'] = _strip_abconv(rest[0]) if rest else 'EXCLUSIVE'
0122 elif area == 'EW_BSM':
0123 sig['process'] = rest[0] if rest else 'EW_BSM'
0124 else:
0125 sig['process'] = area
0126
0127 ions, nucleons = [], []
0128 for raw in rest:
0129 m = _UPSILON_RE.match(raw)
0130 if m:
0131 sig['state'] = m.group(1)
0132 sig['mechanism'] = m.group(2).lstrip('_')
0133 sig['beam_config'] = m.group(3)
0134 sig['beam_energy_electron'], sig['beam_energy_hadron'] = _beam_split(m.group(4))
0135 continue
0136 for tok in _split_compounds(raw):
0137 if _BEAM_RE.match(tok):
0138 sig['beam_energy_electron'], sig['beam_energy_hadron'] = _beam_split(tok)
0139 elif _Q2_RE.match(tok): sig['q2_range'] = tok
0140 elif _ION_RE.match(tok): ions.append(tok)
0141 elif tok in _NUCLEON: nucleons.append(tok)
0142 elif tok in _BEAMCONFIG: sig['beam_config'] = tok
0143 elif tok in _DECAY: sig['decay_mode'] = tok
0144 elif tok in _CHARGE: sig['hadron_charge'] = tok
0145 elif tok in _HELICITY: sig['helicity'] = tok
0146 elif tok in _COHERENCE: sig['coherence'] = 'coherent'
0147 elif tok in _MODEL: sig['model'] = tok
0148 elif tok in _POLAR: sig['polarization'] = tok
0149 elif tok in _FINAL_STATE: sig['final_state'] = tok
0150 elif _MASS_RE.match(tok): sig['mass'] = tok
0151 elif _CHANNEL_RE.match(tok): sig['channel'] = tok
0152
0153 if ions:
0154 sig['beam_species'] = ions[0]
0155 if nucleons:
0156 sig['nucleon'] = nucleons[0]
0157 elif nucleons:
0158 sig['beam_species'] = nucleons[0]
0159 if 'beam_energy_electron' not in sig:
0160 if beam and 'x' in beam:
0161 sig['beam_energy_electron'], sig['beam_energy_hadron'] = _beam_split(beam)
0162 else:
0163 sig['beam_energy_electron'] = sig['beam_energy_hadron'] = 'N/A'
0164 return sig
0165
0166
0167
0168
0169 _BG_MECHANISMS = ('brems', 'coulomb', 'touschek')
0170 _BG_BEAM_NXM = re.compile(r'^(\d+)x(\d+)$')
0171 _BG_BEAM_SINGLE = re.compile(r'^(\d+)GeV$')
0172
0173
0174 def _bg_beam(segs, source):
0175 """Beam energies from a backgrounds path. 'NxM' -> (N, M). A bare 'NGeV' is
0176 assigned by source: a proton source to the hadron beam, otherwise electron."""
0177 for s in segs:
0178 m = _BG_BEAM_NXM.match(s)
0179 if m:
0180 return m.group(1), m.group(2)
0181 m = _BG_BEAM_SINGLE.match(s)
0182 if m:
0183 return ('N/A', m.group(1)) if source == 'proton' else (m.group(1), 'N/A')
0184 return 'N/A', 'N/A'
0185
0186
0187 def derive_background(path):
0188 """Canonical background (k) tag params from a stripped EVGEN/BACKGROUNDS path,
0189 or None if the path is not a backgrounds entry.
0190
0191 BEAMGAS: ``EVGEN/BACKGROUNDS/BEAMGAS/<source>/<mechanism-or-generator>/…/<beam>/…``.
0192 The 4th segment is a physical mechanism (brems/coulomb/touschek) when it is
0193 one, otherwise it is the generator. SYNRAD has no source or mechanism. All
0194 values are open strings — the parser passes through whatever the path names.
0195 Always returns the full set of match fields (blank where not present) so the
0196 tag-dedup lookup is consistent.
0197 """
0198 segs = (path or '').split('/')
0199 if len(segs) < 3 or segs[0] != 'EVGEN' or segs[1] != 'BACKGROUNDS':
0200 return None
0201 sub = segs[2]
0202 params = {
0203 'background_type': sub,
0204 'bg_source': '', 'bg_mechanism': '', 'bg_generator': '',
0205 'beam_energy_electron': 'N/A', 'beam_energy_hadron': 'N/A',
0206 }
0207 if sub == 'SYNRAD':
0208 if len(segs) > 3:
0209 params['bg_generator'] = segs[3]
0210 else:
0211 source = segs[3] if len(segs) > 3 else ''
0212 params['bg_source'] = source
0213 seg4 = segs[4] if len(segs) > 4 else ''
0214 if seg4 in _BG_MECHANISMS:
0215 params['bg_mechanism'] = seg4
0216 gen_parts = [s for s in segs[5:7]
0217 if s and not _BG_BEAM_NXM.match(s) and not _BG_BEAM_SINGLE.match(s)]
0218 params['bg_generator'] = '/'.join(gen_parts)
0219 else:
0220 params['bg_generator'] = seg4
0221 e, h = _bg_beam(segs, params['bg_source'])
0222 params['beam_energy_electron'] = e
0223 params['beam_energy_hadron'] = h
0224 return params
0225
0226
0227
0228
0229
0230 _EVGEN_NAMES = ('EpIC', 'BeAGLE', 'sartre', 'eSTARlight', 'eicMesonSFGen',
0231 'lAger', 'rapgap', 'DEMPgen', 'DJANGOH', 'GETaLM')
0232 _PYTHIA_RE = re.compile(r'^[Pp]ythia[ _]?(\d)(.*)$')
0233
0234
0235 def _split_gen_token(tok):
0236 """Split a '<Generator><Version>' token into (generator, generator_version),
0237 or (None, None) when there is no known generator *with a non-empty version*.
0238
0239 A bare generator name (e.g. 'pythia8', 'eSTARlight') has no version in the
0240 source and resolves to (None, None) — left for manual association, not
0241 guessed. The leading 'v' of a version is preserved (EpIC 'v1.1.6-1.2');
0242 pythiaN keeps its major-version digit in both the family and the version.
0243 """
0244 t = (tok or '').strip()
0245 if not t:
0246 return None, None
0247 m = _PYTHIA_RE.match(t)
0248 if m:
0249 if not m.group(2).strip(' ._-'):
0250 return None, None
0251 return f'pythia{m.group(1)}', f'{m.group(1)}{m.group(2)}'.lstrip('._- ')
0252 for g in _EVGEN_NAMES:
0253 if t.lower().startswith(g.lower()):
0254 ver = t[len(g):].lstrip('._- ')
0255 return (g, ver) if ver else (None, None)
0256 return None, None
0257
0258
0259 def derive_evgen(path, gen_version=''):
0260 """Curated (generator, generator_version) for a catalog row, or None when no
0261 confident resolution exists — left for manual association, never guessed.
0262
0263 - ``EVGEN/SINGLE/...`` samples are the particle gun.
0264 - A background ``dataprod_rel`` release names no generator; the generator is
0265 the repository (e.g. EIC_ESR_Xsuite, EIC_SR_Geant4).
0266 - Otherwise a '<Generator><Version>' token from the gen_version release tag
0267 or the path is split; a bare generator with no version resolves to None.
0268 """
0269 segs = (path or '').split('/')
0270 if len(segs) > 1 and segs[0] == 'EVGEN' and segs[1] == 'SINGLE':
0271 return {'generator': 'particle_gun', 'generator_version': ''}
0272
0273
0274
0275
0276 radiative = ''
0277 if 'noRad' in segs:
0278 radiative = 'off'
0279 elif 'Rad' in segs:
0280 radiative = 'on'
0281
0282 def _with_rad(params):
0283 if radiative:
0284 params['radiative'] = radiative
0285 return params
0286
0287 gv = (gen_version or '').strip()
0288
0289
0290 if 'pythia-rad-corr' in gv.lower():
0291 return None
0292 release = gv.rstrip('/').split('/')[-1] if gv else ''
0293 if release.startswith('dataprod_rel') and 'github.com/' in gv:
0294 repo = gv.split('github.com/', 1)[1].split('/releases', 1)[0].split('/')[-1]
0295 if repo:
0296 return _with_rad({'generator': repo, 'generator_version': release})
0297 for tok in [release, gv, *segs]:
0298 g, v = _split_gen_token(tok)
0299 if g:
0300 return _with_rad({'generator': g, 'generator_version': v})
0301 return None
0302
0303
0304 def single_particle_angle(path):
0305 """Angular-range tail of a single-particle path, or '' if none.
0306
0307 Single-particle samples share a ``(particle, gun_energy)`` physics tag but
0308 differ by polar-angle range; the angle is a per-task detail, not part of the
0309 reusable tag (``derive_physics`` deliberately omits it). This returns the
0310 path tail after ``EVGEN/SINGLE/<particle>/<energy>/`` so the importer can
0311 store it as a per-task override. Returns '' for a non-single-particle path.
0312 """
0313 segs = (path or '').split('/')
0314 if len(segs) < 2 or segs[0] != 'EVGEN' or segs[1] != 'SINGLE':
0315 return ''
0316 return '/'.join(segs[4:])