Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-27 07:41:45

0001 """Query helpers for the alarm dashboard — ORM over the Entry / EntryVersion
0002 tables. No model logic beyond what the views need.
0003 """
0004 from __future__ import annotations
0005 
0006 import json
0007 import time
0008 from datetime import datetime, timezone, timedelta
0009 
0010 from django.db.models import Count, Max
0011 
0012 from .models import Entry, EntryContext, EntryVersion
0013 
0014 
0015 CONTEXT_NAME = 'swf-alarms'
0016 TEAMS_CONTEXT = 'teams'
0017 
0018 
0019 def _active_alarm_configs_qs():
0020     return (Entry.objects
0021             .filter(context_id=CONTEXT_NAME, kind='alarm',
0022                     archived=False, deleted_at__isnull=True)
0023             .order_by('timestamp_created'))
0024 
0025 
0026 def alarm_configs() -> list[dict]:
0027     out = []
0028     for e in _active_alarm_configs_qs():
0029         data = e.data or {}
0030         out.append({
0031             'id': e.id,
0032             'entry_id': data.get('entry_id', ''),
0033             'name': data.get('entry_id', '').replace('alarm_', '', 1) or e.id[:8],
0034             'title': e.title,
0035             'content': e.content,
0036             'enabled': bool(data.get('enabled', True)),
0037             # Render recipients as a plain string for display. Storage
0038             # may be str (new-style, user-typed, preserved) or list[str]
0039             # (legacy seed rows). NEVER `list(x)` on an unknown-typed x —
0040             # that iterates characters when x is already a string.
0041             'recipients_text': _recipients_display(data.get('recipients')),
0042             'params': dict(data.get('params') or {}),
0043             'created': e.timestamp_created,
0044             'modified': e.timestamp_modified,
0045             'data': data,
0046         })
0047     return out
0048 
0049 
0050 def _recipients_display(value) -> str:
0051     if value is None or value == '':
0052         return ''
0053     if isinstance(value, str):
0054         return value
0055     if isinstance(value, (list, tuple)):
0056         return ', '.join(str(v) for v in value)
0057     return str(value)
0058 
0059 
0060 def get_alarm_config_by_entry_id(entry_id: str) -> Entry | None:
0061     try:
0062         return (Entry.objects
0063                 .filter(context_id=CONTEXT_NAME, kind='alarm',
0064                         data__entry_id=entry_id,
0065                         deleted_at__isnull=True)
0066                 .first())
0067     except Entry.DoesNotExist:
0068         return None
0069 
0070 
0071 def events_since(alarm_entry_id: str, hours: int, limit: int = 500) -> list[dict]:
0072     """Event entries with fire_time within the last N hours.
0073 
0074     Returns dicts with useful denormalised fields for the template.
0075     """
0076     event_entry_id = _event_entry_id_for(alarm_entry_id)
0077     cutoff = time.time() - hours * 3600
0078     qs = (Entry.objects
0079           .filter(context_id=CONTEXT_NAME, kind='event',
0080                   data__entry_id=event_entry_id,
0081                   archived=False, deleted_at__isnull=True,
0082                   timestamp_created__gte=cutoff)
0083           .order_by('-timestamp_created')[:limit])
0084     return [_event_to_dict(e) for e in qs]
0085 
0086 
0087 def count_events_since(alarm_entry_id: str, hours: int) -> int:
0088     event_entry_id = _event_entry_id_for(alarm_entry_id)
0089     cutoff = time.time() - hours * 3600
0090     return (Entry.objects
0091             .filter(context_id=CONTEXT_NAME, kind='event',
0092                     data__entry_id=event_entry_id,
0093                     archived=False, deleted_at__isnull=True,
0094                     timestamp_created__gte=cutoff)
0095             .count())
0096 
0097 
0098 def _active_events_qs(alarm_entry_id: str):
0099     """All non-archived event rows for this alarm, filtered in Python for
0100     JSON-null clear_time (Django's __isnull on JSON paths only catches
0101     missing keys, not ``null``-valued ones)."""
0102     event_entry_id = _event_entry_id_for(alarm_entry_id)
0103     rows = (Entry.objects
0104             .filter(context_id=CONTEXT_NAME, kind='event',
0105                     data__entry_id=event_entry_id,
0106                     archived=False, deleted_at__isnull=True)
0107             .order_by('-timestamp_created'))
0108     return [e for e in rows if (e.data or {}).get('clear_time') is None]
0109 
0110 
0111 def active_event_count(alarm_entry_id: str) -> int:
0112     return len(_active_events_qs(alarm_entry_id))
0113 
0114 
0115 def active_event_rows(alarm_entry_id: str) -> list:
0116     """Raw Entry objects for currently-active events of this alarm."""
0117     return _active_events_qs(alarm_entry_id)
0118 
0119 
0120 def active_events(alarm_entry_id: str) -> list[dict]:
0121     """Present-state view: one row per currently-active event for this alarm."""
0122     out: list[dict] = []
0123     for e in _active_events_qs(alarm_entry_id):
0124         d = e.data or {}
0125         ft = d.get('fire_time')
0126         out.append({
0127             'id': e.id,
0128             'subject': d.get('subject') or e.title or '?',
0129             'dedupe_key': d.get('dedupe_key') or '',
0130             'fire_time': ft,
0131             'fire_time_dt': _ts_to_dt(ft),
0132             'last_seen': d.get('last_seen'),
0133             'metric': _event_metric(d),
0134         })
0135     return out
0136 
0137 
0138 def _event_metric(data: dict) -> str:
0139     """Trigger metric as a display string.
0140 
0141     Preferred: detection set an explicit `metric` key (formatted string).
0142     Fallback: derive from `computed_failurerate` for old rows that
0143     predate the metric field.
0144     """
0145     m = data.get('metric')
0146     if isinstance(m, str) and m:
0147         return m
0148     cfr = data.get('computed_failurerate')
0149     if isinstance(cfr, (int, float)):
0150         return f"{cfr*100:.1f}%"
0151     return ''
0152 
0153 
0154 def _ts_to_dt(ts):
0155     if ts is None or ts == '':
0156         return None
0157     try:
0158         return datetime.fromtimestamp(float(ts), tz=timezone.utc)
0159     except (TypeError, ValueError, OSError, OverflowError):
0160         return None
0161 
0162 
0163 def events_for_task(alarm_entry_id: str, dedupe_key: str,
0164                     hours: int) -> list[dict]:
0165     """All events for one (alarm, entity) in the last N hours, reverse chron."""
0166     event_entry_id = _event_entry_id_for(alarm_entry_id)
0167     cutoff = time.time() - hours * 3600
0168     qs = (Entry.objects
0169           .filter(context_id=CONTEXT_NAME, kind='event',
0170                   data__entry_id=event_entry_id,
0171                   data__dedupe_key=dedupe_key,
0172                   archived=False, deleted_at__isnull=True,
0173                   timestamp_created__gte=cutoff)
0174           .order_by('-timestamp_created'))
0175     return [_event_to_dict(e) for e in qs]
0176 
0177 
0178 def task_history_bins(alarm_entry_id: str, dedupe_key: str,
0179                       hours: int) -> list[dict]:
0180     """One row per engine tick in the last N hours: state of this (alarm,
0181     entity) at that tick.
0182 
0183     state ∈ {'firing', 'clear', 'unknown'}:
0184       - 'firing' — at that tick, an event for this task had fire_time
0185         ≤ tick ≤ (clear_time or now), i.e. the alarm was active.
0186       - 'clear' — the tick ran cleanly and the task was not firing.
0187       - 'unknown' — the tick errored or didn't finish (no truth).
0188     """
0189     now = time.time()
0190     cutoff = now - hours * 3600
0191     event_entry_id = _event_entry_id_for(alarm_entry_id)
0192 
0193     # All engine runs in window, oldest first.
0194     runs = (Entry.objects
0195             .filter(context_id=CONTEXT_NAME, kind='engine_run',
0196                     deleted_at__isnull=True,
0197                     timestamp_created__gte=cutoff)
0198             .order_by('timestamp_created'))
0199 
0200     # All events for this (alarm, entity) whose interval intersects the
0201     # window. An event is a ∞ interval [fire_time, clear_time|now]; it
0202     # intersects [cutoff, now] unless clear_time < cutoff.
0203     evs = (Entry.objects
0204            .filter(context_id=CONTEXT_NAME, kind='event',
0205                    data__entry_id=event_entry_id,
0206                    data__dedupe_key=dedupe_key,
0207                    archived=False, deleted_at__isnull=True))
0208     intervals: list[tuple[float, float]] = []
0209     for e in evs:
0210         d = e.data or {}
0211         ft = float(d.get('fire_time') or 0)
0212         ct = d.get('clear_time')
0213         ct_f = float(ct) if ct is not None else now
0214         if ct_f < cutoff:
0215             continue
0216         intervals.append((ft, ct_f))
0217 
0218     bins: list[dict] = []
0219     for run in runs:
0220         rd = run.data or {}
0221         tick = float(rd.get('started_at') or run.timestamp_created)
0222         per_alarm = rd.get('per_alarm') or {}
0223         pa = per_alarm.get(alarm_entry_id) or {}
0224         errored = bool(pa.get('errors')) or (rd.get('finished_at') is None)
0225         firing = any(ft <= tick <= ct for ft, ct in intervals)
0226         if errored:
0227             state = 'unknown'
0228         elif firing:
0229             state = 'firing'
0230         else:
0231             state = 'clear'
0232         bins.append({
0233             'tick': tick,
0234             'state': state,
0235             'run_id': run.id,
0236         })
0237     return bins
0238 
0239 
0240 def last_fired(alarm_entry_id: str):
0241     """Most recent moment this alarm was observed firing.
0242 
0243     For active events this is the tick's last_seen (bumped every tick).
0244     For cleared events this is the clear_time. Taking the max across
0245     all events gives "the last tick at which anything was firing" —
0246     which for an alarm that's currently active is the most recent cron
0247     tick.
0248     """
0249     event_entry_id = _event_entry_id_for(alarm_entry_id)
0250     qs = (Entry.objects
0251           .filter(context_id=CONTEXT_NAME, kind='event',
0252                   data__entry_id=event_entry_id,
0253                   archived=False, deleted_at__isnull=True))
0254     best = 0.0
0255     for e in qs:
0256         d = e.data or {}
0257         for k in ('last_seen', 'clear_time', 'fire_time'):
0258             v = d.get(k)
0259             if v is None:
0260                 continue
0261             try:
0262                 fv = float(v)
0263             except (TypeError, ValueError):
0264                 continue
0265             if fv > best:
0266                 best = fv
0267         if e.timestamp_created and e.timestamp_created > best:
0268             best = float(e.timestamp_created)
0269     return best if best > 0 else None
0270 
0271 
0272 def get_event(event_uuid: str) -> dict | None:
0273     try:
0274         e = Entry.objects.get(id=event_uuid, context_id=CONTEXT_NAME,
0275                               kind='event', deleted_at__isnull=True)
0276     except Entry.DoesNotExist:
0277         return None
0278     return _event_to_dict(e)
0279 
0280 
0281 def versions_for(entry_uuid: str, limit: int = 50) -> list[dict]:
0282     qs = (EntryVersion.objects
0283           .filter(entry_id=entry_uuid)
0284           .order_by('-version_num')[:limit])
0285     return [{
0286         'id': v.id,
0287         'version_num': v.version_num,
0288         'title': v.title,
0289         'content': v.content,
0290         'data': v.data,
0291         'changed_by': v.changed_by,
0292         'timestamp': v.timestamp,
0293         'preview': v.title or ((v.content or '').splitlines()[0][:120] if v.content else ''),
0294         'line_count': (v.content or '').count('\n') + (1 if (v.content or '') else 0),
0295     } for v in qs]
0296 
0297 
0298 def recent_runs(limit: int = 20) -> list[dict]:
0299     qs = (Entry.objects
0300           .filter(context_id=CONTEXT_NAME, kind='engine_run',
0301                   archived=False, deleted_at__isnull=True)
0302           .order_by('-timestamp_created')[:limit])
0303     out = []
0304     for e in qs:
0305         data = dict(e.data or {})
0306         # Normalise legacy key names so templates read one shape.
0307         if 'per_alarm' not in data and 'per_check' in data:
0308             data['per_alarm'] = data['per_check']
0309         if 'alarms_run' not in data and 'checks_run' in data:
0310             data['alarms_run'] = data['checks_run']
0311         out.append({'id': e.id, 'data': data})
0312     return out
0313 
0314 
0315 def quiet_alarms(quiet_ticks: int = 3, history_ticks: int = 12) -> set[str]:
0316     """Alarm entry_ids that look suspiciously silent.
0317 
0318     An alarm is flagged quiet if:
0319       - All of the last `quiet_ticks` successful engine runs (errors==0
0320         for that alarm) saw zero detections for it, AND
0321       - At least one run in the last `history_ticks` DID see detections
0322         for it.
0323 
0324     Purely heuristic. A broken alarm that returns nothing looks identical
0325     to a healthy quiet alarm until it has prior non-zero history, so this
0326     only catches recently-gone-silent cases. Good enough to surface.
0327     """
0328     recent = recent_runs(limit=history_ticks)
0329     if len(recent) < quiet_ticks:
0330         return set()
0331     by_alarm_recent: dict[str, list[int]] = {}
0332     by_alarm_history: dict[str, int] = {}
0333     for i, r in enumerate(recent):
0334         per = (r['data'].get('per_alarm') or {})
0335         for eid, pc in per.items():
0336             if (pc or {}).get('errors'):
0337                 continue  # errored run doesn't count toward quiet
0338             seen = int((pc or {}).get('alarms_seen') or 0)
0339             if i < quiet_ticks:
0340                 by_alarm_recent.setdefault(eid, []).append(seen)
0341             by_alarm_history[eid] = by_alarm_history.get(eid, 0) + seen
0342     out: set[str] = set()
0343     for eid, recent_seens in by_alarm_recent.items():
0344         if len(recent_seens) < quiet_ticks:
0345             continue
0346         if any(recent_seens):
0347             continue
0348         if by_alarm_history.get(eid, 0) > 0:
0349             out.add(eid)
0350     return out
0351 
0352 
0353 def engine_health() -> dict:
0354     """Traffic light for the dashboard header."""
0355     try:
0356         last = (Entry.objects
0357                 .filter(context_id=CONTEXT_NAME, kind='engine_run',
0358                         deleted_at__isnull=True)
0359                 .order_by('-timestamp_created').first())
0360     except Exception:  # noqa: BLE001
0361         return {'status': 'unknown', 'reasons': ['DB not reachable.']}
0362 
0363     if last is None:
0364         return {'status': 'unknown', 'reasons': ['Engine has never run.']}
0365 
0366     data = last.data or {}
0367     finished = data.get('finished_at')
0368     reasons: list[str] = []
0369     status = 'ok'
0370     if not finished:
0371         reasons.append('Last engine run did not finish.')
0372         status = 'warn'
0373     else:
0374         age = time.time() - float(finished)
0375         if age > 15 * 60:
0376             reasons.append(
0377                 f'Engine stale: last run finished {int(age // 60)} min ago.')
0378             status = 'bad'
0379     if data.get('errors'):
0380         reasons.append(f"Last run had {data['errors']} error(s).")
0381         status = 'bad'
0382     if not reasons:
0383         reasons.append('All checks healthy.')
0384     return {'status': status, 'reasons': reasons, 'last_run': data,
0385             'last_run_id': last.id}
0386 
0387 
0388 # ── internal helpers ───────────────────────────────────────────────────────
0389 
0390 # ── teams ────────────────────────────────────────────────────────────────
0391 
0392 def list_teams() -> list[dict]:
0393     """All non-archived teams in the 'teams' context."""
0394     qs = (Entry.objects
0395           .filter(context_id=TEAMS_CONTEXT, kind='team',
0396                   archived=False, deleted_at__isnull=True)
0397           .order_by('name'))
0398     out = []
0399     for e in qs:
0400         out.append({
0401             'id': e.id,
0402             'name': e.name,                       # '@prodops'
0403             'title': e.title,
0404             'content': e.content,
0405             'members': _parse_recipient_tokens(e.content),
0406             'created': e.timestamp_created,
0407             'modified': e.timestamp_modified,
0408         })
0409     return out
0410 
0411 
0412 def get_team(at_name: str) -> Entry | None:
0413     """Fetch a team by its @name. Accepts with-or-without leading '@'."""
0414     if not at_name:
0415         return None
0416     if not at_name.startswith('@'):
0417         at_name = '@' + at_name
0418     return (Entry.objects
0419             .filter(context_id=TEAMS_CONTEXT, kind='team', name=at_name,
0420                     archived=False, deleted_at__isnull=True)
0421             .first())
0422 
0423 
0424 def get_team_by_id(entry_id: str) -> Entry | None:
0425     try:
0426         return Entry.objects.get(id=entry_id, context_id=TEAMS_CONTEXT,
0427                                  kind='team', deleted_at__isnull=True)
0428     except Entry.DoesNotExist:
0429         return None
0430 
0431 
0432 # ── recipient parsing / expansion ────────────────────────────────────────
0433 
0434 def _parse_recipient_tokens(raw) -> list[str]:
0435     """Split a string or list of strings into normalised recipient tokens.
0436 
0437     Tokens may be separated by commas, whitespace, or both. Blank tokens
0438     dropped. Each token is either an email address or an @<teamname>.
0439     Returns the list in the order given, deduped (case-insensitive match
0440     for emails; @names kept as-is).
0441     """
0442     if raw is None:
0443         return []
0444     if isinstance(raw, (list, tuple)):
0445         parts: list[str] = []
0446         for chunk in raw:
0447             parts.extend(_parse_recipient_tokens(chunk))
0448         return _dedup_preserve(parts)
0449     # String path
0450     s = str(raw)
0451     # Normalise commas to whitespace for a single split.
0452     for sep in [',', ';', '\n', '\r', '\t']:
0453         s = s.replace(sep, ' ')
0454     tokens = [t.strip() for t in s.split(' ')]
0455     return _dedup_preserve([t for t in tokens if t])
0456 
0457 
0458 def _dedup_preserve(seq: list[str]) -> list[str]:
0459     seen: set[str] = set()
0460     out: list[str] = []
0461     for t in seq:
0462         key = t.lower() if '@' in t and not t.startswith('@') else t
0463         if key in seen:
0464             continue
0465         seen.add(key)
0466         out.append(t)
0467     return out
0468 
0469 
0470 def parse_recipients_input(text) -> list[str]:
0471     """Public entry point used by views/engine to normalise user input."""
0472     return _parse_recipient_tokens(text)
0473 
0474 
0475 def expand_recipients(tokens) -> tuple[list[str], list[str]]:
0476     """Expand @<team> tokens into their member emails.
0477 
0478     Returns (emails, unresolved). `emails` is the final dedup'd list of
0479     deliverable addresses. `unresolved` is a list of @<team> tokens that
0480     didn't resolve — callers should log but not fail on those.
0481     """
0482     final: list[str] = []
0483     unresolved: list[str] = []
0484     for tok in _parse_recipient_tokens(tokens):
0485         if tok.startswith('@'):
0486             team = get_team(tok)
0487             if team is None or not team.content.strip():
0488                 unresolved.append(tok)
0489                 continue
0490             final.extend(_parse_recipient_tokens(team.content))
0491         else:
0492             final.append(tok)
0493     return _dedup_preserve(final), unresolved
0494 
0495 
0496 # ── internal helpers ─────────────────────────────────────────────────────
0497 
0498 def _event_entry_id_for(alarm_entry_id: str) -> str:
0499     if alarm_entry_id.startswith('alarm_'):
0500         return 'event_' + alarm_entry_id[len('alarm_'):]
0501     return 'event_' + alarm_entry_id
0502 
0503 
0504 _EVENT_INTERNAL_KEYS = {
0505     'entry_id', 'fire_time', 'clear_time', 'last_seen', 'last_notified',
0506     'dedupe_key', 'subject', 'recipients', 'alarm_config_id', 'severity',
0507 }
0508 
0509 
0510 def _event_to_dict(e: Entry) -> dict:
0511     data = e.data or {}
0512     fire_time = data.get('fire_time')
0513     clear_time = data.get('clear_time')
0514     # Context data shown on the event-detail page: strip plumbing keys
0515     # (these already have dedicated rows at the top of the page).
0516     context_data = {k: v for k, v in data.items()
0517                     if k not in _EVENT_INTERNAL_KEYS}
0518     return {
0519         'id': e.id,
0520         'title': e.title,
0521         'entry_id': data.get('entry_id'),
0522         'subject': data.get('subject', ''),
0523         'dedupe_key': data.get('dedupe_key'),
0524         'fire_time': fire_time,
0525         'clear_time': clear_time,
0526         'last_seen': data.get('last_seen'),
0527         'state': 'active' if clear_time is None else 'cleared',
0528         'recipients': data.get('recipients') or [],
0529         'content': e.content,
0530         'data': data,
0531         'context_data': context_data,
0532         'timestamp_created': e.timestamp_created,
0533         'timestamp_modified': e.timestamp_modified,
0534     }