Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-27 07:41:45

0001 #!/bin/bash
0002 # check-testbed.sh - Bootstrap and verify testbed infrastructure
0003 # Ensures agent manager and supervisord are healthy.
0004 # Starts agent manager if not running.
0005 # Reports all problems loudly.
0006 
0007 set -euo pipefail
0008 
0009 TESTBED_DIR="/data/wenauseic/github/swf-testbed"
0010 AGENTS_CONF="agents.supervisord.conf"
0011 VENV="$TESTBED_DIR/.venv/bin"
0012 SUPERVISORCTL="$VENV/supervisorctl"
0013 STATUS=0
0014 
0015 echo "=== Testbed Infrastructure Check ==="
0016 echo ""
0017 
0018 # --- Agent Manager ---
0019 echo "--- Agent Manager ---"
0020 AM_PID=$(pgrep -f "testbed agent-manager" -u "$(whoami)" 2>/dev/null || true)
0021 
0022 if [ -n "$AM_PID" ]; then
0023     echo "RUNNING (PID $AM_PID)"
0024 else
0025     echo "NOT RUNNING - starting..."
0026     cd "$TESTBED_DIR"
0027     source "$VENV/activate"
0028     source ~/.env 2>/dev/null || true
0029     nohup testbed agent-manager > /tmp/agent-manager.log 2>&1 &
0030 
0031     STARTED=false
0032     for i in $(seq 1 6); do
0033         sleep 5
0034         AM_PID=$(pgrep -f "testbed agent-manager" -u "$(whoami)" 2>/dev/null || true)
0035         if [ -n "$AM_PID" ]; then
0036             echo "STARTED (PID $AM_PID) after ${i}0s"
0037             STARTED=true
0038             break
0039         fi
0040         echo "  ...waiting (${i}0s elapsed)"
0041         tail -3 /tmp/agent-manager.log 2>/dev/null || true
0042     done
0043 
0044     if [ "$STARTED" = false ]; then
0045         echo "ERROR: Agent manager failed to start after 30s"
0046         echo "Log output:"
0047         tail -20 /tmp/agent-manager.log 2>/dev/null || echo "(no log)"
0048         STATUS=1
0049     fi
0050 fi
0051 
0052 echo ""
0053 
0054 # --- Supervisord ---
0055 echo "--- Supervisord ---"
0056 SV_OUTPUT=$("$SUPERVISORCTL" -c "$TESTBED_DIR/$AGENTS_CONF" status 2>&1) || true
0057 
0058 if echo "$SV_OUTPUT" | grep -q "no such file"; then
0059     SV_EXIT=4
0060 elif echo "$SV_OUTPUT" | grep -q "refused"; then
0061     SV_EXIT=4
0062 else
0063     # Check for actual running/stopped agents
0064     SV_EXIT=0
0065 fi
0066 
0067 if [ "$SV_EXIT" -eq 0 ]; then
0068     echo "REACHABLE"
0069     echo "$SV_OUTPUT"
0070 elif [ "$SV_EXIT" -eq 4 ]; then
0071     STALE_PID=$(pgrep -f "supervisord.*$AGENTS_CONF" -u "$(whoami)" 2>/dev/null || true)
0072     if [ -n "$STALE_PID" ]; then
0073         echo "Stale process found (PID $STALE_PID) - killing..."
0074         kill "$STALE_PID" 2>/dev/null || true
0075         sleep 2
0076         if kill -0 "$STALE_PID" 2>/dev/null; then
0077             echo "SIGTERM didn't work, sending SIGKILL..."
0078             kill -9 "$STALE_PID" 2>/dev/null || true
0079             sleep 1
0080         fi
0081         if kill -0 "$STALE_PID" 2>/dev/null; then
0082             echo "ERROR: Failed to kill stale supervisord (PID $STALE_PID)"
0083             STATUS=1
0084         else
0085             echo "Killed. Supervisord will be started fresh when testbed starts."
0086         fi
0087     else
0088         echo "NOT RUNNING (normal when testbed is stopped)"
0089     fi
0090 fi
0091 
0092 echo ""
0093 
0094 # --- Refresh heartbeat ---
0095 # After any fixes, signal agent manager to send immediate heartbeat
0096 # so MCP status reflects the current verified state.
0097 AM_PID=$(pgrep -f "testbed agent-manager" -u "$(whoami)" 2>/dev/null || true)
0098 if [ -n "$AM_PID" ] && [ "$STATUS" -eq 0 ]; then
0099     kill -USR1 "$AM_PID" 2>/dev/null || true
0100     sleep 2
0101 fi
0102 
0103 # --- Summary ---
0104 echo "--- Summary ---"
0105 if [ "$STATUS" -eq 0 ]; then
0106     echo "Infrastructure OK. Ready for MCP operations."
0107 else
0108     echo "PROBLEMS DETECTED. Fix issues above before proceeding."
0109 fi
0110 
0111 exit $STATUS