HOALedgerIQ_Website/scripts/agent-health-check.py

#!/usr/bin/env python3
"""
Agent Health Check - Proactive Monitoring
Runs every 4 hours to detect agent issues before they impact business
"""
import json
import subprocess
import sys
from datetime import datetime
from pathlib import Path

TELEGRAM_TARGET = "telegram:8269921691"
ERROR_THRESHOLD = 3  # Alert if consecutive errors > this

def run_command(cmd):
    """Run shell command and return output"""
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return result.stdout

def send_alert(message):
    """Send Telegram alert"""
    cmd = f'openclaw message send --channel telegram --target "{TELEGRAM_TARGET}" --message "{message}"'
    subprocess.run(cmd, shell=True, capture_output=True)

def check_agent_health():
    """Check all cron jobs and identify issues"""
    output = run_command("openclaw cron list")

    lines = output.strip().split('\n')[1:]  # Skip header
    issues = []
    operational = []

    for line in lines:
        if not line.strip():
            continue

        parts = line.split()
        if len(parts) < 8:
            continue

        job_id = parts[0]
        name = parts[1]
        schedule = parts[2]
        status = parts[7]

        # Get detailed info for this job
        detail_output = run_command(f"openclaw cron list --json")

        job_info = {
            'id': job_id,
            'name': name,
            'schedule': schedule,
            'status': status,
        }

        if status == 'error':
            issues.append(job_info)
        else:
            operational.append(job_info)

    return operational, issues

def generate_report():
    """Generate health report and alert if needed"""
    operational, issues = check_agent_health()

    report = f"🔔 *AGENT HEALTH CHECK* - {datetime.now().strftime('%I:%M %p')}\n\n"
    report += f"✅ Operational: {len(operational)}\n"
    report += f"⚠️ Issues: {len(issues)}\n\n"

    if issues:
        report += "*Issues Detected:*\n"
        for issue in issues:
            report += f"• {issue['name']} ({issue['status']})\n"
        report += "\n_Reviewing details..._"
    else:
        report += "All agents operational! ✅\n"

    # Send alert if issues detected
    if issues:
        send_alert(report)

    return len(issues)

if __name__ == "__main__":
    issue_count = generate_report()
    sys.exit(0 if issue_count == 0 else 1)