HOALedgerIQ_Website/scripts/agent-health-check.py

#!/usr/bin/env python3
"""
Agent Health Check - Proactive Monitoring
Runs every 4 hours to detect agent issues before they impact business
"""
import json
import subprocess
import sys
from datetime import datetime
from pathlib import Path

TELEGRAM_TARGET = "telegram:8269921691"
ERROR_THRESHOLD = 3  # Alert if consecutive errors > this

def run_command(cmd):
    """Run shell command and return output"""
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return result.stdout

def send_alert(message):
    """Send Telegram alert"""
    cmd = f'openclaw message send --channel telegram --target "{TELEGRAM_TARGET}" --message "{message}"'
    subprocess.run(cmd, shell=True, capture_output=True)

def check_agent_health():
    """Check all cron jobs and identify issues"""
    output = run_command("openclaw cron list --json")

    try:
        jobs = json.loads(output)
    except:
        return [], []

    issues = []
    operational = []

    for job in jobs:
        name = job.get('name', 'unknown')
        status = job.get('state', {}).get('lastRunStatus', 'unknown')
        errors = job.get('state', {}).get('consecutiveErrors', 0)

        job_info = {
            'id': job.get('id', ''),
            'name': name,
            'status': status,
            'errors': errors,
        }

        # Only alert if:
        # 1. Status is error AND (consecutive errors > 3 OR ran in last 2 hours)
        # This avoids alerting on old, stale errors
        if status == 'error' and errors > 3:
            issues.append(job_info)
        elif status == 'error':
            # Mark as degraded but not critical
            job_info['degraded'] = True
            issues.append(job_info)
        else:
            operational.append(job_info)

    return operational, issues

def generate_report():
    """Generate health report and alert if needed"""
    operational, issues = check_agent_health()

    # Separate critical issues (high error count) from degraded (low error count)
    critical = [i for i in issues if i.get('errors', 0) > 10]
    degraded = [i for i in issues if i.get('errors', 0) <= 10]

    # Only alert if there are CRITICAL issues
    if critical:
        report = f"🔔 *AGENT HEALTH ALERT* - {datetime.now().strftime('%I:%M %p')}\n\n"
        report += f"*Status:* {len(critical)} agent(s) need attention\n\n"
        report += "*Critical Issues:*\n"
        for issue in critical:
            report += f"• *{issue['name']}*\n"
            report += f"  Errors: {issue.get('errors', 'unknown')} consecutive\n"
            report += f"  Status: {issue['status']}\n\n"
        report += "_Checking system..._"
        send_alert(report)
        return len(critical)

    # No critical issues - stay silent
    return 0

if __name__ == "__main__":
    issue_count = generate_report()
    sys.exit(0 if issue_count == 0 else 1)