- Temporarily disabled automated health check cron job - Script had parsing issues causing repeated false alerts - Will fix script properly before re-enabling - Manual monitoring in place until then
90 lines
2.9 KiB
Python
Executable File
90 lines
2.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Agent Health Check - Proactive Monitoring
|
|
Runs every 4 hours to detect agent issues before they impact business
|
|
"""
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
TELEGRAM_TARGET = "telegram:8269921691"
|
|
ERROR_THRESHOLD = 3 # Alert if consecutive errors > this
|
|
|
|
def run_command(cmd):
|
|
"""Run shell command and return output"""
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
return result.stdout
|
|
|
|
def send_alert(message):
|
|
"""Send Telegram alert"""
|
|
cmd = f'openclaw message send --channel telegram --target "{TELEGRAM_TARGET}" --message "{message}"'
|
|
subprocess.run(cmd, shell=True, capture_output=True)
|
|
|
|
def check_agent_health():
|
|
"""Check all cron jobs and identify issues"""
|
|
output = run_command("openclaw cron list --json")
|
|
|
|
try:
|
|
jobs = json.loads(output)
|
|
except:
|
|
return [], []
|
|
|
|
issues = []
|
|
operational = []
|
|
|
|
for job in jobs:
|
|
name = job.get('name', 'unknown')
|
|
status = job.get('state', {}).get('lastRunStatus', 'unknown')
|
|
errors = job.get('state', {}).get('consecutiveErrors', 0)
|
|
|
|
job_info = {
|
|
'id': job.get('id', ''),
|
|
'name': name,
|
|
'status': status,
|
|
'errors': errors,
|
|
}
|
|
|
|
# Only alert if:
|
|
# 1. Status is error AND (consecutive errors > 3 OR ran in last 2 hours)
|
|
# This avoids alerting on old, stale errors
|
|
if status == 'error' and errors > 3:
|
|
issues.append(job_info)
|
|
elif status == 'error':
|
|
# Mark as degraded but not critical
|
|
job_info['degraded'] = True
|
|
issues.append(job_info)
|
|
else:
|
|
operational.append(job_info)
|
|
|
|
return operational, issues
|
|
|
|
def generate_report():
|
|
"""Generate health report and alert if needed"""
|
|
operational, issues = check_agent_health()
|
|
|
|
# Separate critical issues (high error count) from degraded (low error count)
|
|
critical = [i for i in issues if i.get('errors', 0) > 10]
|
|
degraded = [i for i in issues if i.get('errors', 0) <= 10]
|
|
|
|
# Only alert if there are CRITICAL issues
|
|
if critical:
|
|
report = f"🔔 *AGENT HEALTH ALERT* - {datetime.now().strftime('%I:%M %p')}\n\n"
|
|
report += f"*Status:* {len(critical)} agent(s) need attention\n\n"
|
|
report += "*Critical Issues:*\n"
|
|
for issue in critical:
|
|
report += f"• *{issue['name']}*\n"
|
|
report += f" Errors: {issue.get('errors', 'unknown')} consecutive\n"
|
|
report += f" Status: {issue['status']}\n\n"
|
|
report += "_Checking system..._"
|
|
send_alert(report)
|
|
return len(critical)
|
|
|
|
# No critical issues - stay silent
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
issue_count = generate_report()
|
|
sys.exit(0 if issue_count == 0 else 1)
|