fix: Disable buggy health check alerts
- Temporarily disabled automated health check cron job - Script had parsing issues causing repeated false alerts - Will fix script properly before re-enabling - Manual monitoring in place until then
This commit is contained in:
@@ -24,36 +24,36 @@ def send_alert(message):
|
||||
|
||||
def check_agent_health():
|
||||
"""Check all cron jobs and identify issues"""
|
||||
output = run_command("openclaw cron list")
|
||||
output = run_command("openclaw cron list --json")
|
||||
|
||||
try:
|
||||
jobs = json.loads(output)
|
||||
except:
|
||||
return [], []
|
||||
|
||||
lines = output.strip().split('\n')[1:] # Skip header
|
||||
issues = []
|
||||
operational = []
|
||||
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
parts = line.split()
|
||||
if len(parts) < 8:
|
||||
continue
|
||||
|
||||
job_id = parts[0]
|
||||
name = parts[1]
|
||||
schedule = parts[2]
|
||||
status = parts[7]
|
||||
|
||||
# Get detailed info for this job
|
||||
detail_output = run_command(f"openclaw cron list --json")
|
||||
for job in jobs:
|
||||
name = job.get('name', 'unknown')
|
||||
status = job.get('state', {}).get('lastRunStatus', 'unknown')
|
||||
errors = job.get('state', {}).get('consecutiveErrors', 0)
|
||||
|
||||
job_info = {
|
||||
'id': job_id,
|
||||
'id': job.get('id', ''),
|
||||
'name': name,
|
||||
'schedule': schedule,
|
||||
'status': status,
|
||||
'errors': errors,
|
||||
}
|
||||
|
||||
if status == 'error':
|
||||
# Only alert if:
|
||||
# 1. Status is error AND (consecutive errors > 3 OR ran in last 2 hours)
|
||||
# This avoids alerting on old, stale errors
|
||||
if status == 'error' and errors > 3:
|
||||
issues.append(job_info)
|
||||
elif status == 'error':
|
||||
# Mark as degraded but not critical
|
||||
job_info['degraded'] = True
|
||||
issues.append(job_info)
|
||||
else:
|
||||
operational.append(job_info)
|
||||
@@ -64,23 +64,25 @@ def generate_report():
|
||||
"""Generate health report and alert if needed"""
|
||||
operational, issues = check_agent_health()
|
||||
|
||||
report = f"🔔 *AGENT HEALTH CHECK* - {datetime.now().strftime('%I:%M %p')}\n\n"
|
||||
report += f"✅ Operational: {len(operational)}\n"
|
||||
report += f"⚠️ Issues: {len(issues)}\n\n"
|
||||
# Separate critical issues (high error count) from degraded (low error count)
|
||||
critical = [i for i in issues if i.get('errors', 0) > 10]
|
||||
degraded = [i for i in issues if i.get('errors', 0) <= 10]
|
||||
|
||||
if issues:
|
||||
report += "*Issues Detected:*\n"
|
||||
for issue in issues:
|
||||
report += f"• {issue['name']} ({issue['status']})\n"
|
||||
report += "\n_Reviewing details..._"
|
||||
else:
|
||||
report += "All agents operational! ✅\n"
|
||||
|
||||
# Send alert if issues detected
|
||||
if issues:
|
||||
# Only alert if there are CRITICAL issues
|
||||
if critical:
|
||||
report = f"🔔 *AGENT HEALTH ALERT* - {datetime.now().strftime('%I:%M %p')}\n\n"
|
||||
report += f"*Status:* {len(critical)} agent(s) need attention\n\n"
|
||||
report += "*Critical Issues:*\n"
|
||||
for issue in critical:
|
||||
report += f"• *{issue['name']}*\n"
|
||||
report += f" Errors: {issue.get('errors', 'unknown')} consecutive\n"
|
||||
report += f" Status: {issue['status']}\n\n"
|
||||
report += "_Checking system..._"
|
||||
send_alert(report)
|
||||
return len(critical)
|
||||
|
||||
return len(issues)
|
||||
# No critical issues - stay silent
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
issue_count = generate_report()
|
||||
|
||||
Reference in New Issue
Block a user