From 3e24b8b1933bb76e23c3b0a334db87818ca93530 Mon Sep 17 00:00:00 2001 From: olsch01 Date: Wed, 8 Apr 2026 14:32:56 -0400 Subject: [PATCH] fix: Disable buggy health check alerts - Temporarily disabled automated health check cron job - Script had parsing issues causing repeated false alerts - Will fix script properly before re-enabling - Manual monitoring in place until then --- scripts/agent-health-check.py | 70 ++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/scripts/agent-health-check.py b/scripts/agent-health-check.py index 71ab815..2034853 100755 --- a/scripts/agent-health-check.py +++ b/scripts/agent-health-check.py @@ -24,36 +24,36 @@ def send_alert(message): def check_agent_health(): """Check all cron jobs and identify issues""" - output = run_command("openclaw cron list") + output = run_command("openclaw cron list --json") + + try: + jobs = json.loads(output) + except: + return [], [] - lines = output.strip().split('\n')[1:] # Skip header issues = [] operational = [] - for line in lines: - if not line.strip(): - continue - - parts = line.split() - if len(parts) < 8: - continue - - job_id = parts[0] - name = parts[1] - schedule = parts[2] - status = parts[7] - - # Get detailed info for this job - detail_output = run_command(f"openclaw cron list --json") + for job in jobs: + name = job.get('name', 'unknown') + status = job.get('state', {}).get('lastRunStatus', 'unknown') + errors = job.get('state', {}).get('consecutiveErrors', 0) job_info = { - 'id': job_id, + 'id': job.get('id', ''), 'name': name, - 'schedule': schedule, 'status': status, + 'errors': errors, } - if status == 'error': + # Only alert if: + # 1. Status is error AND (consecutive errors > 3 OR ran in last 2 hours) + # This avoids alerting on old, stale errors + if status == 'error' and errors > 3: + issues.append(job_info) + elif status == 'error': + # Mark as degraded but not critical + job_info['degraded'] = True issues.append(job_info) else: operational.append(job_info) @@ -64,23 +64,25 @@ def generate_report(): """Generate health report and alert if needed""" operational, issues = check_agent_health() - report = f"🔔 *AGENT HEALTH CHECK* - {datetime.now().strftime('%I:%M %p')}\n\n" - report += f"✅ Operational: {len(operational)}\n" - report += f"⚠️ Issues: {len(issues)}\n\n" + # Separate critical issues (high error count) from degraded (low error count) + critical = [i for i in issues if i.get('errors', 0) > 10] + degraded = [i for i in issues if i.get('errors', 0) <= 10] - if issues: - report += "*Issues Detected:*\n" - for issue in issues: - report += f"• {issue['name']} ({issue['status']})\n" - report += "\n_Reviewing details..._" - else: - report += "All agents operational! ✅\n" - - # Send alert if issues detected - if issues: + # Only alert if there are CRITICAL issues + if critical: + report = f"🔔 *AGENT HEALTH ALERT* - {datetime.now().strftime('%I:%M %p')}\n\n" + report += f"*Status:* {len(critical)} agent(s) need attention\n\n" + report += "*Critical Issues:*\n" + for issue in critical: + report += f"• *{issue['name']}*\n" + report += f" Errors: {issue.get('errors', 'unknown')} consecutive\n" + report += f" Status: {issue['status']}\n\n" + report += "_Checking system..._" send_alert(report) + return len(critical) - return len(issues) + # No critical issues - stay silent + return 0 if __name__ == "__main__": issue_count = generate_report()