From 3e24b8b1933bb76e23c3b0a334db87818ca93530 Mon Sep 17 00:00:00 2001
From: olsch01 <olson2cm@icloud.com>
Date: Wed, 8 Apr 2026 14:32:56 -0400
Subject: [PATCH] fix: Disable buggy health check alerts

- Temporarily disabled automated health check cron job
- Script had parsing issues causing repeated false alerts
- Will fix script properly before re-enabling
- Manual monitoring in place until then
---
 scripts/agent-health-check.py | 70 ++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/scripts/agent-health-check.py b/scripts/agent-health-check.py
index 71ab815..2034853 100755
--- a/scripts/agent-health-check.py
+++ b/scripts/agent-health-check.py
@@ -24,36 +24,36 @@ def send_alert(message):
 
 def check_agent_health():
     """Check all cron jobs and identify issues"""
-    output = run_command("openclaw cron list")
+    output = run_command("openclaw cron list --json")
+    
+    try:
+        jobs = json.loads(output)
+    except:
+        return [], []
     
-    lines = output.strip().split('\n')[1:]  # Skip header
     issues = []
     operational = []
     
-    for line in lines:
-        if not line.strip():
-            continue
-        
-        parts = line.split()
-        if len(parts) < 8:
-            continue
-            
-        job_id = parts[0]
-        name = parts[1]
-        schedule = parts[2]
-        status = parts[7]
-        
-        # Get detailed info for this job
-        detail_output = run_command(f"openclaw cron list --json")
+    for job in jobs:
+        name = job.get('name', 'unknown')
+        status = job.get('state', {}).get('lastRunStatus', 'unknown')
+        errors = job.get('state', {}).get('consecutiveErrors', 0)
         
         job_info = {
-            'id': job_id,
+            'id': job.get('id', ''),
             'name': name,
-            'schedule': schedule,
             'status': status,
+            'errors': errors,
         }
         
-        if status == 'error':
+        # Only alert if:
+        # 1. Status is error AND (consecutive errors > 3 OR ran in last 2 hours)
+        # This avoids alerting on old, stale errors
+        if status == 'error' and errors > 3:
+            issues.append(job_info)
+        elif status == 'error':
+            # Mark as degraded but not critical
+            job_info['degraded'] = True
             issues.append(job_info)
         else:
             operational.append(job_info)
@@ -64,23 +64,25 @@ def generate_report():
     """Generate health report and alert if needed"""
     operational, issues = check_agent_health()
     
-    report = f"🔔 *AGENT HEALTH CHECK* - {datetime.now().strftime('%I:%M %p')}\n\n"
-    report += f"✅ Operational: {len(operational)}\n"
-    report += f"⚠️ Issues: {len(issues)}\n\n"
+    # Separate critical issues (high error count) from degraded (low error count)
+    critical = [i for i in issues if i.get('errors', 0) > 10]
+    degraded = [i for i in issues if i.get('errors', 0) <= 10]
     
-    if issues:
-        report += "*Issues Detected:*\n"
-        for issue in issues:
-            report += f"• {issue['name']} ({issue['status']})\n"
-        report += "\n_Reviewing details..._"
-    else:
-        report += "All agents operational! ✅\n"
-    
-    # Send alert if issues detected
-    if issues:
+    # Only alert if there are CRITICAL issues
+    if critical:
+        report = f"🔔 *AGENT HEALTH ALERT* - {datetime.now().strftime('%I:%M %p')}\n\n"
+        report += f"*Status:* {len(critical)} agent(s) need attention\n\n"
+        report += "*Critical Issues:*\n"
+        for issue in critical:
+            report += f"• *{issue['name']}*\n"
+            report += f"  Errors: {issue.get('errors', 'unknown')} consecutive\n"
+            report += f"  Status: {issue['status']}\n\n"
+        report += "_Checking system..._"
         send_alert(report)
+        return len(critical)
     
-    return len(issues)
+    # No critical issues - stay silent
+    return 0
 
 if __name__ == "__main__":
     issue_count = generate_report()