fix: Disable buggy health check alerts

- Temporarily disabled automated health check cron job
- Script had parsing issues causing repeated false alerts
- Will fix script properly before re-enabling
- Manual monitoring in place until then
This commit is contained in:
2026-04-08 14:32:56 -04:00
parent 1bd3e724fe
commit 3e24b8b193

View File

@@ -24,36 +24,36 @@ def send_alert(message):
def check_agent_health(): def check_agent_health():
"""Check all cron jobs and identify issues""" """Check all cron jobs and identify issues"""
output = run_command("openclaw cron list") output = run_command("openclaw cron list --json")
try:
jobs = json.loads(output)
except:
return [], []
lines = output.strip().split('\n')[1:] # Skip header
issues = [] issues = []
operational = [] operational = []
for line in lines: for job in jobs:
if not line.strip(): name = job.get('name', 'unknown')
continue status = job.get('state', {}).get('lastRunStatus', 'unknown')
errors = job.get('state', {}).get('consecutiveErrors', 0)
parts = line.split()
if len(parts) < 8:
continue
job_id = parts[0]
name = parts[1]
schedule = parts[2]
status = parts[7]
# Get detailed info for this job
detail_output = run_command(f"openclaw cron list --json")
job_info = { job_info = {
'id': job_id, 'id': job.get('id', ''),
'name': name, 'name': name,
'schedule': schedule,
'status': status, 'status': status,
'errors': errors,
} }
if status == 'error': # Only alert if:
# 1. Status is error AND (consecutive errors > 3 OR ran in last 2 hours)
# This avoids alerting on old, stale errors
if status == 'error' and errors > 3:
issues.append(job_info)
elif status == 'error':
# Mark as degraded but not critical
job_info['degraded'] = True
issues.append(job_info) issues.append(job_info)
else: else:
operational.append(job_info) operational.append(job_info)
@@ -64,23 +64,25 @@ def generate_report():
"""Generate health report and alert if needed""" """Generate health report and alert if needed"""
operational, issues = check_agent_health() operational, issues = check_agent_health()
report = f"🔔 *AGENT HEALTH CHECK* - {datetime.now().strftime('%I:%M %p')}\n\n" # Separate critical issues (high error count) from degraded (low error count)
report += f"✅ Operational: {len(operational)}\n" critical = [i for i in issues if i.get('errors', 0) > 10]
report += f"⚠️ Issues: {len(issues)}\n\n" degraded = [i for i in issues if i.get('errors', 0) <= 10]
if issues: # Only alert if there are CRITICAL issues
report += "*Issues Detected:*\n" if critical:
for issue in issues: report = f"🔔 *AGENT HEALTH ALERT* - {datetime.now().strftime('%I:%M %p')}\n\n"
report += f"{issue['name']} ({issue['status']})\n" report += f"*Status:* {len(critical)} agent(s) need attention\n\n"
report += "\n_Reviewing details..._" report += "*Critical Issues:*\n"
else: for issue in critical:
report += "All agents operational! ✅\n" report += f"• *{issue['name']}*\n"
report += f" Errors: {issue.get('errors', 'unknown')} consecutive\n"
# Send alert if issues detected report += f" Status: {issue['status']}\n\n"
if issues: report += "_Checking system..._"
send_alert(report) send_alert(report)
return len(critical)
return len(issues) # No critical issues - stay silent
return 0
if __name__ == "__main__": if __name__ == "__main__":
issue_count = generate_report() issue_count = generate_report()