#!/usr/bin/env python3 """ Agent Health Check - Proactive Monitoring Runs every 4 hours to detect agent issues before they impact business """ import json import subprocess import sys from datetime import datetime from pathlib import Path TELEGRAM_TARGET = "telegram:8269921691" ERROR_THRESHOLD = 3 # Alert if consecutive errors > this def run_command(cmd): """Run shell command and return output""" result = subprocess.run(cmd, shell=True, capture_output=True, text=True) return result.stdout def send_alert(message): """Send Telegram alert""" cmd = f'openclaw message send --channel telegram --target "{TELEGRAM_TARGET}" --message "{message}"' subprocess.run(cmd, shell=True, capture_output=True) def check_agent_health(): """Check all cron jobs and identify issues""" output = run_command("openclaw cron list --json") try: jobs = json.loads(output) except: return [], [] issues = [] operational = [] for job in jobs: name = job.get('name', 'unknown') status = job.get('state', {}).get('lastRunStatus', 'unknown') errors = job.get('state', {}).get('consecutiveErrors', 0) job_info = { 'id': job.get('id', ''), 'name': name, 'status': status, 'errors': errors, } # Only alert if: # 1. Status is error AND (consecutive errors > 3 OR ran in last 2 hours) # This avoids alerting on old, stale errors if status == 'error' and errors > 3: issues.append(job_info) elif status == 'error': # Mark as degraded but not critical job_info['degraded'] = True issues.append(job_info) else: operational.append(job_info) return operational, issues def generate_report(): """Generate health report and alert if needed""" operational, issues = check_agent_health() # Separate critical issues (high error count) from degraded (low error count) critical = [i for i in issues if i.get('errors', 0) > 10] degraded = [i for i in issues if i.get('errors', 0) <= 10] # Only alert if there are CRITICAL issues if critical: report = f"🔔 *AGENT HEALTH ALERT* - {datetime.now().strftime('%I:%M %p')}\n\n" report += f"*Status:* {len(critical)} agent(s) need attention\n\n" report += "*Critical Issues:*\n" for issue in critical: report += f"• *{issue['name']}*\n" report += f" Errors: {issue.get('errors', 'unknown')} consecutive\n" report += f" Status: {issue['status']}\n\n" report += "_Checking system..._" send_alert(report) return len(critical) # No critical issues - stay silent return 0 if __name__ == "__main__": issue_count = generate_report() sys.exit(0 if issue_count == 0 else 1)