feat: Add Chatwoot Agent Bot prototype and FAQ knowledge base

- Created chatwoot-agent-bot/ with Node.js webhook server - Bot detects intent (greeting, billing, technical, features, account) - Auto-responds from FAQ knowledge base or escalates to human - FAQ-KB.md: Living knowledge base that grows with customer questions - CHATWOOT-SETUP.md: Complete deployment and configuration guide - Supports Telegram notifications on escalation - Bot runs on port 3001, ready for Chatwoot webhook integration
2026-04-01 16:26:05 -04:00
parent 7ba19752de
commit 5319bcd30b
1074 changed files with 456376 additions and 0 deletions
--- a/agents/sales-prospector/prospector-v2.py
+++ b/agents/sales-prospector/prospector-v2.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Sales Prospector v2 - Intelligent HOA Lead Generation
+Searches for HOA websites, crawls for contact info, extracts board/mgmt contacts
+"""
+
+import json
+import os
+import re
+import time
+import subprocess
+from datetime import datetime
+from urllib.parse import urlparse, urljoin
+from pathlib import Path
+
+# Config
+SCRIPT_DIR = Path(__file__).parent.absolute()
+STATE_DIR = SCRIPT_DIR / "state"
+LOG_DIR = SCRIPT_DIR / "logs"
+LEADS_DIR = SCRIPT_DIR / "leads"
+
+for d in [STATE_DIR, LOG_DIR, LEADS_DIR]:
+    d.mkdir(parents=True, exist_ok=True)
+
+STATE_FILE = STATE_DIR / "prospector-v2-state.json"
+LOG_FILE = LOG_DIR / f"prospector-v2-{datetime.now().strftime('%Y%m%d')}.log"
+
+METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ"]
+
+# Search config
+SEARCHES_PER_METRO = [
+    '{metro} HOA "board of directors"',
+    '{metro} homeowners association contact',
+    '{metro} HOA management company',
+    '{metro} HOA board members',
+    '{metro} community association management',
+]
+
+# Keywords for validating HOA sites
+HOA_KEYWORDS = ['hoa', 'homeowners', 'association', 'board', 'community', 'management', 'condo', 'townhome']
+
+# CRM Config
+TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930"
+TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest"
+
+def log(msg):
+    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    line = f"[{ts}] {msg}"
+    print(line)
+    with open(LOG_FILE, 'a') as f:
+        f.write(line + '\n')
+
+def load_state():
+    if STATE_FILE.exists():
+        with open(STATE_FILE) as f:
+            return json.load(f)
+    return {
+        "metro_index": 0,
+        "search_index": 0,
+        "processed_domains": [],
+        "leads_found": 0,
+        "domains_queue": [],  # Domains found but not yet crawled
+        "current_domain": None,
+        "cycle_count": 0
+    }
+
+def save_state(state):
+    with open(STATE_FILE, 'w') as f:
+        json.dump(state, f, indent=2)
+
+def get_throttle_delay():
+    """Returns delay in seconds based on business hours"""
+    hour = datetime.now().hour
+    if 9 <= hour < 18:
+        return 120  # 2 min business hours
+    return 60  # 1 min overnight
+
+def extract_domain(url):
+    """Extract clean domain from URL"""
+    try:
+        parsed = urlparse(url)
+        domain = parsed.netloc.lower()
+        if domain.startswith('www.'):
+            domain = domain[4:]
+        return domain
+    except:
+        return None
+
+def is_hoa_domain(domain):
+    """Check if domain looks like an HOA site"""
+    if not domain:
+        return False
+    domain_lower = domain.lower()
+    return any(kw in domain_lower for kw in HOA_KEYWORDS)
+
+def search_web(query, count=10):
+    """Run web search via openclaw web_search tool"""
+    log(f"SEARCH: {query}")
+    try:
+        # Use openclaw CLI for web search
+        result = subprocess.run(
+            ['openclaw', 'web-search', query, '--count', str(count)],
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+        if result.returncode == 0 and result.stdout:
+            # Parse results - look for URLs
+            urls = []
+            for line in result.stdout.split('\n'):
+                if line.startswith('http'):
+                    urls.append(line.strip())
+                # Also extract from markdown format
+                url_match = re.search(r'https?://[^\s\)\]\"\']+', line)
+                if url_match:
+                    urls.append(url_match.group(0))
+            return list(set(urls))
+    except Exception as e:
+        log(f"Search error: {e}")
+    return []
+
+def fetch_page(url, max_chars=3000):
+    """Fetch page content via web_fetch"""
+    try:
+        result = subprocess.run(
+            ['openclaw', 'web-fetch', url, '--max-chars', str(max_chars)],
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        if result.returncode == 0:
+            return result.stdout
+    except Exception as e:
+        log(f"Fetch error for {url}: {e}")
+    return None
+
+def extract_emails(text):
+    """Extract email addresses from text"""
+    if not text:
+        return []
+    # Pattern for emails
+    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+    emails = re.findall(pattern, text)
+    # Filter out common false positives
+    filtered = [e for e in emails if not any(x in e.lower() for x in ['example.com', 'test.com', 'domain.com', 'email.com'])]
+    return list(set(filtered))
+
+def extract_phones(text):
+    """Extract phone numbers from text"""
+    if not text:
+        return []
+    # Various phone patterns
+    patterns = [
+        r'\(?\d{3}\)?[-.\s]\d{3}[-.\s]\d{4}',  # (555) 123-4567
+        r'\d{3}[-.\s]\d{3}[-.\s]\d{4}',  # 555-123-4567
+        r'\+?1[-.\s]?\(?\d{3}\)?[-.\s]\d{3}[-.\s]\d{4}',  # +1 (555) 123-4567
+    ]
+    phones = []
+    for pattern in patterns:
+        phones.extend(re.findall(pattern, text))
+    return list(set(phones))
+
+def extract_names_and_titles(text):
+    """Extract potential board member names with titles"""
+    if not text:
+        return []
+    
+    # Look for patterns like "John Smith, President" or "Board Member: Jane Doe"
+    titles = ['president', 'vice president', 'vp', 'treasurer', 'secretary', 'board member', 
+              'director', 'manager', 'community manager', 'property manager']
+    
+    results = []
+    lines = text.split('\n')
+    
+    for line in lines:
+        line_lower = line.lower()
+        for title in titles:
+            if title in line_lower:
+                # Extract name before/after title
+                # Simple: capture 2-3 capitalized words near the title
+                match = re.search(r'([A-Z][a-z]+\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)?)', line)
+                if match:
+                    name = match.group(1)
+                    results.append({"name": name, "title": title.title()})
+    
+    return results
+
+def extract_hoa_info(domain, content):
+    """Extract HOA name and details from content"""
+    info = {
+        "name": None,
+        "homes": None,
+        "location": None
+    }
+    
+    if not content:
+        return info
+    
+    # Try to find HOA name from title or first heading
+    lines = content.split('\n')
+    for line in lines[:20]:
+        if '#' in line:  # Markdown header
+            clean = line.replace('#', '').strip()
+            if len(clean) > 3:
+                info['name'] = clean
+                break
+    
+    # Look for home count patterns
+    home_patterns = [
+        r'(\d+)\s+(?:homes|units|properties|residences|households)',
+        r'(?:over|more than)\s+(\d+)\s+(?:homes|units)',
+    ]
+    for pattern in home_patterns:
+        match = re.search(pattern, content, re.IGNORECASE)
+        if match:
+            info['homes'] = match.group(1)
+            break
+    
+    return info
+
+def assess_quality(emails, phones, names, info):
+    """Assess lead quality based on available data"""
+    score = 0
+    if emails: score += 3
+    if phones: score += 2
+    if names: score += 2
+    if info.get('name'): score += 1
+    if info.get('homes'): score += 2
+    
+    if score >= 7:
+        return "HOT"
+    elif score >= 4:
+        return "WARM"
+    return "COLD"
+
+def push_to_crm(lead):
+    """Push lead to Twenty CRM"""
+    try:
+        body = f"""## HOA Prospect - {lead['quality']}
+
+**Name:** {lead.get('hoa_name