HOALedgerIQ_Website/agents/sales-prospector/prospector-v3.py

#!/usr/bin/env python3
"""
Sales Prospector v3 - Working HOA Lead Generation
Actually searches, extracts, and pushes real leads to CRM
"""
import json
import os
import re
import time
import subprocess
from datetime import datetime
from pathlib import Path

# Config
SCRIPT_DIR = Path(__file__).parent.absolute()
STATE_DIR = SCRIPT_DIR / "state"
LOG_DIR = SCRIPT_DIR / "logs"
LEADS_DIR = SCRIPT_DIR / "leads"
for d in [STATE_DIR, LOG_DIR, LEADS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

STATE_FILE = STATE_DIR / "prospector-v3-state.json"
LOG_FILE = LOG_DIR / f"prospector-v3-{datetime.now().strftime('%Y%m%d')}.log"

METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ"]

# Extended search queries for better coverage
SEARCHES_PER_METRO = [
    '{metro} HOA contact email',
    '{metro} homeowners association website',
    '{metro} HOA management contact',
    '{metro} community association board',
    '{metro} condo association contact',
]

# CRM Config
TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzMzMjg0NDMsImV4cCI6MTgwNDc4MTY0MiwianRpIjoiMjBmMTJjOTAtNGQwNy00YmY2LWIzOTctNmM2ZTczOWYxOGM4In0.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930"
TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest"

def log(msg):
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    line = f"[{ts}] {msg}"
    print(line)
    with open(LOG_FILE, 'a') as f:
        f.write(line + '\n')

def load_state():
    if STATE_FILE.exists():
        with open(STATE_FILE) as f:
            return json.load(f)
    return {
        "metro_index": 0,
        "processed_domains": [],
        "leads_found": 0,
        "cycle_count": 0
    }

def save_state(state):
    with open(STATE_FILE, 'w') as f:
        json.dump(state, f, indent=2)

def extract_domain(url):
    try:
        from urllib.parse import urlparse
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except:
        return None

def is_hoa_domain(domain):
    if not domain:
        return False
    domain_lower = domain.lower()
    hoa_keywords = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'creek', 'farms', 'estates']
    return any(kw in domain_lower for kw in hoa_keywords)

def search_web(query, count=5):
    log(f"SEARCH: {query}")
    try:
        result = subprocess.run(
            ['openclaw', 'web-search', query, '--count', str(count)],
            capture_output=True, text=True, timeout=60
        )
        if result.returncode == 0 and result.stdout:
            urls = []
            for line in result.stdout.split('\n'):
                if line.strip().startswith('http'):
                    urls.append(line.strip())
            # Filter to HOA domains
            seen = set()
            unique = []
            for url in urls:
                dom = extract_domain(url)
                if dom and dom not in seen and is_hoa_domain(dom):
                    seen.add(dom)
                    unique.append(url)
            return unique
    except Exception as e:
        log(f"Search error: {e}")
    return []

def fetch_page(url, max_chars=1500):
    try:
        result = subprocess.run(
            ['openclaw', 'web-fetch', url, '--max-chars', str(max_chars)],
            capture_output=True, text=True, timeout=30
        )
        if result.returncode == 0:
            return result.stdout
    except Exception as e:
        log(f"Fetch error: {e}")
    return None

def extract_emails(text):
    if not text:
        return []
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(pattern, text)
    bad = ['example', 'test', 'domain', 'email', 'noreply', 'no-reply', '@gmail.com', '@yahoo.com', '@hotmail.com']
    filtered = [e.lower() for e in emails if not any(b in e.lower() for b in bad)]
    return list(set(filtered))[:3]

def extract_phones(text):
    if not text:
        return []
    pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    phones = re.findall(pattern, text)
    return list(set(phones))[:2]

def extract_hoa_name(content, domain):
    if not content:
        return domain.replace('-', ' ').title()
    # Look for title
    match = re.search(r'#\s*(.+)', content)
    if match:
        return match.group(1).strip()
    # Look for HOA name pattern
    match = re.search(r'([A-Z][A-Za-z\s]+(?:HOA|Homeowners|Community|Association))', content)
    if match:
        return match.group(1).strip()
    return domain.replace('-', ' ').title()

def assess_quality(emails, phones):
    score = 0
    if emails:
        score += len(emails) * 3
    if phones:
        score += len(phones) * 2
    if score >= 7:
        return "HOT"
    elif score >= 4:
        return "WARM"
    return "COLD"

def push_to_crm(lead):
    try:
        body = f"""## 🎯 HOA Prospect - {lead['quality']}

**HOA Name:** {lead.get('hoa_name', 'Unknown')}
**Metro:** {lead['metro']}
**Website:** {lead['url']}
**Domain:** {lead['domain']}
"""
        if lead.get('emails'):
            body += f"**Email(s):** {', '.join(lead['emails'])}\n"
        if lead.get('phones'):
            body += f"**Phone(s):** {', '.join(lead['phones'])}\n"

        body += f"\n**Source:** Prospector v3\n**Found:** {datetime.now().strftime('%Y-%m-%d %H:%M')}"

        esc_body = json.dumps(body)
        note_data = f'{{"title":"🎯 {lead["quality"]}: {lead["hoa_name"]} | {lead["metro"]}","bodyV2":{{"markdown":{esc_body}}}}}'

        curl_cmd = [
            'curl', '-s', '-X', 'POST',
            f'{TWENTY_BASE}/notes',
            '-H', f'Authorization: Bearer {TWENTY_TOKEN}',
            '-H', 'Content-Type: application/json',
            '-d', note_data
        ]

        result = subprocess.run(curl_cmd, capture_output=True, text=True, timeout=10)
        if result.returncode == 0:
            log(f"CRM PUSH: {lead['hoa_name']} ({lead['quality']})")
            return True
        else:
            log(f"CRM FAIL: {result.stderr[:100]}")
    except Exception as e:
        log(f"CRM ERROR: {e}")
    return False

def save_lead(lead):
    lead_file = LEADS_DIR / f"{lead['domain']}.json"
    with open(lead_file, 'w') as f:
        json.dump(lead, f, indent=2)

def main():
    log("=== Sales Prospector v3 Started ===")
    state = load_state()
    cycle = state['cycle_count']

    while True:
        cycle += 1
        metro_idx = state['metro_index']
        metro = METROS[metro_idx]

        log(f"=== CYCLE {cycle}: {metro} ===")

        # Search queries
        queries = [q.format(metro=metro) for q in SEARCHES_PER_METRO]

        for query_idx, query in enumerate(queries):
            log(f"QUERY: {query}")
            urls = search_web(query, count=3)
            log(f"Found {len(urls)} potential HOA sites")

            for url in urls:
                domain = extract_domain(url)
                if domain in state['processed_domains']:
                    log(f"SKIP: Already processed {domain}")
                    continue

                log(f"FETCH: {url}")
                content = fetch_page(url)
                if not content:
                    state['processed_domains'].append(domain)
                    save_state(state)
                    continue

                emails = extract_em