#!/usr/bin/env python3 """ Sales Prospector v2 - Intelligent HOA Lead Generation Searches for HOA websites, crawls for contact info, extracts board/mgmt contacts """ import json import os import re import time import subprocess from datetime import datetime from urllib.parse import urlparse, urljoin from pathlib import Path # Config SCRIPT_DIR = Path(__file__).parent.absolute() STATE_DIR = SCRIPT_DIR / "state" LOG_DIR = SCRIPT_DIR / "logs" LEADS_DIR = SCRIPT_DIR / "leads" for d in [STATE_DIR, LOG_DIR, LEADS_DIR]: d.mkdir(parents=True, exist_ok=True) STATE_FILE = STATE_DIR / "prospector-v2-state.json" LOG_FILE = LOG_DIR / f"prospector-v2-{datetime.now().strftime('%Y%m%d')}.log" METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ"] # Search config SEARCHES_PER_METRO = [ '{metro} HOA "board of directors"', '{metro} homeowners association contact', '{metro} HOA management company', '{metro} HOA board members', '{metro} community association management', ] # Keywords for validating HOA sites HOA_KEYWORDS = ['hoa', 'homeowners', 'association', 'board', 'community', 'management', 'condo', 'townhome'] # CRM Config TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930" TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest" def log(msg): ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S') line = f"[{ts}] {msg}" print(line) with open(LOG_FILE, 'a') as f: f.write(line + '\n') def load_state(): if STATE_FILE.exists(): with open(STATE_FILE) as f: return json.load(f) return { "metro_index": 0, "search_index": 0, "processed_domains": [], "leads_found": 0, "domains_queue": [], # Domains found but not yet crawled "current_domain": None, "cycle_count": 0 } def save_state(state): with open(STATE_FILE, 'w') as f: json.dump(state, f, indent=2) def get_throttle_delay(): """Returns delay in seconds based on business hours""" hour = datetime.now().hour if 9 <= hour < 18: return 120 # 2 min business hours return 60 # 1 min overnight def extract_domain(url): """Extract clean domain from URL""" try: parsed = urlparse(url) domain = parsed.netloc.lower() if domain.startswith('www.'): domain = domain[4:] return domain except: return None def is_hoa_domain(domain): """Check if domain looks like an HOA site""" if not domain: return False domain_lower = domain.lower() return any(kw in domain_lower for kw in HOA_KEYWORDS) def search_web(query, count=10): """Run web search via openclaw web_search tool""" log(f"SEARCH: {query}") try: # Use openclaw CLI for web search result = subprocess.run( ['openclaw', 'web-search', query, '--count', str(count)], capture_output=True, text=True, timeout=60 ) if result.returncode == 0 and result.stdout: # Parse results - look for URLs urls = [] for line in result.stdout.split('\n'): if line.startswith('http'): urls.append(line.strip()) # Also extract from markdown format url_match = re.search(r'https?://[^\s\)\]\"\']+', line) if url_match: urls.append(url_match.group(0)) return list(set(urls)) except Exception as e: log(f"Search error: {e}") return [] def fetch_page(url, max_chars=3000): """Fetch page content via web_fetch""" try: result = subprocess.run( ['openclaw', 'web-fetch', url, '--max-chars', str(max_chars)], capture_output=True, text=True, timeout=30 ) if result.returncode == 0: return result.stdout except Exception as e: log(f"Fetch error for {url}: {e}") return None def extract_emails(text): """Extract email addresses from text""" if not text: return [] # Pattern for emails pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' emails = re.findall(pattern, text) # Filter out common false positives filtered = [e for e in emails if not any(x in e.lower() for x in ['example.com', 'test.com', 'domain.com', 'email.com'])] return list(set(filtered)) def extract_phones(text): """Extract phone numbers from text""" if not text: return [] # Various phone patterns patterns = [ r'\(?\d{3}\)?[-.\s]\d{3}[-.\s]\d{4}', # (555) 123-4567 r'\d{3}[-.\s]\d{3}[-.\s]\d{4}', # 555-123-4567 r'\+?1[-.\s]?\(?\d{3}\)?[-.\s]\d{3}[-.\s]\d{4}', # +1 (555) 123-4567 ] phones = [] for pattern in patterns: phones.extend(re.findall(pattern, text)) return list(set(phones)) def extract_names_and_titles(text): """Extract potential board member names with titles""" if not text: return [] # Look for patterns like "John Smith, President" or "Board Member: Jane Doe" titles = ['president', 'vice president', 'vp', 'treasurer', 'secretary', 'board member', 'director', 'manager', 'community manager', 'property manager'] results = [] lines = text.split('\n') for line in lines: line_lower = line.lower() for title in titles: if title in line_lower: # Extract name before/after title # Simple: capture 2-3 capitalized words near the title match = re.search(r'([A-Z][a-z]+\s[A-Z][a-z]+(?:\s[A-Z][a-z]+)?)', line) if match: name = match.group(1) results.append({"name": name, "title": title.title()}) return results def extract_hoa_info(domain, content): """Extract HOA name and details from content""" info = { "name": None, "homes": None, "location": None } if not content: return info # Try to find HOA name from title or first heading lines = content.split('\n') for line in lines[:20]: if '#' in line: # Markdown header clean = line.replace('#', '').strip() if len(clean) > 3: info['name'] = clean break # Look for home count patterns home_patterns = [ r'(\d+)\s+(?:homes|units|properties|residences|households)', r'(?:over|more than)\s+(\d+)\s+(?:homes|units)', ] for pattern in home_patterns: match = re.search(pattern, content, re.IGNORECASE) if match: info['homes'] = match.group(1) break return info def assess_quality(emails, phones, names, info): """Assess lead quality based on available data""" score = 0 if emails: score += 3 if phones: score += 2 if names: score += 2 if info.get('name'): score += 1 if info.get('homes'): score += 2 if score >= 7: return "HOT" elif score >= 4: return "WARM" return "COLD" def push_to_crm(lead): """Push lead to Twenty CRM""" try: body = f"""## HOA Prospect - {lead['quality']} **Name:** {lead.get('hoa_name