Files
HOALedgerIQ_Website/agents/sales-prospector/prospector-v9.py
olsch01 5319bcd30b feat: Add Chatwoot Agent Bot prototype and FAQ knowledge base
- Created chatwoot-agent-bot/ with Node.js webhook server
- Bot detects intent (greeting, billing, technical, features, account)
- Auto-responds from FAQ knowledge base or escalates to human
- FAQ-KB.md: Living knowledge base that grows with customer questions
- CHATWOOT-SETUP.md: Complete deployment and configuration guide
- Supports Telegram notifications on escalation
- Bot runs on port 3001, ready for Chatwoot webhook integration
2026-04-01 16:26:05 -04:00

184 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""Sales Prospector v9 - Rate limiting + CRM dedupe"""
import json, re, time, urllib.request, urllib.parse, urllib.error
from datetime import datetime
from pathlib import Path
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
SCRIPT_DIR = Path(__file__).parent
STATE_DIR, LOG_DIR, LEADS_DIR = SCRIPT_DIR / "state", SCRIPT_DIR / "logs", SCRIPT_DIR / "leads"
for d in [STATE_DIR, LOG_DIR, LEADS_DIR]: d.mkdir(parents=True, exist_ok=True)
STATE_FILE = STATE_DIR / "prospector-v9-state.json"
LOG_FILE = LOG_DIR / f"prospector-v9-{datetime.now().strftime('%Y%m%d')}.log"
METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ", "Austin TX",
"Denver CO", "Nashville TN", "Raleigh NC", "Tampa FL", "Dallas TX",
"Houston TX", "Miami FL", "Seattle WA", "Portland OR", "Las Vegas NV"]
BRAVE_KEY = "BSACPtwjz5lrsXC10pwjFVqzFGN2gr4"
TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930"
TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest"
LAST_SEARCH, CONSEC_429, BACKOFF_UNTIL = 0, 0, 0
def log(msg):
ts = datetime.now().strftime('%H:%M:%S')
line = f"[{ts}] {msg}"
print(line)
with open(LOG_FILE, 'a') as f: f.write(line + '\n')
def get_backoff():
return min(120 * (2 ** (CONSEC_429 - 1)), 900) if CONSEC_429 > 0 else 2
def rate_limited_sleep():
global LAST_SEARCH, BACKOFF_UNTIL
now = time.time()
if now < BACKOFF_UNTIL:
time.sleep(BACKOFF_UNTIL - now)
delay = get_backoff()
if now - LAST_SEARCH < delay:
time.sleep(delay - (now - LAST_SEARCH))
LAST_SEARCH = time.time()
def load_state():
if STATE_FILE.exists():
s = json.loads(STATE_FILE.read_text())
s['crm'] = set(s.get('crm', []))
s['checked'] = set(s.get('checked', []))
return s
return {"metro_idx": 0, "crm": set(), "checked": set(), "leads": 0, "cycle": 0}
def save_state(s):
tmp = s.copy()
tmp['crm'] = list(s['crm'])
tmp['checked'] = list(s['checked'])
STATE_FILE.write_text(json.dumps(tmp, indent=2))
def search_brave(query):
global CONSEC_429, BACKOFF_UNTIL
rate_limited_sleep()
log(f"SEARCH: {query}")
try:
url = f"https://api.search.brave.com/res/v1/web/search?q={urllib.parse.quote(query)}&count=8"
with urllib.request.urlopen(urllib.request.Request(url,
headers={"X-Subscription-Token": BRAVE_KEY, "Accept": "application/json"}), timeout=30) as r:
urls = [x.get('url') for x in json.loads(r.read().decode()).get('web', {}).get('results', []) if x.get('url')]
if CONSEC_429 > 0:
log(f" -> Rate cleared after {CONSEC_429} tries")
CONSEC_429 = 0
log(f" -> {len(urls)} URLs")
return urls
except urllib.error.HTTPError as e:
if e.code == 429:
CONSEC_429 += 1
delay = get_backoff()
BACKOFF_UNTIL = time.time() + delay
log(f" -> 429 (try #{CONSEC_429}), backoff {delay/60:.1f}min")
return []
except Exception as e:
log(f" -> Error: {str(e)[:40]}")
return []
def fetch_page(url):
try:
with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=8) as r:
html = r.read().decode('utf-8', errors='ignore')
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.I)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.I)
text = re.sub(r'<[^>]+>', ' ', text)
return re.sub(r'\s+', ' ', text)[:2000]
except:
return ""
def get_domain(url):
try:
d = urllib.parse.urlparse(url).netloc.lower()
return d[4:] if d.startswith('www.') else d
except:
return None
def is_hoa(d):
if not d: return False
dl = d.lower()
good = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'creek', 'estates', 'mgmt', 'management']
bad = ['google', 'facebook', 'yelp', 'bbb', 'wiki', 'reddit', 'linkedin', 'blog', 'news']
return any(k in dl for k in good) and not any(b in dl for b in bad)
def get_emails(t):
if not t: return []
bad = ['example.com', 'test.com', 'noreply@', 'info@', 'support@']
ems = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', t)
return list(set([e.lower() for e in ems if len(e) > 8 and '@' in e and not any(b in e for b in bad)]))[:3]
def push_crm(lead):
try:
note = {"title": f"{lead['q']}: {lead['d']}", "bodyV2": {"markdown":
f"## {lead['q']} Lead\n\n**HOA:** {lead['n']}\n**Metro:** {lead['m']}\n**Site:** {lead['u']}\n**Emails:** {', '.join(lead['e']) or 'None'}"}}
with urllib.request.urlopen(urllib.request.Request(f"{TWENTY_BASE}/notes",
headers={"Authorization": f"Bearer {TWENTY_TOKEN}", "Content-Type": "application/json"},
data=json.dumps(note).encode(), method='POST'), timeout=10) as r:
log(f"CRM: {lead['d']}")
return True
except Exception as e:
log(f"CRM error: {e}")
return False
def main():
global CONSEC_429, BACKOFF_UNTIL
log("=== Prospector v9 ===")
s = load_state()
queries = ["{m} HOA", "{m} homeowners association", "{m} community management"]
while True:
s['cycle'] += 1
metro = METROS[s['metro_idx'] % len(METROS)]
log(f"CYCLE {s['cycle']}: {metro}" + (" (backoff)" if CONSEC_429 > 0 else ""))
start = time.time()
found = 0
for qtmpl in queries:
if s['leads'] >= 50: break
urls = search_brave(qtmpl.format(m=metro))
if CONSEC_429 > 1 and not urls: break
for url in urls[:5]:
if s['leads'] >= 50: break
dom = get_domain(url)
if not dom: continue
# DEDUPE: Skip if already in CRM
if dom in s['crm']:
continue
s['checked'].add(dom)
if not is_hoa(dom):
continue
text = fetch_page(url)
emails = get_emails(text)
lead = {'n': dom.split('.')[0].replace('-', ' ').title() + " HOA",
'm': metro, 'u': url, 'd': dom, 'e': emails,
'q': "HOT" if len(emails) >= 2 else "WARM" if emails else "COLD"}
if push_crm(lead):
s['crm'].add(dom)
s['leads'] += 1
found += 1
log(f"LEAD {s['leads']}: {lead['n']} ({lead['q']})")
s['metro_idx'] = (s['metro_idx'] + 1) % len(METROS)
save_state(s)
log(f"Done: {found} new, {s['leads']} total, {time.time()-start:.1f}s")
if s['leads'] >= 50:
log("TARGET: 50 leads!")
break
if found == 0:
time.sleep(30)
if __name__ == "__main__":
main()