Files
HOALedgerIQ_Website/agents/sales-prospector/prospector-v12.py
olsch01 5319bcd30b feat: Add Chatwoot Agent Bot prototype and FAQ knowledge base
- Created chatwoot-agent-bot/ with Node.js webhook server
- Bot detects intent (greeting, billing, technical, features, account)
- Auto-responds from FAQ knowledge base or escalates to human
- FAQ-KB.md: Living knowledge base that grows with customer questions
- CHATWOOT-SETUP.md: Complete deployment and configuration guide
- Supports Telegram notifications on escalation
- Bot runs on port 3001, ready for Chatwoot webhook integration
2026-04-01 16:26:05 -04:00

145 lines
5.9 KiB
Python

#!/usr/bin/env python3
"""Sales Prospector v12b - Aggressive SearXNG harvesting"""
import json, re, time, random, urllib.request, urllib.parse
from datetime import datetime
from pathlib import Path
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
for d in [Path(__file__).parent / x for x in ["state", "logs", "leads"]]:
d.mkdir(parents=True, exist_ok=True)
STATE_FILE = Path(__file__).parent / "state" / "prospector-v12-state.json"
LOG_FILE = Path(__file__).parent / "logs" / f"prospector-v12-{datetime.now().strftime('%Y%m%d')}.log"
METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ", "Austin TX",
"Denver CO", "Nashville TN", "Raleigh NC", "Tampa FL", "Dallas TX",
"Houston TX", "Miami FL", "Seattle WA", "Portland OR", "Las Vegas NV",
"San Antonio TX", "Indianapolis IN", "Columbus OH", "Kansas City MO",
"Salt Lake City UT", "San Diego CA", "Sacramento CA", "San Jose CA",
"New Orleans LA", "Oklahoma City OK"]
SEARXNG = "https://search.sensetostyle.com"
TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930"
TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest"
LAST_REQ = 0
def log(m):
ts = datetime.now().strftime('%H:%M:%S')
print(f"[{ts}] {m}")
with open(LOG_FILE, 'a') as f: f.write(f"[{ts}] {m}\n")
def throttle():
global LAST_REQ
dly = random.uniform(2, 4)
if LAST_REQ > 0 and (time.time() - LAST_REQ) < dly:
time.sleep(dly - (time.time() - LAST_REQ))
LAST_REQ = time.time()
def load():
if STATE_FILE.exists():
s = json.loads(STATE_FILE.read_text())
s['crm'] = set(s.get('crm', []))
return s
return {"m": 0, "crm": set(), "leads": 0, "cycle": 0}
def save(s):
tmp = s.copy()
tmp['crm'] = list(s['crm'])
STATE_FILE.write_text(json.dumps(tmp, indent=2))
def search(q):
throttle()
try:
url = f"{SEARXNG}/search?q={urllib.parse.quote(q)}"
with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=15) as r:
html = r.read().decode('utf-8', errors='ignore')
urls = [m for m in re.findall(r'href="(https?://[^"]+)"', html)
if 'sensetostyle' not in m and 'archive.org' not in m]
return list(dict.fromkeys(urls))[:15]
except:
return []
def get_dom(url):
try:
d = urllib.parse.urlparse(url).netloc.lower()
return d[4:] if d.startswith('www.') else d
except: return None
def is_hoa(d):
if not d: return False
dl = d.lower()
good = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'mgmt', 'management', 'hood']
bad = ['sensetostyle', 'archive.org', 'google', 'facebook', 'yelp', 'bbb', 'wiki', 'reddit', 'linkedin']
return any(k in dl for k in good) and not any(b in dl for b in bad)
def fetch(url):
throttle()
try:
with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=8) as r:
t = re.sub(r'<script.*?script>', '', r.read().decode('utf-8', errors='ignore'), flags=re.DOTALL|re.I)
t = re.sub(r'<style.*?style>', '', t, flags=re.DOTALL|re.I)
return re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', ' ', t))[:1500]
except: return ""
def get_emails(t):
ems = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', t)
return list(set([e.lower() for e in ems if len(e) > 8 and '@' in e]))[:3] or []
def crm_push(lead):
try:
note = {"title": f"{lead['q']}: {lead['d']}",
"bodyV2": {"markdown": f"## {lead['q']} Lead\n\n**HOA:** {lead['n']}\n**Metro:** {lead['m']}\n**Site:** {lead['u']}\n**Emails:** {', '.join(lead['e']) or 'None'}"}}
urllib.request.urlopen(urllib.request.Request(f"{TWENTY_BASE}/notes",
headers={"Authorization": f"Bearer {TWENTY_TOKEN}", "Content-Type": "application/json"},
data=json.dumps(note).encode(), method='POST'), timeout=10)
log(f"CRM: {lead['d']}")
return True
except Exception as e:
log(f"FAIL: {e}")
return False
def main():
log("=== v12 RESTART ===")
s = load()
queries = ["{m} HOA", "{m} homeowners association", "{m} HOA management contact",
"{m} condo association", "{m} community management", "{m} HOA board"]
while True:
s['cycle'] += 1
metro = METROS[s['m'] % len(METROS)]
log(f"CYCLE {s['cycle']}: {metro} | Leads: {s['leads']}")
new = 0
for qt in queries:
if s['leads'] >= 200: break
urls = search(qt.format(m=metro))
if urls: log(f" Got {len(urls)} URLs")
for url in urls[:5]:
if s['leads'] >= 200: break
dom = get_dom(url)
if not dom or dom in s['crm'] or not is_hoa(dom): continue
txt = fetch(url)
lead = {'n': dom.split('.')[0].replace('-', ' ').title()[:30] + " HOA",
'm': metro, 'u': url, 'd': dom,
'e': get_emails(txt),
'q': "HOT" if len(get_emails(txt)) >= 2 else "WARM" if get_emails(txt) else "COLD"}
if crm_push(lead):
s['crm'].add(dom)
s['leads'] += 1
new += 1
log(f"LEAD {s['leads']}: {lead['n']}")
s['m'] = (s['m'] + 1) % len(METROS)
save(s)
log(f"Done: {new} new | {s['leads']} total")
if s['leads'] >= 200: log("TARGET 200!"); break
if new == 0: time.sleep(10)
if __name__ == "__main__":
main()