Files
HOALedgerIQ_Website/agents/sales-prospector/prospector-v11.py
olsch01 5319bcd30b feat: Add Chatwoot Agent Bot prototype and FAQ knowledge base
- Created chatwoot-agent-bot/ with Node.js webhook server
- Bot detects intent (greeting, billing, technical, features, account)
- Auto-responds from FAQ knowledge base or escalates to human
- FAQ-KB.md: Living knowledge base that grows with customer questions
- CHATWOOT-SETUP.md: Complete deployment and configuration guide
- Supports Telegram notifications on escalation
- Bot runs on port 3001, ready for Chatwoot webhook integration
2026-04-01 16:26:05 -04:00

189 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""Sales Prospector v11 - Bing scraper (more reliable than Google)"""
import json, re, time, random, urllib.request, urllib.parse
from datetime import datetime
from pathlib import Path
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
SCRIPT_DIR = Path(__file__).parent
STATE_DIR, LOG_DIR, LEADS_DIR = SCRIPT_DIR / "state", SCRIPT_DIR / "logs", SCRIPT_DIR / "leads"
for d in [STATE_DIR, LOG_DIR, LEADS_DIR]: d.mkdir(parents=True, exist_ok=True)
STATE_FILE = STATE_DIR / "prospector-v11-state.json"
LOG_FILE = LOG_DIR / f"prospector-v11-{datetime.now().strftime('%Y%m%d')}.log"
METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ", "Austin TX",
"Denver CO", "Nashville TN", "Raleigh NC", "Tampa FL", "Dallas TX",
"Houston TX", "Miami FL"]
TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930"
TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest"
LAST_REQ, BLOCKED_UNTIL = 0, 0
USER_AGENTS = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"]
def log(msg):
ts = datetime.now().strftime('%H:%M:%S')
line = f"[{ts}] {msg}"
print(line)
with open(LOG_FILE, 'a') as f: f.write(line + '\n')
def throttle():
global LAST_REQ
delay = random.uniform(5, 10) # 5-10s between requests
if LAST_REQ > 0:
elapsed = time.time() - LAST_REQ
if elapsed < delay: time.sleep(delay - elapsed)
LAST_REQ = time.time()
def load_state():
if STATE_FILE.exists():
s = json.loads(STATE_FILE.read_text())
s['crm'] = set(s.get('crm', []))
return s
return {"metro_idx": 0, "crm": set(), "leads": 0, "cycle": 0}
def save_state(s):
tmp = s.copy()
tmp['crm'] = list(s['crm'])
STATE_FILE.write_text(json.dumps(tmp, indent=2))
def search_bing(query):
global BLOCKED_UNTIL
if time.time() < BLOCKED_UNTIL:
return []
throttle()
log(f"SEARCH: {query}")
try:
# Bing search URL
url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}&format=rss"
req = urllib.request.Request(url, headers={
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/rss+xml",
"Accept-Language": "en-US,en;q=0.9"
})
with urllib.request.urlopen(req, timeout=15) as r:
html = r.read().decode('utf-8', errors='ignore')
if any(x in html.lower() for x in ['captcha', 'blocked', 'unusual']):
BLOCKED_UNTIL = time.time() + 1800
log(f" -> BLOCKED, pausing 30min")
return []
# Extract result URLs
urls = []
# Pattern for organic results
for m in re.findall(r'<li class="b_algo"[^>]*>.*?<a href="([^"]+)"', html, re.DOTALL):
if m.startswith('http') and 'bing.com' not in m:
urls.append(m)
# Backup pattern
if not urls:
for m in re.findall(r'href="(https?://[^"]+)"', html):
if 'bing.com' not in m and 'microsoft.com' not in m:
urls.append(m)
log(f" -> {len(urls)} URLs")
return urls[:10]
except urllib.error.HTTPError as e:
if e.code in [429, 503]:
BLOCKED_UNTIL = time.time() + 900
log(f" -> Rate limited ({e.code}), backoff 15min")
return []
except Exception as e:
log(f" -> Error: {str(e)[:40]}")
return []
def fetch_page(url):
throttle()
try:
req = urllib.request.Request(url, headers={"User-Agent": random.choice(USER_AGENTS)})
with urllib.request.urlopen(req, timeout=10) as r:
html = r.read().decode('utf-8', errors='ignore')
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.I)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.I)
text = re.sub(r'<[^>]+>', ' ', text)
return re.sub(r'\s+', ' ', text)[:2000]
except:
return ""
def get_domain(url):
try:
d = urllib.parse.urlparse(url).netloc.lower()
return d[4:] if d.startswith('www.') else d
except:
return None
def is_hoa(d):
if not d: return False
dl = d.lower()
good = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'creek', 'estates', 'mgmt', 'management']
bad = ['bing.com', 'microsoft.com', 'facebook.com', 'yelp.com', 'bbb.org']
return any(k in dl for k in good) and not any(b in dl for b in bad)
def get_emails(t):
if not t: return []
ems = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', t)
bad = ['example.com', 'test.com', 'noreply@']
return list(set([e.lower() for e in ems if len(e) > 8 and '@' in e and not any(b in e for b in bad)]))[:3]
def push_crm(lead):
try:
note = {"title": f"{lead['q']}: {lead['d']}", "bodyV2": {"markdown":
f"## {lead['q']} Lead\n\n**HOA:** {lead['n']}\n**Metro:** {lead['m']}\n**Site:** {lead['u']}\n**Emails:** {', '.join(lead['e']) or 'None'}"}}
urllib.request.urlopen(urllib.request.Request(f"{TWENTY_BASE}/notes",
headers={"Authorization": f"Bearer {TWENTY_TOKEN}", "Content-Type": "application/json"},
data=json.dumps(note).encode(), method='POST'), timeout=10)
log(f"CRM: {lead['d']}")
return True
except:
return False
def main():
global BLOCKED_UNTIL
log("=== Prospector v11 Started (Bing scraper) ===")
s = load_state()
queries = ["{m} HOA", "{m} homeowners association", "{m} HOA management"]
while True:
s['cycle'] += 1
metro = METROS[s['metro_idx'] % len(METROS)]
status = "(blocked)" if time.time() < BLOCKED_UNTIL else ""
log(f"CYCLE {s['cycle']}: {metro} {status}")
if time.time() < BLOCKED_UNTIL:
time.sleep(60)
continue
start, found = time.time(), 0
for qtmpl in queries:
if s['leads'] >= 50: break
urls = search_bing(qtmpl.format(m=metro))
for url in urls[:6]:
if s['leads'] >= 50: break
dom = get_domain(url)
if not dom or dom in s['crm'] or not is_hoa(dom): continue
text = fetch_page(url)
emails = get_emails(text)
lead = {'n': dom.split('.')[0].replace('-', ' ').title() + " HOA",
'm': metro, 'u': url, 'd': dom, 'e': emails,
'q': "HOT" if len(emails) >= 2 else "WARM" if emails else "COLD"}
if push_crm(lead):
s['crm'].add(dom)
s['leads'] += 1
found += 1
log(f"LEAD {s['leads']}: {lead['n']} ({lead['q']})")
s['metro_idx'] = (s['metro_idx'] + 1) % len(METROS)
save_state(s)
log(f"Done: {found} new, {s['