#!/usr/bin/env python3 """Sales Prospector v9 - Rate limiting + CRM dedupe""" import json, re, time, urllib.request, urllib.parse, urllib.error from datetime import datetime from pathlib import Path import ssl ssl._create_default_https_context = ssl._create_unverified_context SCRIPT_DIR = Path(__file__).parent STATE_DIR, LOG_DIR, LEADS_DIR = SCRIPT_DIR / "state", SCRIPT_DIR / "logs", SCRIPT_DIR / "leads" for d in [STATE_DIR, LOG_DIR, LEADS_DIR]: d.mkdir(parents=True, exist_ok=True) STATE_FILE = STATE_DIR / "prospector-v9-state.json" LOG_FILE = LOG_DIR / f"prospector-v9-{datetime.now().strftime('%Y%m%d')}.log" METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ", "Austin TX", "Denver CO", "Nashville TN", "Raleigh NC", "Tampa FL", "Dallas TX", "Houston TX", "Miami FL", "Seattle WA", "Portland OR", "Las Vegas NV"] BRAVE_KEY = "BSACPtwjz5lrsXC10pwjFVqzFGN2gr4" TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930" TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest" LAST_SEARCH, CONSEC_429, BACKOFF_UNTIL = 0, 0, 0 def log(msg): ts = datetime.now().strftime('%H:%M:%S') line = f"[{ts}] {msg}" print(line) with open(LOG_FILE, 'a') as f: f.write(line + '\n') def get_backoff(): return min(120 * (2 ** (CONSEC_429 - 1)), 900) if CONSEC_429 > 0 else 2 def rate_limited_sleep(): global LAST_SEARCH, BACKOFF_UNTIL now = time.time() if now < BACKOFF_UNTIL: time.sleep(BACKOFF_UNTIL - now) delay = get_backoff() if now - LAST_SEARCH < delay: time.sleep(delay - (now - LAST_SEARCH)) LAST_SEARCH = time.time() def load_state(): if STATE_FILE.exists(): s = json.loads(STATE_FILE.read_text()) s['crm'] = set(s.get('crm', [])) s['checked'] = set(s.get('checked', [])) return s return {"metro_idx": 0, "crm": set(), "checked": set(), "leads": 0, "cycle": 0} def save_state(s): tmp = s.copy() tmp['crm'] = list(s['crm']) tmp['checked'] = list(s['checked']) STATE_FILE.write_text(json.dumps(tmp, indent=2)) def search_brave(query): global CONSEC_429, BACKOFF_UNTIL rate_limited_sleep() log(f"SEARCH: {query}") try: url = f"https://api.search.brave.com/res/v1/web/search?q={urllib.parse.quote(query)}&count=8" with urllib.request.urlopen(urllib.request.Request(url, headers={"X-Subscription-Token": BRAVE_KEY, "Accept": "application/json"}), timeout=30) as r: urls = [x.get('url') for x in json.loads(r.read().decode()).get('web', {}).get('results', []) if x.get('url')] if CONSEC_429 > 0: log(f" -> Rate cleared after {CONSEC_429} tries") CONSEC_429 = 0 log(f" -> {len(urls)} URLs") return urls except urllib.error.HTTPError as e: if e.code == 429: CONSEC_429 += 1 delay = get_backoff() BACKOFF_UNTIL = time.time() + delay log(f" -> 429 (try #{CONSEC_429}), backoff {delay/60:.1f}min") return [] except Exception as e: log(f" -> Error: {str(e)[:40]}") return [] def fetch_page(url): try: with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=8) as r: html = r.read().decode('utf-8', errors='ignore') text = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.I) text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.I) text = re.sub(r'<[^>]+>', ' ', text) return re.sub(r'\s+', ' ', text)[:2000] except: return "" def get_domain(url): try: d = urllib.parse.urlparse(url).netloc.lower() return d[4:] if d.startswith('www.') else d except: return None def is_hoa(d): if not d: return False dl = d.lower() good = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'creek', 'estates', 'mgmt', 'management'] bad = ['google', 'facebook', 'yelp', 'bbb', 'wiki', 'reddit', 'linkedin', 'blog', 'news'] return any(k in dl for k in good) and not any(b in dl for b in bad) def get_emails(t): if not t: return [] bad = ['example.com', 'test.com', 'noreply@', 'info@', 'support@'] ems = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', t) return list(set([e.lower() for e in ems if len(e) > 8 and '@' in e and not any(b in e for b in bad)]))[:3] def push_crm(lead): try: note = {"title": f"{lead['q']}: {lead['d']}", "bodyV2": {"markdown": f"## {lead['q']} Lead\n\n**HOA:** {lead['n']}\n**Metro:** {lead['m']}\n**Site:** {lead['u']}\n**Emails:** {', '.join(lead['e']) or 'None'}"}} with urllib.request.urlopen(urllib.request.Request(f"{TWENTY_BASE}/notes", headers={"Authorization": f"Bearer {TWENTY_TOKEN}", "Content-Type": "application/json"}, data=json.dumps(note).encode(), method='POST'), timeout=10) as r: log(f"CRM: {lead['d']}") return True except Exception as e: log(f"CRM error: {e}") return False def main(): global CONSEC_429, BACKOFF_UNTIL log("=== Prospector v9 ===") s = load_state() queries = ["{m} HOA", "{m} homeowners association", "{m} community management"] while True: s['cycle'] += 1 metro = METROS[s['metro_idx'] % len(METROS)] log(f"CYCLE {s['cycle']}: {metro}" + (" (backoff)" if CONSEC_429 > 0 else "")) start = time.time() found = 0 for qtmpl in queries: if s['leads'] >= 50: break urls = search_brave(qtmpl.format(m=metro)) if CONSEC_429 > 1 and not urls: break for url in urls[:5]: if s['leads'] >= 50: break dom = get_domain(url) if not dom: continue # DEDUPE: Skip if already in CRM if dom in s['crm']: continue s['checked'].add(dom) if not is_hoa(dom): continue text = fetch_page(url) emails = get_emails(text) lead = {'n': dom.split('.')[0].replace('-', ' ').title() + " HOA", 'm': metro, 'u': url, 'd': dom, 'e': emails, 'q': "HOT" if len(emails) >= 2 else "WARM" if emails else "COLD"} if push_crm(lead): s['crm'].add(dom) s['leads'] += 1 found += 1 log(f"LEAD {s['leads']}: {lead['n']} ({lead['q']})") s['metro_idx'] = (s['metro_idx'] + 1) % len(METROS) save_state(s) log(f"Done: {found} new, {s['leads']} total, {time.time()-start:.1f}s") if s['leads'] >= 50: log("TARGET: 50 leads!") break if found == 0: time.sleep(30) if __name__ == "__main__": main()