#!/usr/bin/env python3 """Sales Prospector v12b - Aggressive SearXNG harvesting""" import json, re, time, random, urllib.request, urllib.parse from datetime import datetime from pathlib import Path import ssl ssl._create_default_https_context = ssl._create_unverified_context for d in [Path(__file__).parent / x for x in ["state", "logs", "leads"]]: d.mkdir(parents=True, exist_ok=True) STATE_FILE = Path(__file__).parent / "state" / "prospector-v12-state.json" LOG_FILE = Path(__file__).parent / "logs" / f"prospector-v12-{datetime.now().strftime('%Y%m%d')}.log" METROS = ["Charlotte NC", "Atlanta GA", "Orlando FL", "Phoenix AZ", "Austin TX", "Denver CO", "Nashville TN", "Raleigh NC", "Tampa FL", "Dallas TX", "Houston TX", "Miami FL", "Seattle WA", "Portland OR", "Las Vegas NV", "San Antonio TX", "Indianapolis IN", "Columbus OH", "Kansas City MO", "Salt Lake City UT", "San Diego CA", "Sacramento CA", "San Jose CA", "New Orleans LA", "Oklahoma City OK"] SEARXNG = "https://search.sensetostyle.com" TWENTY_TOKEN = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiI5M2FmNGFmNS0zZWQ0LTQ1ZDMtOWE5Zi01MDMzZjc3YTY3MjMiLCJ0eXBlIjoiQVBJX0tFWSIsIndvcmtzcGFjZUlkIjoiOTNhZjRhZjUtM2VkNC00NWQzLTlhOWYtNTAzM2Y3N2E2NzIzIiwiaWF0IjoxNzczMzI4NDQzLCJleHAiOjE4MDQ3ODE2NDIsImp0aSI6IjIwZjEyYzkwLTRkMDctNGJmNi1iMzk3LTZjNmU3MzlmMThjOCJ9.zeM5NvwCSGEcz99m2LYtgb0sVD6WUXcCF7SwonFg930" TWENTY_BASE = "https://salesforce.hoaledgeriq.com/rest" LAST_REQ = 0 def log(m): ts = datetime.now().strftime('%H:%M:%S') print(f"[{ts}] {m}") with open(LOG_FILE, 'a') as f: f.write(f"[{ts}] {m}\n") def throttle(): global LAST_REQ dly = random.uniform(2, 4) if LAST_REQ > 0 and (time.time() - LAST_REQ) < dly: time.sleep(dly - (time.time() - LAST_REQ)) LAST_REQ = time.time() def load(): if STATE_FILE.exists(): s = json.loads(STATE_FILE.read_text()) s['crm'] = set(s.get('crm', [])) return s return {"m": 0, "crm": set(), "leads": 0, "cycle": 0} def save(s): tmp = s.copy() tmp['crm'] = list(s['crm']) STATE_FILE.write_text(json.dumps(tmp, indent=2)) def search(q): throttle() try: url = f"{SEARXNG}/search?q={urllib.parse.quote(q)}" with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=15) as r: html = r.read().decode('utf-8', errors='ignore') urls = [m for m in re.findall(r'href="(https?://[^"]+)"', html) if 'sensetostyle' not in m and 'archive.org' not in m] return list(dict.fromkeys(urls))[:15] except: return [] def get_dom(url): try: d = urllib.parse.urlparse(url).netloc.lower() return d[4:] if d.startswith('www.') else d except: return None def is_hoa(d): if not d: return False dl = d.lower() good = ['hoa', 'homeowners', 'association', 'community', 'condo', 'village', 'mgmt', 'management', 'hood'] bad = ['sensetostyle', 'archive.org', 'google', 'facebook', 'yelp', 'bbb', 'wiki', 'reddit', 'linkedin'] return any(k in dl for k in good) and not any(b in dl for b in bad) def fetch(url): throttle() try: with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}), timeout=8) as r: t = re.sub(r'', '', r.read().decode('utf-8', errors='ignore'), flags=re.DOTALL|re.I) t = re.sub(r'', '', t, flags=re.DOTALL|re.I) return re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', ' ', t))[:1500] except: return "" def get_emails(t): ems = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', t) return list(set([e.lower() for e in ems if len(e) > 8 and '@' in e]))[:3] or [] def crm_push(lead): try: note = {"title": f"{lead['q']}: {lead['d']}", "bodyV2": {"markdown": f"## {lead['q']} Lead\n\n**HOA:** {lead['n']}\n**Metro:** {lead['m']}\n**Site:** {lead['u']}\n**Emails:** {', '.join(lead['e']) or 'None'}"}} urllib.request.urlopen(urllib.request.Request(f"{TWENTY_BASE}/notes", headers={"Authorization": f"Bearer {TWENTY_TOKEN}", "Content-Type": "application/json"}, data=json.dumps(note).encode(), method='POST'), timeout=10) log(f"CRM: {lead['d']}") return True except Exception as e: log(f"FAIL: {e}") return False def main(): log("=== v12 RESTART ===") s = load() queries = ["{m} HOA", "{m} homeowners association", "{m} HOA management contact", "{m} condo association", "{m} community management", "{m} HOA board"] while True: s['cycle'] += 1 metro = METROS[s['m'] % len(METROS)] log(f"CYCLE {s['cycle']}: {metro} | Leads: {s['leads']}") new = 0 for qt in queries: if s['leads'] >= 200: break urls = search(qt.format(m=metro)) if urls: log(f" Got {len(urls)} URLs") for url in urls[:5]: if s['leads'] >= 200: break dom = get_dom(url) if not dom or dom in s['crm'] or not is_hoa(dom): continue txt = fetch(url) lead = {'n': dom.split('.')[0].replace('-', ' ').title()[:30] + " HOA", 'm': metro, 'u': url, 'd': dom, 'e': get_emails(txt), 'q': "HOT" if len(get_emails(txt)) >= 2 else "WARM" if get_emails(txt) else "COLD"} if crm_push(lead): s['crm'].add(dom) s['leads'] += 1 new += 1 log(f"LEAD {s['leads']}: {lead['n']}") s['m'] = (s['m'] + 1) % len(METROS) save(s) log(f"Done: {new} new | {s['leads']} total") if s['leads'] >= 200: log("TARGET 200!"); break if new == 0: time.sleep(10) if __name__ == "__main__": main()